1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19 *******************************************************************************
20 * @file
21 * ihevc_itrans_recon_32x32_atom_intr.c
22 *
23 * @brief
24 * Contains function definitions for inverse quantization, inverse
25 * transform and reconstruction
26 *
27 * @author
28 * 100470
29 *
30 * @par List of Functions:
31 * - ihevc_iquant_itrans_recon_32x32_ssse3()
32 *
33 * @remarks
34 * None
35 *
36 *******************************************************************************
37 */
38 #include <stdio.h>
39 #include <string.h>
40 #include "ihevc_typedefs.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_defs.h"
44 #include "ihevc_trans_tables.h"
45 #include "ihevc_iquant_itrans_recon.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_trans_macros.h"
48
49
50
51
52 #include <immintrin.h>
53 #include <emmintrin.h>
54
55 #include <tmmintrin.h>
56
57
58
59 /**
60 *******************************************************************************
61 *
62 * @brief
63 * This function performs inverse quantization, inverse transform and
64 * reconstruction for 16x16 input block
65 *
66 * @par Description:
67 * Performs inverse quantization , inverse transform and adds the
68 * prediction data and clips output to 8 bit
69 *
70 * @param[in] pi2_src
71 * Input 16x16 coefficients
72 *
73 * @param[in] pi2_tmp
74 * Temporary 16x16 buffer for storing inverse
75 * transform 1st stage output
76 *
77 * @param[in] pu1_pred
78 * Prediction 16x16 block
79 *
80 * @param[in] pi2_dequant_coeff
81 * Dequant Coeffs
82 *
83 * @param[out] pu1_dst
84 * Output 16x16 block
85 *
86 * @param[in] qp_div
87 * Quantization parameter / 6
88 *
89 * @param[in] qp_rem
90 * Quantization parameter % 6
91 *
92 * @param[in] src_strd
93 * Input stride
94 *
95 * @param[in] pred_strd
96 * Prediction stride
97 *
98 * @param[in] dst_strd
99 * Output Stride
100 *
101 * @param[in] zero_cols
102 * Zero columns in pi2_src
103 *
104 * @returns Void
105 *
106 * @remarks
107 * None
108 *
109 *******************************************************************************
110 */
111 /**/
112
ihevc_itrans_recon_32x32_ssse3(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)113 void ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src,
114 WORD16 *pi2_tmp,
115 UWORD8 *pu1_pred,
116 UWORD8 *pu1_dst,
117 WORD32 src_strd,
118 WORD32 pred_strd,
119 WORD32 dst_strd,
120 WORD32 zero_cols,
121 WORD32 zero_rows)
122 {
123 /* Inverse Transform */
124
125 WORD32 j;
126
127
128 WORD16 *pi2_tmp_orig;
129
130
131 /*MEM_ALIGN16 WORD32 temp_array[1024];
132 MEM_ALIGN16 WORD16 temp1_array[1024];*/
133 WORD16 *o_temp_ptr;
134 WORD16 *temp_ptr;
135
136 __m128i m_temp_reg_0;
137 __m128i m_temp_reg_1;
138 __m128i m_temp_reg_2;
139 __m128i m_temp_reg_3;
140 __m128i m_temp_reg_4;
141 __m128i m_temp_reg_5;
142 __m128i m_temp_reg_6;
143 __m128i m_temp_reg_7;
144 __m128i m_temp_reg_10;
145 __m128i m_temp_reg_11;
146 __m128i m_temp_reg_12;
147 __m128i m_temp_reg_13;
148 __m128i m_temp_reg_14;
149 __m128i m_temp_reg_15;
150 __m128i m_temp_reg_16;
151 __m128i m_temp_reg_17;
152 __m128i m_temp_reg_18;
153 __m128i m_temp_reg_19;
154 __m128i m_temp_reg_20;
155 __m128i m_temp_reg_21;
156 __m128i m_temp_reg_22;
157 __m128i m_temp_reg_23;
158 __m128i m_temp_reg_30;
159 __m128i m_temp_reg_31;
160 __m128i m_temp_reg_32;
161 __m128i m_temp_reg_33;
162 __m128i m_temp_reg_34;
163 __m128i m_temp_reg_35;
164 __m128i m_temp_reg_36;
165 __m128i m_temp_reg_37;
166 __m128i m_temp_reg_40;
167 __m128i m_temp_reg_41;
168 __m128i m_temp_reg_42;
169 __m128i m_temp_reg_43;
170 __m128i m_temp_reg_44;
171 __m128i m_temp_reg_45;
172 __m128i m_temp_reg_46;
173 __m128i m_temp_reg_47;
174
175 __m128i m_temp_reg_70;
176 __m128i m_temp_reg_71;
177 __m128i m_temp_reg_72;
178 __m128i m_temp_reg_73;
179 __m128i m_temp_reg_74;
180 __m128i m_temp_reg_75;
181 __m128i m_temp_reg_76;
182 __m128i m_temp_reg_77;
183
184 __m128i m_temp_reg_80;
185 __m128i m_temp_reg_81;
186 __m128i m_temp_reg_82;
187 __m128i m_temp_reg_83;
188 __m128i m_temp_reg_84;
189 __m128i m_temp_reg_85;
190 __m128i m_temp_reg_86;
191 __m128i m_temp_reg_87;
192
193 __m128i m_temp_reg_90;
194 __m128i m_temp_reg_91;
195 __m128i m_temp_reg_92;
196 __m128i m_temp_reg_93;
197 __m128i m_temp_reg_94;
198 __m128i m_temp_reg_95;
199 __m128i m_temp_reg_96;
200 __m128i m_temp_reg_97;
201
202 __m128i m_rdng_factor;
203 __m128i m_count;
204 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
205 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
206
207 __m128i temp1, temp2, temp3, temp4;
208 __m128i temp5, temp6, temp7, temp8;
209
210 __m128i all_zero_reg;
211 WORD32 i;
212
213 /*Lokesh*/
214 WORD32 zero_last24_cols_stg1;
215 WORD32 zero_last24_rows_stg1;
216 WORD32 zero_last28_rows_stg1;
217
218 WORD32 zero_last28_rows_stg2;
219 WORD32 zero_last24_rows_stg2;
220
221 WORD32 trans_size_stg1;
222
223 WORD32 i4_shift = IT_SHIFT_STAGE_1;
224 WORD32 trans_size = TRANS_SIZE_32;
225
226
227 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
228 zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
229 zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
230 zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
231
232 zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
233 zero_last24_rows_stg2 = zero_last24_cols_stg1;
234
235 if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
236 {
237 trans_size_stg1 = 8;
238
239 }
240 else
241 {
242 trans_size_stg1 = 32;
243 }
244
245 all_zero_reg = _mm_setzero_si128();
246
247 o_temp_ptr = pi2_tmp;
248 temp_ptr = (pi2_tmp + 1024);
249
250 pi2_tmp += 2048;
251 pi2_tmp_orig = pi2_tmp;
252
253 for(i = 0; i < trans_size_stg1; i += 8)
254 {
255
256
257 {
258 WORD16 *pi2_tmp_src = pi2_src;
259
260 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
261 pi2_tmp_src += (src_strd << 1);
262 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
263 pi2_tmp_src += (src_strd << 1);
264 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
265 pi2_tmp_src += (src_strd << 1);
266 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
267 pi2_tmp_src += (src_strd << 1);
268 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
269 pi2_tmp_src += (src_strd << 1);
270 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
271 pi2_tmp_src += (src_strd << 1);
272 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
273 pi2_tmp_src += (src_strd << 1);
274 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
275 pi2_tmp_src += (src_strd << 1);
276
277 m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
278 pi2_tmp_src += (src_strd << 1);
279 m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
280 pi2_tmp_src += (src_strd << 1);
281 m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
282 pi2_tmp_src += (src_strd << 1);
283 m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
284 pi2_tmp_src += (src_strd << 1);
285 m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
286 pi2_tmp_src += (src_strd << 1);
287 m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
288 pi2_tmp_src += (src_strd << 1);
289 m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
290 pi2_tmp_src += (src_strd << 1);
291 m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
292 }
293
294 if(zero_last28_rows_stg1)
295 {
296 /* eeo */
297 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
298 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
299 {
300 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
301
302 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
303
304 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
305
306 /* eeeo[0]= m_temp_reg_20 */
307 /* eeeo[1]= m_temp_reg_21 */
308 /* eeee[0]= m_temp_reg_22 */
309 /* eeee[1]= m_temp_reg_23 */
310
311 /* eee[0] = eeee[0] + eeeo[0]; */
312 m_temp_reg_40 = m_temp_reg_14;
313
314 /* eee[3] = eeee[0] - eeeo[0]; */
315 m_temp_reg_43 = m_temp_reg_14;
316
317 /* eee[2] = eeee[1] - eeeo[1]; */
318 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
319
320 /* eee[1] = eeee[1] + eeeo[1];*/
321 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
322
323 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
324
325 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
326
327 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
328
329 /* eeeo[0]= m_temp_reg_20 */
330 /* eeeo[1]= m_temp_reg_21 */
331 /* eeee[0]= m_temp_reg_22 */
332 /* eeee[1]= m_temp_reg_23 */
333
334 /* eee[0] = eeee[0] + eeeo[0]; */
335 m_temp_reg_44 = m_temp_reg_14;
336
337 /* eee[3] = eeee[0] - eeeo[0]; */
338 m_temp_reg_47 = m_temp_reg_14;
339
340 /* eee[2] = eeee[1] - eeeo[1]; */
341 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
342
343 /* eee[1] = eeee[1] + eeeo[1];*/
344 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
345
346
347 }
348 /* eo */
349 {
350 WORD16 *pi2_scratch = o_temp_ptr;
351
352 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
353 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
354 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
355 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
356 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
357 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
358 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
359 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
360
361 //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71);
362 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
363
364 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
365
366 /* eo0[0-3] */
367 {
368 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
369
370 //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71);
371 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
372
373 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
374 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
375
376 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
377 pi2_scratch += 8;
378 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
379 pi2_scratch += 8;
380
381 }
382
383 /* eo0[4-7] */
384 {
385 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
386
387 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
388 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
389
390 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
391 pi2_scratch += 8;
392 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
393 pi2_scratch += 8;
394
395 }
396 /* eo1[0-3] */
397 {
398 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
399
400 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
401 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
402
403 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
404 pi2_scratch += 8;
405 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
406 pi2_scratch += 8;
407
408 }
409
410 /* eo1[4-7] */
411 {
412 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
413
414 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
415 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
416
417 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
418 pi2_scratch += 8;
419 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
420 pi2_scratch += 8;
421
422 }
423
424 /* eo2[0-3] */
425 {
426 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
427
428 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
429 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
430
431 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
432 pi2_scratch += 8;
433 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
434 pi2_scratch += 8;
435
436 }
437
438 /* eo2[4-7] */
439 {
440 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
441
442 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
443 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
444
445 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
446 pi2_scratch += 8;
447 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
448 pi2_scratch += 8;
449
450 }
451
452 /**************************************************************************/
453
454
455 /* eo3[0-3] */
456 {
457 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
458
459 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
460 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
461
462 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
463 pi2_scratch += 8;
464 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
465 pi2_scratch += 8;
466
467 }
468
469 /* eo3[4-7] */
470 {
471 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
472
473 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
474 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
475
476 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
477 pi2_scratch += 8;
478 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
479 pi2_scratch += 8;
480
481 }
482
483
484 /* eo4[0-3] */
485 {
486 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
487
488 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
489 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
490
491 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
492 pi2_scratch += 8;
493 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
494 pi2_scratch += 8;
495
496 }
497 /* eo4[4-7] */
498 {
499 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
500
501 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
502 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
503
504 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
505 pi2_scratch += 8;
506 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
507 pi2_scratch += 8;
508
509 }
510
511 /***********************************************************************/
512
513 /* eo5[0-3] */
514 {
515 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
516
517 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
518 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
519
520 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
521 pi2_scratch += 8;
522 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
523 pi2_scratch += 8;
524
525 }
526
527
528 /* eo5[4-7] */
529 {
530 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
531
532 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
533 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
534
535 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
536 pi2_scratch += 8;
537 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
538 pi2_scratch += 8;
539
540 }
541
542 /* eo6[0-3] */
543 {
544 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
545
546 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
547 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
548
549 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
550 pi2_scratch += 8;
551 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
552 pi2_scratch += 8;
553
554 }
555
556
557 /* eo6[4-7] */
558 {
559 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
560
561 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
562 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
563
564 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
565 pi2_scratch += 8;
566 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
567 pi2_scratch += 8;
568
569 }
570
571
572 /* eo7[0-3] */
573 {
574 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
575
576 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
577 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
578
579 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
580 pi2_scratch += 8;
581 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
582 pi2_scratch += 8;
583
584 }
585
586
587 /* eo7[4-7] */
588 {
589 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
590
591 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
592 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
593
594 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
595 pi2_scratch += 8;
596 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
597 pi2_scratch += 8;
598
599 }
600
601 }
602 }
603 else if(zero_last24_rows_stg1)
604 {
605 {
606 /* eeo */
607 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
608 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
609
610 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
611 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
612
613 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
614
615 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
616
617 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
618
619 /* eeeo[0]= m_temp_reg_20 */
620 /* eeeo[1]= m_temp_reg_21 */
621 /* eeee[0]= m_temp_reg_22 */
622 /* eeee[1]= m_temp_reg_23 */
623
624 /* eee[0] = eeee[0] + eeeo[0]; */
625 m_temp_reg_40 = m_temp_reg_14;
626
627 /* eee[3] = eeee[0] - eeeo[0]; */
628 m_temp_reg_43 = m_temp_reg_14;
629
630 /* eee[2] = eeee[1] - eeeo[1]; */
631 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
632
633 /* eee[1] = eeee[1] + eeeo[1];*/
634 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
635
636 /* for row 4 to 7 */
637
638 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
639
640 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
641
642 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
643
644 /* eeeo[0]= m_temp_reg_20 */
645 /* eeeo[1]= m_temp_reg_21 */
646 /* eeee[0]= m_temp_reg_22 */
647 /* eeee[1]= m_temp_reg_23 */
648
649 /* eee[0] = eeee[0] + eeeo[0]; */
650 m_temp_reg_44 = m_temp_reg_14;
651
652 /* eee[3] = eeee[0] - eeeo[0]; */
653 m_temp_reg_47 = m_temp_reg_14;
654
655 /* eee[2] = eeee[1] - eeeo[1]; */
656 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
657
658 /* eee[1] = eeee[1] + eeeo[1];*/
659 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
660
661
662 /* eeo[] */
663 /* for(k = 0; k < 4; k++) */
664
665 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
666 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
667 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
668 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
669
670 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
671
672 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
673
674 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
675
676 m_temp_reg_33 = _mm_setzero_si128();
677
678 /* eeo */
679 {
680 /* eeo0[0-3] */
681 {
682 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
683
684 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
685 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
686
687 m_temp_reg_90 = m_temp_reg_34;
688 m_temp_reg_97 = m_temp_reg_35;
689 }
690 /* eeo0[4-7] */
691 {
692 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
693
694 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
695 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
696
697 m_temp_reg_91 = m_temp_reg_34;
698 m_temp_reg_96 = m_temp_reg_35;
699
700 }
701
702 /* eeo1[0-3] */
703 {
704 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
705
706 /* e[1][0-3] stored in pi2_tmp[2][0-7] */
707 /* e[6][0-3] stored in pi2_tmp[2][8-15] */
708 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
709 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
710
711 m_temp_reg_92 = m_temp_reg_34;
712 m_temp_reg_95 = m_temp_reg_35;
713
714 }
715
716 /* eo1[4-7] */
717 {
718 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
719
720 /* e[1][4-7] stored in pi2_tmp[3][0-7] */
721 /* e[6][4-7] stored in pi2_tmp[3][8-15] */
722 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
723 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
724
725 m_temp_reg_93 = m_temp_reg_34;
726 m_temp_reg_94 = m_temp_reg_35;
727
728
729 }
730
731 /* eo2[0-3] */
732 {
733 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
734
735 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
736 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
737 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
738 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
739
740 temp1 = m_temp_reg_34;
741 temp7 = m_temp_reg_35;
742
743 }
744
745 /* eo2[4-7] */
746 {
747 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
748
749 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
750 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
751 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
752 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
753
754 temp2 = m_temp_reg_34;
755 temp6 = m_temp_reg_35;
756
757 }
758
759 /* eo3[0-3] */
760 {
761 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
762
763 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
764 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
765 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
766 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
767
768 temp3 = m_temp_reg_34;
769 temp5 = m_temp_reg_35;
770
771 }
772
773
774 /* eo3[4-7] */
775 {
776 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
777
778 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
779 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
780 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
781 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
782
783 temp4 = m_temp_reg_34;
784 temp8 = m_temp_reg_35;
785
786
787 }
788 /* All values of ee[] array in pi2_temp */
789
790 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
791 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
792 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
793 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
794
795 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
796
797 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
798 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
799
800 }
801 }
802 /* eo */
803 {
804 WORD16 *pi2_scratch = o_temp_ptr;
805
806 /* eo0[0-3] */
807 {
808 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
809
810 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
811 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
812
813 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
814 pi2_scratch += 8;
815 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
816 pi2_scratch += 8;
817
818 }
819
820
821 /* eo0[4-7] */
822 {
823 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
824
825 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
826
827 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
828 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
829
830 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
831 pi2_scratch += 8;
832 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
833 pi2_scratch += 8;
834
835 }
836
837 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
838
839 /* eo1[0-3] */
840 {
841 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
842
843 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
844 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
845
846 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
847 pi2_scratch += 8;
848 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
849 pi2_scratch += 8;
850
851 }
852
853
854 /* eo1[4-7] */
855 {
856 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
857
858 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
859 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
860
861 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
862 pi2_scratch += 8;
863 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
864 pi2_scratch += 8;
865
866 }
867
868 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
869
870 /* eo2[0-3] */
871 {
872
873 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
874
875 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
876 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
877
878 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
879 pi2_scratch += 8;
880 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
881 pi2_scratch += 8;
882
883 }
884
885 /* eo2[4-7] */
886 {
887
888 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
889
890 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
891 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
892
893 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
894 pi2_scratch += 8;
895 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
896 pi2_scratch += 8;
897
898 }
899
900 /**************************************************************************/
901
902
903
904 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
905
906 /* eo3[0-3] */
907 {
908
909 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
910
911 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
912 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
913
914 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
915 pi2_scratch += 8;
916 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
917 pi2_scratch += 8;
918
919 }
920
921
922 /* eo3[4-7] */
923 {
924
925 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
926
927 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
928 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
929
930 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
931 pi2_scratch += 8;
932 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
933 pi2_scratch += 8;
934
935 }
936
937 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
938
939 /* eo4[0-3] */
940 {
941 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
942
943 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
944 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
945
946 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
947 pi2_scratch += 8;
948 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
949 pi2_scratch += 8;
950
951 }
952 /* eo4[4-7] */
953 {
954 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
955
956 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
957 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
958
959 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
960 pi2_scratch += 8;
961 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
962 pi2_scratch += 8;
963
964 }
965
966 /***********************************************************************/
967
968 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
969
970 /* eo5[0-3] */
971 {
972
973 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
974
975 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
976 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
977
978 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
979 pi2_scratch += 8;
980 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
981 pi2_scratch += 8;
982
983 }
984
985
986 /* eo5[4-7] */
987 {
988 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
989
990 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
991 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
992
993 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
994 pi2_scratch += 8;
995 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
996 pi2_scratch += 8;
997
998 }
999
1000 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
1001
1002 /* eo6[0-3] */
1003 {
1004 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1005
1006 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1007 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1008
1009 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1010 pi2_scratch += 8;
1011 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1012 pi2_scratch += 8;
1013
1014 }
1015
1016
1017 /* eo6[4-7] */
1018 {
1019
1020 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1021
1022 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1023 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1024
1025 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1026 pi2_scratch += 8;
1027 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1028 pi2_scratch += 8;
1029
1030 }
1031
1032 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
1033
1034 /* eo7[0-3] */
1035 {
1036
1037 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1038
1039 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1040 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1041
1042 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1043 pi2_scratch += 8;
1044 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1045 pi2_scratch += 8;
1046
1047 }
1048
1049
1050 /* eo7[4-7] */
1051 {
1052 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1053
1054 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1055 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1056
1057 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1058 pi2_scratch += 8;
1059 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1060 pi2_scratch += 8;
1061
1062 }
1063
1064 }
1065
1066 }
1067 else
1068 {
1069
1070 {
1071 /* eeo */
1072 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1073 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1074
1075 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
1076 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
1077
1078 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
1079 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
1080
1081 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1082
1083 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1084
1085 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
1086 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
1087
1088 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
1089 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
1090
1091
1092 /* eeeo[0]= m_temp_reg_20 */
1093 /* eeeo[1]= m_temp_reg_21 */
1094 /* eeee[0]= m_temp_reg_22 */
1095 /* eeee[1]= m_temp_reg_23 */
1096
1097 /* eee[0] = eeee[0] + eeeo[0]; */
1098 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
1099
1100 /* eee[3] = eeee[0] - eeeo[0]; */
1101 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
1102
1103 /* eee[2] = eeee[1] - eeeo[1]; */
1104 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
1105
1106 /* eee[1] = eeee[1] + eeeo[1];*/
1107 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
1108
1109 /* for row 4 to 7 */
1110
1111 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
1112 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
1113
1114 /* Interleaving row 8 and row 24*/
1115 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1116
1117 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1118 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
1119
1120 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1121
1122 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
1123 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
1124
1125 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
1126 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
1127
1128
1129 /* eeeo[0]= m_temp_reg_20 */
1130 /* eeeo[1]= m_temp_reg_21 */
1131 /* eeee[0]= m_temp_reg_22 */
1132 /* eeee[1]= m_temp_reg_23 */
1133
1134 /* eee[0] = eeee[0] + eeeo[0]; */
1135 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
1136
1137 /* eee[3] = eeee[0] - eeeo[0]; */
1138 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
1139
1140 /* eee[2] = eeee[1] - eeeo[1]; */
1141 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
1142
1143 /* eee[1] = eeee[1] + eeeo[1];*/
1144 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
1145
1146
1147 // eeo[]
1148 /* for(k = 0; k < 4; k++) */
1149
1150
1151 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
1152 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
1153
1154 /* eeo */
1155 {
1156 /* eeo0[0-3] */
1157 {
1158 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1159 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1160
1161 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1162 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1163
1164 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1165
1166 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1167 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1168
1169 }
1170
1171 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
1172 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
1173 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
1174 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
1175
1176 /* eeo0[4-7] */
1177 {
1178 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1179 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1180
1181 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1182 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1183
1184 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1185
1186 m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
1187 m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
1188
1189 }
1190
1191
1192 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
1193 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
1194
1195 /* eeo1[0-3] */
1196 {
1197 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1198 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1199
1200 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
1201 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
1202
1203 m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1204 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1205
1206 }
1207
1208 /* eeo1[4-7] */
1209 {
1210
1211 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1212 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1213
1214 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
1215 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
1216
1217 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1218 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1219
1220
1221 }
1222
1223 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
1224 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
1225
1226 /* eeo2[0-3] */
1227 {
1228
1229 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1230 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1231
1232 /* e[2][0-3] stored in pi2_tmp[4][0-7] */
1233 /* e[5][0-3] stored in pi2_tmp[4][8-15] */
1234
1235 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
1236 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
1237
1238 temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1239 temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1240
1241 }
1242
1243 /* eeo2[4-7] */
1244 {
1245
1246 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1247 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1248
1249 /* e[2][4-7] stored in pi2_tmp[5][0-7] */
1250 /* e[5][4-7] stored in pi2_tmp[5][8-15] */
1251
1252 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
1253 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
1254
1255 temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1256 temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1257
1258 }
1259
1260 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
1261 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
1262
1263 /* eeo3[0-3] */
1264 {
1265
1266 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1267 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1268
1269 /* e[3][0-3] stored in pi2_tmp[6][0-7] */
1270 /* e[4][0-3] stored in pi2_tmp[6][8-15] */
1271
1272 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
1273 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
1274
1275 temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1276 temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1277
1278
1279 }
1280
1281 /* eeo3[4-7] */
1282 {
1283
1284 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1285 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1286
1287 /* e[3][4-7] stored in pi2_tmp[7][0-7] */
1288 /* e[4][4-7] stored in pi2_tmp[7][8-15] */
1289
1290 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
1291 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
1292 temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1293 temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1294
1295 }
1296
1297
1298 /* All values of ee[] array in pi2_temp */
1299
1300 /* for(k = 0; k < 8; k++) */
1301 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
1302 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
1303 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
1304 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
1305 }
1306 }
1307 /* eo */
1308 {
1309 WORD16 *pi2_scratch = o_temp_ptr;
1310
1311 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1312 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1313 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1314 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1315
1316 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1317 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
1318 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
1319 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
1320
1321 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
1322 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
1323 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
1324 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
1325
1326 /* eo0[0-3] */
1327 {
1328 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1329 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1330
1331 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1332
1333 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1334 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1335
1336 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1337
1338 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1339
1340 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
1341 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
1342
1343 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1344 pi2_scratch += 8;
1345 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1346 pi2_scratch += 8;
1347
1348 }
1349 /* eo0[4-7] */
1350 {
1351 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1352 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1353 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1354 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1355
1356 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1357 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1358
1359 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1360
1361 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1362 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1363
1364 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1365
1366 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1367
1368 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
1369 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
1370
1371 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1372 pi2_scratch += 8;
1373 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1374 pi2_scratch += 8;
1375
1376 }
1377
1378 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
1379 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
1380 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
1381 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
1382
1383 /* eo1[0-3] */
1384 {
1385
1386 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1387 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1388
1389 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1390
1391 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1392 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1393
1394 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1395
1396 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1397
1398 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
1399 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
1400
1401 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1402 pi2_scratch += 8;
1403 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1404 pi2_scratch += 8;
1405
1406 }
1407
1408 /* eo1[4-7] */
1409 {
1410 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1411 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1412
1413 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1414
1415 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1416 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1417
1418 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1419
1420 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1421
1422 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
1423 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
1424
1425 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1426 pi2_scratch += 8;
1427 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1428 pi2_scratch += 8;
1429
1430 }
1431
1432 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
1433 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
1434 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
1435 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
1436
1437 /* eo2[0-3] */
1438 {
1439 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1440 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1441
1442 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1443
1444 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1445 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1446
1447 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1448
1449 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1450
1451 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
1452 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
1453
1454 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1455 pi2_scratch += 8;
1456 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1457 pi2_scratch += 8;
1458
1459 }
1460
1461
1462 /* eo2[4-7] */
1463 {
1464
1465 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1466 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1467
1468 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1469
1470 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1471 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1472
1473 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1474
1475 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1476
1477 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
1478 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
1479
1480 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1481 pi2_scratch += 8;
1482 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1483 pi2_scratch += 8;
1484
1485 }
1486 /**************************************************************************/
1487
1488 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
1489 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
1490 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
1491 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
1492
1493 /* eo3[0-3] */
1494 {
1495 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1496 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1497
1498 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1499
1500 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1501 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1502
1503 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1504
1505 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1506
1507 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
1508 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
1509
1510 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1511 pi2_scratch += 8;
1512 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1513 pi2_scratch += 8;
1514
1515 }
1516
1517
1518 /* eo3[4-7] */
1519 {
1520 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1521 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1522
1523 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1524
1525 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1526 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1527
1528 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1529
1530 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1531
1532 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
1533 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
1534
1535 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1536 pi2_scratch += 8;
1537 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1538 pi2_scratch += 8;
1539
1540 }
1541
1542 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
1543 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
1544 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
1545 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
1546
1547 /* eo4[0-3] */
1548 {
1549
1550 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1551 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1552
1553 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1554
1555 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1556 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1557
1558 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1559
1560 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1561
1562 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
1563 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
1564
1565 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1566 pi2_scratch += 8;
1567 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1568 pi2_scratch += 8;
1569
1570 }
1571
1572
1573 /* eo4[4-7] */
1574 {
1575 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1576 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1577
1578 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1579
1580 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1581 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1582
1583 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1584
1585 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1586
1587 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
1588 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
1589
1590 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1591 pi2_scratch += 8;
1592 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1593 pi2_scratch += 8;
1594
1595 }
1596
1597 /***********************************************************************/
1598
1599 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
1600 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
1601 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
1602 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
1603
1604 /* eo5[0-3] */
1605 {
1606 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1607 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1608
1609 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1610
1611 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1612 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1613
1614 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1615
1616 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1617
1618 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
1619 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
1620
1621 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1622 pi2_scratch += 8;
1623 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1624 pi2_scratch += 8;
1625
1626 }
1627
1628
1629 /* eo5[4-7] */
1630 {
1631 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1632 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1633
1634 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1635
1636 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1637 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1638
1639 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1640
1641 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1642
1643 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
1644 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
1645
1646 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1647 pi2_scratch += 8;
1648 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1649 pi2_scratch += 8;
1650
1651 }
1652
1653 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
1654 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
1655 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
1656 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
1657
1658 /* eo6[0-3] */
1659 {
1660
1661 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1662 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1663
1664 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1665
1666 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1667 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1668
1669 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1670
1671 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1672
1673 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1674 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1675
1676 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1677 pi2_scratch += 8;
1678 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1679 pi2_scratch += 8;
1680
1681 }
1682
1683
1684 /* eo6[4-7] */
1685 {
1686 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1687 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1688
1689 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1690
1691 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1692 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1693
1694 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1695
1696 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1697
1698 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1699 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1700
1701 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1702 pi2_scratch += 8;
1703 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1704 pi2_scratch += 8;
1705
1706 }
1707
1708 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
1709 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
1710 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
1711 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
1712
1713 /* eo7[0-3] */
1714 {
1715
1716 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1717 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1718
1719 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1720
1721 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1722 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1723
1724 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1725
1726 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1727
1728 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1729 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1730
1731 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1732 pi2_scratch += 8;
1733 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1734 pi2_scratch += 8;
1735
1736 }
1737
1738
1739 /* eo7[4-7] */
1740 {
1741
1742 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1743 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1744
1745 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1746
1747 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1748 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1749
1750 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1751
1752 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1753
1754 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1755 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1756
1757 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1758 pi2_scratch += 8;
1759 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1760 pi2_scratch += 8;
1761
1762 }
1763
1764 }
1765
1766 }
1767 /* All e[] are done */
1768 /****************************/
1769
1770
1771 {
1772
1773 WORD16 *pi2_tmp_src = pi2_src + src_strd;
1774
1775 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
1776 pi2_tmp_src += (src_strd << 1);
1777 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
1778 pi2_tmp_src += (src_strd << 1);
1779 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
1780 pi2_tmp_src += (src_strd << 1);
1781 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
1782 pi2_tmp_src += (src_strd << 1);
1783 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
1784 pi2_tmp_src += (src_strd << 1);
1785 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
1786 pi2_tmp_src += (src_strd << 1);
1787 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
1788 pi2_tmp_src += (src_strd << 1);
1789 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
1790 pi2_tmp_src += (src_strd << 1);
1791
1792 m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
1793 pi2_tmp_src += (src_strd << 1);
1794 m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
1795 pi2_tmp_src += (src_strd << 1);
1796 m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
1797 pi2_tmp_src += (src_strd << 1);
1798 m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
1799 pi2_tmp_src += (src_strd << 1);
1800 m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
1801 pi2_tmp_src += (src_strd << 1);
1802 m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
1803 pi2_tmp_src += (src_strd << 1);
1804 m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
1805 pi2_tmp_src += (src_strd << 1);
1806 m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
1807 }
1808
1809 if(zero_last28_rows_stg1)
1810 {
1811 /* o & stage 1 out */
1812 {
1813 WORD32 j;
1814 WORD16 *pi2_src_scratch = o_temp_ptr;
1815 WORD16 *pi2_dst_scratch = temp_ptr;
1816 WORD32 out_stride = (trans_size << 1);
1817 WORD32 in_stride = trans_size;
1818
1819 for(j = 0; j < 2; j++)
1820 {
1821 if(j)
1822 {
1823 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1824 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1825 }
1826
1827 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
1828
1829 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
1830
1831 /* o0[0-3] */
1832 {
1833 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1834
1835 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1836 pi2_src_scratch += in_stride;
1837
1838 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1839 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1840
1841 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1842 m_count = _mm_cvtsi32_si128(i4_shift);
1843 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1844 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1845
1846 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1847 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1848 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1849 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1850
1851 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1852
1853 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1854 pi2_dst_scratch += out_stride;
1855
1856 }
1857
1858 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
1859
1860 /* o1[0-3] */
1861 {
1862
1863 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1864
1865 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1866 pi2_src_scratch += in_stride;
1867
1868 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1869 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1870
1871 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1872 m_count = _mm_cvtsi32_si128(i4_shift);
1873 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1874 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1875
1876 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1877 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1878 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1879 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1880
1881 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1882
1883 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1884 pi2_dst_scratch += out_stride;
1885
1886 }
1887
1888 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
1889
1890 /* o2[0-3] */
1891 {
1892
1893 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1894
1895 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1896 pi2_src_scratch += in_stride;
1897
1898 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1899 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1900
1901 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1902 m_count = _mm_cvtsi32_si128(i4_shift);
1903 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1904 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1905
1906 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1907 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1908 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1909 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1910
1911 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1912
1913 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1914 pi2_dst_scratch += out_stride;
1915
1916 }
1917
1918 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
1919
1920 /* o3[0-3] */
1921 {
1922 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1923
1924 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1925 pi2_src_scratch += in_stride;
1926
1927 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1928 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1929
1930 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1931 m_count = _mm_cvtsi32_si128(i4_shift);
1932 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1933 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1934
1935 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1936 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1937 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1938 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1939
1940 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1941
1942 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1943 pi2_dst_scratch += out_stride;
1944
1945 }
1946
1947 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
1948
1949 /* o4[0-3] */
1950 {
1951 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1952
1953 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1954 pi2_src_scratch += in_stride;
1955
1956 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1957 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1958
1959 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1960 m_count = _mm_cvtsi32_si128(i4_shift);
1961 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1962 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1963
1964 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1965 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1966 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1967 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1968
1969 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1970
1971 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1972 pi2_dst_scratch += out_stride;
1973
1974 }
1975
1976 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
1977
1978 /* o5[0-3] */
1979 {
1980
1981 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1982
1983 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1984 pi2_src_scratch += in_stride;
1985
1986 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1987 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1988
1989 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1990 m_count = _mm_cvtsi32_si128(i4_shift);
1991 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1992 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1993
1994 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1995 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1996 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1997 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1998
1999 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2000
2001 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2002 pi2_dst_scratch += out_stride;
2003
2004 }
2005
2006 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2007
2008 /* o6[0-3] */
2009 {
2010 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2011
2012 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2013 pi2_src_scratch += in_stride;
2014
2015 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2016 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2017
2018 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2019 m_count = _mm_cvtsi32_si128(i4_shift);
2020 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2021 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2022
2023 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2024 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2025 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2026 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2027
2028 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2029
2030 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2031 pi2_dst_scratch += out_stride;
2032
2033 }
2034
2035 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2036
2037 /* o7[0-3] */
2038 {
2039
2040 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2041
2042 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2043 pi2_src_scratch += 8;
2044
2045 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2046 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2047
2048 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2049 m_count = _mm_cvtsi32_si128(i4_shift);
2050 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2051 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2052
2053 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2054 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2055 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2056 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2057
2058 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2059
2060 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2061 pi2_dst_scratch += 8;
2062
2063 }
2064
2065 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2066
2067 /* o8[0-3] */
2068 {
2069 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2070
2071 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2072 pi2_src_scratch -= in_stride;
2073
2074 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2075 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2076
2077 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2078 m_count = _mm_cvtsi32_si128(i4_shift);
2079 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2080 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2081
2082 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2083 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2084 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2085 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2086
2087 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2088
2089 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2090 pi2_dst_scratch -= out_stride;
2091 }
2092
2093 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2094
2095 /* o9[0-3] */
2096 {
2097 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2098
2099 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2100 pi2_src_scratch -= in_stride;
2101
2102 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2103 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2104
2105 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2106 m_count = _mm_cvtsi32_si128(i4_shift);
2107 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2108 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2109
2110 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2111 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2112 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2113 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2114
2115 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2116
2117 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2118 pi2_dst_scratch -= out_stride;
2119 }
2120
2121 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2122
2123 /* o10[0-3] */
2124 {
2125 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2126
2127 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2128 pi2_src_scratch -= in_stride;
2129
2130 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2131 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2132
2133 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2134 m_count = _mm_cvtsi32_si128(i4_shift);
2135 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2136 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2137
2138 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2139 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2140 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2141 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2142
2143 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2144
2145 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2146 pi2_dst_scratch -= out_stride;
2147 }
2148
2149 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2150
2151 /* o11[0-3] */
2152 {
2153 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2154
2155 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2156 pi2_src_scratch -= in_stride;
2157
2158 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2159 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2160
2161 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2162 m_count = _mm_cvtsi32_si128(i4_shift);
2163 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2164 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2165
2166 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2167 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2168 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2169 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2170
2171 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2172
2173 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2174 pi2_dst_scratch -= out_stride;
2175
2176 }
2177
2178 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2179
2180 /* o12[0-3] */
2181 {
2182 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2183
2184 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2185 pi2_src_scratch -= in_stride;
2186
2187 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2188 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2189
2190 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2191 m_count = _mm_cvtsi32_si128(i4_shift);
2192 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2193 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2194
2195 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2196 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2197 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2198 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2199
2200 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2201
2202 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2203 pi2_dst_scratch -= out_stride;
2204
2205 }
2206
2207 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2208
2209 /* o13[0-3] */
2210 {
2211 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2212
2213 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2214 pi2_src_scratch -= in_stride;
2215
2216 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2217 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2218
2219 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2220 m_count = _mm_cvtsi32_si128(i4_shift);
2221 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2222 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2223
2224 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2225 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2226 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2227 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2228
2229 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2230
2231 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2232 pi2_dst_scratch -= out_stride;
2233 }
2234
2235 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2236
2237 /* o14[0-3] */
2238 {
2239 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2240
2241 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2242 pi2_src_scratch -= in_stride;
2243
2244 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2245 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2246
2247 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2248 m_count = _mm_cvtsi32_si128(i4_shift);
2249 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2250 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2251
2252 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2253 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2254 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2255 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2256
2257 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2258
2259 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2260 pi2_dst_scratch -= out_stride;
2261
2262 }
2263
2264 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2265
2266 /* o15[0-3] */
2267 {
2268 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2269
2270 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2271 pi2_src_scratch += 8;
2272
2273 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2274 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2275
2276 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2277 m_count = _mm_cvtsi32_si128(i4_shift);
2278 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2279 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2280
2281 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2282 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2283 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2284 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2285
2286 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2287
2288 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2289 pi2_dst_scratch += 8;
2290 }
2291
2292 }
2293 }
2294 }
2295 else if(zero_last24_rows_stg1)
2296 {
2297 /* o & stage 1 out */
2298 {
2299 WORD32 j;
2300 WORD16 *pi2_src_scratch = o_temp_ptr;
2301 WORD16 *pi2_dst_scratch = temp_ptr;
2302 WORD32 out_stride = (trans_size << 1);
2303 WORD32 in_stride = trans_size;
2304
2305 for(j = 0; j < 2; j++)
2306 {
2307 if(j)
2308 {
2309 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2310 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2311 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2312 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2313 }
2314
2315 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2316 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2317
2318 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2319 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2320
2321 /* o0[0-3] */
2322 {
2323
2324 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2325 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2326
2327 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2328
2329 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2330 pi2_src_scratch += in_stride;
2331
2332 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2333 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2334
2335 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2336 m_count = _mm_cvtsi32_si128(i4_shift);
2337 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2338 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2339
2340 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2341 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2342 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2343 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2344
2345 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2346
2347 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2348 pi2_dst_scratch += out_stride;
2349
2350 }
2351
2352 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2353 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2354
2355 /* o1[0-3] */
2356 {
2357 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2358 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2359
2360 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2361
2362 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2363 pi2_src_scratch += in_stride;
2364
2365 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2366 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2367
2368 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2369 m_count = _mm_cvtsi32_si128(i4_shift);
2370 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2371 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2372
2373 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2374 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2375 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2376 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2377
2378 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2379
2380 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2381 pi2_dst_scratch += out_stride;
2382
2383 }
2384
2385 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2386 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2387
2388 /* o2[0-3] */
2389 {
2390 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2391 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2392
2393 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2394
2395 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2396 pi2_src_scratch += in_stride;
2397
2398 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2399 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2400
2401 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2402 m_count = _mm_cvtsi32_si128(i4_shift);
2403 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2404 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2405
2406 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2407 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2408 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2409 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2410
2411 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2412
2413 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2414 pi2_dst_scratch += out_stride;
2415
2416 }
2417
2418 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
2419 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
2420
2421 /* o3[0-3] */
2422 {
2423 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2424 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2425
2426 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2427
2428 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2429 pi2_src_scratch += in_stride;
2430
2431 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2432 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2433
2434 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2435 m_count = _mm_cvtsi32_si128(i4_shift);
2436 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2437 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2438
2439 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2440 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2441 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2442 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2443
2444 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2445
2446 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2447 pi2_dst_scratch += out_stride;
2448
2449 }
2450
2451 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
2452 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
2453
2454 /* o4[0-3] */
2455 {
2456 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2457 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2458
2459 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2460
2461 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2462 pi2_src_scratch += in_stride;
2463
2464 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2465 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2466
2467 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2468 m_count = _mm_cvtsi32_si128(i4_shift);
2469 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2470 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2471
2472 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2473 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2474 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2475 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2476
2477 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2478
2479 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2480 pi2_dst_scratch += out_stride;
2481
2482 }
2483
2484 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
2485 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
2486
2487 /* o5[0-3] */
2488 {
2489 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2490 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2491
2492 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2493
2494 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2495 pi2_src_scratch += in_stride;
2496
2497 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2498 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2499
2500 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2501 m_count = _mm_cvtsi32_si128(i4_shift);
2502 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2503 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2504
2505 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2506 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2507 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2508 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2509
2510 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2511
2512 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2513 pi2_dst_scratch += out_stride;
2514
2515 }
2516
2517 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2518 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
2519
2520 /* o6[0-3] */
2521 {
2522 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2523 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2524
2525 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2526
2527 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2528 pi2_src_scratch += in_stride;
2529
2530 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2531 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2532
2533 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2534 m_count = _mm_cvtsi32_si128(i4_shift);
2535 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2536 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2537
2538 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2539 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2540 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2541 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2542
2543 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2544
2545 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2546 pi2_dst_scratch += out_stride;
2547
2548 }
2549
2550 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2551 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
2552
2553 /* o7[0-3] */
2554 {
2555 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2556 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2557
2558 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2559
2560 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2561 pi2_src_scratch += 8;
2562
2563 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2564 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2565
2566 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2567 m_count = _mm_cvtsi32_si128(i4_shift);
2568 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2569 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2570
2571 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2572 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2573 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2574 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2575
2576 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2577
2578 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2579 pi2_dst_scratch += 8;
2580
2581 }
2582
2583 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2584 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
2585
2586 /* o8[0-3] */
2587 {
2588 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2589 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2590
2591 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2592
2593 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2594 pi2_src_scratch -= in_stride;
2595
2596 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2597 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2598
2599 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2600 m_count = _mm_cvtsi32_si128(i4_shift);
2601 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2602 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2603
2604 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2605 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2606 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2607 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2608
2609 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2610
2611 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2612 pi2_dst_scratch -= out_stride;
2613 }
2614
2615 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2616 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
2617
2618 /* o9[0-3] */
2619 {
2620 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2621 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2622
2623 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2624
2625 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2626 pi2_src_scratch -= in_stride;
2627
2628 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2629 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2630
2631 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2632 m_count = _mm_cvtsi32_si128(i4_shift);
2633 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2634 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2635
2636 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2637 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2638 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2639 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2640
2641 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2642
2643 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2644 pi2_dst_scratch -= out_stride;
2645 }
2646
2647 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2648 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
2649
2650 /* o10[0-3] */
2651 {
2652 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2653 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2654
2655 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2656
2657 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2658 pi2_src_scratch -= in_stride;
2659
2660 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2661 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2662
2663 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2664 m_count = _mm_cvtsi32_si128(i4_shift);
2665 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2666 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2667
2668 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2669 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2670 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2671 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2672
2673 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2674
2675 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2676 pi2_dst_scratch -= out_stride;
2677 }
2678
2679 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2680 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
2681
2682 /* o11[0-3] */
2683 {
2684
2685 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2686 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2687
2688 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2689
2690 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2691 pi2_src_scratch -= in_stride;
2692
2693 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2694 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2695
2696 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2697 m_count = _mm_cvtsi32_si128(i4_shift);
2698 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2699 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2700
2701 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2702 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2703 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2704 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2705
2706 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2707
2708 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2709 pi2_dst_scratch -= out_stride;
2710
2711 }
2712
2713 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2714 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
2715
2716 /* o12[0-3] */
2717 {
2718 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2719 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2720
2721 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2722
2723 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2724 pi2_src_scratch -= in_stride;
2725
2726 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2727 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2728
2729 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2730 m_count = _mm_cvtsi32_si128(i4_shift);
2731 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2732 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2733
2734 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2735 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2736 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2737 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2738
2739 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2740
2741 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2742 pi2_dst_scratch -= out_stride;
2743
2744 }
2745
2746 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2747 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
2748
2749 /* o13[0-3] */
2750 {
2751 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2752 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2753
2754 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2755
2756 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2757 pi2_src_scratch -= in_stride;
2758
2759 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2760 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2761
2762 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2763 m_count = _mm_cvtsi32_si128(i4_shift);
2764 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2765 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2766
2767 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2768 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2769 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2770 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2771
2772 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2773
2774 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2775 pi2_dst_scratch -= out_stride;
2776 }
2777
2778 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2779 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
2780
2781 /* o14[0-3] */
2782 {
2783 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2784 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2785
2786 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2787
2788 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2789 pi2_src_scratch -= in_stride;
2790
2791 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2792 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2793
2794 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2795 m_count = _mm_cvtsi32_si128(i4_shift);
2796 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2797 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2798
2799 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2800 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2801 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2802 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2803
2804 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2805
2806 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2807 pi2_dst_scratch -= out_stride;
2808
2809 }
2810
2811 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2812 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
2813
2814 /* o15[0-3] */
2815 {
2816 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2817 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2818
2819 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2820
2821 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2822 pi2_src_scratch += 8;
2823
2824 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2825 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2826
2827 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2828 m_count = _mm_cvtsi32_si128(i4_shift);
2829 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2830 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2831
2832 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2833 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2834 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2835 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2836
2837 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2838
2839 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2840 pi2_dst_scratch += 8;
2841 }
2842
2843 }
2844 }
2845 }
2846 else
2847 {
2848 /* o & stage 1 out */
2849 {
2850 WORD32 j;
2851 WORD16 *pi2_src_scratch = o_temp_ptr;
2852 WORD16 *pi2_dst_scratch = temp_ptr;
2853 WORD32 out_stride = (trans_size << 1);
2854 WORD32 in_stride = trans_size;
2855
2856
2857 for(j = 0; j < 2; j++)
2858 {
2859 if(j)
2860 {
2861 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2862 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2863 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2864 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2865 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
2866 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
2867 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
2868 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
2869
2870 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
2871 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
2872 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
2873 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
2874 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
2875 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
2876 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
2877 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
2878 }
2879
2880 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2881 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2882 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
2883 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
2884 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
2885 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
2886 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
2887 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
2888
2889 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2890 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2891 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
2892 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
2893 temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
2894 temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
2895 temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
2896 temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
2897
2898
2899 /* o0[0-3] */
2900 {
2901 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2902 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2903 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2904 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2905
2906 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2907 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2908
2909 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2910
2911 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2912 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2913 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2914 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2915
2916 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2917 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2918
2919 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2920
2921 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2922
2923 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2924 pi2_src_scratch += in_stride;
2925
2926 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2927 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2928
2929 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2930 m_count = _mm_cvtsi32_si128(i4_shift);
2931 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2932 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2933
2934 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2935 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2936 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2937 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2938
2939 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2940
2941 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2942 pi2_dst_scratch += out_stride;
2943
2944 }
2945
2946 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2947 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2948 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
2949 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
2950 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
2951 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
2952 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
2953 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
2954
2955
2956 /* o1[0-3] */
2957 {
2958 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2959 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2960 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2961 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2962
2963 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2964 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2965
2966 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
2967
2968 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2969 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2970 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2971 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2972
2973 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2974 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2975
2976 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2977
2978 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2979
2980 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2981 pi2_src_scratch += in_stride;
2982
2983 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2984 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2985
2986 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2987 m_count = _mm_cvtsi32_si128(i4_shift);
2988 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2989 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2990
2991 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2992 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2993 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2994 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2995
2996 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2997
2998 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2999 pi2_dst_scratch += out_stride;
3000
3001 }
3002
3003 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
3004 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
3005 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
3006 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
3007 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
3008 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
3009 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
3010 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
3011
3012 /* o2[0-3] */
3013 {
3014 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3015 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3016 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3017 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3018
3019 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3020 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3021
3022 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3023
3024 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3025 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3026 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3027 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3028
3029 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
3030 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3031
3032 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
3033
3034 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3035
3036 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3037 pi2_src_scratch += in_stride;
3038
3039 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3040 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3041
3042 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3043 m_count = _mm_cvtsi32_si128(i4_shift);
3044 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3045 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3046
3047 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3048 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3049 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3050 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3051
3052 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3053
3054 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3055 pi2_dst_scratch += out_stride;
3056
3057 }
3058
3059
3060 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
3061 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
3062 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
3063 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
3064 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
3065 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
3066 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
3067 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
3068
3069 /* o3[0-3] */
3070 {
3071 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3072 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3073 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3074 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3075
3076 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3077 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3078
3079 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3080
3081 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3082 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3083 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3084 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3085
3086 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
3087 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3088
3089 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3090
3091 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3092
3093 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3094 pi2_src_scratch += in_stride;
3095
3096 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3097 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3098
3099 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3100 m_count = _mm_cvtsi32_si128(i4_shift);
3101 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3102 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3103
3104 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3105 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3106 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3107 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3108
3109 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3110
3111 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3112 pi2_dst_scratch += out_stride;
3113
3114 }
3115
3116 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
3117 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
3118 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
3119 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
3120 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
3121 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
3122 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
3123 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
3124
3125 /* o4[0-3] */
3126 {
3127 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3128 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3129 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3130 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3131
3132 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3133 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3134
3135 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3136
3137 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3138 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3139 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3140 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3141
3142 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3143 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3144
3145 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3146
3147 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3148
3149 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3150 pi2_src_scratch += in_stride;
3151
3152 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3153 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3154
3155 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3156 m_count = _mm_cvtsi32_si128(i4_shift);
3157 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3158 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3159
3160 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3161 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3162 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3163 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3164
3165 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3166
3167 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3168 pi2_dst_scratch += out_stride;
3169
3170 }
3171
3172
3173 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
3174 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
3175 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
3176 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
3177 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
3178 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
3179 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
3180 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
3181
3182 /* o5[0-3] */
3183 {
3184 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3185 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3186 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3187 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3188
3189 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3190 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3191
3192 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3193
3194 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3195 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3196 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3197 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3198
3199 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3200 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3201
3202 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3203
3204 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3205
3206 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3207 pi2_src_scratch += in_stride;
3208
3209 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3210 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3211
3212 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3213 m_count = _mm_cvtsi32_si128(i4_shift);
3214 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3215 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3216
3217 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3218 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3219 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3220 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3221
3222 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3223
3224 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3225 pi2_dst_scratch += out_stride;
3226
3227 }
3228
3229 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
3230 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
3231 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
3232 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
3233 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
3234 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
3235 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
3236 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
3237
3238
3239 /* o6[0-3] */
3240 {
3241 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3242 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3243 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3244 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3245
3246 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3247 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3248
3249 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3250
3251 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3252 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3253 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3254 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3255
3256 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3257 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3258
3259 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3260
3261 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3262
3263 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3264 pi2_src_scratch += in_stride;
3265
3266 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3267 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3268
3269 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3270 m_count = _mm_cvtsi32_si128(i4_shift);
3271 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3272 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3273
3274 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3275 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3276 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3277 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3278
3279 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3280
3281 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3282 pi2_dst_scratch += out_stride;
3283
3284 }
3285
3286 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
3287 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
3288 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
3289 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
3290 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
3291 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
3292 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
3293 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
3294
3295 /* o7[0-3] */
3296 {
3297 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3298 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3299 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3300 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3301
3302 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3303 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3304
3305 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3306
3307 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3308 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3309 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3310 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3311
3312 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3313 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3314
3315 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3316
3317 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3318
3319 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3320 pi2_src_scratch += 8;
3321
3322 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3323 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3324
3325 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3326 m_count = _mm_cvtsi32_si128(i4_shift);
3327 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3328 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3329
3330 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3331 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3332 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3333 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3334
3335 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3336
3337 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3338 pi2_dst_scratch += 8;
3339
3340 }
3341
3342 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
3343 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
3344 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
3345 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
3346 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
3347 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
3348 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
3349 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
3350
3351
3352 /* o8[0-3] */
3353 {
3354
3355 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3356 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3357 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3358 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3359
3360 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3361 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3362
3363 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3364
3365 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3366 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3367 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3368 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3369
3370 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3371 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3372
3373 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3374
3375 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3376
3377 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3378 pi2_src_scratch -= in_stride;
3379
3380 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3381 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3382
3383 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3384 m_count = _mm_cvtsi32_si128(i4_shift);
3385 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3386 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3387
3388 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3389 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3390 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3391 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3392
3393 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3394
3395 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3396 pi2_dst_scratch -= out_stride;
3397 }
3398
3399 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
3400 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
3401 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
3402 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
3403 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
3404 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
3405 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
3406 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
3407
3408
3409 /* o9[0-3] */
3410 {
3411 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3412 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3413 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3414 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3415
3416 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3417 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3418
3419 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3420
3421 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3422 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3423 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3424 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3425
3426 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3427 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3428
3429 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3430
3431 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3432
3433 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3434 pi2_src_scratch -= in_stride;
3435
3436 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3437 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3438
3439 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3440 m_count = _mm_cvtsi32_si128(i4_shift);
3441 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3442 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3443
3444 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3445 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3446 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3447 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3448
3449 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3450
3451 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3452 pi2_dst_scratch -= out_stride;
3453 }
3454
3455 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
3456 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
3457 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
3458 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
3459 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
3460 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
3461 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
3462 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
3463
3464 /* o10[0-3] */
3465 {
3466 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3467 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3468 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3469 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3470
3471 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3472 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3473
3474 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3475
3476 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3477 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3478 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3479 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3480
3481 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3482 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3483
3484 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3485
3486 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3487
3488 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3489 pi2_src_scratch -= in_stride;
3490
3491 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3492 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3493
3494 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3495 m_count = _mm_cvtsi32_si128(i4_shift);
3496 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3497 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3498
3499 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3500 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3501 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3502 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3503
3504 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3505
3506 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3507 pi2_dst_scratch -= out_stride;
3508 }
3509
3510 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
3511 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
3512 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
3513 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
3514 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
3515 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
3516 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
3517 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
3518
3519 /* o11[0-3] */
3520 {
3521 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3522 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3523 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3524 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3525
3526 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3527 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3528
3529 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3530
3531 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3532 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3533 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3534 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3535
3536 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3537 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3538
3539 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3540
3541 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3542
3543 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3544 pi2_src_scratch -= in_stride;
3545
3546 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3547 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3548
3549 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3550 m_count = _mm_cvtsi32_si128(i4_shift);
3551 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3552 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3553
3554 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3555 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3556 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3557 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3558
3559 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3560
3561 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3562 pi2_dst_scratch -= out_stride;
3563
3564 }
3565
3566 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
3567 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
3568 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
3569 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
3570 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
3571 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
3572 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
3573 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
3574
3575
3576 /* o12[0-3] */
3577 {
3578 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3579 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3580 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3581 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3582
3583 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3584 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3585
3586 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3587
3588 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3589 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3590 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3591 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3592
3593 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3594 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3595
3596 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3597
3598 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3599
3600 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3601 pi2_src_scratch -= in_stride;
3602
3603 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3604 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3605
3606 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3607 m_count = _mm_cvtsi32_si128(i4_shift);
3608 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3609 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3610
3611 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3612 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3613 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3614 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3615
3616 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3617
3618 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3619 pi2_dst_scratch -= out_stride;
3620
3621 }
3622
3623 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
3624 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
3625 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
3626 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
3627 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
3628 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
3629 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
3630 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
3631
3632
3633 /* o13[0-3] */
3634 {
3635 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3636 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3637 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3638 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3639
3640 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3641 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3642
3643 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3644
3645 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3646 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3647 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3648 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3649
3650 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3651 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3652
3653 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3654
3655 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3656
3657 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3658 pi2_src_scratch -= in_stride;
3659
3660 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3661 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3662
3663 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3664 m_count = _mm_cvtsi32_si128(i4_shift);
3665 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3666 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3667
3668 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3669 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3670 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3671 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3672
3673 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3674
3675 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3676 pi2_dst_scratch -= out_stride;
3677 }
3678
3679 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
3680 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
3681 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
3682 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
3683 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
3684 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
3685 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
3686 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
3687
3688
3689 /* o14[0-3] */
3690 {
3691 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3692 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3693 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3694 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3695
3696 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3697 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3698
3699 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3700
3701 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3702 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3703 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3704 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3705
3706 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3707 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3708
3709 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3710
3711 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3712
3713 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3714 pi2_src_scratch -= in_stride;
3715
3716 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3717 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3718
3719 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3720 m_count = _mm_cvtsi32_si128(i4_shift);
3721 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3722 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3723
3724 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3725 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3726 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3727 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3728
3729 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3730
3731 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3732 pi2_dst_scratch -= out_stride;
3733
3734 }
3735
3736 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
3737 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
3738 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
3739 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
3740 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
3741 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
3742 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
3743 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
3744
3745 /* o15[0-3] */
3746 {
3747 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3748 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3749 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3750 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3751
3752 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3753 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3754
3755 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3756
3757 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3758 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3759 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3760 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3761
3762 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3763 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3764
3765 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3766
3767 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3768
3769 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3770 pi2_src_scratch += 8;
3771
3772 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3773 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3774
3775 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3776 m_count = _mm_cvtsi32_si128(i4_shift);
3777 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3778 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3779
3780 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3781 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3782 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3783 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3784
3785 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3786
3787 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3788 pi2_dst_scratch += 8;
3789 }
3790
3791 }
3792 }
3793 }
3794 /* Transpose */
3795 {
3796 WORD16 *pi2_src_scratch = temp_ptr;
3797 WORD16 *pi2_dst_scratch = pi2_tmp;
3798 WORD32 in_stride = (trans_size << 1);
3799
3800 for(j = 0; j < 2; j++)
3801 {
3802 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3803 pi2_src_scratch += in_stride;
3804 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
3805 pi2_src_scratch += in_stride;
3806 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
3807 pi2_src_scratch += in_stride;
3808 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
3809 pi2_src_scratch += in_stride;
3810 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
3811 pi2_src_scratch += in_stride;
3812 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
3813 pi2_src_scratch += in_stride;
3814 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
3815 pi2_src_scratch += in_stride;
3816 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
3817 pi2_src_scratch += 8;
3818
3819 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
3820 pi2_src_scratch -= in_stride;
3821 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
3822 pi2_src_scratch -= in_stride;
3823 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
3824 pi2_src_scratch -= in_stride;
3825 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
3826 pi2_src_scratch -= in_stride;
3827 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
3828 pi2_src_scratch -= in_stride;
3829 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
3830 pi2_src_scratch -= in_stride;
3831 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
3832 pi2_src_scratch -= in_stride;
3833 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
3834 pi2_src_scratch += 8;
3835
3836
3837 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
3838 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
3839
3840 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
3841 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
3842
3843 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
3844 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
3845
3846 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
3847 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
3848
3849 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
3850 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
3851
3852 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
3853 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
3854
3855 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
3856 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
3857
3858 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
3859 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
3860
3861 /****************/
3862
3863 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
3864 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
3865
3866 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
3867 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
3868
3869 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
3870 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
3871
3872 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
3873 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
3874
3875 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
3876 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
3877
3878 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
3879 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
3880
3881 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
3882 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
3883
3884 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
3885 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
3886
3887 /******************/
3888
3889 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
3890 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
3891
3892 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
3893 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
3894
3895 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
3896 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
3897
3898 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
3899 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
3900
3901 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
3902 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
3903
3904 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
3905 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
3906
3907 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
3908 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
3909
3910 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
3911 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
3912
3913 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
3914 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
3915 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
3916 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
3917
3918 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
3919 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
3920 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
3921 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
3922
3923 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
3924 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
3925 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
3926 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
3927
3928 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
3929 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
3930 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
3931 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
3932
3933 pi2_dst_scratch += 4 * trans_size;
3934 }
3935 }
3936 pi2_src += 8;
3937 // pi2_dequant_coeff +=8;
3938 pi2_tmp += 8 * trans_size;
3939 zero_cols = zero_cols >> 1;
3940 }
3941
3942 if(trans_size_stg1 != TRANS_SIZE_32)
3943 {
3944 m_temp_reg_10 = _mm_setzero_si128();
3945
3946 for(i = trans_size_stg1; i < 32; i += 8)
3947 {
3948 WORD16 *pi2_dst_scratch = pi2_tmp;
3949
3950 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
3951 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
3952 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
3953 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
3954
3955 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
3956 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
3957 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
3958 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
3959
3960 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
3961 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
3962 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
3963 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
3964
3965 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
3966 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
3967 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
3968 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
3969
3970 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
3971 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
3972 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
3973 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
3974
3975 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
3976 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
3977 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
3978 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
3979
3980 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
3981 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
3982 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
3983 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
3984
3985 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
3986 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
3987 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
3988 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
3989
3990 pi2_tmp += 8 * trans_size;
3991 }
3992 }
3993
3994 pi2_tmp = pi2_tmp_orig;
3995
3996 /* Inverse Transform 2nd stage */
3997
3998 for(j = 0; j < trans_size; j += 4)
3999 {
4000 i4_shift = IT_SHIFT_STAGE_2;
4001
4002 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
4003 if(zero_last28_rows_stg2)
4004 {
4005 {
4006
4007 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4008 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
4009 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
4010 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
4011 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
4012 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
4013 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
4014 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
4015
4016 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4017
4018 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
4019
4020 /* eo0[0-3] */
4021 {
4022 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4023
4024 }
4025 /* eo1[0-3] */
4026 {
4027 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4028
4029 }
4030 /* eo2[0-3] */
4031 {
4032 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4033 }
4034
4035 /* eo3[0-3] */
4036 {
4037 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4038 }
4039 /* eo4[0-3] */
4040 {
4041 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
4042 }
4043
4044 /* eo5[0-3] */
4045 {
4046 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
4047 }
4048
4049 /* eo6[0-3] */
4050 {
4051 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
4052 }
4053 /* eo7[0-3] */
4054 {
4055 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
4056 }
4057 }
4058
4059 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4060
4061 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4062
4063 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4064
4065 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4066
4067 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4068
4069 /* e[]*/
4070
4071 temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */
4072 temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */
4073
4074 temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */
4075 temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */
4076
4077 temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */
4078 temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */
4079
4080 temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */
4081 temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */
4082
4083 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */
4084 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */
4085
4086 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */
4087 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */
4088
4089 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */
4090 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */
4091
4092 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */
4093 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */
4094
4095 /*o[k]*/
4096 {
4097
4098 WORD16 *pi2_dst_scratch = temp_ptr;
4099 WORD32 out_stride = 8;
4100
4101 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4102
4103 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4104 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4105
4106 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
4107
4108
4109 /* o0[0-3] */
4110 {
4111 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4112
4113 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4114 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4115
4116 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4117 m_count = _mm_cvtsi32_si128(i4_shift);
4118 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4119 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4120
4121 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4122 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4123 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4124 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4125
4126 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4127
4128 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4129 pi2_dst_scratch += out_stride;
4130
4131 }
4132
4133 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4134
4135 /* o1[0-3] */
4136 {
4137 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4138
4139 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4140 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4141
4142 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4143 m_count = _mm_cvtsi32_si128(i4_shift);
4144 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4145 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4146
4147 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4148 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4149 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4150 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4151
4152 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4153
4154 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4155 pi2_dst_scratch += out_stride;
4156
4157 }
4158
4159 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4160
4161 /* o2[0-3] */
4162 {
4163 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4164
4165 m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
4166 m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
4167
4168 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4169 m_count = _mm_cvtsi32_si128(i4_shift);
4170 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4171 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4172
4173 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4174 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4175 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4176 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4177
4178 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4179
4180 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4181 pi2_dst_scratch += out_stride;
4182
4183 }
4184
4185 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4186
4187 /* o3[0-3] */
4188 {
4189 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4190
4191 m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
4192 m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
4193
4194 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4195 m_count = _mm_cvtsi32_si128(i4_shift);
4196 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4197 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4198
4199 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4200 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4201 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4202 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4203
4204 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4205
4206 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4207 pi2_dst_scratch += out_stride;
4208
4209 }
4210
4211 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4212
4213 /* o4[0-3] */
4214 {
4215 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4216
4217 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4218 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4219
4220 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4221 m_count = _mm_cvtsi32_si128(i4_shift);
4222 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4223 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4224
4225 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4226 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4227 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4228 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4229
4230 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4231
4232 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4233 pi2_dst_scratch += out_stride;
4234
4235 }
4236
4237 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4238
4239 /* o5[0-3] */
4240 {
4241 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4242
4243 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4244 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4245
4246 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4247 m_count = _mm_cvtsi32_si128(i4_shift);
4248 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4249 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4250
4251 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4252 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4253 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4254 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4255
4256 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4257
4258 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4259 pi2_dst_scratch += out_stride;
4260
4261 }
4262
4263 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4264
4265 /* o6[0-3] */
4266 {
4267 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4268
4269 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4270 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4271
4272 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4273 m_count = _mm_cvtsi32_si128(i4_shift);
4274 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4275 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4276
4277 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4278 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4279 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4280 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4281
4282 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4283
4284 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4285 pi2_dst_scratch += out_stride;
4286
4287 }
4288
4289 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4290
4291 /* o7[0-3] */
4292 {
4293 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4294
4295 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4296 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4297
4298 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4299 m_count = _mm_cvtsi32_si128(i4_shift);
4300 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4301 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4302
4303 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4304 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4305 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4306 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4307
4308 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4309
4310 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4311 pi2_dst_scratch += 8;
4312
4313 }
4314
4315 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4316
4317 /* o8[0-3] */
4318 {
4319 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4320
4321 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4322 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4323
4324 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4325 m_count = _mm_cvtsi32_si128(i4_shift);
4326 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4327 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4328
4329 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4330 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4331 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4332 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4333
4334 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4335
4336 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4337 pi2_dst_scratch += out_stride;
4338 }
4339
4340 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4341
4342 /* o9[0-3] */
4343 {
4344 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4345
4346 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4347 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4348
4349 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4350 m_count = _mm_cvtsi32_si128(i4_shift);
4351 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4352 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4353
4354 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4355 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4356 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4357 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4358
4359 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4360
4361 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4362 pi2_dst_scratch += out_stride;
4363
4364 }
4365
4366 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4367
4368 /* o10[0-3] */
4369 {
4370 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4371
4372 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
4373 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
4374
4375 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4376 m_count = _mm_cvtsi32_si128(i4_shift);
4377 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4378 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4379
4380 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4381 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4382 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4383 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4384
4385 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4386
4387 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4388 pi2_dst_scratch += out_stride;
4389 }
4390
4391 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
4392
4393 /* o11[0-3] */
4394 {
4395 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4396
4397 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
4398 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
4399
4400 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4401 m_count = _mm_cvtsi32_si128(i4_shift);
4402 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4403 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4404
4405 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4406 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4407 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4408 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4409
4410 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4411
4412 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4413 pi2_dst_scratch += out_stride;
4414
4415 }
4416
4417 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
4418
4419 /* o12[0-3] */
4420 {
4421 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4422
4423 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
4424 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
4425
4426 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4427 m_count = _mm_cvtsi32_si128(i4_shift);
4428 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4429 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4430
4431 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4432 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4433 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4434 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4435
4436 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4437
4438 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4439 pi2_dst_scratch += out_stride;
4440
4441 }
4442
4443 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
4444
4445 /* o13[0-3] */
4446 {
4447 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4448
4449 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
4450 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
4451
4452 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4453 m_count = _mm_cvtsi32_si128(i4_shift);
4454 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4455 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4456
4457 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4458 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4459 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4460 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4461
4462 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4463
4464 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4465 pi2_dst_scratch += out_stride;
4466 }
4467
4468 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
4469
4470 /* o14[0-3] */
4471 {
4472 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4473
4474 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
4475 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
4476
4477 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4478 m_count = _mm_cvtsi32_si128(i4_shift);
4479 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4480 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4481
4482 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4483 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4484 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4485 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4486
4487 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4488
4489 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4490 pi2_dst_scratch += out_stride;
4491
4492 }
4493
4494 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
4495
4496 /* o15[0-3] */
4497 {
4498 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4499
4500 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
4501 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
4502
4503 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4504 m_count = _mm_cvtsi32_si128(i4_shift);
4505 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4506 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4507
4508 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4509 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4510 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4511 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4512
4513 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4514
4515 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4516 pi2_dst_scratch += 8;
4517 }
4518
4519 }
4520
4521 }
4522 else if(zero_last24_rows_stg2)
4523 {
4524 /* eo */
4525 {
4526 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4527
4528 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4529 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
4530
4531 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
4532
4533
4534 /* eo0[0-3] */
4535 {
4536 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4537
4538 }
4539
4540 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
4541
4542 /* eo1[0-3] */
4543 {
4544 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4545
4546 }
4547 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
4548
4549 /* eo2[0-3] */
4550 {
4551 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4552
4553 }
4554
4555 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
4556
4557 /* eo3[0-3] */
4558 {
4559
4560 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4561
4562 }
4563
4564 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
4565
4566 /* eo4[0-3] */
4567 {
4568 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4569
4570 }
4571
4572 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
4573
4574 /* eo5[0-3] */
4575 {
4576 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4577 }
4578
4579 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
4580 /* eo6[0-3] */
4581 {
4582 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4583 }
4584
4585 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
4586 /* eo7[0-3] */
4587 {
4588 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4589
4590 }
4591
4592 }
4593
4594 /* eeo */
4595 {
4596
4597 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
4598 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
4599 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
4600 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
4601
4602 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
4603
4604 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
4605
4606 /* eeo0[0-3] */
4607 {
4608 temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4609
4610 }
4611
4612 /* eeo1[0-3] */
4613 {
4614 temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4615
4616 }
4617
4618 /* eo2[0-3] */
4619 {
4620 temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4621
4622 }
4623
4624
4625 /* eo3[0-3] */
4626 {
4627 temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4628
4629 }
4630
4631 }
4632
4633 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
4634 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
4635 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4636
4637 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4638
4639 //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
4640 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4641
4642 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4643 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4644
4645 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */
4646 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */
4647
4648 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */
4649 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */
4650
4651 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */
4652 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */
4653
4654 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */
4655 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */
4656
4657 /* e[]*/
4658
4659 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
4660 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
4661
4662 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
4663 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
4664
4665 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
4666 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
4667
4668 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
4669 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
4670
4671 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
4672 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
4673
4674 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
4675 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
4676
4677 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
4678 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
4679
4680 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
4681 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
4682
4683 /*o[k] */
4684 {
4685
4686 WORD16 *pi2_dst_scratch = temp_ptr;
4687 WORD32 out_stride = 8;
4688
4689 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4690 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
4691
4692 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4693 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4694 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
4695 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
4696
4697 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
4698 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
4699
4700 /* o0[0-3] */
4701 {
4702 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4703 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4704
4705 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4706
4707 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4708 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4709
4710 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4711 m_count = _mm_cvtsi32_si128(i4_shift);
4712 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4713 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4714
4715 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4716 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4717 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4718 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4719
4720 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4721
4722 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4723 pi2_dst_scratch += out_stride;
4724
4725 }
4726
4727
4728 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4729 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
4730
4731 /* o1[0-3] */
4732 {
4733 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4734 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4735
4736 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4737
4738 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4739 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4740
4741 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4742 m_count = _mm_cvtsi32_si128(i4_shift);
4743 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4744 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4745
4746 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4747 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4748 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4749 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4750
4751 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4752
4753 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4754 pi2_dst_scratch += out_stride;
4755
4756 }
4757
4758 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4759 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
4760
4761 /* o2[0-3] */
4762 {
4763 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4764 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4765
4766 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4767
4768 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
4769 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
4770
4771 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4772 m_count = _mm_cvtsi32_si128(i4_shift);
4773 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4774 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4775
4776 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4777 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4778 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4779 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4780
4781 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4782
4783 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4784 pi2_dst_scratch += out_stride;
4785
4786 }
4787
4788 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4789 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
4790
4791 /* o3[0-3] */
4792 {
4793 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4794 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4795
4796 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4797
4798 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
4799 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
4800
4801 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4802 m_count = _mm_cvtsi32_si128(i4_shift);
4803 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4804 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4805
4806 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4807 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4808 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4809 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4810
4811 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4812
4813 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4814 pi2_dst_scratch += out_stride;
4815
4816 }
4817
4818 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4819 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
4820
4821 /* o4[0-3] */
4822 {
4823 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4824 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4825
4826 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4827
4828 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4829 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4830
4831 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4832 m_count = _mm_cvtsi32_si128(i4_shift);
4833 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4834 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4835
4836 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4837 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4838 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4839 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4840
4841 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4842
4843 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4844 pi2_dst_scratch += out_stride;
4845
4846 }
4847
4848 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4849 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
4850
4851 /* o5[0-3] */
4852 {
4853 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4854 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4855
4856 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4857
4858 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4859 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4860
4861 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4862 m_count = _mm_cvtsi32_si128(i4_shift);
4863 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4864 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4865
4866 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4867 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4868 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4869 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4870
4871 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4872
4873 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4874 pi2_dst_scratch += out_stride;
4875
4876 }
4877
4878 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4879 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
4880
4881 /* o6[0-3] */
4882 {
4883 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4884 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4885
4886 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4887
4888 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4889 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4890
4891 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4892 m_count = _mm_cvtsi32_si128(i4_shift);
4893 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4894 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4895
4896 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4897 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4898 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4899 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4900
4901 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4902
4903 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4904 pi2_dst_scratch += out_stride;
4905
4906 }
4907
4908 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4909 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
4910
4911 /* o7[0-3] */
4912 {
4913 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4914 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4915
4916 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4917
4918 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4919 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4920
4921 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4922 m_count = _mm_cvtsi32_si128(i4_shift);
4923 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4924 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4925
4926 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4927 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4928 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4929 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4930
4931 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4932
4933 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4934 pi2_dst_scratch += 8;
4935
4936 }
4937
4938 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4939 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
4940
4941 /* o8[0-3] */
4942 {
4943 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4944 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4945
4946 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4947
4948 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4949 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4950
4951 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4952 m_count = _mm_cvtsi32_si128(i4_shift);
4953 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4954 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4955
4956 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4957 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4958 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4959 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4960
4961 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4962
4963 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4964 pi2_dst_scratch += out_stride;
4965 }
4966
4967 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4968 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
4969
4970 /* o9[0-3] */
4971 {
4972 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4973 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4974
4975 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4976
4977 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4978 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4979
4980 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4981 m_count = _mm_cvtsi32_si128(i4_shift);
4982 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4983 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4984
4985 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4986 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4987 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4988 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4989
4990 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4991
4992 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4993 pi2_dst_scratch += out_stride;
4994 }
4995
4996 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4997 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
4998
4999 /* o10[0-3] */
5000 {
5001 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5002 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5003
5004 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5005
5006 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
5007 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
5008
5009 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5010 m_count = _mm_cvtsi32_si128(i4_shift);
5011 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5012 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5013
5014 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5015 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5016 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5017 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5018
5019 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5020
5021 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5022 pi2_dst_scratch += out_stride;
5023 }
5024
5025 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
5026 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
5027
5028 /* o11[0-3] */
5029 {
5030 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5031 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5032
5033 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5034
5035 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
5036 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
5037
5038 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5039 m_count = _mm_cvtsi32_si128(i4_shift);
5040 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5041 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5042
5043 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5044 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5045 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5046 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5047
5048 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5049
5050 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5051 pi2_dst_scratch += out_stride;
5052
5053 }
5054
5055 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
5056 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
5057
5058 /* o12[0-3] */
5059 {
5060 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5061 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5062
5063 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5064
5065 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
5066 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
5067
5068 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5069 m_count = _mm_cvtsi32_si128(i4_shift);
5070 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5071 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5072
5073 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5074 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5075 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5076 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5077
5078 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5079
5080 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5081 pi2_dst_scratch += out_stride;
5082
5083 }
5084
5085 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
5086 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
5087
5088 /* o13[0-3] */
5089 {
5090 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5091 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5092
5093 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5094
5095 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
5096 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
5097
5098 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5099 m_count = _mm_cvtsi32_si128(i4_shift);
5100 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5101 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5102
5103 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5104 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5105 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5106 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5107
5108 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5109
5110 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5111 pi2_dst_scratch += out_stride;
5112 }
5113
5114 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
5115 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
5116
5117 /* o14[0-3] */
5118 {
5119 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5120 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5121
5122 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5123
5124 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
5125 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
5126
5127 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5128 m_count = _mm_cvtsi32_si128(i4_shift);
5129 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5130 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5131
5132 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5133 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5134 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5135 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5136
5137 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5138
5139 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5140 pi2_dst_scratch += out_stride;
5141 }
5142
5143 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
5144 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
5145
5146 /* o15[0-3] */
5147 {
5148 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5149 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5150
5151 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5152
5153 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
5154 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
5155
5156 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5157 m_count = _mm_cvtsi32_si128(i4_shift);
5158 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5159 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5160
5161 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5162 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5163 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5164 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5165
5166 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5167
5168 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5169 pi2_dst_scratch += 8;
5170 }
5171
5172 }
5173 }
5174 else
5175 {
5176 /* eo */
5177 {
5178
5179 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
5180 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
5181 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
5182 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
5183
5184
5185 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
5186 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
5187 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
5188 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
5189 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
5190 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
5191 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
5192 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
5193
5194 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
5195 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
5196 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
5197 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
5198
5199 /* eo0[0-3] */
5200 {
5201 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5202 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5203
5204 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5205
5206 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5207 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5208
5209 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5210
5211 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5212
5213 }
5214
5215 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57
5216 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43
5217 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90
5218 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25
5219
5220 /* eo1[0-3] */
5221 {
5222 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5223 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5224
5225 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5226
5227 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5228 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5229
5230 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5231
5232 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
5233
5234 }
5235
5236 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9
5237 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87
5238 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57
5239 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43
5240
5241 /* eo2[0-3] */
5242 {
5243 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5244 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5245
5246 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5247
5248 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5249 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5250
5251 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5252
5253 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5254
5255 }
5256
5257 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43
5258 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9
5259 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25
5260 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57
5261
5262 /* eo3[0-3] */
5263 {
5264 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5265 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5266
5267 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5268
5269 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5270 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5271
5272 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
5273
5274 m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5275
5276 }
5277
5278 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80
5279 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90
5280 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87
5281 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70
5282
5283
5284 /* eo4[0-3] */
5285 {
5286 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5287 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5288
5289 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5290
5291 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5292 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5293
5294 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
5295
5296 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5297
5298 }
5299
5300 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90
5301 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25
5302 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70
5303 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80
5304
5305 /* eo5[0-3] */
5306 {
5307 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5308 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5309
5310 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5311
5312 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5313 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5314
5315 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5316
5317 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5318 }
5319
5320 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70
5321 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80
5322 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9
5323 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87
5324
5325 /* eo6[0-3] */
5326 {
5327 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5328 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5329
5330 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5331
5332 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5333 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5334
5335 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5336
5337 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5338
5339 }
5340
5341 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25
5342 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57
5343 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80
5344 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90
5345
5346 /* eo7[0-3] */
5347 {
5348 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5349 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5350
5351 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5352
5353 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5354 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5355
5356 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5357
5358 m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5359
5360
5361 }
5362
5363 }
5364
5365 /* eeo */
5366 {
5367 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
5368 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
5369
5370 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
5371 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
5372 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
5373 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
5374
5375 /* eeo0[0-3] */
5376 {
5377
5378 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
5379 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
5380
5381 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5382 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5383
5384 temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5385
5386 }
5387
5388 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
5389 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50
5390
5391 /* eeo1[0-3] */
5392 {
5393 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5394 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5395
5396 temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5397
5398 }
5399
5400 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
5401 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75
5402
5403 /* eo2[0-3] */
5404 {
5405 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5406 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5407
5408 temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5409
5410 }
5411
5412 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
5413 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89
5414
5415 /* eo3[0-3] */
5416 {
5417 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5418 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5419
5420 temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5421
5422 }
5423
5424
5425 }
5426
5427 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
5428 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
5429
5430 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
5431 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
5432
5433 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
5434 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
5435
5436 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
5437
5438 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
5439 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
5440
5441 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
5442
5443 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */
5444 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */
5445
5446 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */
5447 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */
5448
5449 /* eeeo[0]= m_temp_reg_20 */
5450 /* eeeo[1]= m_temp_reg_21 */
5451 /* eeee[0]= m_temp_reg_22 */
5452 /* eeee[1]= m_temp_reg_23 */
5453
5454 /* eee[0] = eeee[0] + eeeo[0]; */
5455 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */
5456
5457 /* eee[3] = eeee[0] - eeeo[0]; */
5458 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */
5459
5460 /* eee[2] = eeee[1] - eeeo[1]; */
5461 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */
5462
5463 /* eee[1] = eeee[1] + eeeo[1];*/
5464 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */
5465
5466 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */
5467 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */
5468
5469 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */
5470 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */
5471
5472 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */
5473 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */
5474
5475 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */
5476 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */
5477
5478 /* e[]*/
5479
5480 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */
5481 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */
5482
5483 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */
5484 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */
5485
5486 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */
5487 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */
5488
5489 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */
5490 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */
5491
5492 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */
5493 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */
5494
5495 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */
5496 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */
5497
5498 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */
5499 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */
5500
5501 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */
5502 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */
5503
5504 /*o[k] */
5505 {
5506
5507 WORD16 *pi2_dst_scratch = temp_ptr;
5508 WORD32 out_stride = 8;
5509
5510 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
5511 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
5512 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
5513 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
5514 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
5515 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
5516 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
5517 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
5518
5519
5520 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
5521 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
5522 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
5523 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
5524 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
5525 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
5526 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
5527 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
5528
5529 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
5530 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
5531 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
5532 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
5533 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
5534 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
5535 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
5536 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
5537
5538 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
5539 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
5540 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
5541 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
5542 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
5543 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
5544 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
5545 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
5546
5547 /* o0[0-3] */
5548 {
5549 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5550 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5551 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5552 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5553
5554 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5555 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5556
5557 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5558
5559 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5560 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5561 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5562 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5563
5564 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5565 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5566
5567 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5568
5569 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5570
5571 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
5572 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
5573
5574 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5575 m_count = _mm_cvtsi32_si128(i4_shift);
5576 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5577 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5578
5579 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5580 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5581 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5582 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5583
5584 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5585
5586 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5587 pi2_dst_scratch += out_stride;
5588
5589 }
5590
5591 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
5592 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
5593 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
5594 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
5595 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
5596 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
5597 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
5598 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
5599
5600 /* o1[0-3] */
5601 {
5602 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5603 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5604 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5605 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5606
5607 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5608 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5609
5610 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
5611
5612 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5613 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5614 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5615 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5616
5617 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5618 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5619
5620 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5621
5622 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5623
5624 m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
5625 m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
5626
5627 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5628 m_count = _mm_cvtsi32_si128(i4_shift);
5629 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5630 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5631
5632 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5633 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5634 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5635 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5636
5637 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5638
5639 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5640 pi2_dst_scratch += out_stride;
5641
5642 }
5643
5644 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
5645 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
5646 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
5647 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
5648 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
5649 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
5650 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
5651 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
5652
5653 /* o2[0-3] */
5654 {
5655 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5656 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5657 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5658 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5659
5660 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5661 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5662
5663 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5664
5665 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5666 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5667 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5668 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5669
5670 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
5671 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5672
5673 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
5674
5675 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5676
5677 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
5678 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
5679
5680 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5681 m_count = _mm_cvtsi32_si128(i4_shift);
5682 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5683 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5684
5685 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5686 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5687 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5688 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5689
5690 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5691
5692 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5693 pi2_dst_scratch += out_stride;
5694
5695 }
5696
5697 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
5698 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
5699 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
5700 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
5701 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
5702 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
5703 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
5704 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
5705
5706 /* o3[0-3] */
5707 {
5708 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5709 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5710 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5711 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5712
5713 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5714 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5715
5716 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5717
5718 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5719 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5720 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5721 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5722
5723 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
5724 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5725
5726 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5727
5728 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5729
5730 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
5731 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
5732
5733 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5734 m_count = _mm_cvtsi32_si128(i4_shift);
5735 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5736 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5737
5738 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5739 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5740 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5741 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5742
5743 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5744
5745 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5746 pi2_dst_scratch += out_stride;
5747
5748 }
5749
5750 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
5751 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
5752 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
5753 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
5754 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
5755 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
5756 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
5757 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
5758
5759 /* o4[0-3] */
5760 {
5761 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5762 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5763 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5764 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5765
5766 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5767 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5768
5769 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5770
5771 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5772 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5773 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5774 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5775
5776 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5777 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5778
5779 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5780
5781 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5782
5783 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
5784 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
5785 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5786 m_count = _mm_cvtsi32_si128(i4_shift);
5787 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5788 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5789
5790 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5791 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5792 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5793 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5794
5795 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5796
5797 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5798 pi2_dst_scratch += out_stride;
5799
5800 }
5801
5802 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
5803 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
5804 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
5805 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
5806 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
5807 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
5808 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
5809 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
5810
5811 /* o5[0-3] */
5812 {
5813 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5814 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5815 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5816 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5817
5818 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5819 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5820
5821 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5822
5823 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5824 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5825 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5826 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5827
5828 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5829 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5830
5831 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5832
5833 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5834
5835 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
5836 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
5837
5838 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5839 m_count = _mm_cvtsi32_si128(i4_shift);
5840 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5841 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5842
5843 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5844 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5845 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5846 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5847
5848 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5849
5850 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5851 pi2_dst_scratch += out_stride;
5852
5853 }
5854
5855 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
5856 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
5857 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
5858 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
5859 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
5860 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
5861 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
5862 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
5863
5864 /* o6[0-3] */
5865 {
5866 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5867 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5868 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5869 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5870
5871 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5872 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5873
5874 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5875
5876 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5877 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5878 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5879 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5880
5881 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5882 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5883
5884 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5885
5886 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5887
5888 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
5889 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
5890
5891 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5892 m_count = _mm_cvtsi32_si128(i4_shift);
5893 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5894 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5895
5896 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5897 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5898 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5899 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5900
5901 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5902
5903 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5904 pi2_dst_scratch += out_stride;
5905
5906 }
5907
5908 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
5909 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
5910 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
5911 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
5912 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
5913 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
5914 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
5915 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
5916
5917 /* o7[0-3] */
5918 {
5919 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5920 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5921 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5922 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5923
5924 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5925 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5926
5927 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5928
5929 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5930 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5931 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5932 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5933
5934 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5935 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5936
5937 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5938
5939 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5940
5941 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
5942 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
5943
5944 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5945 m_count = _mm_cvtsi32_si128(i4_shift);
5946 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5947 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5948
5949 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5950 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5951 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5952 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5953
5954 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5955
5956 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5957 pi2_dst_scratch += 8;
5958
5959 }
5960
5961 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
5962 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
5963 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
5964 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
5965 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
5966 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
5967 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
5968 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
5969
5970 /* o8[0-3] */
5971 {
5972 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5973 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5974 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5975 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5976
5977 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5978 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5979
5980 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5981
5982 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5983 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5984 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5985 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5986
5987 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5988 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5989
5990 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5991
5992 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5993
5994 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
5995 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
5996
5997 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5998 m_count = _mm_cvtsi32_si128(i4_shift);
5999 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6000 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6001
6002 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6003 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6004 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6005 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6006
6007 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6008
6009 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6010 pi2_dst_scratch += out_stride;
6011 }
6012
6013 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
6014 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
6015 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
6016 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
6017 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
6018 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
6019 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
6020 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
6021
6022 /* o9[0-3] */
6023 {
6024 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6025 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6026 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6027 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6028
6029 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6030 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6031
6032 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6033
6034 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6035 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6036 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6037 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6038
6039 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6040 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6041
6042 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6043
6044 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6045
6046 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
6047 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
6048
6049 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6050 m_count = _mm_cvtsi32_si128(i4_shift);
6051 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6052 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6053
6054 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6055 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6056 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6057 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6058
6059 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6060
6061 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6062 pi2_dst_scratch += out_stride;
6063 }
6064
6065 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
6066 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
6067 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
6068 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
6069 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
6070 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
6071 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
6072 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
6073
6074 /* o10[0-3] */
6075 {
6076 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6077 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6078 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6079 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6080
6081 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6082 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6083
6084 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6085
6086 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6087 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6088 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6089 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6090
6091 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6092 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6093
6094 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6095
6096 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6097
6098 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
6099 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
6100
6101 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6102 m_count = _mm_cvtsi32_si128(i4_shift);
6103 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6104 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6105
6106 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6107 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6108 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6109 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6110
6111 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6112
6113 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6114 pi2_dst_scratch += out_stride;
6115 }
6116
6117
6118 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
6119 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
6120 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
6121 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
6122 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
6123 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
6124 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
6125 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
6126
6127 /* o11[0-3] */
6128 {
6129 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6130 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6131 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6132 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6133
6134 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6135 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6136
6137 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6138
6139 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6140 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6141 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6142 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6143
6144 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6145 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6146
6147 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6148
6149 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6150
6151 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
6152 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
6153
6154 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6155 m_count = _mm_cvtsi32_si128(i4_shift);
6156 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6157 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6158
6159 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6160 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6161 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6162 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6163
6164 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6165
6166 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6167 pi2_dst_scratch += out_stride;
6168
6169 }
6170
6171 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
6172 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
6173 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
6174 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
6175 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
6176 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
6177 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
6178 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
6179
6180 /* o12[0-3] */
6181 {
6182 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6183 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6184 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6185 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6186
6187 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6188 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6189
6190 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6191
6192 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6193 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6194 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6195 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6196
6197 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6198 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6199
6200 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6201
6202 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6203
6204 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
6205 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
6206
6207 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6208 m_count = _mm_cvtsi32_si128(i4_shift);
6209 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6210 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6211
6212 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6213 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6214 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6215 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6216
6217 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6218
6219 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6220 pi2_dst_scratch += out_stride;
6221
6222 }
6223
6224 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
6225 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
6226 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
6227 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
6228 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
6229 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
6230 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
6231 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
6232
6233 /* o13[0-3] */
6234 {
6235 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6236 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6237 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6238 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6239
6240 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6241 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6242
6243 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6244
6245 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6246 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6247 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6248 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6249
6250 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6251 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6252
6253 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6254
6255 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6256
6257 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
6258 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
6259
6260 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6261 m_count = _mm_cvtsi32_si128(i4_shift);
6262 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6263 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6264
6265 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6266 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6267 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6268 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6269
6270 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6271
6272 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6273 pi2_dst_scratch += out_stride;
6274 }
6275
6276 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
6277 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
6278 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
6279 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
6280 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
6281 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
6282 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
6283 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
6284
6285 /* o14[0-3] */
6286 {
6287 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6288 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6289 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6290 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6291
6292 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6293 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6294
6295 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6296
6297 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6298 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6299 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6300 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6301
6302 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6303 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6304
6305 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6306
6307 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6308
6309 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
6310 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
6311
6312 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6313 m_count = _mm_cvtsi32_si128(i4_shift);
6314 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6315 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6316
6317 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6318 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6319 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6320 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6321
6322 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6323
6324 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6325 pi2_dst_scratch += out_stride;
6326
6327 }
6328
6329 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
6330 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
6331 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
6332 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
6333 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
6334 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
6335 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
6336 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
6337
6338 /* o15[0-3] */
6339 {
6340 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6341 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6342 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6343 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6344
6345 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6346 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6347
6348 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6349
6350 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6351 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6352 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6353 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6354
6355 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6356 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6357
6358 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6359
6360 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6361
6362 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
6363 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
6364
6365 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6366 m_count = _mm_cvtsi32_si128(i4_shift);
6367 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6368 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6369
6370 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6371 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6372 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6373 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6374
6375 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6376
6377 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6378 pi2_dst_scratch += 8;
6379 }
6380
6381 }
6382
6383 }
6384
6385 /* Transpose */
6386 {
6387
6388 WORD16 *pi2_src_scratch = temp_ptr;
6389 WORD32 out_stride = dst_strd;
6390 WORD32 in_stride = 8;
6391
6392 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
6393 pi2_src_scratch += in_stride;
6394 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
6395 pi2_src_scratch += in_stride;
6396 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
6397 pi2_src_scratch += in_stride;
6398 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
6399 pi2_src_scratch += in_stride;
6400 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
6401 pi2_src_scratch += in_stride;
6402 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
6403 pi2_src_scratch += in_stride;
6404 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
6405 pi2_src_scratch += in_stride;
6406 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
6407 pi2_src_scratch += 8;
6408
6409 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
6410 pi2_src_scratch += in_stride;
6411 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
6412 pi2_src_scratch += in_stride;
6413 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
6414 pi2_src_scratch += in_stride;
6415 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
6416 pi2_src_scratch += in_stride;
6417 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
6418 pi2_src_scratch += in_stride;
6419 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
6420 pi2_src_scratch += in_stride;
6421 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
6422 pi2_src_scratch += in_stride;
6423 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
6424 pi2_src_scratch += 8;
6425
6426
6427 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
6428 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
6429
6430 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
6431 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
6432
6433 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
6434 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
6435
6436 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
6437 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
6438
6439 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
6440 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
6441
6442 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
6443 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
6444
6445 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
6446 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
6447
6448 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
6449 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
6450
6451
6452 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
6453 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
6454
6455 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
6456 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
6457
6458 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
6459 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
6460
6461 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
6462 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
6463
6464 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
6465 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
6466
6467 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
6468 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
6469
6470 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
6471 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
6472
6473 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
6474 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
6475
6476
6477 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7
6478 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7
6479
6480 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31
6481 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31
6482
6483 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15
6484 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15
6485
6486 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23
6487 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23
6488
6489 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7
6490 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7
6491
6492 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31
6493 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31
6494
6495 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15
6496 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15
6497
6498 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23
6499 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23
6500
6501 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6502
6503 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6504
6505 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
6506 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6507
6508 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6509
6510 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
6511 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6512
6513 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6514
6515 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6516
6517 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6518
6519 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
6520 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6521
6522 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6523
6524 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
6525 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6526
6527 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6528 pu1_dst += out_stride;
6529 pu1_pred += pred_strd;
6530
6531
6532 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6533
6534 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6535
6536 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
6537 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6538
6539 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6540
6541 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
6542 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6543
6544 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6545
6546 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6547
6548 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6549
6550 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
6551 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6552
6553 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6554
6555 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
6556 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6557
6558 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6559 pu1_dst += out_stride;
6560 pu1_pred += pred_strd;
6561
6562 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6563
6564 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6565
6566 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
6567 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6568
6569 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6570
6571 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
6572 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6573
6574 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6575
6576 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6577
6578 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6579
6580 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
6581 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6582
6583 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6584
6585 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
6586 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6587
6588 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6589 pu1_dst += out_stride;
6590 pu1_pred += pred_strd;
6591
6592
6593 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6594
6595 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6596
6597 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
6598 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6599
6600 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6601
6602 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
6603 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6604
6605 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6606
6607 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6608
6609 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6610
6611 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
6612 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6613
6614 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6615
6616 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
6617 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6618
6619 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6620 pu1_dst += out_stride;
6621 pu1_pred += pred_strd;
6622
6623 }
6624 pi2_tmp += 4;
6625 }
6626 }
6627
6628
6629