1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19  *******************************************************************************
20  * @file
21  *  ihevc_itrans_recon_32x32_atom_intr.c
22  *
23  * @brief
24  *  Contains function definitions for inverse  quantization, inverse
25  * transform and reconstruction
26  *
27  * @author
28  *  100470
29  *
30  * @par List of Functions:
31  *  - ihevc_iquant_itrans_recon_32x32_ssse3()
32  *
33  * @remarks
34  *  None
35  *
36  *******************************************************************************
37  */
38 #include <stdio.h>
39 #include <string.h>
40 #include "ihevc_typedefs.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_defs.h"
44 #include "ihevc_trans_tables.h"
45 #include "ihevc_iquant_itrans_recon.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_trans_macros.h"
48 
49 
50 
51 
52 #include <immintrin.h>
53 #include <emmintrin.h>
54 
55 #include <tmmintrin.h>
56 
57 
58 
59 /**
60  *******************************************************************************
61  *
62  * @brief
63  *  This function performs inverse quantization, inverse  transform and
64  * reconstruction for 16x16 input block
65  *
66  * @par Description:
67  *  Performs inverse quantization , inverse transform  and adds the
68  * prediction data and clips output to 8 bit
69  *
70  * @param[in] pi2_src
71  *  Input 16x16 coefficients
72  *
73  * @param[in] pi2_tmp
74  *  Temporary 16x16 buffer for storing inverse
75  *  transform 1st stage output
76  *
77  * @param[in] pu1_pred
78  *  Prediction 16x16 block
79  *
80  * @param[in] pi2_dequant_coeff
81  *  Dequant Coeffs
82  *
83  * @param[out] pu1_dst
84  *  Output 16x16 block
85  *
86  * @param[in] qp_div
87  *  Quantization parameter / 6
88  *
89  * @param[in] qp_rem
90  *  Quantization parameter % 6
91  *
92  * @param[in] src_strd
93  *  Input stride
94  *
95  * @param[in] pred_strd
96  *  Prediction stride
97  *
98  * @param[in] dst_strd
99  *  Output Stride
100  *
101  * @param[in] zero_cols
102  *  Zero columns in pi2_src
103  *
104  * @returns  Void
105  *
106  * @remarks
107  *  None
108  *
109  *******************************************************************************
110  */
111 /**/
112 
ihevc_itrans_recon_32x32_ssse3(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)113 void ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src,
114                                     WORD16 *pi2_tmp,
115                                     UWORD8 *pu1_pred,
116                                     UWORD8 *pu1_dst,
117                                     WORD32 src_strd,
118                                     WORD32 pred_strd,
119                                     WORD32 dst_strd,
120                                     WORD32 zero_cols,
121                                     WORD32 zero_rows)
122 {
123     /* Inverse Transform */
124 
125     WORD32 j;
126 
127 
128     WORD16 *pi2_tmp_orig;
129 
130 
131     /*MEM_ALIGN16  WORD32 temp_array[1024];
132     MEM_ALIGN16  WORD16 temp1_array[1024];*/
133     WORD16 *o_temp_ptr;
134     WORD16 *temp_ptr;
135 
136     __m128i m_temp_reg_0;
137     __m128i m_temp_reg_1;
138     __m128i m_temp_reg_2;
139     __m128i m_temp_reg_3;
140     __m128i m_temp_reg_4;
141     __m128i m_temp_reg_5;
142     __m128i m_temp_reg_6;
143     __m128i m_temp_reg_7;
144     __m128i m_temp_reg_10;
145     __m128i m_temp_reg_11;
146     __m128i m_temp_reg_12;
147     __m128i m_temp_reg_13;
148     __m128i m_temp_reg_14;
149     __m128i m_temp_reg_15;
150     __m128i m_temp_reg_16;
151     __m128i m_temp_reg_17;
152     __m128i m_temp_reg_18;
153     __m128i m_temp_reg_19;
154     __m128i m_temp_reg_20;
155     __m128i m_temp_reg_21;
156     __m128i m_temp_reg_22;
157     __m128i m_temp_reg_23;
158     __m128i m_temp_reg_30;
159     __m128i m_temp_reg_31;
160     __m128i m_temp_reg_32;
161     __m128i m_temp_reg_33;
162     __m128i m_temp_reg_34;
163     __m128i m_temp_reg_35;
164     __m128i m_temp_reg_36;
165     __m128i m_temp_reg_37;
166     __m128i m_temp_reg_40;
167     __m128i m_temp_reg_41;
168     __m128i m_temp_reg_42;
169     __m128i m_temp_reg_43;
170     __m128i m_temp_reg_44;
171     __m128i m_temp_reg_45;
172     __m128i m_temp_reg_46;
173     __m128i m_temp_reg_47;
174 
175     __m128i m_temp_reg_70;
176     __m128i m_temp_reg_71;
177     __m128i m_temp_reg_72;
178     __m128i m_temp_reg_73;
179     __m128i m_temp_reg_74;
180     __m128i m_temp_reg_75;
181     __m128i m_temp_reg_76;
182     __m128i m_temp_reg_77;
183 
184     __m128i m_temp_reg_80;
185     __m128i m_temp_reg_81;
186     __m128i m_temp_reg_82;
187     __m128i m_temp_reg_83;
188     __m128i m_temp_reg_84;
189     __m128i m_temp_reg_85;
190     __m128i m_temp_reg_86;
191     __m128i m_temp_reg_87;
192 
193     __m128i m_temp_reg_90;
194     __m128i m_temp_reg_91;
195     __m128i m_temp_reg_92;
196     __m128i m_temp_reg_93;
197     __m128i m_temp_reg_94;
198     __m128i m_temp_reg_95;
199     __m128i m_temp_reg_96;
200     __m128i m_temp_reg_97;
201 
202     __m128i m_rdng_factor;
203     __m128i m_count;
204     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
205     __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
206 
207     __m128i temp1, temp2, temp3, temp4;
208     __m128i temp5, temp6, temp7, temp8;
209 
210     __m128i all_zero_reg;
211     WORD32 i;
212 
213     /*Lokesh*/
214     WORD32  zero_last24_cols_stg1;
215     WORD32  zero_last24_rows_stg1;
216     WORD32  zero_last28_rows_stg1;
217 
218     WORD32  zero_last28_rows_stg2;
219     WORD32  zero_last24_rows_stg2;
220 
221     WORD32  trans_size_stg1;
222 
223     WORD32 i4_shift = IT_SHIFT_STAGE_1;
224     WORD32 trans_size = TRANS_SIZE_32;
225 
226 
227     /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
228     zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
229     zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
230     zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
231 
232     zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
233     zero_last24_rows_stg2 = zero_last24_cols_stg1;
234 
235     if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
236     {
237         trans_size_stg1 = 8;
238 
239     }
240     else
241     {
242         trans_size_stg1 = 32;
243     }
244 
245     all_zero_reg = _mm_setzero_si128();
246 
247     o_temp_ptr  = pi2_tmp;
248     temp_ptr = (pi2_tmp + 1024);
249 
250     pi2_tmp += 2048;
251     pi2_tmp_orig = pi2_tmp;
252 
253     for(i = 0; i < trans_size_stg1; i += 8)
254     {
255 
256 
257         {
258             WORD16 *pi2_tmp_src = pi2_src;
259 
260             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
261             pi2_tmp_src += (src_strd << 1);
262             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
263             pi2_tmp_src += (src_strd << 1);
264             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
265             pi2_tmp_src += (src_strd << 1);
266             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
267             pi2_tmp_src += (src_strd << 1);
268             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
269             pi2_tmp_src += (src_strd << 1);
270             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
271             pi2_tmp_src += (src_strd << 1);
272             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
273             pi2_tmp_src += (src_strd << 1);
274             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
275             pi2_tmp_src += (src_strd << 1);
276 
277             m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
278             pi2_tmp_src += (src_strd << 1);
279             m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
280             pi2_tmp_src += (src_strd << 1);
281             m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
282             pi2_tmp_src += (src_strd << 1);
283             m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
284             pi2_tmp_src += (src_strd << 1);
285             m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
286             pi2_tmp_src += (src_strd << 1);
287             m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
288             pi2_tmp_src += (src_strd << 1);
289             m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
290             pi2_tmp_src += (src_strd << 1);
291             m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
292         }
293 
294         if(zero_last28_rows_stg1)
295         {
296             /* eeo */
297             /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
298             /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
299             {
300                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
301 
302                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
303 
304                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
305 
306 /* eeeo[0]= m_temp_reg_20  */
307 /* eeeo[1]= m_temp_reg_21  */
308 /* eeee[0]= m_temp_reg_22  */
309 /* eeee[1]= m_temp_reg_23  */
310 
311                 /* eee[0] = eeee[0] + eeeo[0]; */
312                 m_temp_reg_40 = m_temp_reg_14;
313 
314                 /* eee[3] = eeee[0] - eeeo[0]; */
315                 m_temp_reg_43 = m_temp_reg_14;
316 
317                 /* eee[2] = eeee[1] - eeeo[1]; */
318                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
319 
320                 /* eee[1] = eeee[1] + eeeo[1];*/
321                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
322 
323                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
324 
325                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
326 
327                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
328 
329 /* eeeo[0]= m_temp_reg_20  */
330 /* eeeo[1]= m_temp_reg_21  */
331 /* eeee[0]= m_temp_reg_22  */
332 /* eeee[1]= m_temp_reg_23  */
333 
334                 /* eee[0] = eeee[0] + eeeo[0]; */
335                 m_temp_reg_44 = m_temp_reg_14;
336 
337                 /* eee[3] = eeee[0] - eeeo[0]; */
338                 m_temp_reg_47 = m_temp_reg_14;
339 
340                 /* eee[2] = eeee[1] - eeeo[1]; */
341                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
342 
343                 /* eee[1] = eeee[1] + eeeo[1];*/
344                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
345 
346 
347             }
348             /* eo */
349             {
350                 WORD16 *pi2_scratch = o_temp_ptr;
351 
352                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
353                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
354                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
355                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
356                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
357                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
358                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
359                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
360 
361                 //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71);
362                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
363 
364                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
365 
366                 /* eo0[0-3] */
367                 {
368                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
369 
370                     //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71);
371                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
372 
373                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
374                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
375 
376                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
377                     pi2_scratch += 8;
378                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
379                     pi2_scratch += 8;
380 
381                 }
382 
383                 /* eo0[4-7] */
384                 {
385                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
386 
387                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
388                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
389 
390                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
391                     pi2_scratch += 8;
392                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
393                     pi2_scratch += 8;
394 
395                 }
396                 /* eo1[0-3] */
397                 {
398                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
399 
400                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
401                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
402 
403                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
404                     pi2_scratch += 8;
405                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
406                     pi2_scratch += 8;
407 
408                 }
409 
410                 /* eo1[4-7] */
411                 {
412                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
413 
414                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
415                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
416 
417                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
418                     pi2_scratch += 8;
419                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
420                     pi2_scratch += 8;
421 
422                 }
423 
424                 /* eo2[0-3] */
425                 {
426                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
427 
428                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
429                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
430 
431                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
432                     pi2_scratch += 8;
433                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
434                     pi2_scratch += 8;
435 
436                 }
437 
438                 /* eo2[4-7] */
439                 {
440                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
441 
442                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
443                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
444 
445                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
446                     pi2_scratch += 8;
447                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
448                     pi2_scratch += 8;
449 
450                 }
451 
452                 /**************************************************************************/
453 
454 
455                 /* eo3[0-3] */
456                 {
457                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
458 
459                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
460                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
461 
462                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
463                     pi2_scratch += 8;
464                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
465                     pi2_scratch += 8;
466 
467                 }
468 
469                 /* eo3[4-7] */
470                 {
471                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
472 
473                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
474                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
475 
476                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
477                     pi2_scratch += 8;
478                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
479                     pi2_scratch += 8;
480 
481                 }
482 
483 
484                 /* eo4[0-3] */
485                 {
486                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
487 
488                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
489                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
490 
491                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
492                     pi2_scratch += 8;
493                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
494                     pi2_scratch += 8;
495 
496                 }
497                 /* eo4[4-7] */
498                 {
499                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
500 
501                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
502                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
503 
504                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
505                     pi2_scratch += 8;
506                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
507                     pi2_scratch += 8;
508 
509                 }
510 
511                 /***********************************************************************/
512 
513                 /* eo5[0-3] */
514                 {
515                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
516 
517                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
518                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
519 
520                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
521                     pi2_scratch += 8;
522                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
523                     pi2_scratch += 8;
524 
525                 }
526 
527 
528                 /* eo5[4-7] */
529                 {
530                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
531 
532                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
533                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
534 
535                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
536                     pi2_scratch += 8;
537                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
538                     pi2_scratch += 8;
539 
540                 }
541 
542                 /* eo6[0-3] */
543                 {
544                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
545 
546                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
547                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
548 
549                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
550                     pi2_scratch += 8;
551                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
552                     pi2_scratch += 8;
553 
554                 }
555 
556 
557                 /* eo6[4-7] */
558                 {
559                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
560 
561                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
562                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
563 
564                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
565                     pi2_scratch += 8;
566                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
567                     pi2_scratch += 8;
568 
569                 }
570 
571 
572                 /* eo7[0-3] */
573                 {
574                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
575 
576                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
577                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
578 
579                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
580                     pi2_scratch += 8;
581                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
582                     pi2_scratch += 8;
583 
584                 }
585 
586 
587                 /* eo7[4-7] */
588                 {
589                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
590 
591                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
592                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
593 
594                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
595                     pi2_scratch += 8;
596                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
597                     pi2_scratch += 8;
598 
599                 }
600 
601             }
602         }
603         else if(zero_last24_rows_stg1)
604         {
605             {
606                 /* eeo */
607                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
608                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
609 
610                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
611                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
612 
613                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
614 
615                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
616 
617                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
618 
619                 /* eeeo[0]= m_temp_reg_20  */
620                 /* eeeo[1]= m_temp_reg_21  */
621                 /* eeee[0]= m_temp_reg_22  */
622                 /* eeee[1]= m_temp_reg_23  */
623 
624                 /* eee[0] = eeee[0] + eeeo[0]; */
625                 m_temp_reg_40 = m_temp_reg_14;
626 
627                 /* eee[3] = eeee[0] - eeeo[0]; */
628                 m_temp_reg_43 = m_temp_reg_14;
629 
630                 /* eee[2] = eeee[1] - eeeo[1]; */
631                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
632 
633                 /* eee[1] = eeee[1] + eeeo[1];*/
634                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
635 
636                 /* for row 4 to 7 */
637 
638                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
639 
640                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
641 
642                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
643 
644                 /* eeeo[0]= m_temp_reg_20  */
645                 /* eeeo[1]= m_temp_reg_21  */
646                 /* eeee[0]= m_temp_reg_22  */
647                 /* eeee[1]= m_temp_reg_23  */
648 
649                 /* eee[0] = eeee[0] + eeeo[0]; */
650                 m_temp_reg_44 = m_temp_reg_14;
651 
652                 /* eee[3] = eeee[0] - eeeo[0]; */
653                 m_temp_reg_47 = m_temp_reg_14;
654 
655                 /* eee[2] = eeee[1] - eeeo[1]; */
656                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
657 
658                 /* eee[1] = eeee[1] + eeeo[1];*/
659                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
660 
661 
662                 /* eeo[] */
663                 /* for(k = 0; k < 4; k++) */
664 
665                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
666                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
667                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
668                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
669 
670                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
671 
672                 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
673 
674                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
675 
676                 m_temp_reg_33 = _mm_setzero_si128();
677 
678                 /* eeo */
679                 {
680                     /* eeo0[0-3] */
681                     {
682                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
683 
684                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
685                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
686 
687                         m_temp_reg_90 = m_temp_reg_34;
688                         m_temp_reg_97 = m_temp_reg_35;
689                     }
690                     /* eeo0[4-7] */
691                     {
692                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
693 
694                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
695                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
696 
697                         m_temp_reg_91 = m_temp_reg_34;
698                         m_temp_reg_96 = m_temp_reg_35;
699 
700                     }
701 
702                     /* eeo1[0-3] */
703                     {
704                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
705 
706                         /* e[1][0-3] stored in pi2_tmp[2][0-7] */
707                         /* e[6][0-3] stored in pi2_tmp[2][8-15] */
708                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
709                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
710 
711                         m_temp_reg_92 = m_temp_reg_34;
712                         m_temp_reg_95 = m_temp_reg_35;
713 
714                     }
715 
716                     /* eo1[4-7] */
717                     {
718                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
719 
720                         /* e[1][4-7] stored in pi2_tmp[3][0-7] */
721                         /* e[6][4-7] stored in pi2_tmp[3][8-15] */
722                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
723                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
724 
725                         m_temp_reg_93 = m_temp_reg_34;
726                         m_temp_reg_94 = m_temp_reg_35;
727 
728 
729                     }
730 
731                     /* eo2[0-3] */
732                     {
733                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
734 
735                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
736                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
737                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
738                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
739 
740                         temp1 = m_temp_reg_34;
741                         temp7 = m_temp_reg_35;
742 
743                     }
744 
745                     /* eo2[4-7] */
746                     {
747                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
748 
749                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
750                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
751                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
752                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
753 
754                         temp2 = m_temp_reg_34;
755                         temp6 = m_temp_reg_35;
756 
757                     }
758 
759                     /* eo3[0-3] */
760                     {
761                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
762 
763                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
764                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
765                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
766                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
767 
768                         temp3 = m_temp_reg_34;
769                         temp5 = m_temp_reg_35;
770 
771                     }
772 
773 
774                     /* eo3[4-7] */
775                     {
776                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
777 
778                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
779                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
780                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
781                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
782 
783                         temp4 = m_temp_reg_34;
784                         temp8 = m_temp_reg_35;
785 
786 
787                     }
788                     /* All values of ee[] array in pi2_temp */
789 
790                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
791                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
792                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
793                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
794 
795                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
796 
797                     m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
798                     m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
799 
800                 }
801             }
802             /* eo */
803             {
804                 WORD16 *pi2_scratch = o_temp_ptr;
805 
806                 /* eo0[0-3] */
807                 {
808                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
809 
810                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
811                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
812 
813                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
814                     pi2_scratch += 8;
815                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
816                     pi2_scratch += 8;
817 
818                 }
819 
820 
821                 /* eo0[4-7] */
822                 {
823                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
824 
825                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
826 
827                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
828                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
829 
830                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
831                     pi2_scratch += 8;
832                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
833                     pi2_scratch += 8;
834 
835                 }
836 
837                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
838 
839                 /* eo1[0-3] */
840                 {
841                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
842 
843                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
844                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
845 
846                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
847                     pi2_scratch += 8;
848                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
849                     pi2_scratch += 8;
850 
851                 }
852 
853 
854                 /* eo1[4-7] */
855                 {
856                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
857 
858                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
859                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
860 
861                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
862                     pi2_scratch += 8;
863                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
864                     pi2_scratch += 8;
865 
866                 }
867 
868                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
869 
870                 /* eo2[0-3] */
871                 {
872 
873                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
874 
875                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
876                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
877 
878                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
879                     pi2_scratch += 8;
880                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
881                     pi2_scratch += 8;
882 
883                 }
884 
885                 /* eo2[4-7] */
886                 {
887 
888                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
889 
890                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
891                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
892 
893                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
894                     pi2_scratch += 8;
895                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
896                     pi2_scratch += 8;
897 
898                 }
899 
900                 /**************************************************************************/
901 
902 
903 
904                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
905 
906                 /* eo3[0-3] */
907                 {
908 
909                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
910 
911                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
912                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
913 
914                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
915                     pi2_scratch += 8;
916                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
917                     pi2_scratch += 8;
918 
919                 }
920 
921 
922                 /* eo3[4-7] */
923                 {
924 
925                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
926 
927                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
928                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
929 
930                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
931                     pi2_scratch += 8;
932                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
933                     pi2_scratch += 8;
934 
935                 }
936 
937                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
938 
939                 /* eo4[0-3] */
940                 {
941                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
942 
943                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
944                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
945 
946                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
947                     pi2_scratch += 8;
948                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
949                     pi2_scratch += 8;
950 
951                 }
952                 /* eo4[4-7] */
953                 {
954                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
955 
956                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
957                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
958 
959                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
960                     pi2_scratch += 8;
961                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
962                     pi2_scratch += 8;
963 
964                 }
965 
966                 /***********************************************************************/
967 
968                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
969 
970                 /* eo5[0-3] */
971                 {
972 
973                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
974 
975                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
976                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
977 
978                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
979                     pi2_scratch += 8;
980                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
981                     pi2_scratch += 8;
982 
983                 }
984 
985 
986                 /* eo5[4-7] */
987                 {
988                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
989 
990                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
991                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
992 
993                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
994                     pi2_scratch += 8;
995                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
996                     pi2_scratch += 8;
997 
998                 }
999 
1000                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
1001 
1002                 /* eo6[0-3] */
1003                 {
1004                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1005 
1006                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1007                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1008 
1009                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1010                     pi2_scratch += 8;
1011                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1012                     pi2_scratch += 8;
1013 
1014                 }
1015 
1016 
1017                 /* eo6[4-7] */
1018                 {
1019 
1020                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1021 
1022                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1023                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1024 
1025                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1026                     pi2_scratch += 8;
1027                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1028                     pi2_scratch += 8;
1029 
1030                 }
1031 
1032                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
1033 
1034                 /* eo7[0-3] */
1035                 {
1036 
1037                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1038 
1039                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1040                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1041 
1042                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1043                     pi2_scratch += 8;
1044                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1045                     pi2_scratch += 8;
1046 
1047                 }
1048 
1049 
1050                 /* eo7[4-7] */
1051                 {
1052                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1053 
1054                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1055                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1056 
1057                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1058                     pi2_scratch += 8;
1059                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1060                     pi2_scratch += 8;
1061 
1062                 }
1063 
1064             }
1065 
1066         }
1067         else
1068         {
1069 
1070             {
1071                 /* eeo */
1072                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1073                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1074 
1075                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
1076                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
1077 
1078                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
1079                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
1080 
1081                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1082 
1083                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1084 
1085                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
1086                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
1087 
1088                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
1089                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
1090 
1091 
1092                 /* eeeo[0]= m_temp_reg_20  */
1093                 /* eeeo[1]= m_temp_reg_21  */
1094                 /* eeee[0]= m_temp_reg_22  */
1095                 /* eeee[1]= m_temp_reg_23  */
1096 
1097                 /* eee[0] = eeee[0] + eeeo[0]; */
1098                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
1099 
1100                 /* eee[3] = eeee[0] - eeeo[0]; */
1101                 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
1102 
1103                 /* eee[2] = eeee[1] - eeeo[1]; */
1104                 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
1105 
1106                 /* eee[1] = eeee[1] + eeeo[1];*/
1107                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
1108 
1109                 /* for row 4 to 7 */
1110 
1111                 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
1112                 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
1113 
1114                 /* Interleaving row 8 and row 24*/
1115                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1116 
1117                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1118                 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
1119 
1120                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1121 
1122                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
1123                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
1124 
1125                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
1126                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
1127 
1128 
1129                 /* eeeo[0]= m_temp_reg_20  */
1130                 /* eeeo[1]= m_temp_reg_21  */
1131                 /* eeee[0]= m_temp_reg_22  */
1132                 /* eeee[1]= m_temp_reg_23  */
1133 
1134                 /* eee[0] = eeee[0] + eeeo[0]; */
1135                 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
1136 
1137                 /* eee[3] = eeee[0] - eeeo[0]; */
1138                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
1139 
1140                 /* eee[2] = eeee[1] - eeeo[1]; */
1141                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
1142 
1143                 /* eee[1] = eeee[1] + eeeo[1];*/
1144                 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
1145 
1146 
1147                 // eeo[]
1148                 /* for(k = 0; k < 4; k++) */
1149 
1150 
1151                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
1152                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
1153 
1154                 /* eeo */
1155                 {
1156                     /* eeo0[0-3] */
1157                     {
1158                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1159                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1160 
1161                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1162                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1163 
1164                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1165 
1166                         m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1167                         m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1168 
1169                     }
1170 
1171                     m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
1172                     m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
1173                     m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
1174                     m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
1175 
1176                     /* eeo0[4-7] */
1177                     {
1178                         m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1179                         m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1180 
1181                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1182                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1183 
1184                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1185 
1186                         m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
1187                         m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
1188 
1189                     }
1190 
1191 
1192                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
1193                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
1194 
1195                     /* eeo1[0-3] */
1196                     {
1197                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1198                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1199 
1200                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
1201                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
1202 
1203                         m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1204                         m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1205 
1206                     }
1207 
1208                     /* eeo1[4-7] */
1209                     {
1210 
1211                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1212                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1213 
1214                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
1215                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
1216 
1217                         m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1218                         m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1219 
1220 
1221                     }
1222 
1223                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
1224                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
1225 
1226                     /* eeo2[0-3] */
1227                     {
1228 
1229                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1230                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1231 
1232                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
1233                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
1234 
1235                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
1236                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
1237 
1238                         temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1239                         temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1240 
1241                     }
1242 
1243                     /* eeo2[4-7] */
1244                     {
1245 
1246                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1247                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1248 
1249                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
1250                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
1251 
1252                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
1253                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
1254 
1255                         temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1256                         temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1257 
1258                     }
1259 
1260                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
1261                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
1262 
1263                     /* eeo3[0-3] */
1264                     {
1265 
1266                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1267                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1268 
1269                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
1270                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
1271 
1272                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
1273                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
1274 
1275                         temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1276                         temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1277 
1278 
1279                     }
1280 
1281                     /* eeo3[4-7] */
1282                     {
1283 
1284                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1285                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1286 
1287                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
1288                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
1289 
1290                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
1291                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
1292                         temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1293                         temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1294 
1295                     }
1296 
1297 
1298                     /* All values of ee[] array in pi2_temp */
1299 
1300                     /* for(k = 0; k < 8; k++) */
1301                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
1302                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
1303                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
1304                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
1305                 }
1306             }
1307             /* eo */
1308             {
1309                 WORD16 *pi2_scratch = o_temp_ptr;
1310 
1311                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1312                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1313                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1314                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1315 
1316                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1317                 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
1318                 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
1319                 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
1320 
1321                 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
1322                 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
1323                 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
1324                 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
1325 
1326                 /* eo0[0-3] */
1327                 {
1328                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1329                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1330 
1331                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1332 
1333                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1334                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1335 
1336                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1337 
1338                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1339 
1340                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
1341                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
1342 
1343                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1344                     pi2_scratch += 8;
1345                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1346                     pi2_scratch += 8;
1347 
1348                 }
1349                 /* eo0[4-7] */
1350                 {
1351                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1352                     m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1353                     m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1354                     m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1355 
1356                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1357                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1358 
1359                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1360 
1361                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1362                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1363 
1364                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1365 
1366                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1367 
1368                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
1369                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
1370 
1371                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1372                     pi2_scratch += 8;
1373                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1374                     pi2_scratch += 8;
1375 
1376                 }
1377 
1378                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
1379                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
1380                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
1381                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
1382 
1383                 /* eo1[0-3] */
1384                 {
1385 
1386                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1387                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1388 
1389                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1390 
1391                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1392                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1393 
1394                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1395 
1396                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1397 
1398                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
1399                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
1400 
1401                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1402                     pi2_scratch += 8;
1403                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1404                     pi2_scratch += 8;
1405 
1406                 }
1407 
1408                 /* eo1[4-7] */
1409                 {
1410                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1411                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1412 
1413                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1414 
1415                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1416                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1417 
1418                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1419 
1420                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1421 
1422                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
1423                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
1424 
1425                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1426                     pi2_scratch += 8;
1427                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1428                     pi2_scratch += 8;
1429 
1430                 }
1431 
1432                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
1433                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
1434                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
1435                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
1436 
1437                 /* eo2[0-3] */
1438                 {
1439                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1440                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1441 
1442                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1443 
1444                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1445                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1446 
1447                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1448 
1449                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1450 
1451                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
1452                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
1453 
1454                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1455                     pi2_scratch += 8;
1456                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1457                     pi2_scratch += 8;
1458 
1459                 }
1460 
1461 
1462                 /* eo2[4-7] */
1463                 {
1464 
1465                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1466                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1467 
1468                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1469 
1470                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1471                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1472 
1473                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1474 
1475                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1476 
1477                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
1478                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
1479 
1480                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1481                     pi2_scratch += 8;
1482                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1483                     pi2_scratch += 8;
1484 
1485                 }
1486                 /**************************************************************************/
1487 
1488                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
1489                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
1490                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
1491                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
1492 
1493                 /* eo3[0-3] */
1494                 {
1495                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1496                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1497 
1498                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1499 
1500                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1501                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1502 
1503                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1504 
1505                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1506 
1507                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
1508                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
1509 
1510                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1511                     pi2_scratch += 8;
1512                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1513                     pi2_scratch += 8;
1514 
1515                 }
1516 
1517 
1518                 /* eo3[4-7] */
1519                 {
1520                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1521                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1522 
1523                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1524 
1525                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1526                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1527 
1528                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1529 
1530                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1531 
1532                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
1533                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
1534 
1535                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1536                     pi2_scratch += 8;
1537                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1538                     pi2_scratch += 8;
1539 
1540                 }
1541 
1542                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
1543                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
1544                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
1545                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
1546 
1547                 /* eo4[0-3] */
1548                 {
1549 
1550                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1551                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1552 
1553                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1554 
1555                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1556                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1557 
1558                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1559 
1560                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1561 
1562                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
1563                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
1564 
1565                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1566                     pi2_scratch += 8;
1567                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1568                     pi2_scratch += 8;
1569 
1570                 }
1571 
1572 
1573                 /* eo4[4-7] */
1574                 {
1575                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1576                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1577 
1578                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1579 
1580                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1581                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1582 
1583                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1584 
1585                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1586 
1587                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
1588                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
1589 
1590                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1591                     pi2_scratch += 8;
1592                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1593                     pi2_scratch += 8;
1594 
1595                 }
1596 
1597                 /***********************************************************************/
1598 
1599                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
1600                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
1601                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
1602                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
1603 
1604                 /* eo5[0-3] */
1605                 {
1606                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1607                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1608 
1609                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1610 
1611                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1612                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1613 
1614                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1615 
1616                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1617 
1618                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
1619                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
1620 
1621                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1622                     pi2_scratch += 8;
1623                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1624                     pi2_scratch += 8;
1625 
1626                 }
1627 
1628 
1629                 /* eo5[4-7] */
1630                 {
1631                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1632                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1633 
1634                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1635 
1636                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1637                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1638 
1639                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1640 
1641                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1642 
1643                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
1644                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
1645 
1646                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1647                     pi2_scratch += 8;
1648                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1649                     pi2_scratch += 8;
1650 
1651                 }
1652 
1653                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
1654                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
1655                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
1656                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
1657 
1658                 /* eo6[0-3] */
1659                 {
1660 
1661                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1662                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1663 
1664                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1665 
1666                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1667                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1668 
1669                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1670 
1671                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1672 
1673                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1674                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1675 
1676                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1677                     pi2_scratch += 8;
1678                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1679                     pi2_scratch += 8;
1680 
1681                 }
1682 
1683 
1684                 /* eo6[4-7] */
1685                 {
1686                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1687                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1688 
1689                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1690 
1691                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1692                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1693 
1694                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1695 
1696                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1697 
1698                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1699                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1700 
1701                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1702                     pi2_scratch += 8;
1703                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1704                     pi2_scratch += 8;
1705 
1706                 }
1707 
1708                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
1709                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
1710                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
1711                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
1712 
1713                 /* eo7[0-3] */
1714                 {
1715 
1716                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1717                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1718 
1719                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1720 
1721                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1722                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1723 
1724                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1725 
1726                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1727 
1728                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1729                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1730 
1731                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1732                     pi2_scratch += 8;
1733                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1734                     pi2_scratch += 8;
1735 
1736                 }
1737 
1738 
1739                 /* eo7[4-7] */
1740                 {
1741 
1742                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1743                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1744 
1745                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1746 
1747                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1748                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1749 
1750                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1751 
1752                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1753 
1754                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1755                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1756 
1757                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1758                     pi2_scratch += 8;
1759                     _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1760                     pi2_scratch += 8;
1761 
1762                 }
1763 
1764             }
1765 
1766         }
1767         /*  All e[] are done */
1768         /****************************/
1769 
1770 
1771         {
1772 
1773             WORD16 *pi2_tmp_src = pi2_src + src_strd;
1774 
1775             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src);
1776             pi2_tmp_src += (src_strd << 1);
1777             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src);
1778             pi2_tmp_src += (src_strd << 1);
1779             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src);
1780             pi2_tmp_src += (src_strd << 1);
1781             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src);
1782             pi2_tmp_src += (src_strd << 1);
1783             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src);
1784             pi2_tmp_src += (src_strd << 1);
1785             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src);
1786             pi2_tmp_src += (src_strd << 1);
1787             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src);
1788             pi2_tmp_src += (src_strd << 1);
1789             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src);
1790             pi2_tmp_src += (src_strd << 1);
1791 
1792             m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src);
1793             pi2_tmp_src += (src_strd << 1);
1794             m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src);
1795             pi2_tmp_src += (src_strd << 1);
1796             m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src);
1797             pi2_tmp_src += (src_strd << 1);
1798             m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src);
1799             pi2_tmp_src += (src_strd << 1);
1800             m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src);
1801             pi2_tmp_src += (src_strd << 1);
1802             m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src);
1803             pi2_tmp_src += (src_strd << 1);
1804             m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src);
1805             pi2_tmp_src += (src_strd << 1);
1806             m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src);
1807         }
1808 
1809         if(zero_last28_rows_stg1)
1810         {
1811             /* o & stage 1 out */
1812             {
1813                 WORD32 j;
1814                 WORD16 *pi2_src_scratch = o_temp_ptr;
1815                 WORD16 *pi2_dst_scratch = temp_ptr;
1816                 WORD32 out_stride = (trans_size << 1);
1817                 WORD32 in_stride = trans_size;
1818 
1819                 for(j = 0; j < 2; j++)
1820                 {
1821                     if(j)
1822                     {
1823                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1824                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1825                     }
1826 
1827                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
1828 
1829                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
1830 
1831                     /* o0[0-3] */
1832                     {
1833                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1834 
1835                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1836                         pi2_src_scratch += in_stride;
1837 
1838                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1839                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1840 
1841                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1842                         m_count = _mm_cvtsi32_si128(i4_shift);
1843                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1844                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1845 
1846                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1847                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1848                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1849                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1850 
1851                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1852 
1853                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1854                         pi2_dst_scratch += out_stride;
1855 
1856                     }
1857 
1858                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
1859 
1860                     /* o1[0-3] */
1861                     {
1862 
1863                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1864 
1865                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1866                         pi2_src_scratch += in_stride;
1867 
1868                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1869                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1870 
1871                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1872                         m_count = _mm_cvtsi32_si128(i4_shift);
1873                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1874                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1875 
1876                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1877                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1878                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1879                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1880 
1881                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1882 
1883                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1884                         pi2_dst_scratch += out_stride;
1885 
1886                     }
1887 
1888                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
1889 
1890                     /* o2[0-3] */
1891                     {
1892 
1893                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1894 
1895                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1896                         pi2_src_scratch += in_stride;
1897 
1898                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1899                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1900 
1901                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1902                         m_count = _mm_cvtsi32_si128(i4_shift);
1903                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1904                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1905 
1906                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1907                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1908                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1909                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1910 
1911                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1912 
1913                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1914                         pi2_dst_scratch += out_stride;
1915 
1916                     }
1917 
1918                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
1919 
1920                     /* o3[0-3] */
1921                     {
1922                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1923 
1924                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1925                         pi2_src_scratch += in_stride;
1926 
1927                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1928                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1929 
1930                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1931                         m_count = _mm_cvtsi32_si128(i4_shift);
1932                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1933                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1934 
1935                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1936                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1937                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1938                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1939 
1940                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1941 
1942                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1943                         pi2_dst_scratch += out_stride;
1944 
1945                     }
1946 
1947                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
1948 
1949                     /* o4[0-3] */
1950                     {
1951                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1952 
1953                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1954                         pi2_src_scratch += in_stride;
1955 
1956                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1957                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1958 
1959                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1960                         m_count = _mm_cvtsi32_si128(i4_shift);
1961                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1962                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1963 
1964                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1965                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1966                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1967                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1968 
1969                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1970 
1971                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1972                         pi2_dst_scratch += out_stride;
1973 
1974                     }
1975 
1976                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
1977 
1978                     /* o5[0-3] */
1979                     {
1980 
1981                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1982 
1983                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
1984                         pi2_src_scratch += in_stride;
1985 
1986                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1987                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1988 
1989                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1990                         m_count = _mm_cvtsi32_si128(i4_shift);
1991                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1992                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1993 
1994                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1995                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1996                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1997                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1998 
1999                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2000 
2001                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2002                         pi2_dst_scratch += out_stride;
2003 
2004                     }
2005 
2006                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2007 
2008                     /* o6[0-3] */
2009                     {
2010                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2011 
2012                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2013                         pi2_src_scratch += in_stride;
2014 
2015                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2016                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2017 
2018                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2019                         m_count = _mm_cvtsi32_si128(i4_shift);
2020                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2021                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2022 
2023                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2024                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2025                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2026                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2027 
2028                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2029 
2030                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2031                         pi2_dst_scratch += out_stride;
2032 
2033                     }
2034 
2035                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2036 
2037                     /* o7[0-3] */
2038                     {
2039 
2040                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2041 
2042                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2043                         pi2_src_scratch += 8;
2044 
2045                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2046                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2047 
2048                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2049                         m_count = _mm_cvtsi32_si128(i4_shift);
2050                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2051                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2052 
2053                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2054                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2055                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2056                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2057 
2058                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2059 
2060                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2061                         pi2_dst_scratch += 8;
2062 
2063                     }
2064 
2065                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2066 
2067                     /* o8[0-3] */
2068                     {
2069                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2070 
2071                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2072                         pi2_src_scratch -= in_stride;
2073 
2074                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2075                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2076 
2077                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2078                         m_count = _mm_cvtsi32_si128(i4_shift);
2079                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2080                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2081 
2082                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2083                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2084                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2085                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2086 
2087                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2088 
2089                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2090                         pi2_dst_scratch -= out_stride;
2091                     }
2092 
2093                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2094 
2095                     /* o9[0-3] */
2096                     {
2097                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2098 
2099                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2100                         pi2_src_scratch -= in_stride;
2101 
2102                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2103                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2104 
2105                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2106                         m_count = _mm_cvtsi32_si128(i4_shift);
2107                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2108                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2109 
2110                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2111                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2112                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2113                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2114 
2115                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2116 
2117                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2118                         pi2_dst_scratch -= out_stride;
2119                     }
2120 
2121                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2122 
2123                     /* o10[0-3] */
2124                     {
2125                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2126 
2127                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2128                         pi2_src_scratch -= in_stride;
2129 
2130                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2131                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2132 
2133                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2134                         m_count = _mm_cvtsi32_si128(i4_shift);
2135                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2136                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2137 
2138                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2139                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2140                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2141                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2142 
2143                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2144 
2145                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2146                         pi2_dst_scratch -= out_stride;
2147                     }
2148 
2149                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2150 
2151                     /* o11[0-3] */
2152                     {
2153                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2154 
2155                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2156                         pi2_src_scratch -= in_stride;
2157 
2158                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2159                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2160 
2161                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2162                         m_count = _mm_cvtsi32_si128(i4_shift);
2163                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2164                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2165 
2166                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2167                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2168                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2169                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2170 
2171                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2172 
2173                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2174                         pi2_dst_scratch -= out_stride;
2175 
2176                     }
2177 
2178                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2179 
2180                     /* o12[0-3] */
2181                     {
2182                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2183 
2184                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2185                         pi2_src_scratch -= in_stride;
2186 
2187                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2188                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2189 
2190                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2191                         m_count = _mm_cvtsi32_si128(i4_shift);
2192                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2193                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2194 
2195                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2196                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2197                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2198                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2199 
2200                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2201 
2202                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2203                         pi2_dst_scratch -= out_stride;
2204 
2205                     }
2206 
2207                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2208 
2209                     /* o13[0-3] */
2210                     {
2211                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2212 
2213                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2214                         pi2_src_scratch -= in_stride;
2215 
2216                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2217                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2218 
2219                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2220                         m_count = _mm_cvtsi32_si128(i4_shift);
2221                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2222                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2223 
2224                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2225                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2226                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2227                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2228 
2229                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2230 
2231                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2232                         pi2_dst_scratch -= out_stride;
2233                     }
2234 
2235                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2236 
2237                     /* o14[0-3] */
2238                     {
2239                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2240 
2241                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2242                         pi2_src_scratch -= in_stride;
2243 
2244                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2245                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2246 
2247                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2248                         m_count = _mm_cvtsi32_si128(i4_shift);
2249                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2250                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2251 
2252                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2253                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2254                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2255                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2256 
2257                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2258 
2259                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2260                         pi2_dst_scratch -= out_stride;
2261 
2262                     }
2263 
2264                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2265 
2266                     /* o15[0-3] */
2267                     {
2268                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2269 
2270                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2271                         pi2_src_scratch += 8;
2272 
2273                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2274                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2275 
2276                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2277                         m_count = _mm_cvtsi32_si128(i4_shift);
2278                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2279                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2280 
2281                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2282                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2283                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2284                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2285 
2286                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2287 
2288                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2289                         pi2_dst_scratch += 8;
2290                     }
2291 
2292                 }
2293             }
2294         }
2295         else if(zero_last24_rows_stg1)
2296         {
2297             /* o & stage 1 out */
2298             {
2299                 WORD32 j;
2300                 WORD16 *pi2_src_scratch = o_temp_ptr;
2301                 WORD16 *pi2_dst_scratch = temp_ptr;
2302                 WORD32 out_stride = (trans_size << 1);
2303                 WORD32 in_stride = trans_size;
2304 
2305                 for(j = 0; j < 2; j++)
2306                 {
2307                     if(j)
2308                     {
2309                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2310                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2311                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2312                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2313                     }
2314 
2315                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2316                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2317 
2318                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2319                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2320 
2321                     /* o0[0-3] */
2322                     {
2323 
2324                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2325                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2326 
2327                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2328 
2329                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2330                         pi2_src_scratch += in_stride;
2331 
2332                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2333                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2334 
2335                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2336                         m_count = _mm_cvtsi32_si128(i4_shift);
2337                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2338                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2339 
2340                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2341                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2342                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2343                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2344 
2345                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2346 
2347                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2348                         pi2_dst_scratch += out_stride;
2349 
2350                     }
2351 
2352                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2353                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2354 
2355                     /* o1[0-3] */
2356                     {
2357                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2358                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2359 
2360                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2361 
2362                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2363                         pi2_src_scratch += in_stride;
2364 
2365                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2366                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2367 
2368                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2369                         m_count = _mm_cvtsi32_si128(i4_shift);
2370                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2371                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2372 
2373                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2374                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2375                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2376                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2377 
2378                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2379 
2380                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2381                         pi2_dst_scratch += out_stride;
2382 
2383                     }
2384 
2385                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2386                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2387 
2388                     /* o2[0-3] */
2389                     {
2390                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2391                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2392 
2393                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2394 
2395                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2396                         pi2_src_scratch += in_stride;
2397 
2398                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2399                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2400 
2401                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2402                         m_count = _mm_cvtsi32_si128(i4_shift);
2403                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2404                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2405 
2406                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2407                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2408                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2409                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2410 
2411                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2412 
2413                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2414                         pi2_dst_scratch += out_stride;
2415 
2416                     }
2417 
2418                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
2419                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
2420 
2421                     /* o3[0-3] */
2422                     {
2423                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2424                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2425 
2426                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2427 
2428                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2429                         pi2_src_scratch += in_stride;
2430 
2431                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2432                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2433 
2434                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2435                         m_count = _mm_cvtsi32_si128(i4_shift);
2436                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2437                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2438 
2439                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2440                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2441                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2442                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2443 
2444                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2445 
2446                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2447                         pi2_dst_scratch += out_stride;
2448 
2449                     }
2450 
2451                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
2452                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
2453 
2454                     /* o4[0-3] */
2455                     {
2456                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2457                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2458 
2459                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2460 
2461                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2462                         pi2_src_scratch += in_stride;
2463 
2464                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2465                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2466 
2467                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2468                         m_count = _mm_cvtsi32_si128(i4_shift);
2469                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2470                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2471 
2472                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2473                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2474                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2475                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2476 
2477                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2478 
2479                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2480                         pi2_dst_scratch += out_stride;
2481 
2482                     }
2483 
2484                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
2485                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
2486 
2487                     /* o5[0-3] */
2488                     {
2489                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2490                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2491 
2492                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2493 
2494                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2495                         pi2_src_scratch += in_stride;
2496 
2497                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2498                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2499 
2500                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2501                         m_count = _mm_cvtsi32_si128(i4_shift);
2502                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2503                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2504 
2505                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2506                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2507                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2508                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2509 
2510                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2511 
2512                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2513                         pi2_dst_scratch += out_stride;
2514 
2515                     }
2516 
2517                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2518                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
2519 
2520                     /* o6[0-3] */
2521                     {
2522                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2523                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2524 
2525                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2526 
2527                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2528                         pi2_src_scratch += in_stride;
2529 
2530                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2531                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2532 
2533                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2534                         m_count = _mm_cvtsi32_si128(i4_shift);
2535                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2536                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2537 
2538                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2539                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2540                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2541                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2542 
2543                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2544 
2545                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2546                         pi2_dst_scratch += out_stride;
2547 
2548                     }
2549 
2550                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2551                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
2552 
2553                     /* o7[0-3] */
2554                     {
2555                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2556                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2557 
2558                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2559 
2560                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2561                         pi2_src_scratch += 8;
2562 
2563                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2564                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2565 
2566                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2567                         m_count = _mm_cvtsi32_si128(i4_shift);
2568                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2569                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2570 
2571                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2572                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2573                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2574                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2575 
2576                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2577 
2578                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2579                         pi2_dst_scratch += 8;
2580 
2581                     }
2582 
2583                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2584                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
2585 
2586                     /* o8[0-3] */
2587                     {
2588                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2589                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2590 
2591                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2592 
2593                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2594                         pi2_src_scratch -= in_stride;
2595 
2596                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2597                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2598 
2599                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2600                         m_count = _mm_cvtsi32_si128(i4_shift);
2601                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2602                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2603 
2604                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2605                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2606                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2607                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2608 
2609                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2610 
2611                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2612                         pi2_dst_scratch -= out_stride;
2613                     }
2614 
2615                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2616                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
2617 
2618                     /* o9[0-3] */
2619                     {
2620                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2621                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2622 
2623                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2624 
2625                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2626                         pi2_src_scratch -= in_stride;
2627 
2628                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2629                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2630 
2631                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2632                         m_count = _mm_cvtsi32_si128(i4_shift);
2633                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2634                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2635 
2636                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2637                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2638                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2639                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2640 
2641                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2642 
2643                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2644                         pi2_dst_scratch -= out_stride;
2645                     }
2646 
2647                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2648                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
2649 
2650                     /* o10[0-3] */
2651                     {
2652                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2653                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2654 
2655                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2656 
2657                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2658                         pi2_src_scratch -= in_stride;
2659 
2660                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2661                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2662 
2663                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2664                         m_count = _mm_cvtsi32_si128(i4_shift);
2665                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2666                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2667 
2668                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2669                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2670                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2671                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2672 
2673                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2674 
2675                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2676                         pi2_dst_scratch -= out_stride;
2677                     }
2678 
2679                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2680                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
2681 
2682                     /* o11[0-3] */
2683                     {
2684 
2685                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2686                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2687 
2688                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2689 
2690                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2691                         pi2_src_scratch -= in_stride;
2692 
2693                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2694                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2695 
2696                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2697                         m_count = _mm_cvtsi32_si128(i4_shift);
2698                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2699                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2700 
2701                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2702                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2703                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2704                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2705 
2706                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2707 
2708                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2709                         pi2_dst_scratch -= out_stride;
2710 
2711                     }
2712 
2713                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2714                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
2715 
2716                     /* o12[0-3] */
2717                     {
2718                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2719                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2720 
2721                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2722 
2723                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2724                         pi2_src_scratch -= in_stride;
2725 
2726                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2727                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2728 
2729                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2730                         m_count = _mm_cvtsi32_si128(i4_shift);
2731                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2732                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2733 
2734                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2735                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2736                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2737                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2738 
2739                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2740 
2741                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2742                         pi2_dst_scratch -= out_stride;
2743 
2744                     }
2745 
2746                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2747                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
2748 
2749                     /* o13[0-3] */
2750                     {
2751                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2752                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2753 
2754                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2755 
2756                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2757                         pi2_src_scratch -= in_stride;
2758 
2759                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2760                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2761 
2762                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2763                         m_count = _mm_cvtsi32_si128(i4_shift);
2764                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2765                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2766 
2767                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2768                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2769                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2770                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2771 
2772                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2773 
2774                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2775                         pi2_dst_scratch -= out_stride;
2776                     }
2777 
2778                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2779                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
2780 
2781                     /* o14[0-3] */
2782                     {
2783                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2784                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2785 
2786                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2787 
2788                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2789                         pi2_src_scratch -= in_stride;
2790 
2791                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2792                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2793 
2794                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2795                         m_count = _mm_cvtsi32_si128(i4_shift);
2796                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2797                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2798 
2799                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2800                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2801                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2802                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2803 
2804                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2805 
2806                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2807                         pi2_dst_scratch -= out_stride;
2808 
2809                     }
2810 
2811                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2812                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
2813 
2814                     /* o15[0-3] */
2815                     {
2816                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2817                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2818 
2819                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2820 
2821                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2822                         pi2_src_scratch += 8;
2823 
2824                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2825                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2826 
2827                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2828                         m_count = _mm_cvtsi32_si128(i4_shift);
2829                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2830                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2831 
2832                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2833                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2834                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2835                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2836 
2837                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2838 
2839                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2840                         pi2_dst_scratch += 8;
2841                     }
2842 
2843                 }
2844             }
2845         }
2846         else
2847         {
2848             /* o & stage 1 out */
2849             {
2850                 WORD32 j;
2851                 WORD16 *pi2_src_scratch = o_temp_ptr;
2852                 WORD16 *pi2_dst_scratch = temp_ptr;
2853                 WORD32 out_stride = (trans_size << 1);
2854                 WORD32 in_stride = trans_size;
2855 
2856 
2857                 for(j = 0; j < 2; j++)
2858                 {
2859                     if(j)
2860                     {
2861                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2862                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2863                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2864                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2865                         m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
2866                         m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
2867                         m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
2868                         m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
2869 
2870                         m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
2871                         m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
2872                         m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
2873                         m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
2874                         m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
2875                         m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
2876                         m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
2877                         m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
2878                     }
2879 
2880                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2881                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2882                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
2883                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
2884                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
2885                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
2886                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
2887                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
2888 
2889                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2890                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2891                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
2892                     m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
2893                     temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
2894                     temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
2895                     temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
2896                     temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
2897 
2898 
2899                     /* o0[0-3] */
2900                     {
2901                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2902                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2903                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2904                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2905 
2906                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2907                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2908 
2909                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2910 
2911                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2912                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2913                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2914                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2915 
2916                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2917                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2918 
2919                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2920 
2921                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2922 
2923                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2924                         pi2_src_scratch += in_stride;
2925 
2926                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2927                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2928 
2929                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2930                         m_count = _mm_cvtsi32_si128(i4_shift);
2931                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2932                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2933 
2934                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2935                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2936                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2937                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2938 
2939                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2940 
2941                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2942                         pi2_dst_scratch += out_stride;
2943 
2944                     }
2945 
2946                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2947                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2948                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
2949                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
2950                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
2951                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
2952                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
2953                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
2954 
2955 
2956                     /* o1[0-3] */
2957                     {
2958                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2959                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2960                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2961                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2962 
2963                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2964                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2965 
2966                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
2967 
2968                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2969                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2970                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2971                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2972 
2973                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2974                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2975 
2976                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2977 
2978                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2979 
2980                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
2981                         pi2_src_scratch += in_stride;
2982 
2983                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2984                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2985 
2986                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2987                         m_count = _mm_cvtsi32_si128(i4_shift);
2988                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2989                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2990 
2991                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2992                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2993                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2994                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2995 
2996                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2997 
2998                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2999                         pi2_dst_scratch += out_stride;
3000 
3001                     }
3002 
3003                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
3004                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
3005                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
3006                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
3007                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
3008                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
3009                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
3010                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
3011 
3012                     /* o2[0-3] */
3013                     {
3014                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3015                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3016                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3017                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3018 
3019                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3020                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3021 
3022                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3023 
3024                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3025                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3026                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3027                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3028 
3029                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
3030                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3031 
3032                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
3033 
3034                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3035 
3036                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3037                         pi2_src_scratch += in_stride;
3038 
3039                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3040                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3041 
3042                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3043                         m_count = _mm_cvtsi32_si128(i4_shift);
3044                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3045                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3046 
3047                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3048                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3049                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3050                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3051 
3052                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3053 
3054                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3055                         pi2_dst_scratch += out_stride;
3056 
3057                     }
3058 
3059 
3060                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
3061                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
3062                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
3063                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
3064                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
3065                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
3066                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
3067                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
3068 
3069                     /* o3[0-3] */
3070                     {
3071                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3072                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3073                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3074                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3075 
3076                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3077                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3078 
3079                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3080 
3081                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3082                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3083                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3084                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3085 
3086                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
3087                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3088 
3089                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3090 
3091                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3092 
3093                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3094                         pi2_src_scratch += in_stride;
3095 
3096                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3097                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3098 
3099                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3100                         m_count = _mm_cvtsi32_si128(i4_shift);
3101                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3102                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3103 
3104                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3105                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3106                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3107                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3108 
3109                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3110 
3111                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3112                         pi2_dst_scratch += out_stride;
3113 
3114                     }
3115 
3116                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
3117                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
3118                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
3119                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
3120                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
3121                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
3122                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
3123                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
3124 
3125                     /* o4[0-3] */
3126                     {
3127                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3128                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3129                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3130                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3131 
3132                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3133                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3134 
3135                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3136 
3137                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3138                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3139                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3140                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3141 
3142                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3143                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3144 
3145                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3146 
3147                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3148 
3149                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3150                         pi2_src_scratch += in_stride;
3151 
3152                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3153                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3154 
3155                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3156                         m_count = _mm_cvtsi32_si128(i4_shift);
3157                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3158                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3159 
3160                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3161                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3162                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3163                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3164 
3165                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3166 
3167                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3168                         pi2_dst_scratch += out_stride;
3169 
3170                     }
3171 
3172 
3173                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
3174                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
3175                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
3176                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
3177                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
3178                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
3179                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
3180                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
3181 
3182                     /* o5[0-3] */
3183                     {
3184                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3185                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3186                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3187                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3188 
3189                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3190                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3191 
3192                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3193 
3194                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3195                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3196                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3197                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3198 
3199                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3200                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3201 
3202                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3203 
3204                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3205 
3206                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3207                         pi2_src_scratch += in_stride;
3208 
3209                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3210                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3211 
3212                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3213                         m_count = _mm_cvtsi32_si128(i4_shift);
3214                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3215                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3216 
3217                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3218                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3219                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3220                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3221 
3222                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3223 
3224                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3225                         pi2_dst_scratch += out_stride;
3226 
3227                     }
3228 
3229                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
3230                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
3231                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
3232                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
3233                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
3234                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
3235                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
3236                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
3237 
3238 
3239                     /* o6[0-3] */
3240                     {
3241                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3242                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3243                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3244                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3245 
3246                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3247                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3248 
3249                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3250 
3251                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3252                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3253                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3254                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3255 
3256                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3257                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3258 
3259                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3260 
3261                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3262 
3263                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3264                         pi2_src_scratch += in_stride;
3265 
3266                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3267                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3268 
3269                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3270                         m_count = _mm_cvtsi32_si128(i4_shift);
3271                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3272                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3273 
3274                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3275                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3276                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3277                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3278 
3279                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3280 
3281                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3282                         pi2_dst_scratch += out_stride;
3283 
3284                     }
3285 
3286                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
3287                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
3288                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
3289                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
3290                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
3291                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
3292                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
3293                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
3294 
3295                     /* o7[0-3] */
3296                     {
3297                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3298                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3299                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3300                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3301 
3302                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3303                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3304 
3305                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3306 
3307                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3308                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3309                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3310                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3311 
3312                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3313                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3314 
3315                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3316 
3317                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3318 
3319                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3320                         pi2_src_scratch += 8;
3321 
3322                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3323                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3324 
3325                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3326                         m_count = _mm_cvtsi32_si128(i4_shift);
3327                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3328                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3329 
3330                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3331                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3332                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3333                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3334 
3335                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3336 
3337                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3338                         pi2_dst_scratch += 8;
3339 
3340                     }
3341 
3342                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
3343                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
3344                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
3345                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
3346                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
3347                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
3348                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
3349                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
3350 
3351 
3352                     /* o8[0-3] */
3353                     {
3354 
3355                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3356                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3357                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3358                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3359 
3360                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3361                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3362 
3363                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3364 
3365                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3366                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3367                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3368                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3369 
3370                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3371                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3372 
3373                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3374 
3375                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3376 
3377                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3378                         pi2_src_scratch -= in_stride;
3379 
3380                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3381                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3382 
3383                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3384                         m_count = _mm_cvtsi32_si128(i4_shift);
3385                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3386                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3387 
3388                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3389                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3390                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3391                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3392 
3393                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3394 
3395                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3396                         pi2_dst_scratch -= out_stride;
3397                     }
3398 
3399                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
3400                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
3401                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
3402                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
3403                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
3404                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
3405                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
3406                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
3407 
3408 
3409                     /* o9[0-3] */
3410                     {
3411                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3412                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3413                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3414                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3415 
3416                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3417                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3418 
3419                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3420 
3421                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3422                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3423                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3424                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3425 
3426                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3427                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3428 
3429                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3430 
3431                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3432 
3433                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3434                         pi2_src_scratch -= in_stride;
3435 
3436                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3437                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3438 
3439                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3440                         m_count = _mm_cvtsi32_si128(i4_shift);
3441                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3442                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3443 
3444                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3445                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3446                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3447                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3448 
3449                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3450 
3451                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3452                         pi2_dst_scratch -= out_stride;
3453                     }
3454 
3455                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
3456                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
3457                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
3458                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
3459                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
3460                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
3461                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
3462                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
3463 
3464                     /* o10[0-3] */
3465                     {
3466                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3467                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3468                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3469                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3470 
3471                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3472                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3473 
3474                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3475 
3476                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3477                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3478                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3479                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3480 
3481                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3482                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3483 
3484                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3485 
3486                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3487 
3488                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3489                         pi2_src_scratch -= in_stride;
3490 
3491                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3492                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3493 
3494                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3495                         m_count = _mm_cvtsi32_si128(i4_shift);
3496                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3497                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3498 
3499                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3500                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3501                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3502                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3503 
3504                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3505 
3506                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3507                         pi2_dst_scratch -= out_stride;
3508                     }
3509 
3510                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
3511                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
3512                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
3513                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
3514                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
3515                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
3516                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
3517                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
3518 
3519                     /* o11[0-3] */
3520                     {
3521                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3522                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3523                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3524                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3525 
3526                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3527                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3528 
3529                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3530 
3531                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3532                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3533                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3534                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3535 
3536                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3537                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3538 
3539                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3540 
3541                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3542 
3543                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3544                         pi2_src_scratch -= in_stride;
3545 
3546                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3547                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3548 
3549                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3550                         m_count = _mm_cvtsi32_si128(i4_shift);
3551                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3552                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3553 
3554                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3555                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3556                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3557                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3558 
3559                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3560 
3561                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3562                         pi2_dst_scratch -= out_stride;
3563 
3564                     }
3565 
3566                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
3567                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
3568                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
3569                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
3570                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
3571                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
3572                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
3573                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
3574 
3575 
3576                     /* o12[0-3] */
3577                     {
3578                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3579                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3580                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3581                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3582 
3583                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3584                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3585 
3586                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3587 
3588                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3589                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3590                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3591                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3592 
3593                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3594                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3595 
3596                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3597 
3598                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3599 
3600                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3601                         pi2_src_scratch -= in_stride;
3602 
3603                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3604                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3605 
3606                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3607                         m_count = _mm_cvtsi32_si128(i4_shift);
3608                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3609                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3610 
3611                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3612                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3613                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3614                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3615 
3616                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3617 
3618                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3619                         pi2_dst_scratch -= out_stride;
3620 
3621                     }
3622 
3623                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
3624                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
3625                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
3626                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
3627                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
3628                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
3629                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
3630                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
3631 
3632 
3633                     /* o13[0-3] */
3634                     {
3635                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3636                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3637                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3638                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3639 
3640                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3641                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3642 
3643                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3644 
3645                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3646                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3647                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3648                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3649 
3650                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3651                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3652 
3653                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3654 
3655                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3656 
3657                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3658                         pi2_src_scratch -= in_stride;
3659 
3660                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3661                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3662 
3663                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3664                         m_count = _mm_cvtsi32_si128(i4_shift);
3665                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3666                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3667 
3668                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3669                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3670                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3671                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3672 
3673                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3674 
3675                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3676                         pi2_dst_scratch -= out_stride;
3677                     }
3678 
3679                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
3680                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
3681                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
3682                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
3683                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
3684                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
3685                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
3686                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
3687 
3688 
3689                     /* o14[0-3] */
3690                     {
3691                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3692                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3693                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3694                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3695 
3696                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3697                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3698 
3699                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3700 
3701                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3702                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3703                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3704                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3705 
3706                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3707                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3708 
3709                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3710 
3711                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3712 
3713                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3714                         pi2_src_scratch -= in_stride;
3715 
3716                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3717                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3718 
3719                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3720                         m_count = _mm_cvtsi32_si128(i4_shift);
3721                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3722                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3723 
3724                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3725                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3726                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3727                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3728 
3729                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3730 
3731                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3732                         pi2_dst_scratch -= out_stride;
3733 
3734                     }
3735 
3736                     m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
3737                     m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
3738                     m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
3739                     m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
3740                     m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
3741                     m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
3742                     m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
3743                     m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
3744 
3745                     /* o15[0-3] */
3746                     {
3747                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3748                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3749                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3750                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3751 
3752                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3753                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3754 
3755                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3756 
3757                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3758                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3759                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3760                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3761 
3762                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3763                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3764 
3765                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3766 
3767                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3768 
3769                         m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3770                         pi2_src_scratch += 8;
3771 
3772                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3773                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3774 
3775                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3776                         m_count = _mm_cvtsi32_si128(i4_shift);
3777                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3778                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3779 
3780                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3781                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3782                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3783                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3784 
3785                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3786 
3787                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3788                         pi2_dst_scratch += 8;
3789                     }
3790 
3791                 }
3792             }
3793         }
3794         /* Transpose */
3795         {
3796             WORD16 *pi2_src_scratch = temp_ptr;
3797             WORD16 *pi2_dst_scratch = pi2_tmp;
3798             WORD32 in_stride = (trans_size << 1);
3799 
3800             for(j = 0; j < 2; j++)
3801             {
3802                 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
3803                 pi2_src_scratch += in_stride;
3804                 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
3805                 pi2_src_scratch += in_stride;
3806                 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
3807                 pi2_src_scratch += in_stride;
3808                 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
3809                 pi2_src_scratch += in_stride;
3810                 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
3811                 pi2_src_scratch += in_stride;
3812                 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
3813                 pi2_src_scratch += in_stride;
3814                 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
3815                 pi2_src_scratch += in_stride;
3816                 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
3817                 pi2_src_scratch += 8;
3818 
3819                 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
3820                 pi2_src_scratch -= in_stride;
3821                 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
3822                 pi2_src_scratch -= in_stride;
3823                 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
3824                 pi2_src_scratch -= in_stride;
3825                 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
3826                 pi2_src_scratch -= in_stride;
3827                 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
3828                 pi2_src_scratch -= in_stride;
3829                 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
3830                 pi2_src_scratch -= in_stride;
3831                 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
3832                 pi2_src_scratch -= in_stride;
3833                 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
3834                 pi2_src_scratch += 8;
3835 
3836 
3837                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
3838                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
3839 
3840                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
3841                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
3842 
3843                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
3844                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
3845 
3846                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
3847                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
3848 
3849                 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
3850                 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
3851 
3852                 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
3853                 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
3854 
3855                 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
3856                 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
3857 
3858                 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
3859                 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
3860 
3861                 /****************/
3862 
3863                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
3864                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
3865 
3866                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
3867                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
3868 
3869                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
3870                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
3871 
3872                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
3873                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
3874 
3875                 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
3876                 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
3877 
3878                 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
3879                 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
3880 
3881                 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
3882                 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
3883 
3884                 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
3885                 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
3886 
3887                 /******************/
3888 
3889                 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
3890                 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
3891 
3892                 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
3893                 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
3894 
3895                 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
3896                 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
3897 
3898                 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
3899                 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
3900 
3901                 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
3902                 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
3903 
3904                 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
3905                 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
3906 
3907                 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
3908                 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
3909 
3910                 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
3911                 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
3912 
3913                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
3914                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
3915                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
3916                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
3917 
3918                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
3919                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
3920                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
3921                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
3922 
3923                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
3924                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
3925                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
3926                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
3927 
3928                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
3929                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
3930                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
3931                 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
3932 
3933                 pi2_dst_scratch += 4 * trans_size;
3934             }
3935         }
3936         pi2_src += 8;
3937 //      pi2_dequant_coeff +=8;
3938         pi2_tmp += 8 * trans_size;
3939         zero_cols = zero_cols >> 1;
3940     }
3941 
3942     if(trans_size_stg1 != TRANS_SIZE_32)
3943     {
3944         m_temp_reg_10 = _mm_setzero_si128();
3945 
3946         for(i = trans_size_stg1; i < 32; i += 8)
3947         {
3948             WORD16 *pi2_dst_scratch = pi2_tmp;
3949 
3950             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
3951             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
3952             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
3953             _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
3954 
3955             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
3956             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
3957             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
3958             _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
3959 
3960             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
3961             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
3962             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
3963             _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
3964 
3965             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
3966             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
3967             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
3968             _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
3969 
3970             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
3971             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
3972             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
3973             _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
3974 
3975             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
3976             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
3977             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
3978             _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
3979 
3980             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
3981             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
3982             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
3983             _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
3984 
3985             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
3986             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
3987             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
3988             _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
3989 
3990             pi2_tmp += 8 * trans_size;
3991         }
3992     }
3993 
3994     pi2_tmp = pi2_tmp_orig;
3995 
3996     /* Inverse Transform 2nd stage */
3997 
3998     for(j = 0; j < trans_size; j += 4)
3999     {
4000         i4_shift = IT_SHIFT_STAGE_2;
4001 
4002         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
4003         if(zero_last28_rows_stg2)
4004         {
4005             {
4006 
4007                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4008                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
4009                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
4010                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
4011                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
4012                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
4013                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
4014                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
4015 
4016                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4017 
4018                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
4019 
4020                 /* eo0[0-3] */
4021                 {
4022                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4023 
4024                 }
4025                 /* eo1[0-3] */
4026                 {
4027                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4028 
4029                 }
4030                 /* eo2[0-3] */
4031                 {
4032                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4033                 }
4034 
4035                 /* eo3[0-3] */
4036                 {
4037                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4038                 }
4039                 /* eo4[0-3] */
4040                 {
4041                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
4042                 }
4043 
4044                 /* eo5[0-3] */
4045                 {
4046                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
4047                 }
4048 
4049                 /* eo6[0-3] */
4050                 {
4051                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
4052                 }
4053                 /* eo7[0-3] */
4054                 {
4055                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
4056                 }
4057             }
4058 
4059             m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4060 
4061             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4062 
4063             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4064 
4065             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4066 
4067             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4068 
4069             /* e[]*/
4070 
4071             temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
4072             temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
4073 
4074             temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
4075             temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
4076 
4077             temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
4078             temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
4079 
4080             temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
4081             temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
4082 
4083             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
4084             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
4085 
4086             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
4087             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
4088 
4089             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
4090             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
4091 
4092             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
4093             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
4094 
4095             /*o[k]*/
4096             {
4097 
4098                 WORD16 *pi2_dst_scratch = temp_ptr;
4099                 WORD32 out_stride = 8;
4100 
4101                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4102 
4103                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4104                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4105 
4106                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
4107 
4108 
4109                 /* o0[0-3] */
4110                 {
4111                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4112 
4113                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4114                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4115 
4116                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4117                     m_count = _mm_cvtsi32_si128(i4_shift);
4118                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4119                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4120 
4121                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4122                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4123                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4124                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4125 
4126                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4127 
4128                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4129                     pi2_dst_scratch += out_stride;
4130 
4131                 }
4132 
4133                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4134 
4135                 /* o1[0-3] */
4136                 {
4137                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4138 
4139                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4140                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4141 
4142                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4143                     m_count = _mm_cvtsi32_si128(i4_shift);
4144                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4145                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4146 
4147                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4148                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4149                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4150                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4151 
4152                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4153 
4154                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4155                     pi2_dst_scratch += out_stride;
4156 
4157                 }
4158 
4159                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4160 
4161                 /* o2[0-3] */
4162                 {
4163                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4164 
4165                     m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
4166                     m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
4167 
4168                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4169                     m_count = _mm_cvtsi32_si128(i4_shift);
4170                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4171                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4172 
4173                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4174                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4175                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4176                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4177 
4178                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4179 
4180                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4181                     pi2_dst_scratch += out_stride;
4182 
4183                 }
4184 
4185                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4186 
4187                 /* o3[0-3] */
4188                 {
4189                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4190 
4191                     m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
4192                     m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
4193 
4194                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4195                     m_count = _mm_cvtsi32_si128(i4_shift);
4196                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4197                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4198 
4199                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4200                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4201                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4202                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4203 
4204                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4205 
4206                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4207                     pi2_dst_scratch += out_stride;
4208 
4209                 }
4210 
4211                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4212 
4213                 /* o4[0-3] */
4214                 {
4215                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4216 
4217                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4218                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4219 
4220                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4221                     m_count = _mm_cvtsi32_si128(i4_shift);
4222                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4223                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4224 
4225                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4226                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4227                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4228                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4229 
4230                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4231 
4232                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4233                     pi2_dst_scratch += out_stride;
4234 
4235                 }
4236 
4237                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4238 
4239                 /* o5[0-3] */
4240                 {
4241                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4242 
4243                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4244                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4245 
4246                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4247                     m_count = _mm_cvtsi32_si128(i4_shift);
4248                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4249                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4250 
4251                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4252                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4253                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4254                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4255 
4256                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4257 
4258                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4259                     pi2_dst_scratch += out_stride;
4260 
4261                 }
4262 
4263                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4264 
4265                 /* o6[0-3] */
4266                 {
4267                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4268 
4269                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4270                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4271 
4272                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4273                     m_count = _mm_cvtsi32_si128(i4_shift);
4274                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4275                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4276 
4277                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4278                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4279                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4280                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4281 
4282                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4283 
4284                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4285                     pi2_dst_scratch += out_stride;
4286 
4287                 }
4288 
4289                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4290 
4291                 /* o7[0-3] */
4292                 {
4293                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4294 
4295                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4296                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4297 
4298                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4299                     m_count = _mm_cvtsi32_si128(i4_shift);
4300                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4301                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4302 
4303                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4304                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4305                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4306                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4307 
4308                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4309 
4310                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4311                     pi2_dst_scratch += 8;
4312 
4313                 }
4314 
4315                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4316 
4317                 /* o8[0-3] */
4318                 {
4319                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4320 
4321                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4322                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4323 
4324                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4325                     m_count = _mm_cvtsi32_si128(i4_shift);
4326                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4327                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4328 
4329                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4330                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4331                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4332                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4333 
4334                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4335 
4336                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4337                     pi2_dst_scratch += out_stride;
4338                 }
4339 
4340                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4341 
4342                 /* o9[0-3] */
4343                 {
4344                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4345 
4346                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4347                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4348 
4349                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4350                     m_count = _mm_cvtsi32_si128(i4_shift);
4351                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4352                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4353 
4354                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4355                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4356                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4357                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4358 
4359                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4360 
4361                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4362                     pi2_dst_scratch += out_stride;
4363 
4364                 }
4365 
4366                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4367 
4368                 /* o10[0-3] */
4369                 {
4370                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4371 
4372                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
4373                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
4374 
4375                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4376                     m_count = _mm_cvtsi32_si128(i4_shift);
4377                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4378                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4379 
4380                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4381                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4382                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4383                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4384 
4385                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4386 
4387                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4388                     pi2_dst_scratch += out_stride;
4389                 }
4390 
4391                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
4392 
4393                 /* o11[0-3] */
4394                 {
4395                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4396 
4397                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
4398                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
4399 
4400                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4401                     m_count = _mm_cvtsi32_si128(i4_shift);
4402                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4403                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4404 
4405                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4406                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4407                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4408                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4409 
4410                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4411 
4412                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4413                     pi2_dst_scratch += out_stride;
4414 
4415                 }
4416 
4417                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
4418 
4419                 /* o12[0-3] */
4420                 {
4421                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4422 
4423                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
4424                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
4425 
4426                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4427                     m_count = _mm_cvtsi32_si128(i4_shift);
4428                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4429                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4430 
4431                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4432                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4433                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4434                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4435 
4436                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4437 
4438                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4439                     pi2_dst_scratch += out_stride;
4440 
4441                 }
4442 
4443                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
4444 
4445                 /* o13[0-3] */
4446                 {
4447                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4448 
4449                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
4450                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
4451 
4452                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4453                     m_count = _mm_cvtsi32_si128(i4_shift);
4454                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4455                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4456 
4457                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4458                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4459                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4460                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4461 
4462                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4463 
4464                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4465                     pi2_dst_scratch += out_stride;
4466                 }
4467 
4468                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
4469 
4470                 /* o14[0-3] */
4471                 {
4472                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4473 
4474                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
4475                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
4476 
4477                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4478                     m_count = _mm_cvtsi32_si128(i4_shift);
4479                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4480                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4481 
4482                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4483                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4484                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4485                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4486 
4487                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4488 
4489                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4490                     pi2_dst_scratch += out_stride;
4491 
4492                 }
4493 
4494                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
4495 
4496                 /* o15[0-3] */
4497                 {
4498                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4499 
4500                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
4501                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
4502 
4503                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4504                     m_count = _mm_cvtsi32_si128(i4_shift);
4505                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4506                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4507 
4508                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4509                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4510                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4511                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4512 
4513                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4514 
4515                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4516                     pi2_dst_scratch += 8;
4517                 }
4518 
4519             }
4520 
4521         }
4522         else if(zero_last24_rows_stg2)
4523         {
4524             /* eo */
4525             {
4526                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4527 
4528                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4529                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
4530 
4531                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
4532 
4533 
4534                 /* eo0[0-3] */
4535                 {
4536                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4537 
4538                 }
4539 
4540                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
4541 
4542                 /* eo1[0-3] */
4543                 {
4544                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4545 
4546                 }
4547                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
4548 
4549                 /* eo2[0-3] */
4550                 {
4551                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4552 
4553                 }
4554 
4555                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
4556 
4557                 /* eo3[0-3] */
4558                 {
4559 
4560                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4561 
4562                 }
4563 
4564                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
4565 
4566                 /* eo4[0-3] */
4567                 {
4568                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4569 
4570                 }
4571 
4572                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
4573 
4574                 /* eo5[0-3] */
4575                 {
4576                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4577                 }
4578 
4579                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
4580                 /* eo6[0-3] */
4581                 {
4582                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4583                 }
4584 
4585                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
4586                 /* eo7[0-3] */
4587                 {
4588                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4589 
4590                 }
4591 
4592             }
4593 
4594             /* eeo */
4595             {
4596 
4597                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
4598                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
4599                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
4600                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
4601 
4602                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
4603 
4604                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
4605 
4606                 /* eeo0[0-3] */
4607                 {
4608                     temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4609 
4610                 }
4611 
4612                 /* eeo1[0-3] */
4613                 {
4614                     temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4615 
4616                 }
4617 
4618                 /* eo2[0-3] */
4619                 {
4620                     temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4621 
4622                 }
4623 
4624 
4625                 /* eo3[0-3] */
4626                 {
4627                     temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4628 
4629                 }
4630 
4631             }
4632 
4633             m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
4634             m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
4635             m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4636 
4637             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4638 
4639             //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
4640             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4641 
4642             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4643             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4644 
4645             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
4646             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
4647 
4648             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
4649             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
4650 
4651             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
4652             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
4653 
4654             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
4655             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
4656 
4657             /* e[]*/
4658 
4659             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
4660             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
4661 
4662             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
4663             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
4664 
4665             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
4666             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
4667 
4668             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
4669             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
4670 
4671             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
4672             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
4673 
4674             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
4675             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
4676 
4677             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
4678             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
4679 
4680             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
4681             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
4682 
4683             /*o[k] */
4684             {
4685 
4686                 WORD16 *pi2_dst_scratch = temp_ptr;
4687                 WORD32 out_stride = 8;
4688 
4689                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4690                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
4691 
4692                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4693                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4694                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
4695                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
4696 
4697                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
4698                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
4699 
4700                 /* o0[0-3] */
4701                 {
4702                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4703                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4704 
4705                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4706 
4707                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4708                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4709 
4710                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4711                     m_count = _mm_cvtsi32_si128(i4_shift);
4712                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4713                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4714 
4715                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4716                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4717                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4718                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4719 
4720                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4721 
4722                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4723                     pi2_dst_scratch += out_stride;
4724 
4725                 }
4726 
4727 
4728                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4729                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
4730 
4731                 /* o1[0-3] */
4732                 {
4733                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4734                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4735 
4736                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4737 
4738                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4739                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4740 
4741                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4742                     m_count = _mm_cvtsi32_si128(i4_shift);
4743                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4744                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4745 
4746                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4747                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4748                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4749                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4750 
4751                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4752 
4753                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4754                     pi2_dst_scratch += out_stride;
4755 
4756                 }
4757 
4758                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4759                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
4760 
4761                 /* o2[0-3] */
4762                 {
4763                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4764                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4765 
4766                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4767 
4768                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
4769                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
4770 
4771                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4772                     m_count = _mm_cvtsi32_si128(i4_shift);
4773                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4774                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4775 
4776                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4777                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4778                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4779                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4780 
4781                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4782 
4783                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4784                     pi2_dst_scratch += out_stride;
4785 
4786                 }
4787 
4788                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4789                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
4790 
4791                 /* o3[0-3] */
4792                 {
4793                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4794                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4795 
4796                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4797 
4798                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
4799                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
4800 
4801                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4802                     m_count = _mm_cvtsi32_si128(i4_shift);
4803                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4804                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4805 
4806                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4807                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4808                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4809                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4810 
4811                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4812 
4813                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4814                     pi2_dst_scratch += out_stride;
4815 
4816                 }
4817 
4818                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4819                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
4820 
4821                 /* o4[0-3] */
4822                 {
4823                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4824                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4825 
4826                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4827 
4828                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4829                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4830 
4831                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4832                     m_count = _mm_cvtsi32_si128(i4_shift);
4833                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4834                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4835 
4836                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4837                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4838                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4839                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4840 
4841                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4842 
4843                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4844                     pi2_dst_scratch += out_stride;
4845 
4846                 }
4847 
4848                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4849                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
4850 
4851                 /* o5[0-3] */
4852                 {
4853                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4854                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4855 
4856                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4857 
4858                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4859                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4860 
4861                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4862                     m_count = _mm_cvtsi32_si128(i4_shift);
4863                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4864                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4865 
4866                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4867                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4868                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4869                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4870 
4871                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4872 
4873                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4874                     pi2_dst_scratch += out_stride;
4875 
4876                 }
4877 
4878                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4879                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
4880 
4881                 /* o6[0-3] */
4882                 {
4883                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4884                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4885 
4886                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4887 
4888                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4889                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4890 
4891                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4892                     m_count = _mm_cvtsi32_si128(i4_shift);
4893                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4894                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4895 
4896                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4897                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4898                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4899                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4900 
4901                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4902 
4903                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4904                     pi2_dst_scratch += out_stride;
4905 
4906                 }
4907 
4908                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4909                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
4910 
4911                 /* o7[0-3] */
4912                 {
4913                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4914                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4915 
4916                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4917 
4918                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4919                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4920 
4921                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4922                     m_count = _mm_cvtsi32_si128(i4_shift);
4923                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4924                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4925 
4926                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4927                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4928                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4929                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4930 
4931                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4932 
4933                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4934                     pi2_dst_scratch += 8;
4935 
4936                 }
4937 
4938                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4939                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
4940 
4941                 /* o8[0-3] */
4942                 {
4943                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4944                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4945 
4946                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4947 
4948                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4949                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4950 
4951                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4952                     m_count = _mm_cvtsi32_si128(i4_shift);
4953                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4954                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4955 
4956                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4957                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4958                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4959                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4960 
4961                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4962 
4963                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4964                     pi2_dst_scratch += out_stride;
4965                 }
4966 
4967                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4968                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
4969 
4970                 /* o9[0-3] */
4971                 {
4972                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4973                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4974 
4975                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4976 
4977                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4978                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4979 
4980                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4981                     m_count = _mm_cvtsi32_si128(i4_shift);
4982                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4983                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4984 
4985                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4986                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4987                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4988                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4989 
4990                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4991 
4992                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4993                     pi2_dst_scratch += out_stride;
4994                 }
4995 
4996                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4997                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
4998 
4999                 /* o10[0-3] */
5000                 {
5001                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5002                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5003 
5004                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5005 
5006                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
5007                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
5008 
5009                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5010                     m_count = _mm_cvtsi32_si128(i4_shift);
5011                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5012                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5013 
5014                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5015                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5016                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5017                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5018 
5019                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5020 
5021                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5022                     pi2_dst_scratch += out_stride;
5023                 }
5024 
5025                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
5026                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
5027 
5028                 /* o11[0-3] */
5029                 {
5030                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5031                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5032 
5033                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5034 
5035                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
5036                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
5037 
5038                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5039                     m_count = _mm_cvtsi32_si128(i4_shift);
5040                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5041                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5042 
5043                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5044                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5045                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5046                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5047 
5048                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5049 
5050                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5051                     pi2_dst_scratch += out_stride;
5052 
5053                 }
5054 
5055                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
5056                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
5057 
5058                 /* o12[0-3] */
5059                 {
5060                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5061                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5062 
5063                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5064 
5065                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
5066                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
5067 
5068                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5069                     m_count = _mm_cvtsi32_si128(i4_shift);
5070                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5071                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5072 
5073                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5074                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5075                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5076                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5077 
5078                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5079 
5080                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5081                     pi2_dst_scratch += out_stride;
5082 
5083                 }
5084 
5085                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
5086                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
5087 
5088                 /* o13[0-3] */
5089                 {
5090                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5091                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5092 
5093                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5094 
5095                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
5096                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
5097 
5098                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5099                     m_count = _mm_cvtsi32_si128(i4_shift);
5100                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5101                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5102 
5103                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5104                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5105                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5106                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5107 
5108                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5109 
5110                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5111                     pi2_dst_scratch += out_stride;
5112                 }
5113 
5114                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
5115                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
5116 
5117                 /* o14[0-3] */
5118                 {
5119                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5120                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5121 
5122                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5123 
5124                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
5125                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
5126 
5127                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5128                     m_count = _mm_cvtsi32_si128(i4_shift);
5129                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5130                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5131 
5132                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5133                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5134                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5135                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5136 
5137                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5138 
5139                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5140                     pi2_dst_scratch += out_stride;
5141                 }
5142 
5143                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
5144                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
5145 
5146                 /* o15[0-3] */
5147                 {
5148                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5149                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5150 
5151                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5152 
5153                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
5154                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
5155 
5156                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5157                     m_count = _mm_cvtsi32_si128(i4_shift);
5158                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5159                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5160 
5161                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5162                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5163                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5164                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5165 
5166                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5167 
5168                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5169                     pi2_dst_scratch += 8;
5170                 }
5171 
5172             }
5173         }
5174         else
5175         {
5176             /* eo */
5177             {
5178 
5179                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
5180                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
5181                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
5182                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
5183 
5184 
5185                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
5186                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
5187                 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
5188                 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
5189                 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
5190                 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
5191                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
5192                 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
5193 
5194                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
5195                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
5196                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
5197                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
5198 
5199                 /* eo0[0-3] */
5200                 {
5201                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5202                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5203 
5204                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5205 
5206                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5207                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5208 
5209                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5210 
5211                     m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5212 
5213                 }
5214 
5215                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
5216                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
5217                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
5218                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
5219 
5220                 /* eo1[0-3] */
5221                 {
5222                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5223                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5224 
5225                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5226 
5227                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5228                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5229 
5230                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5231 
5232                     m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
5233 
5234                 }
5235 
5236                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
5237                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
5238                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
5239                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
5240 
5241                 /* eo2[0-3] */
5242                 {
5243                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5244                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5245 
5246                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5247 
5248                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5249                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5250 
5251                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5252 
5253                     m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5254 
5255                 }
5256 
5257                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
5258                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
5259                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
5260                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
5261 
5262                 /* eo3[0-3] */
5263                 {
5264                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5265                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5266 
5267                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5268 
5269                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5270                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5271 
5272                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
5273 
5274                     m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5275 
5276                 }
5277 
5278                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
5279                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
5280                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
5281                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
5282 
5283 
5284                 /* eo4[0-3] */
5285                 {
5286                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5287                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5288 
5289                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5290 
5291                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5292                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5293 
5294                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
5295 
5296                     m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5297 
5298                 }
5299 
5300                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
5301                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
5302                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
5303                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
5304 
5305                 /* eo5[0-3] */
5306                 {
5307                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5308                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5309 
5310                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5311 
5312                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5313                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5314 
5315                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5316 
5317                     m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5318                 }
5319 
5320                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
5321                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
5322                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
5323                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
5324 
5325                 /* eo6[0-3] */
5326                 {
5327                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5328                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5329 
5330                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5331 
5332                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5333                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5334 
5335                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5336 
5337                     m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5338 
5339                 }
5340 
5341                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
5342                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
5343                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
5344                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
5345 
5346                 /* eo7[0-3] */
5347                 {
5348                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5349                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5350 
5351                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5352 
5353                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5354                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5355 
5356                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5357 
5358                     m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5359 
5360 
5361                 }
5362 
5363             }
5364 
5365             /* eeo */
5366             {
5367                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
5368                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
5369 
5370                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
5371                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
5372                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
5373                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
5374 
5375                 /* eeo0[0-3] */
5376                 {
5377 
5378                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
5379                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
5380 
5381                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5382                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5383 
5384                     temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5385 
5386                 }
5387 
5388                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
5389                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
5390 
5391                 /* eeo1[0-3] */
5392                 {
5393                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5394                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5395 
5396                     temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5397 
5398                 }
5399 
5400                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
5401                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
5402 
5403                 /* eo2[0-3] */
5404                 {
5405                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5406                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5407 
5408                     temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5409 
5410                 }
5411 
5412                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
5413                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
5414 
5415                 /* eo3[0-3] */
5416                 {
5417                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5418                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5419 
5420                     temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5421 
5422                 }
5423 
5424 
5425             }
5426 
5427             m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
5428             m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
5429 
5430             m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
5431             m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
5432 
5433             m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
5434             m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
5435 
5436             m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
5437 
5438             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
5439             m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
5440 
5441             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
5442 
5443             m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
5444             m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
5445 
5446             m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
5447             m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
5448 
5449 /* eeeo[0]= m_temp_reg_20  */
5450 /* eeeo[1]= m_temp_reg_21  */
5451 /* eeee[0]= m_temp_reg_22  */
5452 /* eeee[1]= m_temp_reg_23  */
5453 
5454             /* eee[0] = eeee[0] + eeeo[0]; */
5455             m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
5456 
5457             /* eee[3] = eeee[0] - eeeo[0]; */
5458             m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
5459 
5460             /* eee[2] = eeee[1] - eeeo[1]; */
5461             m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
5462 
5463             /* eee[1] = eeee[1] + eeeo[1];*/
5464             m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
5465 
5466             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
5467             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
5468 
5469             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
5470             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
5471 
5472             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
5473             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
5474 
5475             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
5476             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
5477 
5478 /* e[]*/
5479 
5480             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
5481             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
5482 
5483             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
5484             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
5485 
5486             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
5487             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
5488 
5489             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
5490             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
5491 
5492             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
5493             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
5494 
5495             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
5496             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
5497 
5498             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
5499             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
5500 
5501             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
5502             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
5503 
5504 /*o[k] */
5505             {
5506 
5507                 WORD16 *pi2_dst_scratch = temp_ptr;
5508                 WORD32 out_stride = 8;
5509 
5510                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
5511                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
5512                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
5513                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
5514                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
5515                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
5516                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
5517                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
5518 
5519 
5520                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
5521                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
5522                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
5523                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
5524                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
5525                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
5526                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
5527                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
5528 
5529                 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
5530                 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
5531                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
5532                 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
5533                 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
5534                 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
5535                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
5536                 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
5537 
5538                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
5539                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
5540                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
5541                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
5542                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
5543                 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
5544                 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
5545                 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
5546 
5547                 /* o0[0-3] */
5548                 {
5549                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5550                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5551                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5552                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5553 
5554                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5555                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5556 
5557                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5558 
5559                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5560                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5561                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5562                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5563 
5564                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5565                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5566 
5567                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5568 
5569                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5570 
5571                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
5572                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
5573 
5574                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5575                     m_count = _mm_cvtsi32_si128(i4_shift);
5576                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5577                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5578 
5579                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5580                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5581                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5582                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5583 
5584                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5585 
5586                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5587                     pi2_dst_scratch += out_stride;
5588 
5589                 }
5590 
5591                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
5592                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
5593                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
5594                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
5595                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
5596                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
5597                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
5598                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
5599 
5600                 /* o1[0-3] */
5601                 {
5602                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5603                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5604                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5605                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5606 
5607                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5608                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5609 
5610                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
5611 
5612                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5613                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5614                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5615                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5616 
5617                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5618                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5619 
5620                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5621 
5622                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5623 
5624                     m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
5625                     m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
5626 
5627                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5628                     m_count = _mm_cvtsi32_si128(i4_shift);
5629                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5630                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5631 
5632                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5633                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5634                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5635                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5636 
5637                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5638 
5639                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5640                     pi2_dst_scratch += out_stride;
5641 
5642                 }
5643 
5644                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
5645                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
5646                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
5647                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
5648                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
5649                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
5650                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
5651                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
5652 
5653                 /* o2[0-3] */
5654                 {
5655                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5656                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5657                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5658                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5659 
5660                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5661                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5662 
5663                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5664 
5665                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5666                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5667                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5668                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5669 
5670                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
5671                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5672 
5673                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
5674 
5675                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5676 
5677                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
5678                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
5679 
5680                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5681                     m_count = _mm_cvtsi32_si128(i4_shift);
5682                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5683                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5684 
5685                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5686                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5687                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5688                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5689 
5690                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5691 
5692                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5693                     pi2_dst_scratch += out_stride;
5694 
5695                 }
5696 
5697                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
5698                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
5699                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
5700                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
5701                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
5702                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
5703                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
5704                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
5705 
5706                 /* o3[0-3] */
5707                 {
5708                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5709                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5710                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5711                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5712 
5713                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5714                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5715 
5716                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5717 
5718                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5719                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5720                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5721                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5722 
5723                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
5724                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5725 
5726                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5727 
5728                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5729 
5730                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
5731                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
5732 
5733                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5734                     m_count = _mm_cvtsi32_si128(i4_shift);
5735                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5736                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5737 
5738                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5739                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5740                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5741                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5742 
5743                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5744 
5745                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5746                     pi2_dst_scratch += out_stride;
5747 
5748                 }
5749 
5750                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
5751                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
5752                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
5753                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
5754                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
5755                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
5756                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
5757                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
5758 
5759                 /* o4[0-3] */
5760                 {
5761                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5762                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5763                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5764                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5765 
5766                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5767                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5768 
5769                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5770 
5771                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5772                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5773                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5774                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5775 
5776                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5777                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5778 
5779                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5780 
5781                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5782 
5783                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
5784                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
5785                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5786                     m_count = _mm_cvtsi32_si128(i4_shift);
5787                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5788                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5789 
5790                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5791                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5792                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5793                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5794 
5795                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5796 
5797                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5798                     pi2_dst_scratch += out_stride;
5799 
5800                 }
5801 
5802                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
5803                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
5804                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
5805                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
5806                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
5807                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
5808                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
5809                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
5810 
5811                 /* o5[0-3] */
5812                 {
5813                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5814                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5815                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5816                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5817 
5818                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5819                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5820 
5821                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5822 
5823                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5824                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5825                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5826                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5827 
5828                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5829                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5830 
5831                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5832 
5833                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5834 
5835                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
5836                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
5837 
5838                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5839                     m_count = _mm_cvtsi32_si128(i4_shift);
5840                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5841                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5842 
5843                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5844                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5845                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5846                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5847 
5848                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5849 
5850                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5851                     pi2_dst_scratch += out_stride;
5852 
5853                 }
5854 
5855                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
5856                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
5857                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
5858                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
5859                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
5860                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
5861                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
5862                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
5863 
5864                 /* o6[0-3] */
5865                 {
5866                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5867                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5868                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5869                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5870 
5871                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5872                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5873 
5874                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5875 
5876                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5877                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5878                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5879                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5880 
5881                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5882                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5883 
5884                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5885 
5886                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5887 
5888                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
5889                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
5890 
5891                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5892                     m_count = _mm_cvtsi32_si128(i4_shift);
5893                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5894                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5895 
5896                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5897                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5898                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5899                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5900 
5901                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5902 
5903                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5904                     pi2_dst_scratch += out_stride;
5905 
5906                 }
5907 
5908                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
5909                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
5910                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
5911                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
5912                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
5913                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
5914                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
5915                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
5916 
5917                 /* o7[0-3] */
5918                 {
5919                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5920                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5921                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5922                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5923 
5924                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5925                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5926 
5927                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5928 
5929                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5930                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5931                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5932                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5933 
5934                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5935                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5936 
5937                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5938 
5939                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5940 
5941                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
5942                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
5943 
5944                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5945                     m_count = _mm_cvtsi32_si128(i4_shift);
5946                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5947                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5948 
5949                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5950                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5951                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5952                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5953 
5954                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5955 
5956                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5957                     pi2_dst_scratch += 8;
5958 
5959                 }
5960 
5961                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
5962                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
5963                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
5964                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
5965                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
5966                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
5967                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
5968                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
5969 
5970                 /* o8[0-3] */
5971                 {
5972                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5973                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5974                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5975                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5976 
5977                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5978                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5979 
5980                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5981 
5982                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5983                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5984                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5985                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5986 
5987                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5988                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5989 
5990                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5991 
5992                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5993 
5994                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
5995                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
5996 
5997                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5998                     m_count = _mm_cvtsi32_si128(i4_shift);
5999                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6000                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6001 
6002                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6003                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6004                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6005                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6006 
6007                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6008 
6009                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6010                     pi2_dst_scratch += out_stride;
6011                 }
6012 
6013                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
6014                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
6015                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
6016                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
6017                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
6018                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
6019                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
6020                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
6021 
6022                 /* o9[0-3] */
6023                 {
6024                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6025                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6026                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6027                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6028 
6029                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6030                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6031 
6032                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6033 
6034                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6035                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6036                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6037                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6038 
6039                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6040                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6041 
6042                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6043 
6044                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6045 
6046                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
6047                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
6048 
6049                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6050                     m_count = _mm_cvtsi32_si128(i4_shift);
6051                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6052                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6053 
6054                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6055                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6056                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6057                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6058 
6059                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6060 
6061                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6062                     pi2_dst_scratch += out_stride;
6063                 }
6064 
6065                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
6066                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
6067                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
6068                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
6069                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
6070                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
6071                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
6072                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
6073 
6074                 /* o10[0-3] */
6075                 {
6076                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6077                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6078                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6079                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6080 
6081                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6082                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6083 
6084                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6085 
6086                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6087                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6088                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6089                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6090 
6091                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6092                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6093 
6094                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6095 
6096                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6097 
6098                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
6099                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
6100 
6101                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6102                     m_count = _mm_cvtsi32_si128(i4_shift);
6103                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6104                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6105 
6106                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6107                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6108                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6109                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6110 
6111                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6112 
6113                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6114                     pi2_dst_scratch += out_stride;
6115                 }
6116 
6117 
6118                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
6119                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
6120                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
6121                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
6122                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
6123                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
6124                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
6125                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
6126 
6127                 /* o11[0-3] */
6128                 {
6129                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6130                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6131                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6132                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6133 
6134                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6135                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6136 
6137                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6138 
6139                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6140                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6141                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6142                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6143 
6144                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6145                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6146 
6147                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6148 
6149                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6150 
6151                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
6152                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
6153 
6154                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6155                     m_count = _mm_cvtsi32_si128(i4_shift);
6156                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6157                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6158 
6159                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6160                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6161                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6162                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6163 
6164                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6165 
6166                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6167                     pi2_dst_scratch += out_stride;
6168 
6169                 }
6170 
6171                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
6172                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
6173                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
6174                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
6175                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
6176                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
6177                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
6178                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
6179 
6180                 /* o12[0-3] */
6181                 {
6182                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6183                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6184                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6185                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6186 
6187                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6188                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6189 
6190                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6191 
6192                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6193                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6194                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6195                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6196 
6197                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6198                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6199 
6200                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6201 
6202                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6203 
6204                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
6205                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
6206 
6207                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6208                     m_count = _mm_cvtsi32_si128(i4_shift);
6209                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6210                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6211 
6212                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6213                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6214                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6215                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6216 
6217                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6218 
6219                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6220                     pi2_dst_scratch += out_stride;
6221 
6222                 }
6223 
6224                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
6225                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
6226                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
6227                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
6228                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
6229                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
6230                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
6231                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
6232 
6233                 /* o13[0-3] */
6234                 {
6235                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6236                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6237                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6238                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6239 
6240                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6241                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6242 
6243                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6244 
6245                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6246                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6247                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6248                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6249 
6250                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6251                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6252 
6253                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6254 
6255                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6256 
6257                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
6258                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
6259 
6260                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6261                     m_count = _mm_cvtsi32_si128(i4_shift);
6262                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6263                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6264 
6265                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6266                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6267                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6268                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6269 
6270                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6271 
6272                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6273                     pi2_dst_scratch += out_stride;
6274                 }
6275 
6276                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
6277                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
6278                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
6279                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
6280                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
6281                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
6282                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
6283                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
6284 
6285                 /* o14[0-3] */
6286                 {
6287                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6288                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6289                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6290                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6291 
6292                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6293                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6294 
6295                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6296 
6297                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6298                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6299                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6300                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6301 
6302                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6303                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6304 
6305                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6306 
6307                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6308 
6309                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
6310                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
6311 
6312                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6313                     m_count = _mm_cvtsi32_si128(i4_shift);
6314                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6315                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6316 
6317                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6318                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6319                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6320                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6321 
6322                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6323 
6324                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6325                     pi2_dst_scratch += out_stride;
6326 
6327                 }
6328 
6329                 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
6330                 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
6331                 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
6332                 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
6333                 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
6334                 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
6335                 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
6336                 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
6337 
6338                 /* o15[0-3] */
6339                 {
6340                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6341                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6342                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6343                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6344 
6345                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6346                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6347 
6348                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6349 
6350                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6351                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6352                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6353                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6354 
6355                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6356                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6357 
6358                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6359 
6360                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6361 
6362                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
6363                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
6364 
6365                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6366                     m_count = _mm_cvtsi32_si128(i4_shift);
6367                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6368                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6369 
6370                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6371                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6372                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6373                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6374 
6375                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6376 
6377                     _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6378                     pi2_dst_scratch += 8;
6379                 }
6380 
6381             }
6382 
6383         }
6384 
6385         /* Transpose */
6386         {
6387 
6388             WORD16 *pi2_src_scratch = temp_ptr;
6389             WORD32 out_stride = dst_strd;
6390             WORD32 in_stride = 8;
6391 
6392             m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
6393             pi2_src_scratch += in_stride;
6394             m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
6395             pi2_src_scratch += in_stride;
6396             m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
6397             pi2_src_scratch += in_stride;
6398             m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
6399             pi2_src_scratch += in_stride;
6400             m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
6401             pi2_src_scratch += in_stride;
6402             m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
6403             pi2_src_scratch += in_stride;
6404             m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
6405             pi2_src_scratch += in_stride;
6406             m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
6407             pi2_src_scratch += 8;
6408 
6409             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
6410             pi2_src_scratch += in_stride;
6411             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
6412             pi2_src_scratch += in_stride;
6413             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
6414             pi2_src_scratch += in_stride;
6415             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
6416             pi2_src_scratch += in_stride;
6417             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
6418             pi2_src_scratch += in_stride;
6419             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
6420             pi2_src_scratch += in_stride;
6421             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
6422             pi2_src_scratch += in_stride;
6423             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
6424             pi2_src_scratch += 8;
6425 
6426 
6427             m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
6428             m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
6429 
6430             m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
6431             m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
6432 
6433             m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
6434             m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
6435 
6436             m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
6437             m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
6438 
6439             m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
6440             m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
6441 
6442             m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
6443             m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
6444 
6445             m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
6446             m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
6447 
6448             m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
6449             m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
6450 
6451 
6452             m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
6453             m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
6454 
6455             m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
6456             m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
6457 
6458             m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
6459             m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
6460 
6461             m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
6462             m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
6463 
6464             m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
6465             m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
6466 
6467             m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
6468             m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
6469 
6470             m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
6471             m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
6472 
6473             m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
6474             m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
6475 
6476 
6477             m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
6478             m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
6479 
6480             m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
6481             m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
6482 
6483             m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
6484             m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
6485 
6486             m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
6487             m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
6488 
6489             m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
6490             m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
6491 
6492             m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
6493             m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
6494 
6495             m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
6496             m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
6497 
6498             m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
6499             m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
6500 
6501             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6502 
6503             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6504 
6505             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
6506             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6507 
6508             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6509 
6510             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
6511             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6512 
6513             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6514 
6515             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6516 
6517             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6518 
6519             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
6520             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6521 
6522             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6523 
6524             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
6525             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6526 
6527             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6528             pu1_dst += out_stride;
6529             pu1_pred += pred_strd;
6530 
6531 
6532             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6533 
6534             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6535 
6536             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
6537             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6538 
6539             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6540 
6541             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
6542             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6543 
6544             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6545 
6546             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6547 
6548             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6549 
6550             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
6551             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6552 
6553             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6554 
6555             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
6556             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6557 
6558             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6559             pu1_dst += out_stride;
6560             pu1_pred += pred_strd;
6561 
6562             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6563 
6564             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6565 
6566             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
6567             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6568 
6569             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6570 
6571             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
6572             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6573 
6574             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6575 
6576             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6577 
6578             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6579 
6580             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
6581             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6582 
6583             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6584 
6585             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
6586             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6587 
6588             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6589             pu1_dst += out_stride;
6590             pu1_pred += pred_strd;
6591 
6592 
6593             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6594 
6595             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6596 
6597             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
6598             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6599 
6600             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6601 
6602             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
6603             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6604 
6605             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6606 
6607             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6608 
6609             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6610 
6611             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
6612             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6613 
6614             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6615 
6616             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
6617             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6618 
6619             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6620             pu1_dst += out_stride;
6621             pu1_pred += pred_strd;
6622 
6623         }
6624         pi2_tmp += 4;
6625     }
6626 }
6627 
6628 
6629