1 /******************************************************************************
2 *
3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at:
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 *
17 ******************************************************************************/
18 /**
19  *******************************************************************************
20  * @file
21  *  ihevc_32x32_itrans_recon_x86_intr.c
22  *
23  * @brief
24  *  Contains function definitions for inverse  quantization, inverse
25  * transform and reconstruction
26  *
27  * @author
28  *  100470
29  *
30  * @par List of Functions:
31  *  - ihevc_itrans_recon_32x32_sse42()
32  *
33  * @remarks
34  *  None
35  *
36  *******************************************************************************
37  */
38 #include <stdio.h>
39 #include <string.h>
40 #include "ihevc_typedefs.h"
41 #include "ihevc_platform_macros.h"
42 #include "ihevc_macros.h"
43 #include "ihevc_defs.h"
44 #include "ihevc_trans_tables.h"
45 #include "ihevc_iquant_itrans_recon.h"
46 #include "ihevc_func_selector.h"
47 #include "ihevc_trans_macros.h"
48 
49 #include <emmintrin.h>
50 #include <smmintrin.h>
51 #include <tmmintrin.h>
52 
53 /**
54  *******************************************************************************
55  *
56  * @brief
57  *  This function performs inverse quantization, inverse  transform and
58  * reconstruction for 16x16 input block
59  *
60  * @par Description:
61  *  Performs inverse quantization , inverse transform  and adds the
62  * prediction data and clips output to 8 bit
63  *
64  * @param[in] pi2_src
65  *  Input 16x16 coefficients
66  *
67  * @param[in] pi2_tmp
68  *  Temporary 16x16 buffer for storing inverse
69  *  transform 1st stage output
70  *
71  * @param[in] pu1_pred
72  *  Prediction 16x16 block
73  *
74  * @param[in] pi2_dequant_coeff
75  *  Dequant Coeffs
76  *
77  * @param[out] pu1_dst
78  *  Output 16x16 block
79  *
80  * @param[in] qp_div
81  *  Quantization parameter / 6
82  *
83  * @param[in] qp_rem
84  *  Quantization parameter % 6
85  *
86  * @param[in] src_strd
87  *  Input stride
88  *
89  * @param[in] pred_strd
90  *  Prediction stride
91  *
92  * @param[in] dst_strd
93  *  Output Stride
94  *
95  * @param[in] zero_cols
96  *  Zero columns in pi2_src
97  *
98  * @returns  Void
99  *
100  * @remarks
101  *  None
102  *
103  *******************************************************************************
104  */
105 /**/
106 
ihevc_itrans_recon_32x32_sse42(WORD16 * pi2_src,WORD16 * pi2_tmp,UWORD8 * pu1_pred,UWORD8 * pu1_dst,WORD32 src_strd,WORD32 pred_strd,WORD32 dst_strd,WORD32 zero_cols,WORD32 zero_rows)107 void ihevc_itrans_recon_32x32_sse42(WORD16 *pi2_src,
108                                     WORD16 *pi2_tmp,
109                                     UWORD8 *pu1_pred,
110                                     UWORD8 *pu1_dst,
111                                     WORD32 src_strd,
112                                     WORD32 pred_strd,
113                                     WORD32 dst_strd,
114                                     WORD32 zero_cols,
115                                     WORD32 zero_rows)
116 {
117     /* Inverse Transform */
118 
119     WORD32 j;
120 
121 
122     WORD16 *pi2_tmp_orig;
123 
124 
125     WORD16 *o_temp_ptr;
126     WORD16 *temp_ptr;
127 
128     __m128i m_temp_reg_0;
129     __m128i m_temp_reg_1;
130     __m128i m_temp_reg_2;
131     __m128i m_temp_reg_3;
132     __m128i m_temp_reg_4;
133     __m128i m_temp_reg_5;
134     __m128i m_temp_reg_6;
135     __m128i m_temp_reg_7;
136     __m128i m_temp_reg_10;
137     __m128i m_temp_reg_11;
138     __m128i m_temp_reg_12;
139     __m128i m_temp_reg_13;
140     __m128i m_temp_reg_14;
141     __m128i m_temp_reg_15;
142     __m128i m_temp_reg_16;
143     __m128i m_temp_reg_17;
144     __m128i m_temp_reg_18;
145     __m128i m_temp_reg_19;
146     __m128i m_temp_reg_20;
147     __m128i m_temp_reg_21;
148     __m128i m_temp_reg_22;
149     __m128i m_temp_reg_23;
150     __m128i m_temp_reg_30;
151     __m128i m_temp_reg_31;
152     __m128i m_temp_reg_32;
153     __m128i m_temp_reg_33;
154     __m128i m_temp_reg_34;
155     __m128i m_temp_reg_35;
156     __m128i m_temp_reg_36;
157     __m128i m_temp_reg_37;
158     __m128i m_temp_reg_40;
159     __m128i m_temp_reg_41;
160     __m128i m_temp_reg_42;
161     __m128i m_temp_reg_43;
162     __m128i m_temp_reg_44;
163     __m128i m_temp_reg_45;
164     __m128i m_temp_reg_46;
165     __m128i m_temp_reg_47;
166 
167     __m128i m_temp_reg_70;
168     __m128i m_temp_reg_71;
169     __m128i m_temp_reg_72;
170     __m128i m_temp_reg_73;
171     __m128i m_temp_reg_74;
172     __m128i m_temp_reg_75;
173     __m128i m_temp_reg_76;
174     __m128i m_temp_reg_77;
175 
176     __m128i m_temp_reg_80;
177     __m128i m_temp_reg_81;
178     __m128i m_temp_reg_82;
179     __m128i m_temp_reg_83;
180     __m128i m_temp_reg_84;
181     __m128i m_temp_reg_85;
182     __m128i m_temp_reg_86;
183     __m128i m_temp_reg_87;
184 
185     __m128i m_temp_reg_90;
186     __m128i m_temp_reg_91;
187     __m128i m_temp_reg_92;
188     __m128i m_temp_reg_93;
189     __m128i m_temp_reg_94;
190     __m128i m_temp_reg_95;
191     __m128i m_temp_reg_96;
192     __m128i m_temp_reg_97;
193 
194     __m128i m_rdng_factor;
195     __m128i m_count;
196     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
197     __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8;
198 
199     __m128i temp1, temp2, temp3, temp4;
200     __m128i temp5, temp6, temp7, temp8;
201 
202     __m128i all_zero_reg;
203     WORD32 i;
204 
205     /*Lokesh*/
206     WORD32  zero_last24_cols_stg1;
207     WORD32  zero_last24_rows_stg1;
208     WORD32  zero_last28_rows_stg1;
209 
210     WORD32  zero_last28_rows_stg2;
211     WORD32  zero_last24_rows_stg2;
212 
213     WORD32  trans_size_stg1;
214 
215     WORD32 i4_shift = IT_SHIFT_STAGE_1;
216     WORD32 trans_size = TRANS_SIZE_32;
217 
218 
219     /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */
220     zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
221     zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0;
222     zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
223 
224     zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0;
225     zero_last24_rows_stg2 = zero_last24_cols_stg1;
226 
227     if((zero_last28_rows_stg2) || (zero_last24_cols_stg1))
228     {
229         trans_size_stg1 = 8;
230 
231     }
232     else
233     {
234         trans_size_stg1 = 32;
235     }
236 
237     all_zero_reg = _mm_setzero_si128();
238 
239     o_temp_ptr  = pi2_tmp;
240     temp_ptr = (pi2_tmp + 1024);
241 
242     pi2_tmp += 2048;
243     pi2_tmp_orig = pi2_tmp;
244 
245     for(i = 0; i < trans_size_stg1; i += 8)
246     {
247 
248         {
249             WORD16 *pi2_tmp_src = pi2_src;
250 
251             m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
252             pi2_tmp_src += (src_strd << 1);
253             m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
254             pi2_tmp_src += (src_strd << 1);
255             m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
256             pi2_tmp_src += (src_strd << 1);
257             m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
258             pi2_tmp_src += (src_strd << 1);
259             m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
260             pi2_tmp_src += (src_strd << 1);
261             m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
262             pi2_tmp_src += (src_strd << 1);
263             m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
264             pi2_tmp_src += (src_strd << 1);
265             m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
266             pi2_tmp_src += (src_strd << 1);
267 
268             m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
269             pi2_tmp_src += (src_strd << 1);
270             m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
271             pi2_tmp_src += (src_strd << 1);
272             m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
273             pi2_tmp_src += (src_strd << 1);
274             m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
275             pi2_tmp_src += (src_strd << 1);
276             m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
277             pi2_tmp_src += (src_strd << 1);
278             m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
279             pi2_tmp_src += (src_strd << 1);
280             m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
281             pi2_tmp_src += (src_strd << 1);
282             m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
283         }
284 
285         if(zero_last28_rows_stg1)
286         {
287             /* eeo */
288             /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
289             /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
290             {
291                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
292 
293                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
294 
295                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
296 
297 /* eeeo[0]= m_temp_reg_20  */
298 /* eeeo[1]= m_temp_reg_21  */
299 /* eeee[0]= m_temp_reg_22  */
300 /* eeee[1]= m_temp_reg_23  */
301 
302                 /* eee[0] = eeee[0] + eeeo[0]; */
303                 m_temp_reg_40 = m_temp_reg_14;
304 
305                 /* eee[3] = eeee[0] - eeeo[0]; */
306                 m_temp_reg_43 = m_temp_reg_14;
307 
308                 /* eee[2] = eeee[1] - eeeo[1]; */
309                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
310 
311                 /* eee[1] = eeee[1] + eeeo[1];*/
312                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
313 
314                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
315 
316                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
317 
318                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
319 
320 /* eeeo[0]= m_temp_reg_20  */
321 /* eeeo[1]= m_temp_reg_21  */
322 /* eeee[0]= m_temp_reg_22  */
323 /* eeee[1]= m_temp_reg_23  */
324 
325                 /* eee[0] = eeee[0] + eeeo[0]; */
326                 m_temp_reg_44 = m_temp_reg_14;
327 
328                 /* eee[3] = eeee[0] - eeeo[0]; */
329                 m_temp_reg_47 = m_temp_reg_14;
330 
331                 /* eee[2] = eeee[1] - eeeo[1]; */
332                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
333 
334                 /* eee[1] = eeee[1] + eeeo[1];*/
335                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
336 
337 
338             }
339             /* eo */
340             {
341                 WORD16 *pi2_scratch = o_temp_ptr;
342 
343                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90
344                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
345                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
346                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
347                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
348                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
349                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
350                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
351 
352                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
353 
354                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
355 
356                 /* eo0[0-3] */
357                 {
358                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
359 
360                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg);
361 
362                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
363                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
364 
365                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
366                     pi2_scratch += 8;
367                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
368                     pi2_scratch += 8;
369 
370                 }
371 
372                 /* eo0[4-7] */
373                 {
374                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
375 
376                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
377                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
378 
379                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
380                     pi2_scratch += 8;
381                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
382                     pi2_scratch += 8;
383 
384                 }
385                 /* eo1[0-3] */
386                 {
387                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
388 
389                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
390                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
391 
392                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
393                     pi2_scratch += 8;
394                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
395                     pi2_scratch += 8;
396 
397                 }
398 
399                 /* eo1[4-7] */
400                 {
401                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
402 
403                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
404                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
405 
406                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
407                     pi2_scratch += 8;
408                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
409                     pi2_scratch += 8;
410 
411                 }
412 
413                 /* eo2[0-3] */
414                 {
415                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
416 
417                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
418                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
419 
420                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
421                     pi2_scratch += 8;
422                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
423                     pi2_scratch += 8;
424 
425                 }
426 
427                 /* eo2[4-7] */
428                 {
429                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
430 
431                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
432                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
433 
434                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
435                     pi2_scratch += 8;
436                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
437                     pi2_scratch += 8;
438 
439                 }
440 
441                 /**************************************************************************/
442 
443 
444                 /* eo3[0-3] */
445                 {
446                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
447 
448                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
449                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
450 
451                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
452                     pi2_scratch += 8;
453                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
454                     pi2_scratch += 8;
455 
456                 }
457 
458                 /* eo3[4-7] */
459                 {
460                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
461 
462                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
463                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
464 
465                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
466                     pi2_scratch += 8;
467                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
468                     pi2_scratch += 8;
469 
470                 }
471 
472 
473                 /* eo4[0-3] */
474                 {
475                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
476 
477                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
478                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
479 
480                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
481                     pi2_scratch += 8;
482                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
483                     pi2_scratch += 8;
484 
485                 }
486                 /* eo4[4-7] */
487                 {
488                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
489 
490                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
491                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
492 
493                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
494                     pi2_scratch += 8;
495                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
496                     pi2_scratch += 8;
497 
498                 }
499 
500                 /***********************************************************************/
501 
502                 /* eo5[0-3] */
503                 {
504                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
505 
506                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
507                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
508 
509                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
510                     pi2_scratch += 8;
511                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
512                     pi2_scratch += 8;
513 
514                 }
515 
516 
517                 /* eo5[4-7] */
518                 {
519                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6);
520 
521                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
522                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
523 
524                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
525                     pi2_scratch += 8;
526                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
527                     pi2_scratch += 8;
528 
529                 }
530 
531                 /* eo6[0-3] */
532                 {
533                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
534 
535                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
536                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
537 
538                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
539                     pi2_scratch += 8;
540                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
541                     pi2_scratch += 8;
542 
543                 }
544 
545 
546                 /* eo6[4-7] */
547                 {
548                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7);
549 
550                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
551                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
552 
553                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
554                     pi2_scratch += 8;
555                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
556                     pi2_scratch += 8;
557 
558                 }
559 
560 
561                 /* eo7[0-3] */
562                 {
563                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
564 
565                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
566                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
567 
568                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
569                     pi2_scratch += 8;
570                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
571                     pi2_scratch += 8;
572 
573                 }
574 
575 
576                 /* eo7[4-7] */
577                 {
578                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8);
579 
580                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
581                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
582 
583                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
584                     pi2_scratch += 8;
585                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
586                     pi2_scratch += 8;
587 
588                 }
589 
590             }
591         }
592         else if(zero_last24_rows_stg1)
593         {
594             {
595                 /* eeo */
596                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
597                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
598 
599                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
600                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
601 
602                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
603 
604                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
605 
606                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
607 
608                 /* eeeo[0]= m_temp_reg_20  */
609                 /* eeeo[1]= m_temp_reg_21  */
610                 /* eeee[0]= m_temp_reg_22  */
611                 /* eeee[1]= m_temp_reg_23  */
612 
613                 /* eee[0] = eeee[0] + eeeo[0]; */
614                 m_temp_reg_40 = m_temp_reg_14;
615 
616                 /* eee[3] = eeee[0] - eeeo[0]; */
617                 m_temp_reg_43 = m_temp_reg_14;
618 
619                 /* eee[2] = eeee[1] - eeeo[1]; */
620                 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16;
621 
622                 /* eee[1] = eeee[1] + eeeo[1];*/
623                 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16;
624 
625                 /* for row 4 to 7 */
626 
627                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
628 
629                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
630 
631                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
632 
633                 /* eeeo[0]= m_temp_reg_20  */
634                 /* eeeo[1]= m_temp_reg_21  */
635                 /* eeee[0]= m_temp_reg_22  */
636                 /* eeee[1]= m_temp_reg_23  */
637 
638                 /* eee[0] = eeee[0] + eeeo[0]; */
639                 m_temp_reg_44 = m_temp_reg_14;
640 
641                 /* eee[3] = eeee[0] - eeeo[0]; */
642                 m_temp_reg_47 = m_temp_reg_14;
643 
644                 /* eee[2] = eeee[1] - eeeo[1]; */
645                 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16;
646 
647                 /* eee[1] = eeee[1] + eeeo[1];*/
648                 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16;
649 
650 
651                 // eeo[]
652                 /* for(k = 0; k < 4; k++) */
653 
654                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
655                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
656                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
657                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
658 
659                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
660 
661                 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
662 
663                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
664 
665                 m_temp_reg_33 = _mm_setzero_si128();
666 
667                 /* eeo */
668                 {
669                     /* eeo0[0-3] */
670                     {
671                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
672 
673                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
674                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
675 
676                         m_temp_reg_90 = m_temp_reg_34;
677                         m_temp_reg_97 = m_temp_reg_35;
678                     }
679                     /* eeo0[4-7] */
680                     {
681                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
682 
683                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
684                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
685 
686                         m_temp_reg_91 = m_temp_reg_34;
687                         m_temp_reg_96 = m_temp_reg_35;
688 
689                     }
690 
691                     /* eeo1[0-3] */
692                     {
693                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
694 
695                         /* e[1][0-3] stored in pi2_tmp[2][0-7] */
696                         /* e[6][0-3] stored in pi2_tmp[2][8-15] */
697                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
698                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
699 
700                         m_temp_reg_92 = m_temp_reg_34;
701                         m_temp_reg_95 = m_temp_reg_35;
702 
703                     }
704 
705                     /* eo1[4-7] */
706                     {
707                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2);
708 
709                         /* e[1][4-7] stored in pi2_tmp[3][0-7] */
710                         /* e[6][4-7] stored in pi2_tmp[3][8-15] */
711                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
712                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
713 
714                         m_temp_reg_93 = m_temp_reg_34;
715                         m_temp_reg_94 = m_temp_reg_35;
716 
717 
718                     }
719 
720                     /* eo2[0-3] */
721                     {
722                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
723 
724                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
725                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
726                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
727                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
728 
729                         temp1 = m_temp_reg_34;
730                         temp7 = m_temp_reg_35;
731 
732                     }
733 
734                     /* eo2[4-7] */
735                     {
736                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4);
737 
738                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
739                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
740                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
741                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
742 
743                         temp2 = m_temp_reg_34;
744                         temp6 = m_temp_reg_35;
745 
746                     }
747 
748                     /* eo3[0-3] */
749                     {
750                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
751 
752                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
753                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
754                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
755                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
756 
757                         temp3 = m_temp_reg_34;
758                         temp5 = m_temp_reg_35;
759 
760                     }
761 
762 
763                     /* eo3[4-7] */
764                     {
765                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
766 
767                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
768                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
769                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
770                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
771 
772                         temp4 = m_temp_reg_34;
773                         temp8 = m_temp_reg_35;
774 
775 
776                     }
777                     /* All values of ee[] array in pi2_temp */
778 
779                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
780                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
781                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
782                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
783 
784                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
785 
786                     m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
787                     m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
788 
789                 }
790             }
791             /* eo */
792             {
793 
794                 WORD16 *pi2_scratch = o_temp_ptr;
795 
796                 /* eo0[0-3] */
797                 {
798                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
799 
800                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
801                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
802 
803                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
804                     pi2_scratch += 8;
805                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
806                     pi2_scratch += 8;
807 
808                 }
809 
810 
811                 /* eo0[4-7] */
812                 {
813                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
814 
815                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
816 
817                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
818                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
819 
820                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
821                     pi2_scratch += 8;
822                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
823                     pi2_scratch += 8;
824 
825                 }
826 
827                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
828 
829                 /* eo1[0-3] */
830                 {
831                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
832 
833                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
834                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
835 
836                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
837                     pi2_scratch += 8;
838                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
839                     pi2_scratch += 8;
840 
841                 }
842 
843 
844                 /* eo1[4-7] */
845                 {
846                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
847 
848                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
849                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
850 
851                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
852                     pi2_scratch += 8;
853                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
854                     pi2_scratch += 8;
855 
856                 }
857 
858                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
859 
860                 /* eo2[0-3] */
861                 {
862 
863                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
864 
865                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
866                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
867 
868                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
869                     pi2_scratch += 8;
870                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
871                     pi2_scratch += 8;
872 
873                 }
874 
875                 /* eo2[4-7] */
876                 {
877 
878                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
879 
880                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
881                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
882 
883                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
884                     pi2_scratch += 8;
885                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
886                     pi2_scratch += 8;
887 
888                 }
889 
890                 /**************************************************************************/
891 
892 
893 
894                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
895 
896                 /* eo3[0-3] */
897                 {
898 
899                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
900 
901                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
902                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
903 
904                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
905                     pi2_scratch += 8;
906                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
907                     pi2_scratch += 8;
908 
909                 }
910 
911 
912                 /* eo3[4-7] */
913                 {
914 
915                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
916 
917                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
918                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
919 
920                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
921                     pi2_scratch += 8;
922                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
923                     pi2_scratch += 8;
924 
925                 }
926 
927                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
928 
929                 /* eo4[0-3] */
930                 {
931                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
932 
933                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
934                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
935 
936                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
937                     pi2_scratch += 8;
938                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
939                     pi2_scratch += 8;
940 
941                 }
942                 /* eo4[4-7] */
943                 {
944                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
945 
946                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
947                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
948 
949                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
950                     pi2_scratch += 8;
951                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
952                     pi2_scratch += 8;
953 
954                 }
955 
956                 /***********************************************************************/
957 
958                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
959 
960                 /* eo5[0-3] */
961                 {
962 
963                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
964 
965                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
966                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
967 
968                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
969                     pi2_scratch += 8;
970                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
971                     pi2_scratch += 8;
972 
973                 }
974 
975 
976                 /* eo5[4-7] */
977                 {
978                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
979 
980                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
981                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
982 
983                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
984                     pi2_scratch += 8;
985                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
986                     pi2_scratch += 8;
987 
988                 }
989 
990                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
991 
992                 /* eo6[0-3] */
993                 {
994                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
995 
996                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
997                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
998 
999                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1000                     pi2_scratch += 8;
1001                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1002                     pi2_scratch += 8;
1003 
1004                 }
1005 
1006 
1007                 /* eo6[4-7] */
1008                 {
1009 
1010                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1011 
1012                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1013                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1014 
1015                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1016                     pi2_scratch += 8;
1017                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1018                     pi2_scratch += 8;
1019 
1020                 }
1021 
1022                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
1023 
1024                 /* eo7[0-3] */
1025                 {
1026 
1027                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1028 
1029                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1030                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1031 
1032                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1033                     pi2_scratch += 8;
1034                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1035                     pi2_scratch += 8;
1036 
1037                 }
1038 
1039 
1040                 /* eo7[4-7] */
1041                 {
1042                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1043 
1044                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1045                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1046 
1047                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1048                     pi2_scratch += 8;
1049                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1050                     pi2_scratch += 8;
1051 
1052                 }
1053 
1054             }
1055 
1056         }
1057         else
1058         {
1059 
1060             {
1061                 /* eeo */
1062                 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */
1063                 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */
1064 
1065                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
1066                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
1067 
1068                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
1069                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
1070 
1071                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1072 
1073                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1074 
1075                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
1076                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
1077 
1078                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
1079                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
1080 
1081 
1082                 /* eeeo[0]= m_temp_reg_20  */
1083                 /* eeeo[1]= m_temp_reg_21  */
1084                 /* eeee[0]= m_temp_reg_22  */
1085                 /* eeee[1]= m_temp_reg_23  */
1086 
1087                 /* eee[0] = eeee[0] + eeeo[0]; */
1088                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
1089 
1090                 /* eee[3] = eeee[0] - eeeo[0]; */
1091                 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
1092 
1093                 /* eee[2] = eeee[1] - eeeo[1]; */
1094                 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
1095 
1096                 /* eee[1] = eeee[1] + eeeo[1];*/
1097                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
1098 
1099                 /* for row 4 to 7 */
1100 
1101                 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
1102                 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
1103 
1104                 /* Interleaving row 8 and row 24*/
1105                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
1106 
1107                 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1108                 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
1109 
1110                 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
1111 
1112                 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
1113                 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
1114 
1115                 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
1116                 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
1117 
1118 
1119                 /* eeeo[0]= m_temp_reg_20  */
1120                 /* eeeo[1]= m_temp_reg_21  */
1121                 /* eeee[0]= m_temp_reg_22  */
1122                 /* eeee[1]= m_temp_reg_23  */
1123 
1124                 /* eee[0] = eeee[0] + eeeo[0]; */
1125                 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
1126 
1127                 /* eee[3] = eeee[0] - eeeo[0]; */
1128                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
1129 
1130                 /* eee[2] = eeee[1] - eeeo[1]; */
1131                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
1132 
1133                 /* eee[1] = eeee[1] + eeeo[1];*/
1134                 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
1135 
1136 
1137                 // eeo[]
1138                 /* for(k = 0; k < 4; k++) */
1139 
1140                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
1141                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
1142 
1143                 /* eeo */
1144                 {
1145                     /* eeo0[0-3] */
1146                     {
1147                         m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1148                         m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1149 
1150                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1151                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1152 
1153                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1154 
1155                         m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1156                         m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1157 
1158                     }
1159 
1160                     m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
1161                     m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
1162                     m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
1163                     m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
1164 
1165                     /* eeo0[4-7] */
1166                     {
1167                         m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1168                         m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
1169 
1170                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1171                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1172 
1173                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1174 
1175                         m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30);
1176                         m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30);
1177 
1178                     }
1179 
1180 
1181                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
1182                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
1183 
1184                     /* eeo1[0-3] */
1185                     {
1186                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1187                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1188 
1189                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30);
1190                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30);
1191 
1192                         m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1193                         m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1194 
1195                     }
1196 
1197                     /* eeo1[4-7] */
1198                     {
1199 
1200                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1201                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1202 
1203                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30);
1204                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30);
1205 
1206                         m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31);
1207                         m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31);
1208 
1209 
1210                     }
1211 
1212                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
1213                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
1214 
1215                     /* eeo2[0-3] */
1216                     {
1217 
1218                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1219                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1220 
1221                         /* e[2][0-3] stored in pi2_tmp[4][0-7] */
1222                         /* e[5][0-3] stored in pi2_tmp[4][8-15] */
1223 
1224                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30);
1225                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30);
1226 
1227                         temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1228                         temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1229 
1230                     }
1231 
1232                     /* eeo2[4-7] */
1233                     {
1234 
1235                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1236                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1237 
1238                         /* e[2][4-7] stored in pi2_tmp[5][0-7] */
1239                         /* e[5][4-7] stored in pi2_tmp[5][8-15] */
1240 
1241                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30);
1242                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30);
1243 
1244                         temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1245                         temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1246 
1247                     }
1248 
1249                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
1250                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
1251 
1252                     /* eeo3[0-3] */
1253                     {
1254 
1255                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
1256                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
1257 
1258                         /* e[3][0-3] stored in pi2_tmp[6][0-7] */
1259                         /* e[4][0-3] stored in pi2_tmp[6][8-15] */
1260 
1261                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30);
1262                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30);
1263 
1264                         temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1265                         temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1266 
1267 
1268                     }
1269 
1270                     /* eeo3[4-7] */
1271                     {
1272 
1273                         m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3);
1274                         m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4);
1275 
1276                         /* e[3][4-7] stored in pi2_tmp[7][0-7] */
1277                         /* e[4][4-7] stored in pi2_tmp[7][8-15] */
1278 
1279                         m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30);
1280                         m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30);
1281                         temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31);
1282                         temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31);
1283 
1284                     }
1285 
1286 
1287                     /* All values of ee[] array in pi2_temp */
1288 
1289                     /* for(k = 0; k < 8; k++) */
1290                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
1291                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
1292                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
1293                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
1294                 }
1295             }
1296             /* eo */
1297             {
1298 
1299                 WORD16 *pi2_scratch = o_temp_ptr;
1300 
1301                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1302                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1303                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1304                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1305 
1306                 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1307                 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
1308                 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
1309                 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
1310 
1311                 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
1312                 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
1313                 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
1314                 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
1315 
1316                 /* eo0[0-3] */
1317                 {
1318                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1319                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1320 
1321                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1322 
1323                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1324                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1325 
1326                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1327 
1328                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1329 
1330                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30);
1331                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30);
1332 
1333                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1334                     pi2_scratch += 8;
1335                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1336                     pi2_scratch += 8;
1337 
1338                 }
1339                 /* eo0[4-7] */
1340                 {
1341                     m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1342                     m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1343                     m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83);
1344                     m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87);
1345 
1346                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1347                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1348 
1349                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1350 
1351                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1352                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1353 
1354                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1355 
1356                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1357 
1358                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30);
1359                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30);
1360 
1361                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1362                     pi2_scratch += 8;
1363                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1364                     pi2_scratch += 8;
1365 
1366                 }
1367 
1368                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
1369                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
1370                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
1371                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
1372 
1373                 /* eo1[0-3] */
1374                 {
1375 
1376                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1377                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1378 
1379                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1380 
1381                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1382                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1383 
1384                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1385 
1386                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1387 
1388                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30);
1389                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30);
1390 
1391                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1392                     pi2_scratch += 8;
1393                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1394                     pi2_scratch += 8;
1395 
1396                 }
1397 
1398                 /* eo1[4-7] */
1399                 {
1400                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1401                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1402 
1403                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1404 
1405                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1406                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1407 
1408                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1409 
1410                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
1411 
1412                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30);
1413                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30);
1414 
1415                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1416                     pi2_scratch += 8;
1417                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1418                     pi2_scratch += 8;
1419 
1420                 }
1421 
1422                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
1423                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
1424                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
1425                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
1426 
1427                 /* eo2[0-3] */
1428                 {
1429                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1430                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1431 
1432                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1433 
1434                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1435                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1436 
1437                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1438 
1439                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1440 
1441                     m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30);
1442                     m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30);
1443 
1444                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1445                     pi2_scratch += 8;
1446                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1447                     pi2_scratch += 8;
1448 
1449                 }
1450 
1451 
1452                 /* eo2[4-7] */
1453                 {
1454 
1455                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1456                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1457 
1458                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
1459 
1460                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1461                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1462 
1463                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1464 
1465                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1466 
1467                     m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30);
1468                     m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30);
1469 
1470                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1471                     pi2_scratch += 8;
1472                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1473                     pi2_scratch += 8;
1474 
1475                 }
1476                 /**************************************************************************/
1477 
1478                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
1479                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
1480                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
1481                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
1482 
1483                 /* eo3[0-3] */
1484                 {
1485                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1486                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1487 
1488                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1489 
1490                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1491                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1492 
1493                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1494 
1495                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1496 
1497                     m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30);
1498                     m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30);
1499 
1500                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1501                     pi2_scratch += 8;
1502                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1503                     pi2_scratch += 8;
1504 
1505                 }
1506 
1507 
1508                 /* eo3[4-7] */
1509                 {
1510                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1511                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1512 
1513                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1514 
1515                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1516                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1517 
1518                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
1519 
1520                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1521 
1522                     m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30);
1523                     m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30);
1524 
1525                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1526                     pi2_scratch += 8;
1527                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1528                     pi2_scratch += 8;
1529 
1530                 }
1531 
1532                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
1533                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
1534                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
1535                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
1536 
1537                 /* eo4[0-3] */
1538                 {
1539 
1540                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1541                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1542 
1543                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1544 
1545                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1546                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1547 
1548                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1549 
1550                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1551 
1552                     m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30);
1553                     m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30);
1554 
1555                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1556                     pi2_scratch += 8;
1557                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1558                     pi2_scratch += 8;
1559 
1560                 }
1561 
1562 
1563                 /* eo4[4-7] */
1564                 {
1565                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1566                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1567 
1568                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1569 
1570                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1571                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1572 
1573                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
1574 
1575                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1576 
1577                     m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30);
1578                     m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30);
1579 
1580                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1581                     pi2_scratch += 8;
1582                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1583                     pi2_scratch += 8;
1584 
1585                 }
1586 
1587                 /***********************************************************************/
1588 
1589                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
1590                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
1591                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
1592                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
1593 
1594                 /* eo5[0-3] */
1595                 {
1596                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1597                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1598 
1599                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1600 
1601                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1602                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1603 
1604                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1605 
1606                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1607 
1608                     m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30);
1609                     m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30);
1610 
1611                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1612                     pi2_scratch += 8;
1613                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1614                     pi2_scratch += 8;
1615 
1616                 }
1617 
1618 
1619                 /* eo5[4-7] */
1620                 {
1621                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1622                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1623 
1624                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1625 
1626                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1627                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1628 
1629                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1630 
1631                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1632 
1633                     m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30);
1634                     m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30);
1635 
1636                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1637                     pi2_scratch += 8;
1638                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1639                     pi2_scratch += 8;
1640 
1641                 }
1642 
1643                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
1644                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
1645                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
1646                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
1647 
1648                 /* eo6[0-3] */
1649                 {
1650 
1651                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1652                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1653 
1654                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1655 
1656                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1657                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1658 
1659                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1660 
1661                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1662 
1663                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30);
1664                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30);
1665 
1666                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1667                     pi2_scratch += 8;
1668                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1669                     pi2_scratch += 8;
1670 
1671                 }
1672 
1673 
1674                 /* eo6[4-7] */
1675                 {
1676                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1677                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1678 
1679                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1680 
1681                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1682                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1683 
1684                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1685 
1686                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1687 
1688                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30);
1689                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30);
1690 
1691                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1692                     pi2_scratch += 8;
1693                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1694                     pi2_scratch += 8;
1695 
1696                 }
1697 
1698                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
1699                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
1700                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
1701                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
1702 
1703                 /* eo7[0-3] */
1704                 {
1705 
1706                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1707                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
1708 
1709                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1710 
1711                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
1712                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
1713 
1714                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1715 
1716                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1717 
1718                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30);
1719                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30);
1720 
1721                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1722                     pi2_scratch += 8;
1723                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1724                     pi2_scratch += 8;
1725 
1726                 }
1727 
1728 
1729                 /* eo7[4-7] */
1730                 {
1731 
1732                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1);
1733                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2);
1734 
1735                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
1736 
1737                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3);
1738                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4);
1739 
1740                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
1741 
1742                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
1743 
1744                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30);
1745                     m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30);
1746 
1747                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34);
1748                     pi2_scratch += 8;
1749                     _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35);
1750                     pi2_scratch += 8;
1751 
1752                 }
1753 
1754             }
1755 
1756         }
1757         /*  All e[] are done */
1758         /****************************/
1759 
1760         {
1761 
1762             WORD16 *pi2_tmp_src = pi2_src + src_strd;
1763 
1764             m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1765             pi2_tmp_src += (src_strd << 1);
1766             m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1767             pi2_tmp_src += (src_strd << 1);
1768             m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1769             pi2_tmp_src += (src_strd << 1);
1770             m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1771             pi2_tmp_src += (src_strd << 1);
1772             m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1773             pi2_tmp_src += (src_strd << 1);
1774             m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1775             pi2_tmp_src += (src_strd << 1);
1776             m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1777             pi2_tmp_src += (src_strd << 1);
1778             m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1779             pi2_tmp_src += (src_strd << 1);
1780 
1781             m_temp_reg_80 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1782             pi2_tmp_src += (src_strd << 1);
1783             m_temp_reg_81 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1784             pi2_tmp_src += (src_strd << 1);
1785             m_temp_reg_82 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1786             pi2_tmp_src += (src_strd << 1);
1787             m_temp_reg_83 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1788             pi2_tmp_src += (src_strd << 1);
1789             m_temp_reg_84 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1790             pi2_tmp_src += (src_strd << 1);
1791             m_temp_reg_85 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1792             pi2_tmp_src += (src_strd << 1);
1793             m_temp_reg_86 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1794             pi2_tmp_src += (src_strd << 1);
1795             m_temp_reg_87 = _mm_loadu_si128((__m128i *)pi2_tmp_src);
1796         }
1797 
1798         if(zero_last28_rows_stg1)
1799         {
1800             /* o & stage 1 out */
1801             {
1802                 WORD32 j;
1803                 WORD16 *pi2_src_scratch = o_temp_ptr;
1804                 WORD16 *pi2_dst_scratch = temp_ptr;
1805                 WORD32 out_stride = (trans_size << 1);
1806                 WORD32 in_stride = trans_size;
1807 
1808                 for(j = 0; j < 2; j++)
1809                 {
1810                     if(j)
1811                     {
1812                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
1813                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
1814                     }
1815 
1816                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
1817 
1818                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
1819 
1820                     /* o0[0-3] */
1821                     {
1822                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1823 
1824                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1825                         pi2_src_scratch += in_stride;
1826 
1827                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1828                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1829 
1830                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1831                         m_count = _mm_cvtsi32_si128(i4_shift);
1832                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1833                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1834 
1835                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1836                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1837                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1838                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1839 
1840                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1841 
1842                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1843                         pi2_dst_scratch += out_stride;
1844 
1845                     }
1846 
1847                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
1848 
1849                     /* o1[0-3] */
1850                     {
1851 
1852                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1853 
1854                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1855                         pi2_src_scratch += in_stride;
1856 
1857                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1858                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1859 
1860                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1861                         m_count = _mm_cvtsi32_si128(i4_shift);
1862                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1863                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1864 
1865                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1866                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1867                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1868                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1869 
1870                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1871 
1872                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1873                         pi2_dst_scratch += out_stride;
1874 
1875                     }
1876 
1877                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
1878 
1879                     /* o2[0-3] */
1880                     {
1881 
1882                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1883 
1884                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1885                         pi2_src_scratch += in_stride;
1886 
1887                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1888                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1889 
1890                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1891                         m_count = _mm_cvtsi32_si128(i4_shift);
1892                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1893                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1894 
1895                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1896                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1897                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1898                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1899 
1900                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1901 
1902                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1903                         pi2_dst_scratch += out_stride;
1904 
1905                     }
1906 
1907                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
1908 
1909                     /* o3[0-3] */
1910                     {
1911                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1912 
1913                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1914                         pi2_src_scratch += in_stride;
1915 
1916                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1917                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1918 
1919                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1920                         m_count = _mm_cvtsi32_si128(i4_shift);
1921                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1922                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1923 
1924                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1925                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1926                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1927                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1928 
1929                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1930 
1931                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1932                         pi2_dst_scratch += out_stride;
1933 
1934                     }
1935 
1936                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
1937 
1938                     /* o4[0-3] */
1939                     {
1940                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1941 
1942                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1943                         pi2_src_scratch += in_stride;
1944 
1945                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1946                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1947 
1948                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1949                         m_count = _mm_cvtsi32_si128(i4_shift);
1950                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1951                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1952 
1953                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1954                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1955                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1956                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1957 
1958                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1959 
1960                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1961                         pi2_dst_scratch += out_stride;
1962 
1963                     }
1964 
1965                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
1966 
1967                     /* o5[0-3] */
1968                     {
1969 
1970                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
1971 
1972                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
1973                         pi2_src_scratch += in_stride;
1974 
1975                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
1976                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
1977 
1978                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1979                         m_count = _mm_cvtsi32_si128(i4_shift);
1980                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
1981                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
1982 
1983                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
1984                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
1985                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
1986                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
1987 
1988                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
1989 
1990                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
1991                         pi2_dst_scratch += out_stride;
1992 
1993                     }
1994 
1995                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
1996 
1997                     /* o6[0-3] */
1998                     {
1999                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2000 
2001                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2002                         pi2_src_scratch += in_stride;
2003 
2004                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2005                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2006 
2007                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2008                         m_count = _mm_cvtsi32_si128(i4_shift);
2009                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2010                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2011 
2012                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2013                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2014                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2015                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2016 
2017                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2018 
2019                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2020                         pi2_dst_scratch += out_stride;
2021 
2022                     }
2023 
2024                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2025 
2026                     /* o7[0-3] */
2027                     {
2028 
2029                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2030 
2031                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2032                         pi2_src_scratch += 8;
2033 
2034                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2035                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2036 
2037                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2038                         m_count = _mm_cvtsi32_si128(i4_shift);
2039                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2040                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2041 
2042                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2043                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2044                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2045                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2046 
2047                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2048 
2049                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2050                         pi2_dst_scratch += 8;
2051 
2052                     }
2053 
2054                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2055 
2056                     /* o8[0-3] */
2057                     {
2058                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2059 
2060                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2061                         pi2_src_scratch -= in_stride;
2062 
2063                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2064                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2065 
2066                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2067                         m_count = _mm_cvtsi32_si128(i4_shift);
2068                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2069                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2070 
2071                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2072                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2073                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2074                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2075 
2076                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2077 
2078                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2079                         pi2_dst_scratch -= out_stride;
2080                     }
2081 
2082                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2083 
2084                     /* o9[0-3] */
2085                     {
2086                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2087 
2088                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2089                         pi2_src_scratch -= in_stride;
2090 
2091                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2092                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2093 
2094                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2095                         m_count = _mm_cvtsi32_si128(i4_shift);
2096                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2097                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2098 
2099                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2100                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2101                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2102                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2103 
2104                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2105 
2106                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2107                         pi2_dst_scratch -= out_stride;
2108                     }
2109 
2110                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2111 
2112                     /* o10[0-3] */
2113                     {
2114                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2115 
2116                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2117                         pi2_src_scratch -= in_stride;
2118 
2119                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2120                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2121 
2122                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2123                         m_count = _mm_cvtsi32_si128(i4_shift);
2124                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2125                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2126 
2127                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2128                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2129                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2130                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2131 
2132                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2133 
2134                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2135                         pi2_dst_scratch -= out_stride;
2136                     }
2137 
2138                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2139 
2140                     /* o11[0-3] */
2141                     {
2142                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2143 
2144                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2145                         pi2_src_scratch -= in_stride;
2146 
2147                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2148                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2149 
2150                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2151                         m_count = _mm_cvtsi32_si128(i4_shift);
2152                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2153                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2154 
2155                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2156                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2157                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2158                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2159 
2160                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2161 
2162                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2163                         pi2_dst_scratch -= out_stride;
2164 
2165                     }
2166 
2167                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2168 
2169                     /* o12[0-3] */
2170                     {
2171                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2172 
2173                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2174                         pi2_src_scratch -= in_stride;
2175 
2176                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2177                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2178 
2179                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2180                         m_count = _mm_cvtsi32_si128(i4_shift);
2181                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2182                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2183 
2184                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2185                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2186                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2187                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2188 
2189                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2190 
2191                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2192                         pi2_dst_scratch -= out_stride;
2193 
2194                     }
2195 
2196                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2197 
2198                     /* o13[0-3] */
2199                     {
2200                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2201 
2202                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2203                         pi2_src_scratch -= in_stride;
2204 
2205                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2206                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2207 
2208                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2209                         m_count = _mm_cvtsi32_si128(i4_shift);
2210                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2211                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2212 
2213                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2214                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2215                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2216                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2217 
2218                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2219 
2220                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2221                         pi2_dst_scratch -= out_stride;
2222                     }
2223 
2224                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2225 
2226                     /* o14[0-3] */
2227                     {
2228                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2229 
2230                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2231                         pi2_src_scratch -= in_stride;
2232 
2233                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2234                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2235 
2236                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2237                         m_count = _mm_cvtsi32_si128(i4_shift);
2238                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2239                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2240 
2241                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2242                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2243                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2244                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2245 
2246                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2247 
2248                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2249                         pi2_dst_scratch -= out_stride;
2250 
2251                     }
2252 
2253                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2254 
2255                     /* o15[0-3] */
2256                     {
2257                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2258 
2259                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2260                         pi2_src_scratch += 8;
2261 
2262                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2263                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2264 
2265                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2266                         m_count = _mm_cvtsi32_si128(i4_shift);
2267                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2268                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2269 
2270                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2271                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2272                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2273                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2274 
2275                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2276 
2277                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2278                         pi2_dst_scratch += 8;
2279                     }
2280 
2281                 }
2282             }
2283         }
2284         else if(zero_last24_rows_stg1)
2285         {
2286             /* o & stage 1 out */
2287             {
2288                 WORD32 j;
2289 
2290                 WORD16 *pi2_src_scratch = o_temp_ptr;
2291                 WORD16 *pi2_dst_scratch = temp_ptr;
2292                 WORD32 out_stride = (trans_size << 1);
2293 
2294                 WORD32 in_stride = trans_size;
2295 
2296                 for(j = 0; j < 2; j++)
2297                 {
2298                     if(j)
2299                     {
2300                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2301                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2302                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2303                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2304                     }
2305 
2306                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2307                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2308 
2309                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2310                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2311 
2312                     /* o0[0-3] */
2313                     {
2314 
2315                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2316                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2317 
2318                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2319 
2320                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2321                         pi2_src_scratch += in_stride;
2322 
2323                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2324                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2325 
2326                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2327                         m_count = _mm_cvtsi32_si128(i4_shift);
2328                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2329                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2330 
2331                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2332                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2333                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2334                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2335 
2336                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2337 
2338                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2339                         pi2_dst_scratch += out_stride;
2340 
2341                     }
2342 
2343                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2344                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2345 
2346                     /* o1[0-3] */
2347                     {
2348                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2349                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2350 
2351                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2352 
2353                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2354                         pi2_src_scratch += in_stride;
2355 
2356                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2357                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2358 
2359                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2360                         m_count = _mm_cvtsi32_si128(i4_shift);
2361                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2362                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2363 
2364                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2365                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2366                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2367                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2368 
2369                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2370 
2371                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2372                         pi2_dst_scratch += out_stride;
2373 
2374                     }
2375 
2376                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2377                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2378 
2379                     /* o2[0-3] */
2380                     {
2381                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2382                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2383 
2384                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2385 
2386                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2387                         pi2_src_scratch += in_stride;
2388 
2389                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2390                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2391 
2392                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2393                         m_count = _mm_cvtsi32_si128(i4_shift);
2394                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2395                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2396 
2397                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2398                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2399                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2400                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2401 
2402                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2403 
2404                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2405                         pi2_dst_scratch += out_stride;
2406 
2407                     }
2408 
2409                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
2410                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
2411 
2412                     /* o3[0-3] */
2413                     {
2414                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2415                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2416 
2417                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
2418 
2419                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2420                         pi2_src_scratch += in_stride;
2421 
2422                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2423                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2424 
2425                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2426                         m_count = _mm_cvtsi32_si128(i4_shift);
2427                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2428                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2429 
2430                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2431                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2432                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2433                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2434 
2435                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2436 
2437                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2438                         pi2_dst_scratch += out_stride;
2439 
2440                     }
2441 
2442                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
2443                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
2444 
2445                     /* o4[0-3] */
2446                     {
2447                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2448                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2449 
2450                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2451 
2452                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2453                         pi2_src_scratch += in_stride;
2454 
2455                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2456                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2457 
2458                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2459                         m_count = _mm_cvtsi32_si128(i4_shift);
2460                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2461                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2462 
2463                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2464                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2465                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2466                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2467 
2468                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2469 
2470                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2471                         pi2_dst_scratch += out_stride;
2472 
2473                     }
2474 
2475                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
2476                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
2477 
2478                     /* o5[0-3] */
2479                     {
2480                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2481                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2482 
2483                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2484 
2485                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2486                         pi2_src_scratch += in_stride;
2487 
2488                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2489                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2490 
2491                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2492                         m_count = _mm_cvtsi32_si128(i4_shift);
2493                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2494                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2495 
2496                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2497                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2498                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2499                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2500 
2501                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2502 
2503                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2504                         pi2_dst_scratch += out_stride;
2505 
2506                     }
2507 
2508                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
2509                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
2510 
2511                     /* o6[0-3] */
2512                     {
2513                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2514                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2515 
2516                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2517 
2518                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2519                         pi2_src_scratch += in_stride;
2520 
2521                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2522                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2523 
2524                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2525                         m_count = _mm_cvtsi32_si128(i4_shift);
2526                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2527                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2528 
2529                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2530                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2531                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2532                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2533 
2534                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2535 
2536                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2537                         pi2_dst_scratch += out_stride;
2538 
2539                     }
2540 
2541                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
2542                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
2543 
2544                     /* o7[0-3] */
2545                     {
2546                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2547                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2548 
2549                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2550 
2551                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2552                         pi2_src_scratch += 8;
2553 
2554                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2555                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2556 
2557                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2558                         m_count = _mm_cvtsi32_si128(i4_shift);
2559                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2560                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2561 
2562                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2563                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2564                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2565                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2566 
2567                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2568 
2569                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2570                         pi2_dst_scratch += 8;
2571 
2572                     }
2573 
2574                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
2575                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
2576 
2577                     /* o8[0-3] */
2578                     {
2579                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2580                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2581 
2582                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2583 
2584                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2585                         pi2_src_scratch -= in_stride;
2586 
2587                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2588                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2589 
2590                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2591                         m_count = _mm_cvtsi32_si128(i4_shift);
2592                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2593                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2594 
2595                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2596                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2597                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2598                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2599 
2600                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2601 
2602                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2603                         pi2_dst_scratch -= out_stride;
2604                     }
2605 
2606                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
2607                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
2608 
2609                     /* o9[0-3] */
2610                     {
2611                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2612                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2613 
2614                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2615 
2616                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2617                         pi2_src_scratch -= in_stride;
2618 
2619                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2620                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2621 
2622                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2623                         m_count = _mm_cvtsi32_si128(i4_shift);
2624                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2625                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2626 
2627                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2628                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2629                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2630                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2631 
2632                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2633 
2634                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2635                         pi2_dst_scratch -= out_stride;
2636                     }
2637 
2638                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
2639                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
2640 
2641                     /* o10[0-3] */
2642                     {
2643                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2644                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2645 
2646                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2647 
2648                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2649                         pi2_src_scratch -= in_stride;
2650 
2651                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2652                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2653 
2654                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2655                         m_count = _mm_cvtsi32_si128(i4_shift);
2656                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2657                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2658 
2659                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2660                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2661                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2662                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2663 
2664                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2665 
2666                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2667                         pi2_dst_scratch -= out_stride;
2668                     }
2669 
2670                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
2671                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
2672 
2673                     /* o11[0-3] */
2674                     {
2675 
2676                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2677                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2678 
2679                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2680 
2681                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2682                         pi2_src_scratch -= in_stride;
2683 
2684                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2685                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2686 
2687                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2688                         m_count = _mm_cvtsi32_si128(i4_shift);
2689                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2690                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2691 
2692                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2693                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2694                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2695                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2696 
2697                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2698 
2699                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2700                         pi2_dst_scratch -= out_stride;
2701 
2702                     }
2703 
2704                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
2705                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
2706 
2707                     /* o12[0-3] */
2708                     {
2709                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2710                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2711 
2712                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2713 
2714                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2715                         pi2_src_scratch -= in_stride;
2716 
2717                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2718                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2719 
2720                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2721                         m_count = _mm_cvtsi32_si128(i4_shift);
2722                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2723                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2724 
2725                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2726                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2727                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2728                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2729 
2730                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2731 
2732                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2733                         pi2_dst_scratch -= out_stride;
2734 
2735                     }
2736 
2737                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
2738                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
2739 
2740                     /* o13[0-3] */
2741                     {
2742                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2743                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2744 
2745                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2746 
2747                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2748                         pi2_src_scratch -= in_stride;
2749 
2750                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2751                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2752 
2753                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2754                         m_count = _mm_cvtsi32_si128(i4_shift);
2755                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2756                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2757 
2758                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2759                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2760                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2761                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2762 
2763                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2764 
2765                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2766                         pi2_dst_scratch -= out_stride;
2767                     }
2768 
2769                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
2770                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
2771 
2772                     /* o14[0-3] */
2773                     {
2774                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2775                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2776 
2777                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2778 
2779                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2780                         pi2_src_scratch -= in_stride;
2781 
2782                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2783                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2784 
2785                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2786                         m_count = _mm_cvtsi32_si128(i4_shift);
2787                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2788                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2789 
2790                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2791                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2792                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2793                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2794 
2795                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2796 
2797                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2798                         pi2_dst_scratch -= out_stride;
2799 
2800                     }
2801 
2802                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
2803                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
2804 
2805                     /* o15[0-3] */
2806                     {
2807                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2808                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2809 
2810                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2811 
2812                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2813                         pi2_src_scratch += 8;
2814 
2815                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2816                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2817 
2818                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2819                         m_count = _mm_cvtsi32_si128(i4_shift);
2820                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2821                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2822 
2823                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2824                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2825                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2826                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2827 
2828                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2829 
2830                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2831                         pi2_dst_scratch += 8;
2832                     }
2833 
2834                 }
2835             }
2836         }
2837         else
2838         {
2839             /* o & stage 1 out */
2840             {
2841                 WORD32 j;
2842 
2843                 WORD16 *pi2_src_scratch = o_temp_ptr;
2844                 WORD16 *pi2_dst_scratch = temp_ptr;
2845                 WORD32 out_stride = (trans_size << 1);
2846 
2847                 WORD32 in_stride = trans_size;
2848 
2849 
2850                 for(j = 0; j < 2; j++)
2851                 {
2852                     if(j)
2853                     {
2854                         m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8);
2855                         m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8);
2856                         m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8);
2857                         m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8);
2858                         m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8);
2859                         m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8);
2860                         m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8);
2861                         m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8);
2862 
2863                         m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8);
2864                         m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8);
2865                         m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8);
2866                         m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8);
2867                         m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8);
2868                         m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8);
2869                         m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8);
2870                         m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8);
2871                     }
2872 
2873                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
2874                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
2875                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
2876                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
2877                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
2878                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
2879                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
2880                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
2881 
2882                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
2883                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
2884                     m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
2885                     m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
2886                     temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
2887                     temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
2888                     temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
2889                     temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
2890 
2891 
2892                     /* o0[0-3] */
2893                     {
2894                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2895                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2896                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2897                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2898 
2899                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2900                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2901 
2902                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
2903 
2904                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2905                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2906                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2907                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2908 
2909                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2910                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2911 
2912                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2913 
2914                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2915 
2916                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2917                         pi2_src_scratch += in_stride;
2918 
2919                         m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2920                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2921 
2922                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2923                         m_count = _mm_cvtsi32_si128(i4_shift);
2924                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2925                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2926 
2927                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2928                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2929                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2930                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2931 
2932                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2933 
2934                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2935                         pi2_dst_scratch += out_stride;
2936 
2937                     }
2938 
2939                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
2940                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
2941                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
2942                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
2943                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
2944                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
2945                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
2946                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
2947 
2948 
2949                     /* o1[0-3] */
2950                     {
2951                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
2952                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
2953                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
2954                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
2955 
2956                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
2957                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
2958 
2959                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
2960 
2961                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
2962                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
2963                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
2964                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
2965 
2966                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
2967                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
2968 
2969                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
2970 
2971                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
2972 
2973                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
2974                         pi2_src_scratch += in_stride;
2975 
2976                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
2977                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
2978 
2979                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
2980                         m_count = _mm_cvtsi32_si128(i4_shift);
2981                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
2982                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
2983 
2984                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
2985                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
2986                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
2987                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
2988 
2989                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
2990 
2991                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
2992                         pi2_dst_scratch += out_stride;
2993 
2994                     }
2995 
2996                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
2997                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
2998                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
2999                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
3000                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
3001                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
3002                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
3003                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
3004 
3005                     /* o2[0-3] */
3006                     {
3007                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3008                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3009                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3010                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3011 
3012                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3013                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3014 
3015                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3016 
3017                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3018                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3019                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3020                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3021 
3022                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
3023                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3024 
3025                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
3026 
3027                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3028 
3029                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3030                         pi2_src_scratch += in_stride;
3031 
3032                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3033                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3034 
3035                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3036                         m_count = _mm_cvtsi32_si128(i4_shift);
3037                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3038                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3039 
3040                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3041                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3042                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3043                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3044 
3045                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3046 
3047                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3048                         pi2_dst_scratch += out_stride;
3049 
3050                     }
3051 
3052 
3053                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
3054                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
3055                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
3056                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
3057                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
3058                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
3059                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
3060                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
3061 
3062                     /* o3[0-3] */
3063                     {
3064                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3065                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3066                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3067                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3068 
3069                         m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
3070                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3071 
3072                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3073 
3074                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3075                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3076                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3077                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3078 
3079                         m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
3080                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3081 
3082                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3083 
3084                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3085 
3086                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3087                         pi2_src_scratch += in_stride;
3088 
3089                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3090                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3091 
3092                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3093                         m_count = _mm_cvtsi32_si128(i4_shift);
3094                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3095                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3096 
3097                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3098                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3099                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3100                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3101 
3102                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3103 
3104                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3105                         pi2_dst_scratch += out_stride;
3106 
3107                     }
3108 
3109                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
3110                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
3111                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
3112                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
3113                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
3114                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
3115                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
3116                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
3117 
3118                     /* o4[0-3] */
3119                     {
3120                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3121                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3122                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3123                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3124 
3125                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3126                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3127 
3128                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3129 
3130                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3131                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3132                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3133                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3134 
3135                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3136                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3137 
3138                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3139 
3140                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3141 
3142                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3143                         pi2_src_scratch += in_stride;
3144 
3145                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3146                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3147 
3148                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3149                         m_count = _mm_cvtsi32_si128(i4_shift);
3150                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3151                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3152 
3153                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3154                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3155                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3156                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3157 
3158                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3159 
3160                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3161                         pi2_dst_scratch += out_stride;
3162 
3163                     }
3164 
3165 
3166                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
3167                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
3168                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
3169                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
3170                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
3171                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
3172                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
3173                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
3174 
3175                     /* o5[0-3] */
3176                     {
3177                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3178                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3179                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3180                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3181 
3182                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3183                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3184 
3185                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3186 
3187                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3188                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3189                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3190                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3191 
3192                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3193                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3194 
3195                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3196 
3197                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3198 
3199                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3200                         pi2_src_scratch += in_stride;
3201 
3202                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3203                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3204 
3205                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3206                         m_count = _mm_cvtsi32_si128(i4_shift);
3207                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3208                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3209 
3210                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3211                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3212                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3213                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3214 
3215                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3216 
3217                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3218                         pi2_dst_scratch += out_stride;
3219 
3220                     }
3221 
3222                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
3223                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
3224                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
3225                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
3226                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
3227                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
3228                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
3229                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
3230 
3231 
3232                     /* o6[0-3] */
3233                     {
3234                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3235                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3236                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3237                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3238 
3239                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3240                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3241 
3242                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3243 
3244                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3245                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3246                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3247                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3248 
3249                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3250                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3251 
3252                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3253 
3254                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3255 
3256                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3257                         pi2_src_scratch += in_stride;
3258 
3259                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3260                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3261 
3262                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3263                         m_count = _mm_cvtsi32_si128(i4_shift);
3264                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3265                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3266 
3267                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3268                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3269                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3270                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3271 
3272                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3273 
3274                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3275                         pi2_dst_scratch += out_stride;
3276 
3277                     }
3278 
3279                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
3280                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
3281                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
3282                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
3283                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
3284                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
3285                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
3286                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
3287 
3288                     /* o7[0-3] */
3289                     {
3290                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3291                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3292                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3293                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3294 
3295                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3296                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3297 
3298                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3299 
3300                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3301                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3302                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3303                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3304 
3305                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3306                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3307 
3308                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3309 
3310                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3311 
3312                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3313                         pi2_src_scratch += 8;
3314 
3315                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3316                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3317 
3318                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3319                         m_count = _mm_cvtsi32_si128(i4_shift);
3320                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3321                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3322 
3323                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3324                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3325                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3326                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3327 
3328                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3329 
3330                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3331                         pi2_dst_scratch += 8;
3332 
3333                     }
3334 
3335                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
3336                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
3337                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
3338                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
3339                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
3340                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
3341                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
3342                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
3343 
3344 
3345                     /* o8[0-3] */
3346                     {
3347 
3348                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3349                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3350                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3351                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3352 
3353                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3354                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3355 
3356                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3357 
3358                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3359                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3360                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3361                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3362 
3363                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3364                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3365 
3366                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3367 
3368                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3369 
3370                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3371                         pi2_src_scratch -= in_stride;
3372 
3373                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3374                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3375 
3376                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3377                         m_count = _mm_cvtsi32_si128(i4_shift);
3378                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3379                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3380 
3381                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3382                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3383                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3384                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3385 
3386                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3387 
3388                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3389                         pi2_dst_scratch -= out_stride;
3390                     }
3391 
3392                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
3393                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
3394                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
3395                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
3396                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
3397                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
3398                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
3399                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
3400 
3401 
3402                     /* o9[0-3] */
3403                     {
3404                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3405                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3406                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3407                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3408 
3409                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3410                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3411 
3412                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3413 
3414                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3415                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3416                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3417                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3418 
3419                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3420                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3421 
3422                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3423 
3424                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3425 
3426                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3427                         pi2_src_scratch -= in_stride;
3428 
3429                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3430                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3431 
3432                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3433                         m_count = _mm_cvtsi32_si128(i4_shift);
3434                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3435                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3436 
3437                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3438                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3439                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3440                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3441 
3442                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3443 
3444                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3445                         pi2_dst_scratch -= out_stride;
3446                     }
3447 
3448                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
3449                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
3450                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
3451                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
3452                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
3453                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
3454                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
3455                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
3456 
3457                     /* o10[0-3] */
3458                     {
3459                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3460                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3461                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3462                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3463 
3464                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3465                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3466 
3467                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3468 
3469                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3470                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3471                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3472                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3473 
3474                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3475                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3476 
3477                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3478 
3479                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3480 
3481                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3482                         pi2_src_scratch -= in_stride;
3483 
3484                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3485                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3486 
3487                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3488                         m_count = _mm_cvtsi32_si128(i4_shift);
3489                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3490                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3491 
3492                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3493                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3494                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3495                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3496 
3497                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3498 
3499                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3500                         pi2_dst_scratch -= out_stride;
3501                     }
3502 
3503                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
3504                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
3505                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
3506                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
3507                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
3508                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
3509                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
3510                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
3511 
3512                     /* o11[0-3] */
3513                     {
3514                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3515                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3516                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3517                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3518 
3519                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3520                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3521 
3522                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3523 
3524                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3525                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3526                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3527                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3528 
3529                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3530                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3531 
3532                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3533 
3534                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3535 
3536                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3537                         pi2_src_scratch -= in_stride;
3538 
3539                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3540                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3541 
3542                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3543                         m_count = _mm_cvtsi32_si128(i4_shift);
3544                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3545                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3546 
3547                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3548                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3549                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3550                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3551 
3552                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3553 
3554                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3555                         pi2_dst_scratch -= out_stride;
3556 
3557                     }
3558 
3559                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
3560                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
3561                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
3562                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
3563                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
3564                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
3565                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
3566                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
3567 
3568 
3569                     /* o12[0-3] */
3570                     {
3571                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3572                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3573                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3574                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3575 
3576                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3577                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3578 
3579                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3580 
3581                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3582                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3583                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3584                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3585 
3586                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3587                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3588 
3589                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3590 
3591                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3592 
3593                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3594                         pi2_src_scratch -= in_stride;
3595 
3596                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3597                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3598 
3599                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3600                         m_count = _mm_cvtsi32_si128(i4_shift);
3601                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3602                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3603 
3604                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3605                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3606                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3607                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3608 
3609                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3610 
3611                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3612                         pi2_dst_scratch -= out_stride;
3613 
3614                     }
3615 
3616                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
3617                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
3618                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
3619                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
3620                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
3621                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
3622                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
3623                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
3624 
3625 
3626                     /* o13[0-3] */
3627                     {
3628                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3629                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3630                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3631                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3632 
3633                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3634                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3635 
3636                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3637 
3638                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3639                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3640                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3641                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3642 
3643                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3644                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3645 
3646                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3647 
3648                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3649 
3650                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3651                         pi2_src_scratch -= in_stride;
3652 
3653                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3654                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3655 
3656                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3657                         m_count = _mm_cvtsi32_si128(i4_shift);
3658                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3659                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3660 
3661                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3662                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3663                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3664                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3665 
3666                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3667 
3668                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3669                         pi2_dst_scratch -= out_stride;
3670                     }
3671 
3672                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
3673                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
3674                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
3675                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
3676                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
3677                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
3678                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
3679                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
3680 
3681 
3682                     /* o14[0-3] */
3683                     {
3684                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3685                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3686                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3687                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3688 
3689                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3690                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3691 
3692                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3693 
3694                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3695                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3696                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3697                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3698 
3699                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3700                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3701 
3702                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3703 
3704                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3705 
3706                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3707                         pi2_src_scratch -= in_stride;
3708 
3709                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3710                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3711 
3712                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3713                         m_count = _mm_cvtsi32_si128(i4_shift);
3714                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3715                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3716 
3717                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3718                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3719                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3720                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3721 
3722                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3723 
3724                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3725                         pi2_dst_scratch -= out_stride;
3726 
3727                     }
3728 
3729                     m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
3730                     m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
3731                     m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
3732                     m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
3733                     m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
3734                     m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
3735                     m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
3736                     m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
3737 
3738                     /* o15[0-3] */
3739                     {
3740                         m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
3741                         m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
3742                         m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
3743                         m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
3744 
3745                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
3746                         m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
3747 
3748                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
3749 
3750                         m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5);
3751                         m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6);
3752                         m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7);
3753                         m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8);
3754 
3755                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
3756                         m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
3757 
3758                         m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
3759 
3760                         m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
3761 
3762                         m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch);
3763                         pi2_src_scratch += 8;
3764 
3765                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20);
3766                         m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20);
3767 
3768                         m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
3769                         m_count = _mm_cvtsi32_si128(i4_shift);
3770                         m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
3771                         m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
3772 
3773                         m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
3774                         m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
3775                         m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
3776                         m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
3777 
3778                         m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
3779 
3780                         _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
3781                         pi2_dst_scratch += 8;
3782                     }
3783 
3784                 }
3785             }
3786         }
3787         /* Transpose */
3788         {
3789             WORD16 *pi2_src_scratch = temp_ptr;
3790             WORD16 *pi2_dst_scratch = pi2_tmp;
3791             WORD32 in_stride = (trans_size << 1);
3792 
3793             for(j = 0; j < 2; j++)
3794             {
3795                 m_temp_reg_30 =  _mm_load_si128((__m128i *)pi2_src_scratch);
3796                 pi2_src_scratch += in_stride;
3797                 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
3798                 pi2_src_scratch += in_stride;
3799                 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
3800                 pi2_src_scratch += in_stride;
3801                 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
3802                 pi2_src_scratch += in_stride;
3803                 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
3804                 pi2_src_scratch += in_stride;
3805                 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
3806                 pi2_src_scratch += in_stride;
3807                 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
3808                 pi2_src_scratch += in_stride;
3809                 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
3810                 pi2_src_scratch += 8;
3811 
3812                 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
3813                 pi2_src_scratch -= in_stride;
3814                 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
3815                 pi2_src_scratch -= in_stride;
3816                 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
3817                 pi2_src_scratch -= in_stride;
3818                 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
3819                 pi2_src_scratch -= in_stride;
3820                 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
3821                 pi2_src_scratch -= in_stride;
3822                 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
3823                 pi2_src_scratch -= in_stride;
3824                 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
3825                 pi2_src_scratch -= in_stride;
3826                 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
3827                 pi2_src_scratch += 8;
3828 
3829 
3830                 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
3831                 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
3832 
3833                 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
3834                 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
3835 
3836                 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
3837                 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
3838 
3839                 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
3840                 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
3841 
3842                 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
3843                 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
3844 
3845                 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
3846                 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
3847 
3848                 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
3849                 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
3850 
3851                 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
3852                 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
3853 
3854                 /****************/
3855 
3856                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
3857                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
3858 
3859                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
3860                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
3861 
3862                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
3863                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
3864 
3865                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
3866                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
3867 
3868                 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
3869                 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
3870 
3871                 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
3872                 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
3873 
3874                 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
3875                 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
3876 
3877                 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
3878                 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
3879 
3880                 /******************/
3881 
3882                 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);
3883                 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);
3884 
3885                 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);
3886                 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);
3887 
3888                 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);
3889                 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);
3890 
3891                 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);
3892                 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);
3893 
3894                 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);
3895                 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);
3896 
3897                 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);
3898                 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);
3899 
3900                 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);
3901                 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);
3902 
3903                 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);
3904                 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);
3905 
3906                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30);
3907                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34);
3908                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36);
3909                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32);
3910 
3911                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31);
3912                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35);
3913                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37);
3914                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33);
3915 
3916                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80);
3917                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84);
3918                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86);
3919                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82);
3920 
3921                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81);
3922                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85);
3923                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87);
3924                 _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83);
3925 
3926                 pi2_dst_scratch += 4 * trans_size;
3927             }
3928         }
3929         pi2_src += 8;
3930 //      pi2_dequant_coeff +=8;
3931         pi2_tmp += 8 * trans_size;
3932         zero_cols = zero_cols >> 1;
3933     }
3934 
3935     if(trans_size_stg1 != TRANS_SIZE_32)
3936     {
3937         m_temp_reg_10 = _mm_setzero_si128();
3938 
3939         for(i = trans_size_stg1; i < 32; i += 8)
3940         {
3941             WORD16 *pi2_dst_scratch = pi2_tmp;
3942 
3943             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10);
3944             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10);
3945             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10);
3946             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10);
3947 
3948             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10);
3949             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10);
3950             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10);
3951             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10);
3952 
3953             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10);
3954             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10);
3955             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10);
3956             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10);
3957 
3958             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10);
3959             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10);
3960             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10);
3961             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10);
3962 
3963             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10);
3964             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10);
3965             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10);
3966             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10);
3967 
3968             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10);
3969             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10);
3970             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10);
3971             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10);
3972 
3973             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10);
3974             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10);
3975             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10);
3976             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10);
3977 
3978             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10);
3979             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10);
3980             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10);
3981             _mm_storeu_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10);
3982 
3983             pi2_tmp += 8 * trans_size;
3984         }
3985     }
3986 
3987     pi2_tmp = pi2_tmp_orig;
3988 
3989     /* Inverse Transform 2nd stage */
3990 
3991 
3992     for(j = 0; j < trans_size; j += 4)
3993     {
3994         i4_shift = IT_SHIFT_STAGE_2;
3995 
3996         /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
3997         if(zero_last28_rows_stg2)
3998         {
3999             {
4000 
4001                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4002                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87
4003                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80
4004                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70
4005                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57
4006                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43
4007                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25
4008                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9
4009 
4010                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4011 
4012                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg);
4013 
4014                 /* eo0[0-3] */
4015                 {
4016                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4017 
4018                 }
4019                 /* eo1[0-3] */
4020                 {
4021                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4022 
4023                 }
4024                 /* eo2[0-3] */
4025                 {
4026                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4027                 }
4028 
4029                 /* eo3[0-3] */
4030                 {
4031                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4032                 }
4033                 /* eo4[0-3] */
4034                 {
4035                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5);
4036                 }
4037 
4038                 /* eo5[0-3] */
4039                 {
4040                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6);
4041                 }
4042 
4043                 /* eo6[0-3] */
4044                 {
4045                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7);
4046                 }
4047                 /* eo7[0-3] */
4048                 {
4049                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8);
4050                 }
4051             }
4052 
4053             m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4054 
4055             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4056 
4057             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4058 
4059             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4060 
4061             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4062 
4063             /* e[]*/
4064 
4065             temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[0] */
4066             temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90);  /* ee[15] */
4067 
4068             temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[1] */
4069             temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91);  /* ee[14] */
4070 
4071             temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[2] */
4072             temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92);  /* ee[13] */
4073 
4074             temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[3] */
4075             temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93);  /* ee[12] */
4076 
4077             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[4] */
4078             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94);  /* ee[11] */
4079 
4080             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[5] */
4081             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95);  /* ee[10] */
4082 
4083             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[6] */
4084             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96);  /* ee[9] */
4085 
4086             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[7] */
4087             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97);  /* ee[8] */
4088 
4089             /*o[k]*/
4090             {
4091 
4092                 WORD16 *pi2_dst_scratch = temp_ptr;
4093                 WORD32 out_stride = 8;
4094 
4095                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4096 
4097                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4098                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4099 
4100                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
4101 
4102 
4103                 /* o0[0-3] */
4104                 {
4105                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4106 
4107                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4108                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4109 
4110                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4111                     m_count = _mm_cvtsi32_si128(i4_shift);
4112                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4113                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4114 
4115                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4116                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4117                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4118                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4119 
4120                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4121 
4122                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4123                     pi2_dst_scratch += out_stride;
4124 
4125                 }
4126 
4127                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4128 
4129                 /* o1[0-3] */
4130                 {
4131                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4132 
4133                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4134                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4135 
4136                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4137                     m_count = _mm_cvtsi32_si128(i4_shift);
4138                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4139                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4140 
4141                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4142                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4143                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4144                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4145 
4146                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4147 
4148                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4149                     pi2_dst_scratch += out_stride;
4150 
4151                 }
4152 
4153                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4154 
4155                 /* o2[0-3] */
4156                 {
4157                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4158 
4159                     m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20);
4160                     m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20);
4161 
4162                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4163                     m_count = _mm_cvtsi32_si128(i4_shift);
4164                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4165                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4166 
4167                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4168                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4169                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4170                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4171 
4172                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4173 
4174                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4175                     pi2_dst_scratch += out_stride;
4176 
4177                 }
4178 
4179                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4180 
4181                 /* o3[0-3] */
4182                 {
4183                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4184 
4185                     m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20);
4186                     m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20);
4187 
4188                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4189                     m_count = _mm_cvtsi32_si128(i4_shift);
4190                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4191                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4192 
4193                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4194                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4195                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4196                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4197 
4198                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4199 
4200                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4201                     pi2_dst_scratch += out_stride;
4202 
4203                 }
4204 
4205                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4206 
4207                 /* o4[0-3] */
4208                 {
4209                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4210 
4211                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4212                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4213 
4214                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4215                     m_count = _mm_cvtsi32_si128(i4_shift);
4216                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4217                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4218 
4219                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4220                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4221                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4222                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4223 
4224                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4225 
4226                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4227                     pi2_dst_scratch += out_stride;
4228 
4229                 }
4230 
4231                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4232 
4233                 /* o5[0-3] */
4234                 {
4235                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4236 
4237                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4238                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4239 
4240                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4241                     m_count = _mm_cvtsi32_si128(i4_shift);
4242                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4243                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4244 
4245                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4246                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4247                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4248                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4249 
4250                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4251 
4252                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4253                     pi2_dst_scratch += out_stride;
4254 
4255                 }
4256 
4257                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4258 
4259                 /* o6[0-3] */
4260                 {
4261                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4262 
4263                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4264                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4265 
4266                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4267                     m_count = _mm_cvtsi32_si128(i4_shift);
4268                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4269                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4270 
4271                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4272                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4273                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4274                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4275 
4276                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4277 
4278                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4279                     pi2_dst_scratch += out_stride;
4280 
4281                 }
4282 
4283                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4284 
4285                 /* o7[0-3] */
4286                 {
4287                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4288 
4289                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4290                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4291 
4292                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4293                     m_count = _mm_cvtsi32_si128(i4_shift);
4294                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4295                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4296 
4297                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4298                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4299                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4300                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4301 
4302                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4303 
4304                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4305                     pi2_dst_scratch += 8;
4306 
4307                 }
4308 
4309                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4310 
4311                 /* o8[0-3] */
4312                 {
4313                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4314 
4315                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4316                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4317 
4318                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4319                     m_count = _mm_cvtsi32_si128(i4_shift);
4320                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4321                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4322 
4323                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4324                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4325                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4326                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4327 
4328                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4329 
4330                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4331                     pi2_dst_scratch += out_stride;
4332                 }
4333 
4334                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4335 
4336                 /* o9[0-3] */
4337                 {
4338                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4339 
4340                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4341                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4342 
4343                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4344                     m_count = _mm_cvtsi32_si128(i4_shift);
4345                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4346                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4347 
4348                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4349                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4350                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4351                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4352 
4353                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4354 
4355                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4356                     pi2_dst_scratch += out_stride;
4357 
4358                 }
4359 
4360                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4361 
4362                 /* o10[0-3] */
4363                 {
4364                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4365 
4366                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
4367                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
4368 
4369                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4370                     m_count = _mm_cvtsi32_si128(i4_shift);
4371                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4372                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4373 
4374                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4375                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4376                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4377                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4378 
4379                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4380 
4381                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4382                     pi2_dst_scratch += out_stride;
4383                 }
4384 
4385                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
4386 
4387                 /* o11[0-3] */
4388                 {
4389                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4390 
4391                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
4392                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
4393 
4394                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4395                     m_count = _mm_cvtsi32_si128(i4_shift);
4396                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4397                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4398 
4399                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4400                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4401                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4402                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4403 
4404                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4405 
4406                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4407                     pi2_dst_scratch += out_stride;
4408 
4409                 }
4410 
4411                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
4412 
4413                 /* o12[0-3] */
4414                 {
4415                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4416 
4417                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
4418                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
4419 
4420                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4421                     m_count = _mm_cvtsi32_si128(i4_shift);
4422                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4423                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4424 
4425                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4426                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4427                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4428                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4429 
4430                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4431 
4432                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4433                     pi2_dst_scratch += out_stride;
4434 
4435                 }
4436 
4437                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
4438 
4439                 /* o13[0-3] */
4440                 {
4441                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4442 
4443                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
4444                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
4445 
4446                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4447                     m_count = _mm_cvtsi32_si128(i4_shift);
4448                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4449                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4450 
4451                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4452                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4453                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4454                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4455 
4456                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4457 
4458                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4459                     pi2_dst_scratch += out_stride;
4460                 }
4461 
4462                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
4463 
4464                 /* o14[0-3] */
4465                 {
4466                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4467 
4468                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
4469                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
4470 
4471                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4472                     m_count = _mm_cvtsi32_si128(i4_shift);
4473                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4474                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4475 
4476                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4477                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4478                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4479                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4480 
4481                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4482 
4483                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4484                     pi2_dst_scratch += out_stride;
4485 
4486                 }
4487 
4488                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
4489 
4490                 /* o15[0-3] */
4491                 {
4492                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4493 
4494                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
4495                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
4496 
4497                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4498                     m_count = _mm_cvtsi32_si128(i4_shift);
4499                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4500                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4501 
4502                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4503                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4504                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4505                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4506 
4507                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4508 
4509                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4510                     pi2_dst_scratch += 8;
4511                 }
4512 
4513             }
4514 
4515         }
4516         else if(zero_last24_rows_stg2)
4517         {
4518             /* eo */
4519             {
4520                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
4521 
4522                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
4523                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
4524 
4525                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
4526 
4527 
4528                 /* eo0[0-3] */
4529                 {
4530                     m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4531 
4532                 }
4533 
4534                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
4535 
4536                 /* eo1[0-3] */
4537                 {
4538                     m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4539 
4540                 }
4541                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
4542 
4543                 /* eo2[0-3] */
4544                 {
4545                     m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4546 
4547                 }
4548 
4549                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
4550 
4551                 /* eo3[0-3] */
4552                 {
4553 
4554                     m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4555 
4556                 }
4557 
4558                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
4559 
4560                 /* eo4[0-3] */
4561                 {
4562                     m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4563 
4564                 }
4565 
4566                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
4567 
4568                 /* eo5[0-3] */
4569                 {
4570                     m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4571                 }
4572 
4573                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
4574                 /* eo6[0-3] */
4575                 {
4576                     m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4577                 }
4578 
4579                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
4580                 /* eo7[0-3] */
4581                 {
4582                     m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4583 
4584                 }
4585 
4586             }
4587 
4588             /* eeo */
4589             {
4590 
4591                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
4592                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75
4593                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18
4594                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50
4595 
4596                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
4597 
4598                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg);
4599 
4600                 /* eeo0[0-3] */
4601                 {
4602                     temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4603 
4604                 }
4605 
4606                 /* eeo1[0-3] */
4607                 {
4608                     temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2);
4609 
4610                 }
4611 
4612                 /* eo2[0-3] */
4613                 {
4614                     temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4);
4615 
4616                 }
4617 
4618 
4619                 /* eo3[0-3] */
4620                 {
4621                     temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
4622 
4623                 }
4624 
4625             }
4626 
4627             m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83
4628             m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36
4629             m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64
4630 
4631             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
4632 
4633             //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70);
4634             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg);
4635 
4636             m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4637             m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
4638 
4639             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1);  /* ee[0] */
4640             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1);  /* ee[7] */
4641 
4642             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2);  /* ee[1] */
4643             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2);  /* ee[6] */
4644 
4645             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3);  /* ee[2] */
4646             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3);  /* ee[5] */
4647 
4648             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4);  /* ee[3] */
4649             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4);  /* ee[4] */
4650 
4651             /* e[]*/
4652 
4653             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
4654             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
4655 
4656             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
4657             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
4658 
4659             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
4660             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
4661 
4662             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
4663             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
4664 
4665             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
4666             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
4667 
4668             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
4669             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
4670 
4671             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
4672             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
4673 
4674             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
4675             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
4676 
4677             /*o[k] */
4678             {
4679 
4680                 WORD16 *pi2_dst_scratch = temp_ptr;
4681                 WORD32 out_stride = 8;
4682 
4683                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
4684                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
4685 
4686                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
4687                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
4688                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
4689                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
4690 
4691                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
4692                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
4693 
4694                 /* o0[0-3] */
4695                 {
4696                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4697                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4698 
4699                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4700 
4701                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
4702                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
4703 
4704                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4705                     m_count = _mm_cvtsi32_si128(i4_shift);
4706                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4707                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4708 
4709                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4710                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4711                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4712                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4713 
4714                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4715 
4716                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4717                     pi2_dst_scratch += out_stride;
4718 
4719                 }
4720 
4721 
4722                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
4723                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
4724 
4725                 /* o1[0-3] */
4726                 {
4727                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4728                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4729 
4730                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
4731 
4732                     m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20);
4733                     m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20);
4734 
4735                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4736                     m_count = _mm_cvtsi32_si128(i4_shift);
4737                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4738                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4739 
4740                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4741                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4742                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4743                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4744 
4745                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4746 
4747                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4748                     pi2_dst_scratch += out_stride;
4749 
4750                 }
4751 
4752                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
4753                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
4754 
4755                 /* o2[0-3] */
4756                 {
4757                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4758                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4759 
4760                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4761 
4762                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
4763                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
4764 
4765                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4766                     m_count = _mm_cvtsi32_si128(i4_shift);
4767                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4768                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4769 
4770                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4771                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4772                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4773                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4774 
4775                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4776 
4777                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4778                     pi2_dst_scratch += out_stride;
4779 
4780                 }
4781 
4782                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
4783                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
4784 
4785                 /* o3[0-3] */
4786                 {
4787                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4788                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4789 
4790                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
4791 
4792                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
4793                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
4794 
4795                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4796                     m_count = _mm_cvtsi32_si128(i4_shift);
4797                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4798                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4799 
4800                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4801                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4802                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4803                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4804 
4805                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4806 
4807                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4808                     pi2_dst_scratch += out_stride;
4809 
4810                 }
4811 
4812                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
4813                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
4814 
4815                 /* o4[0-3] */
4816                 {
4817                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4818                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4819 
4820                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4821 
4822                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
4823                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
4824 
4825                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4826                     m_count = _mm_cvtsi32_si128(i4_shift);
4827                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4828                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4829 
4830                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4831                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4832                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4833                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4834 
4835                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4836 
4837                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4838                     pi2_dst_scratch += out_stride;
4839 
4840                 }
4841 
4842                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
4843                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
4844 
4845                 /* o5[0-3] */
4846                 {
4847                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4848                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4849 
4850                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4851 
4852                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
4853                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
4854 
4855                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4856                     m_count = _mm_cvtsi32_si128(i4_shift);
4857                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4858                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4859 
4860                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4861                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4862                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4863                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4864 
4865                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4866 
4867                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4868                     pi2_dst_scratch += out_stride;
4869 
4870                 }
4871 
4872                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
4873                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
4874 
4875                 /* o6[0-3] */
4876                 {
4877                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4878                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4879 
4880                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4881 
4882                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
4883                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
4884 
4885                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4886                     m_count = _mm_cvtsi32_si128(i4_shift);
4887                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4888                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4889 
4890                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4891                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4892                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4893                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4894 
4895                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4896 
4897                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4898                     pi2_dst_scratch += out_stride;
4899 
4900                 }
4901 
4902                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
4903                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
4904 
4905                 /* o7[0-3] */
4906                 {
4907                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4908                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4909 
4910                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4911 
4912                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
4913                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
4914 
4915                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4916                     m_count = _mm_cvtsi32_si128(i4_shift);
4917                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4918                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4919 
4920                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4921                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4922                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4923                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4924 
4925                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4926 
4927                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4928                     pi2_dst_scratch += 8;
4929 
4930                 }
4931 
4932                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
4933                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
4934 
4935                 /* o8[0-3] */
4936                 {
4937                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4938                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4939 
4940                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4941 
4942                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
4943                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
4944 
4945                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4946                     m_count = _mm_cvtsi32_si128(i4_shift);
4947                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4948                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4949 
4950                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4951                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4952                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4953                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4954 
4955                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4956 
4957                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4958                     pi2_dst_scratch += out_stride;
4959                 }
4960 
4961                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
4962                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
4963 
4964                 /* o9[0-3] */
4965                 {
4966                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4967                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4968 
4969                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4970 
4971                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
4972                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
4973 
4974                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
4975                     m_count = _mm_cvtsi32_si128(i4_shift);
4976                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
4977                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
4978 
4979                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
4980                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
4981                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
4982                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
4983 
4984                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
4985 
4986                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
4987                     pi2_dst_scratch += out_stride;
4988                 }
4989 
4990                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
4991                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
4992 
4993                 /* o10[0-3] */
4994                 {
4995                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
4996                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
4997 
4998                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
4999 
5000                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
5001                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
5002 
5003                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5004                     m_count = _mm_cvtsi32_si128(i4_shift);
5005                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5006                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5007 
5008                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5009                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5010                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5011                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5012 
5013                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5014 
5015                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5016                     pi2_dst_scratch += out_stride;
5017                 }
5018 
5019                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
5020                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
5021 
5022                 /* o11[0-3] */
5023                 {
5024                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5025                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5026 
5027                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5028 
5029                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
5030                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
5031 
5032                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5033                     m_count = _mm_cvtsi32_si128(i4_shift);
5034                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5035                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5036 
5037                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5038                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5039                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5040                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5041 
5042                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5043 
5044                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5045                     pi2_dst_scratch += out_stride;
5046 
5047                 }
5048 
5049                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
5050                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
5051 
5052                 /* o12[0-3] */
5053                 {
5054                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5055                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5056 
5057                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5058 
5059                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
5060                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
5061 
5062                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5063                     m_count = _mm_cvtsi32_si128(i4_shift);
5064                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5065                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5066 
5067                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5068                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5069                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5070                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5071 
5072                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5073 
5074                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5075                     pi2_dst_scratch += out_stride;
5076 
5077                 }
5078 
5079                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
5080                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
5081 
5082                 /* o13[0-3] */
5083                 {
5084                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5085                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5086 
5087                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5088 
5089                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
5090                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
5091 
5092                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5093                     m_count = _mm_cvtsi32_si128(i4_shift);
5094                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5095                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5096 
5097                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5098                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5099                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5100                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5101 
5102                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5103 
5104                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5105                     pi2_dst_scratch += out_stride;
5106                 }
5107 
5108                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
5109                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
5110 
5111                 /* o14[0-3] */
5112                 {
5113                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5114                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5115 
5116                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5117 
5118                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
5119                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
5120 
5121                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5122                     m_count = _mm_cvtsi32_si128(i4_shift);
5123                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5124                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5125 
5126                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5127                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5128                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5129                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5130 
5131                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5132 
5133                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5134                     pi2_dst_scratch += out_stride;
5135                 }
5136 
5137                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
5138                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
5139 
5140                 /* o15[0-3] */
5141                 {
5142                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5143                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5144 
5145                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);
5146 
5147                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
5148                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
5149 
5150                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5151                     m_count = _mm_cvtsi32_si128(i4_shift);
5152                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5153                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5154 
5155                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5156                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5157                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5158                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5159 
5160                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5161 
5162                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5163                     pi2_dst_scratch += 8;
5164                 }
5165 
5166             }
5167         }
5168         else
5169         {
5170             /* eo */
5171             {
5172 
5173                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87
5174                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70
5175                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43
5176                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9
5177 
5178 
5179                 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]);
5180                 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]);
5181                 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]);
5182                 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]);
5183                 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]);
5184                 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]);
5185                 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]);
5186                 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]);
5187 
5188                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11);
5189                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13);
5190                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19);
5191                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21);
5192 
5193                 /* eo0[0-3] */
5194                 {
5195                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5196                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5197 
5198                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5199 
5200                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5201                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5202 
5203                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5204 
5205                     m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5206 
5207                 }
5208 
5209                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87  57
5210                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0  -43
5211                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80  90
5212                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70  25
5213 
5214                 /* eo1[0-3] */
5215                 {
5216                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5217                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5218 
5219                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5220 
5221                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5222                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5223 
5224                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5225 
5226                     m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32);
5227 
5228                 }
5229 
5230                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80  9
5231                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70  87
5232                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25  57
5233                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90  43
5234 
5235                 /* eo2[0-3] */
5236                 {
5237                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5238                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5239 
5240                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5241 
5242                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5243                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5244 
5245                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5246 
5247                     m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5248 
5249                 }
5250 
5251                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70  -43
5252                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87  9
5253                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90  25
5254                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80  57
5255 
5256                 /* eo3[0-3] */
5257                 {
5258                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5259                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5260 
5261                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5262 
5263                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5264                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5265 
5266                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33);
5267 
5268                     m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5269 
5270                 }
5271 
5272                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57  -80
5273                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25  90
5274                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9  87
5275                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43  70
5276 
5277 
5278                 /* eo4[0-3] */
5279                 {
5280                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5281                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5282 
5283                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5284 
5285                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5286                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5287 
5288                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32);
5289 
5290                     m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5291 
5292                 }
5293 
5294                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43  -90
5295                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57  25
5296                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87  70
5297                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9  -80
5298 
5299                 /* eo5[0-3] */
5300                 {
5301                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5302                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5303 
5304                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5305 
5306                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5307                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5308 
5309                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5310 
5311                     m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5312                 }
5313 
5314                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25  -70
5315                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90  -80
5316                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43  9
5317                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57  87
5318 
5319                 /* eo6[0-3] */
5320                 {
5321                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5322                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5323 
5324                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5325 
5326                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5327                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5328 
5329                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5330 
5331                     m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5332 
5333                 }
5334 
5335                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9  -25
5336                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43  -57
5337                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70  -80
5338                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87  -90
5339 
5340                 /* eo7[0-3] */
5341                 {
5342                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5343                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5344 
5345                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5346 
5347                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5348                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5349 
5350                     m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33);
5351 
5352                     m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32);
5353 
5354 
5355                 }
5356 
5357             }
5358 
5359             /* eeo */
5360             {
5361                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75
5362                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18
5363 
5364                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]);
5365                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]);
5366                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]);
5367                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]);
5368 
5369                 /* eeo0[0-3] */
5370                 {
5371 
5372                     m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
5373                     m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86);
5374 
5375                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5376                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5377 
5378                     temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5379 
5380                 }
5381 
5382                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18
5383                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89  50
5384 
5385                 /* eeo1[0-3] */
5386                 {
5387                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5388                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5389 
5390                     temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31);
5391 
5392                 }
5393 
5394                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89
5395                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18  75
5396 
5397                 /* eo2[0-3] */
5398                 {
5399                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5400                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5401 
5402                     temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5403 
5404                 }
5405 
5406                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50
5407                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75  -89
5408 
5409                 /* eo3[0-3] */
5410                 {
5411                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3);
5412                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4);
5413 
5414                     temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31);
5415 
5416                 }
5417 
5418 
5419             }
5420 
5421             m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36
5422             m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83
5423 
5424             m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64
5425             m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64
5426 
5427             m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]);
5428             m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]);
5429 
5430             m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84);
5431 
5432             m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]);
5433             m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]);
5434 
5435             m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80);
5436 
5437             m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);  /* eeeo[0] */
5438             m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);  /* eeeo[1] */
5439 
5440             m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);  /* eeee[0] */
5441             m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4);  /* eeee[1] */
5442 
5443 /* eeeo[0]= m_temp_reg_20  */
5444 /* eeeo[1]= m_temp_reg_21  */
5445 /* eeee[0]= m_temp_reg_22  */
5446 /* eeee[1]= m_temp_reg_23  */
5447 
5448             /* eee[0] = eeee[0] + eeeo[0]; */
5449             m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[0] */
5450 
5451             /* eee[3] = eeee[0] - eeeo[0]; */
5452             m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);  /* eeeo[1] */
5453 
5454             /* eee[2] = eeee[1] - eeeo[1]; */
5455             m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[1] */
5456 
5457             /* eee[1] = eeee[1] + eeeo[1];*/
5458             m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22);  /* eeee[0] */
5459 
5460             m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1);  /* ee[0] */
5461             m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1);  /* ee[7] */
5462 
5463             m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2);  /* ee[1] */
5464             m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2);  /* ee[6] */
5465 
5466             m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3);  /* ee[2] */
5467             m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3);  /* ee[5] */
5468 
5469             m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4);  /* ee[3] */
5470             m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4);  /* ee[4] */
5471 
5472 /* e[]*/
5473 
5474             temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[0] */
5475             temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90);  /* ee[15] */
5476 
5477             temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[1] */
5478             temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91);  /* ee[14] */
5479 
5480             temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[2] */
5481             temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92);  /* ee[13] */
5482 
5483             temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[3] */
5484             temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93);  /* ee[12] */
5485 
5486             m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[4] */
5487             m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94);  /* ee[11] */
5488 
5489             m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[5] */
5490             m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95);  /* ee[10] */
5491 
5492             m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[6] */
5493             m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96);  /* ee[9] */
5494 
5495             m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[7] */
5496             m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97);  /* ee[8] */
5497 
5498 /*o[k] */
5499             {
5500 
5501                 WORD16 *pi2_dst_scratch = temp_ptr;
5502                 WORD32 out_stride = 8;
5503 
5504                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]);
5505                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]);
5506                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]);
5507                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]);
5508                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]);
5509                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]);
5510                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]);
5511                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]);
5512 
5513 
5514                 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]);
5515                 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]);
5516                 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]);
5517                 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]);
5518                 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]);
5519                 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]);
5520                 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]);
5521                 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]);
5522 
5523                 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]);
5524                 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]);
5525                 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]);
5526                 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]);
5527                 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]);
5528                 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]);
5529                 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]);
5530                 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]);
5531 
5532                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved
5533                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved
5534                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved
5535                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved
5536                 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved
5537                 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved
5538                 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved
5539                 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved
5540 
5541                 /* o0[0-3] */
5542                 {
5543                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5544                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5545                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5546                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5547 
5548                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5549                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5550 
5551                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5552 
5553                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5554                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5555                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5556                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5557 
5558                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5559                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5560 
5561                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5562 
5563                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5564 
5565                     m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20);
5566                     m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20);
5567 
5568                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5569                     m_count = _mm_cvtsi32_si128(i4_shift);
5570                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5571                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5572 
5573                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5574                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5575                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5576                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5577 
5578                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5579 
5580                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5581                     pi2_dst_scratch += out_stride;
5582 
5583                 }
5584 
5585                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]);
5586                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]);
5587                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]);
5588                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]);
5589                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]);
5590                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]);
5591                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]);
5592                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]);
5593 
5594                 /* o1[0-3] */
5595                 {
5596                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5597                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5598                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5599                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5600 
5601                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5602                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5603 
5604                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20);
5605 
5606                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5607                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5608                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5609                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5610 
5611                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5612                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5613 
5614                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5615 
5616                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5617 
5618                     m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20);
5619                     m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20);
5620 
5621                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5622                     m_count = _mm_cvtsi32_si128(i4_shift);
5623                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5624                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5625 
5626                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5627                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5628                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5629                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5630 
5631                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5632 
5633                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5634                     pi2_dst_scratch += out_stride;
5635 
5636                 }
5637 
5638                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]);
5639                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]);
5640                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]);
5641                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]);
5642                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]);
5643                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]);
5644                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]);
5645                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]);
5646 
5647                 /* o2[0-3] */
5648                 {
5649                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5650                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5651                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5652                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5653 
5654                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5655                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5656 
5657                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5658 
5659                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5660                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5661                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5662                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5663 
5664                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41);
5665                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5666 
5667                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42);
5668 
5669                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5670 
5671                     m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20);
5672                     m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20);
5673 
5674                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5675                     m_count = _mm_cvtsi32_si128(i4_shift);
5676                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5677                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5678 
5679                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5680                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5681                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5682                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5683 
5684                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5685 
5686                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5687                     pi2_dst_scratch += out_stride;
5688 
5689                 }
5690 
5691                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]);
5692                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]);
5693                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]);
5694                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]);
5695                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]);
5696                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]);
5697                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]);
5698                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]);
5699 
5700                 /* o3[0-3] */
5701                 {
5702                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5703                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5704                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5705                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5706 
5707                     m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20);
5708                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5709 
5710                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5711 
5712                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5713                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5714                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5715                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5716 
5717                     m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40);
5718                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5719 
5720                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5721 
5722                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5723 
5724                     m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20);
5725                     m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20);
5726 
5727                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5728                     m_count = _mm_cvtsi32_si128(i4_shift);
5729                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5730                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5731 
5732                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5733                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5734                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5735                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5736 
5737                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5738 
5739                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5740                     pi2_dst_scratch += out_stride;
5741 
5742                 }
5743 
5744                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]);
5745                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]);
5746                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]);
5747                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]);
5748                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]);
5749                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]);
5750                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]);
5751                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]);
5752 
5753                 /* o4[0-3] */
5754                 {
5755                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5756                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5757                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5758                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5759 
5760                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5761                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5762 
5763                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5764 
5765                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5766                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5767                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5768                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5769 
5770                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5771                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5772 
5773                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5774 
5775                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5776 
5777                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20);
5778                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20);
5779                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5780                     m_count = _mm_cvtsi32_si128(i4_shift);
5781                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5782                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5783 
5784                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5785                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5786                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5787                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5788 
5789                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5790 
5791                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5792                     pi2_dst_scratch += out_stride;
5793 
5794                 }
5795 
5796                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]);
5797                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]);
5798                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]);
5799                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]);
5800                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]);
5801                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]);
5802                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]);
5803                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]);
5804 
5805                 /* o5[0-3] */
5806                 {
5807                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5808                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5809                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5810                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5811 
5812                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5813                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5814 
5815                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5816 
5817                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5818                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5819                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5820                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5821 
5822                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5823                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5824 
5825                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5826 
5827                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5828 
5829                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20);
5830                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20);
5831 
5832                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5833                     m_count = _mm_cvtsi32_si128(i4_shift);
5834                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5835                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5836 
5837                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5838                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5839                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5840                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5841 
5842                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5843 
5844                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5845                     pi2_dst_scratch += out_stride;
5846 
5847                 }
5848 
5849                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]);
5850                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]);
5851                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]);
5852                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]);
5853                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]);
5854                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]);
5855                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]);
5856                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]);
5857 
5858                 /* o6[0-3] */
5859                 {
5860                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5861                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5862                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5863                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5864 
5865                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5866                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5867 
5868                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5869 
5870                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5871                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5872                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5873                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5874 
5875                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5876                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5877 
5878                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5879 
5880                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5881 
5882                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20);
5883                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20);
5884 
5885                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5886                     m_count = _mm_cvtsi32_si128(i4_shift);
5887                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5888                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5889 
5890                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5891                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5892                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5893                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5894 
5895                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5896 
5897                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5898                     pi2_dst_scratch += out_stride;
5899 
5900                 }
5901 
5902                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]);
5903                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]);
5904                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]);
5905                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]);
5906                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]);
5907                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]);
5908                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]);
5909                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]);
5910 
5911                 /* o7[0-3] */
5912                 {
5913                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5914                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5915                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5916                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5917 
5918                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5919                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5920 
5921                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5922 
5923                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5924                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5925                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5926                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5927 
5928                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5929                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5930 
5931                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5932 
5933                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5934 
5935                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20);
5936                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20);
5937 
5938                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5939                     m_count = _mm_cvtsi32_si128(i4_shift);
5940                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5941                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5942 
5943                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5944                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5945                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5946                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
5947 
5948                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
5949 
5950                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
5951                     pi2_dst_scratch += 8;
5952 
5953                 }
5954 
5955                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]);
5956                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]);
5957                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]);
5958                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]);
5959                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]);
5960                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]);
5961                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]);
5962                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]);
5963 
5964                 /* o8[0-3] */
5965                 {
5966                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
5967                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
5968                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
5969                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
5970 
5971                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
5972                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
5973 
5974                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
5975 
5976                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
5977                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
5978                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
5979                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
5980 
5981                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
5982                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
5983 
5984                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
5985 
5986                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
5987 
5988                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20);
5989                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20);
5990 
5991                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
5992                     m_count = _mm_cvtsi32_si128(i4_shift);
5993                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
5994                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
5995 
5996                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
5997                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
5998                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
5999                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6000 
6001                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6002 
6003                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6004                     pi2_dst_scratch += out_stride;
6005                 }
6006 
6007                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]);
6008                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]);
6009                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]);
6010                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]);
6011                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]);
6012                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]);
6013                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]);
6014                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]);
6015 
6016                 /* o9[0-3] */
6017                 {
6018                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6019                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6020                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6021                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6022 
6023                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6024                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6025 
6026                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6027 
6028                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6029                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6030                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6031                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6032 
6033                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6034                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6035 
6036                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6037 
6038                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6039 
6040                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20);
6041                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20);
6042 
6043                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6044                     m_count = _mm_cvtsi32_si128(i4_shift);
6045                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6046                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6047 
6048                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6049                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6050                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6051                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6052 
6053                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6054 
6055                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6056                     pi2_dst_scratch += out_stride;
6057                 }
6058 
6059                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]);
6060                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]);
6061                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]);
6062                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]);
6063                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]);
6064                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]);
6065                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]);
6066                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]);
6067 
6068                 /* o10[0-3] */
6069                 {
6070                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6071                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6072                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6073                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6074 
6075                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6076                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6077 
6078                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6079 
6080                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6081                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6082                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6083                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6084 
6085                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6086                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6087 
6088                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6089 
6090                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6091 
6092                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20);
6093                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20);
6094 
6095                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6096                     m_count = _mm_cvtsi32_si128(i4_shift);
6097                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6098                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6099 
6100                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6101                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6102                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6103                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6104 
6105                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6106 
6107                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6108                     pi2_dst_scratch += out_stride;
6109                 }
6110 
6111 
6112                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]);
6113                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]);
6114                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]);
6115                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]);
6116                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]);
6117                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]);
6118                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]);
6119                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]);
6120 
6121                 /* o11[0-3] */
6122                 {
6123                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6124                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6125                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6126                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6127 
6128                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6129                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6130 
6131                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6132 
6133                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6134                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6135                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6136                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6137 
6138                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6139                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6140 
6141                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6142 
6143                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6144 
6145                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20);
6146                     m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20);
6147 
6148                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6149                     m_count = _mm_cvtsi32_si128(i4_shift);
6150                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6151                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6152 
6153                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6154                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6155                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6156                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6157 
6158                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6159 
6160                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6161                     pi2_dst_scratch += out_stride;
6162 
6163                 }
6164 
6165                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]);
6166                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]);
6167                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]);
6168                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]);
6169                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]);
6170                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]);
6171                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]);
6172                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]);
6173 
6174                 /* o12[0-3] */
6175                 {
6176                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6177                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6178                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6179                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6180 
6181                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6182                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6183 
6184                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6185 
6186                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6187                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6188                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6189                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6190 
6191                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6192                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6193 
6194                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6195 
6196                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6197 
6198                     m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20);
6199                     m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20);
6200 
6201                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6202                     m_count = _mm_cvtsi32_si128(i4_shift);
6203                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6204                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6205 
6206                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6207                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6208                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6209                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6210 
6211                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6212 
6213                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6214                     pi2_dst_scratch += out_stride;
6215 
6216                 }
6217 
6218                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]);
6219                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]);
6220                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]);
6221                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]);
6222                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]);
6223                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]);
6224                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]);
6225                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]);
6226 
6227                 /* o13[0-3] */
6228                 {
6229                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6230                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6231                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6232                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6233 
6234                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6235                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6236 
6237                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6238 
6239                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6240                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6241                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6242                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6243 
6244                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6245                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6246 
6247                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6248 
6249                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6250 
6251                     m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20);
6252                     m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20);
6253 
6254                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6255                     m_count = _mm_cvtsi32_si128(i4_shift);
6256                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6257                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6258 
6259                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6260                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6261                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6262                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6263 
6264                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6265 
6266                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6267                     pi2_dst_scratch += out_stride;
6268                 }
6269 
6270                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]);
6271                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]);
6272                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]);
6273                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]);
6274                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]);
6275                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]);
6276                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]);
6277                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]);
6278 
6279                 /* o14[0-3] */
6280                 {
6281                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6282                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6283                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6284                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6285 
6286                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6287                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6288 
6289                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6290 
6291                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6292                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6293                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6294                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6295 
6296                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6297                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6298 
6299                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6300 
6301                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6302 
6303                     m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20);
6304                     m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20);
6305 
6306                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6307                     m_count = _mm_cvtsi32_si128(i4_shift);
6308                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6309                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6310 
6311                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6312                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6313                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6314                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6315 
6316                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6317 
6318                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6319                     pi2_dst_scratch += out_stride;
6320 
6321                 }
6322 
6323                 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]);
6324                 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]);
6325                 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]);
6326                 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]);
6327                 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]);
6328                 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]);
6329                 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]);
6330                 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]);
6331 
6332                 /* o15[0-3] */
6333                 {
6334                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1);
6335                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2);
6336                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3);
6337                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4);
6338 
6339                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21);
6340                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23);
6341 
6342                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22);
6343 
6344                     m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5);
6345                     m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6);
6346                     m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7);
6347                     m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8);
6348 
6349                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41);
6350                     m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43);
6351 
6352                     m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42);
6353 
6354                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40);
6355 
6356                     m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20);
6357                     m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20);
6358 
6359                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
6360                     m_count = _mm_cvtsi32_si128(i4_shift);
6361                     m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor);
6362                     m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor);
6363 
6364                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor);
6365                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor);
6366                     m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count);
6367                     m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count);
6368 
6369                     m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31);
6370 
6371                     _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30);
6372                     pi2_dst_scratch += 8;
6373                 }
6374 
6375             }
6376         }
6377 
6378         /* Transpose */
6379         {
6380 
6381             WORD16 *pi2_src_scratch = temp_ptr;
6382             WORD32 out_stride = dst_strd;
6383             WORD32 in_stride = 8;
6384 
6385             m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch);
6386             pi2_src_scratch += in_stride;
6387             m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch);
6388             pi2_src_scratch += in_stride;
6389             m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch);
6390             pi2_src_scratch += in_stride;
6391             m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch);
6392             pi2_src_scratch += in_stride;
6393             m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch);
6394             pi2_src_scratch += in_stride;
6395             m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch);
6396             pi2_src_scratch += in_stride;
6397             m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch);
6398             pi2_src_scratch += in_stride;
6399             m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch);
6400             pi2_src_scratch += 8;
6401 
6402             m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch);
6403             pi2_src_scratch += in_stride;
6404             m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch);
6405             pi2_src_scratch += in_stride;
6406             m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch);
6407             pi2_src_scratch += in_stride;
6408             m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch);
6409             pi2_src_scratch += in_stride;
6410             m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch);
6411             pi2_src_scratch += in_stride;
6412             m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch);
6413             pi2_src_scratch += in_stride;
6414             m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch);
6415             pi2_src_scratch += in_stride;
6416             m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch);
6417             pi2_src_scratch += 8;
6418 
6419 
6420             m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31);
6421             m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30);
6422 
6423             m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33);
6424             m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32);
6425 
6426             m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35);
6427             m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34);
6428 
6429             m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37);
6430             m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36);
6431 
6432             m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71);
6433             m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70);
6434 
6435             m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73);
6436             m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72);
6437 
6438             m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75);
6439             m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74);
6440 
6441             m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77);
6442             m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76);
6443 
6444 
6445             m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42);
6446             m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42);
6447 
6448             m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46);
6449             m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46);
6450 
6451             m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82);
6452             m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82);
6453 
6454             m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86);
6455             m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86);
6456 
6457             m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41);
6458             m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41);
6459 
6460             m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45);
6461             m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45);
6462 
6463             m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81);
6464             m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81);
6465 
6466             m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85);
6467             m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85);
6468 
6469 
6470             m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2);       // row0 = 0-7
6471             m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2);       // row1 = 0-7
6472 
6473             m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90);     // row0=24-31
6474             m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90);     // row1=24-31
6475 
6476             m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6);       // row0=8-15
6477             m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6);       // row1=8-15
6478 
6479             m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94);     // row0=16-23
6480             m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94);     // row1=16-23
6481 
6482             m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3);      // row2 =0-7
6483             m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3);      // row3 =0-7
6484 
6485             m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91);    // row2=24-31
6486             m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91);    // row3=24-31
6487 
6488             m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7);      // row2=8-15
6489             m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7);      // row3=8-15
6490 
6491             m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95);    // row2=16-23
6492             m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95);    // row3=16-23
6493 
6494             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6495 
6496             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6497             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6498 
6499             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0);
6500             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6501 
6502             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6503             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6504 
6505             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0);
6506             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6507 
6508             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6509 
6510             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6511 
6512             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6513             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6514 
6515             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0);
6516             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6517 
6518             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6519             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6520 
6521             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0);
6522             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6523 
6524             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6525             pu1_dst += out_stride;
6526             pu1_pred += pred_strd;
6527 
6528 
6529             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6530 
6531             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6532             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6533 
6534             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0);
6535             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6536 
6537             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6538             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6539 
6540             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0);
6541             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6542 
6543             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6544 
6545             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6546 
6547             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6548             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6549 
6550             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0);
6551             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6552 
6553             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6554             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6555 
6556             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0);
6557             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6558 
6559             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6560             pu1_dst += out_stride;
6561             pu1_pred += pred_strd;
6562 
6563             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6564 
6565             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6566             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6567 
6568             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0);
6569             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6570 
6571             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6572             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6573 
6574             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0);
6575             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6576 
6577             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6578 
6579             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6580 
6581             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6582             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6583 
6584             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0);
6585             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6586 
6587             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6588             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6589 
6590             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0);
6591             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6592 
6593             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6594             pu1_dst += out_stride;
6595             pu1_pred += pred_strd;
6596 
6597 
6598             m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred);
6599 
6600             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6601             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6602 
6603             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0);
6604             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6605 
6606             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6607             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6608 
6609             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0);
6610             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6611 
6612             _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20);
6613 
6614             m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16));
6615 
6616             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_20);
6617             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg);
6618 
6619             m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0);
6620             m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8);
6621 
6622             //m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0);
6623             m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg);
6624 
6625             m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0);
6626             m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44);
6627 
6628             _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20);
6629             pu1_dst += out_stride;
6630             pu1_pred += pred_strd;
6631 
6632         }
6633         pi2_tmp += 4;
6634     }
6635 }
6636 
6637