1 /******************************************************************************
2  *
3  * Copyright (C) 2015 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at:
8  *
9  * http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  *
17  *****************************************************************************
18  * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
19 */
20 
21 /**
22  *******************************************************************************
23  * @file
24  *  impeg2_itrans_recon_x86_intr.c
25  *
26  * @brief
27  *  Contains function definitions for inverse  quantization, inverse
28  * transform and reconstruction
29  *
30  * @author
31  *  100470
32  *  100592 (edited by)
33  *
34  * @par List of Functions:
35  *  - impeg2_itrans_recon_8x8_sse42()
36  *
37  * @remarks
38  *  None
39  *
40  *******************************************************************************
41  */
42 #include <stdio.h>
43 #include <string.h>
44 #include "iv_datatypedef.h"
45 #include "impeg2_macros.h"
46 #include "impeg2_defs.h"
47 #include "impeg2_globals.h"
48 
49 #include <immintrin.h>
50 #include <emmintrin.h>
51 #include <smmintrin.h>
52 #include <tmmintrin.h>
53 
54 
55 /**
56  *******************************************************************************
57  *
58  * @brief
59  *  This function performs inverse quantization, inverse  transform and
60  * reconstruction for 8c8 input block
61  *
62  * @par Description:
63  *  Performs inverse quantization , inverse transform  and adds the
64  * prediction data and clips output to 8 bit
65  *
66  * @param[in] pi2_src
67  *  Input 8x8 coefficients
68  *
69  * @param[in] pi2_tmp
70  *  Temporary 8x8 buffer for storing inverse
71  *  transform 1st stage output
72  *
73  * @param[in] pu1_pred
74  *  Prediction 8x8 block
75  *
76  * @param[in] pi2_dequant_coeff
77  *  Dequant Coeffs
78  *
79  * @param[out] pu1_dst
80  *  Output 8x8 block
81  *
82  * @param[in] src_strd
83  *  Input stride
84  *
85  * @param[in] qp_div
86  *  Quantization parameter / 6
87  *
88  * @param[in] qp_rem
89  *  Quantization parameter % 6
90  *
91  * @param[in] pred_strd
92  *  Prediction stride
93  *
94  * @param[in] dst_strd
95  *  Output Stride
96  *
97  * @param[in] zero_cols
98  *  Zero columns in pi2_src
99  *
100  * @returns  Void
101  *
102  * @remarks
103  *  None
104  *
105  *******************************************************************************
106  */
107 
108 
109 void impeg2_idct_recon_sse42(WORD16 *pi2_src,
110                                   WORD16 *pi2_tmp,
111                                   UWORD8 *pu1_pred,
112                                   UWORD8 *pu1_dst,
113                                   WORD32 src_strd,
114                                   WORD32 pred_strd,
115                                   WORD32 dst_strd,
116                                   WORD32 zero_cols,
117                                   WORD32 zero_rows)
118 {
119     __m128i m_temp_reg_0;
120     __m128i m_temp_reg_1;
121     __m128i m_temp_reg_2;
122     __m128i m_temp_reg_3;
123     __m128i m_temp_reg_5;
124     __m128i m_temp_reg_6;
125     __m128i m_temp_reg_7;
126     __m128i m_temp_reg_4;
127     __m128i m_temp_reg_10;
128     __m128i m_temp_reg_11;
129     __m128i m_temp_reg_12;
130     __m128i m_temp_reg_13;
131     __m128i m_temp_reg_14;
132     __m128i m_temp_reg_15;
133     __m128i m_temp_reg_16;
134     __m128i m_temp_reg_17;
135     __m128i m_temp_reg_20;
136     __m128i m_temp_reg_21;
137     __m128i m_temp_reg_22;
138     __m128i m_temp_reg_23;
139     __m128i m_temp_reg_24;
140     __m128i m_temp_reg_25;
141     __m128i m_temp_reg_26;
142     __m128i m_temp_reg_27;
143     __m128i m_temp_reg_30;
144     __m128i m_temp_reg_31;
145     __m128i m_temp_reg_32;
146     __m128i m_temp_reg_33;
147     __m128i m_temp_reg_34;
148     __m128i m_temp_reg_35;
149     __m128i m_temp_reg_36;
150     __m128i m_temp_reg_37;
151     __m128i m_temp_reg_40;
152     __m128i m_temp_reg_41;
153     __m128i m_temp_reg_42;
154     __m128i m_temp_reg_43;
155     __m128i m_temp_reg_44;
156     __m128i m_temp_reg_45;
157     __m128i m_temp_reg_46;
158     __m128i m_temp_reg_47;
159     __m128i m_temp_reg_50;
160     __m128i m_temp_reg_51;
161     __m128i m_temp_reg_52;
162     __m128i m_temp_reg_53;
163     __m128i m_temp_reg_54;
164     __m128i m_temp_reg_55;
165     __m128i m_temp_reg_56;
166     __m128i m_temp_reg_57;
167     __m128i m_temp_reg_60;
168     __m128i m_temp_reg_61;
169     __m128i m_temp_reg_62;
170     __m128i m_temp_reg_63;
171     __m128i m_temp_reg_64;
172     __m128i m_temp_reg_65;
173     __m128i m_temp_reg_66;
174     __m128i m_temp_reg_67;
175     __m128i m_temp_reg_70;
176     __m128i m_temp_reg_71;
177     __m128i m_temp_reg_72;
178     __m128i m_temp_reg_73;
179     __m128i m_temp_reg_74;
180     __m128i m_temp_reg_75;
181     __m128i m_temp_reg_76;
182     __m128i m_temp_reg_77;
183     __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4;
184 
185     WORD32 check_row_stage_1;   /* Lokesh */
186     WORD32 check_row_stage_2;   /* Lokesh */
187 
188     __m128i m_rdng_factor;
189     WORD32 i4_shift = IDCT_STG1_SHIFT;
190     UNUSED(pi2_tmp);
191     check_row_stage_1   = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0;
192     check_row_stage_2   = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0;
193 
194     m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src);
195     pi2_src += src_strd;
196     m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src);
197     pi2_src += src_strd;
198     m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src);
199     pi2_src += src_strd;
200     m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src);
201     pi2_src += src_strd;
202 
203     m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src);
204     pi2_src += src_strd;
205     m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src);
206     pi2_src += src_strd;
207     m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src);
208     pi2_src += src_strd;
209     m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src);
210 
211     if(!check_row_stage_2)
212     {
213         if(!check_row_stage_1)
214         {
215             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
216             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
217             {
218                 //Interleaving 0,4 row in 0 , 1 Rishab
219                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
220                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
221                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
222 
223                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
224 
225                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
226                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
227 
228             }
229 
230 
231             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
232             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
233             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
234             {
235 
236                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
237                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
238 
239                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
240                 //Interleaving 2,6 row in 4, 5 Rishab
241                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
242 
243                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
244                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
245 
246 
247                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
248 
249                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
250                 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
251 
252                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
253                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
254 
255 
256 
257                 /* e */
258 
259                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
260                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
261                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
262                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
263                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
264                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
265 
266                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
267                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
268 
269             }
270 
271             /* o */
272             {
273 
274                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
275                 {
276 
277                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
278                     //o0:1B*89+3B*75,5B*50+7B*18
279                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
280 
281                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
282                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
283 
284 
285 
286                     /* Column 0 of destination computed here */
287                     /* It is stored in m_temp_reg_50 */
288                     /* Column 7 of destination computed here */
289                     /* It is stored in m_temp_reg_57 */
290                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
291 
292 
293 
294                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
295                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
296 
297                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
298                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
299 
300                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
301                     m_temp_reg_63 = _mm_setzero_si128();
302                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
303 
304                     //o1:1B*75-3B*18,5B*89+7B*50
305                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
306 
307                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
308                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
309 
310                     /* Loading coeff for computing o2  in the next block */
311 
312                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
313                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
314 
315                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
316 
317 
318 
319                     /* Column 1 of destination computed here */
320                     /* It is stored in m_temp_reg_51 */
321                     /* Column 6 of destination computed here */
322                     /* It is stored in m_temp_reg_56 */
323 
324                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
325                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
326 
327                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
328                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
329 
330                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
331                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
332 
333                     //o2:1B*50-3B*89,5B*18+7B*75
334                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
335 
336                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
337                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
338 
339 
340                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
341 
342                     /* Loading coeff for computing o3  in the next block */
343 
344                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
345                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
346 
347 
348 
349                     /* Column 2 of destination computed here */
350                     /* It is stored in m_temp_reg_52 */
351                     /* Column 5 of destination computed here */
352                     /* It is stored in m_temp_reg_55 */
353 
354                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
355                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
356 
357                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
358                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
359 
360                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
361                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
362 
363                     //o3:1B*18-3B*50,5B*75-7B*89
364                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
365 
366                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
367                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
368 
369 
370 
371                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
372 
373 
374 
375                     /* Column 3 of destination computed here */
376                     /* It is stored in m_temp_reg_53 */
377                     /* Column 4 of destination computed here */
378                     /* It is stored in m_temp_reg_54 */
379 
380                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
381                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
382 
383                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
384                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
385 
386                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
387                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
388 
389 
390                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
391                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
392                 }
393             }
394 
395             /* Transpose of the destination 8x8 matrix done here */
396             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
397             /* respectively */
398             {
399                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
400                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
401                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
402                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
403 
404                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
405                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
406 
407                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
408                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
409 
410                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
411                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
412                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
413                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
414 
415                 m_temp_reg_54 = _mm_setzero_si128();
416                 m_temp_reg_55 = _mm_setzero_si128();
417                 m_temp_reg_56 = _mm_setzero_si128();
418                 m_temp_reg_57 = _mm_setzero_si128();
419             }
420         }
421         else
422         {
423             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
424             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
425             {
426                 //Interleaving 0,4 row in 0 , 1 Rishab
427                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
428                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
429                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
430 
431                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
432 
433                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
434                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
435 
436             }
437 
438 
439             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
440             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
441             /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/
442             {
443 
444                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
445                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
446 
447                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
448                 //Interleaving 2,6 row in 4, 5 Rishab
449                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
450 
451                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
452                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
453 
454 
455                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
456 
457                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
458                 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
459 
460                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
461                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
462 
463 
464 
465                 /* e */
466 
467                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
468                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
469                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
470                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
471                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
472                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
473 
474                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
475                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
476 
477             }
478 
479             /* o */
480             {
481 
482                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
483                 {
484 
485                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
486                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
487                     //o0:1B*89+3B*75,5B*50+7B*18
488                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
489                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
490 
491                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
492                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
493 
494                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
495 
496 
497 
498                     /* Column 0 of destination computed here */
499                     /* It is stored in m_temp_reg_50 */
500                     /* Column 7 of destination computed here */
501                     /* It is stored in m_temp_reg_57 */
502                     /* Upper 8 bytes of both registers are zero due to zero_cols*/
503 
504 
505 
506                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
507                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
508 
509                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
510                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
511 
512                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
513                     m_temp_reg_63 = _mm_setzero_si128();
514                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
515 
516                     //o1:1B*75-3B*18,5B*89+7B*50
517                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
518                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
519 
520                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
521                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
522 
523                     /* Loading coeff for computing o2  in the next block */
524 
525                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
526                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
527 
528                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
529                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
530 
531 
532 
533                     /* Column 1 of destination computed here */
534                     /* It is stored in m_temp_reg_51 */
535                     /* Column 6 of destination computed here */
536                     /* It is stored in m_temp_reg_56 */
537 
538                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
539                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
540 
541                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
542                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
543 
544                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
545                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
546 
547                     //o2:1B*50-3B*89,5B*18+7B*75
548                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
549                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
550 
551                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
552                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
553 
554 
555                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
556 
557                     /* Loading coeff for computing o3  in the next block */
558 
559                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
560                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
561 
562                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
563 
564 
565                     /* Column 2 of destination computed here */
566                     /* It is stored in m_temp_reg_52 */
567                     /* Column 5 of destination computed here */
568                     /* It is stored in m_temp_reg_55 */
569 
570                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
571                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
572 
573                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
574                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
575 
576                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
577                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
578 
579                     //o3:1B*18-3B*50,5B*75-7B*89
580                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
581                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
582 
583                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
584                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
585 
586 
587 
588                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
589 
590                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
591 
592 
593                     /* Column 3 of destination computed here */
594                     /* It is stored in m_temp_reg_53 */
595                     /* Column 4 of destination computed here */
596                     /* It is stored in m_temp_reg_54 */
597 
598                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
599                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
600 
601                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
602                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
603 
604                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
605                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
606 
607 
608                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
609                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63);
610                 }
611             }
612 
613             /* Transpose of the destination 8x8 matrix done here */
614             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
615             /* respectively */
616             {
617                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
618                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
619                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
620                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
621 
622                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
623                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
624                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
625                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
626 
627                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
628                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
629                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
630                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
631 
632                 m_temp_reg_54 = _mm_setzero_si128();
633                 m_temp_reg_55 = _mm_setzero_si128();
634                 m_temp_reg_56 = _mm_setzero_si128();
635                 m_temp_reg_57 = _mm_setzero_si128();
636             }
637         }
638 
639         /* Stage 2 */
640         i4_shift = IDCT_STG2_SHIFT;
641         {
642             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
643             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
644             {
645                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
646                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
647 
648                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
649                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
650 
651                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
652                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
653                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
654                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
655 
656 
657                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
658                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
659             }
660 
661 
662             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
663             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
664             {
665 
666                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
667                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
668 
669 
670                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
671                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
672                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
673                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
674 
675                 /* Loading coeff for computing o0 in the next block */
676                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
677 
678 
679                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
680                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
681 
682 
683 
684                 /* e */
685 
686                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
687                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
688                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
689                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
690                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
691                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
692 
693                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
694                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
695 
696                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
697                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
698 
699                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
700                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
701 
702             }
703 
704             /* o */
705             {
706 
707                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
708                 {
709                     //o0:1B*89+3B*75,1T*89+3T*75
710                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
711                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
712 
713                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
714                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
715                     /* Loading coeff for computing o1 in the next block */
716                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
717 
718 
719 
720                     /* Column 0 of destination computed here */
721                     /* It is stored in m_temp_reg_50 */
722                     /* Column 7 of destination computed here */
723                     /* It is stored in m_temp_reg_57 */
724 
725                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
726                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
727 
728                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
729                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
730 
731                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
732                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
733                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
734                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
735 
736                     //o1:1B*75-3B*18,1T*75-3T*18
737                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
738                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
739 
740                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
741                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
742                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
743                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
744 
745                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
746                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
747 
748 
749                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
750 
751 
752                     /* Loading coeff for computing o2  in the next block */
753                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
754 
755 
756 
757                     /* Column 1 of destination computed here */
758                     /* It is stored in m_temp_reg_51 */
759                     /* Column 6 of destination computed here */
760                     /* It is stored in m_temp_reg_56 */
761 
762                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
763                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
764 
765                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
766                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
767 
768                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
769                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
770                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
771                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
772 
773                     //o2:1B*50-3B*89,5T*18+7T*75.
774                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
775                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
776 
777                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
778                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
779                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
780                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
781 
782                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
783                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
784 
785 
786                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
787 
788                     /* Loading coeff for computing o3  in the next block */
789 
790                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
791 
792 
793                     /* Column 2 of destination computed here */
794                     /* It is stored in m_temp_reg_52 */
795                     /* Column 5 of destination computed here */
796                     /* It is stored in m_temp_reg_55 */
797 
798                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
799                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
800 
801                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
802                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
803 
804                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
805                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
806                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
807                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
808 
809                     //o3:1B*18-3B*50,1T*18-3T*50
810                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
811                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
812 
813                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
814                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
815                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
816                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
817 
818 
819                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
820                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
821 
822 
823 
824                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
825 
826 
827                     /* Column 3 of destination computed here */
828                     /* It is stored in m_temp_reg_53 */
829                     /* Column 4 of destination computed here */
830                     /* It is stored in m_temp_reg_54 */
831 
832                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
833                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
834 
835                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
836                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
837 
838                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
839                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
840                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
841                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
842 
843                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
844                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
845                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
846                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
847 
848                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
849                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
850                 }
851             }
852 
853             /* Transpose of the destination 8x8 matrix done here */
854             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
855             /* respectively */
856             {
857                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
858                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
859                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
860                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
861                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
862                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
863                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
864                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
865 
866                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
867                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
868                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
869                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
870                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
871                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
872                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
873                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
874                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
875                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
876                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
877                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
878 
879                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
880                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
881                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
882                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
883             }
884 
885             /* Recon and store */
886             {
887                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
888                 pu1_pred += pred_strd;
889                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
890                 pu1_pred += pred_strd;
891                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
892                 pu1_pred += pred_strd;
893                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
894                 pu1_pred += pred_strd;
895                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
896                 pu1_pred += pred_strd;
897                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
898                 pu1_pred += pred_strd;
899                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
900                 pu1_pred += pred_strd;
901                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
902 
903                 m_temp_reg_50 = _mm_setzero_si128();
904                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
905                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
906                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
907                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
908                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
909                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
910                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
911                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
912 
913                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
914                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
915                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
916                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
917                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
918                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
919                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
920                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
921 
922                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
923                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
924                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
925                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
926                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
927                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
928                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
929                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
930 
931                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
932                 pu1_dst += dst_strd;
933                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
934                 pu1_dst += dst_strd;
935                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
936                 pu1_dst += dst_strd;
937                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
938                 pu1_dst += dst_strd;
939                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
940                 pu1_dst += dst_strd;
941                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
942                 pu1_dst += dst_strd;
943                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
944                 pu1_dst += dst_strd;
945                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
946                 pu1_dst += dst_strd;
947             }
948         }
949     }
950     else
951 
952     {
953 
954         /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
955         /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
956         if(!check_row_stage_1)
957         {
958             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
959             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
960             {
961                 //Interleaving 0,4 row in 0 , 1 Rishab
962                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
963                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
964                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
965 
966                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
967                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
968 
969                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
970                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
971 
972 
973                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
974                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
975             }
976 
977 
978             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
979             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
980             {
981 
982                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
983                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
984 
985                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
986                 //Interleaving 2,6 row in 4, 5 Rishab
987                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
988                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
989 
990                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
991                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
992 
993                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
994                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
995 
996 
997 
998                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
999 
1000                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
1001                 //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]);
1002 
1003                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
1004                 //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]);
1005 
1006             }
1007 
1008             /* e */
1009             {
1010                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1011                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1012                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1013                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1014                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1015                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1016 
1017                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1018                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1019 
1020                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1021                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1022 
1023                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1024                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1025 
1026             }
1027 
1028             /* o */
1029             {
1030 
1031                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1032                 {
1033 
1034                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1035                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1036                     //o0:1B*89+3B*75,1T*89+3T*75
1037                     m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1038                     m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1039 
1040                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1041                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1042 
1043                 }
1044 
1045                 /* Column 0 of destination computed here */
1046                 /* It is stored in m_temp_reg_50 */
1047                 /* Column 7 of destination computed here */
1048                 /* It is stored in m_temp_reg_57 */
1049                 {
1050 
1051 
1052                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1053                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1054 
1055                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1056                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1057 
1058                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1059                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1060                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1061                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1062 
1063                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1064                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1065                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1066                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1067 
1068                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1069                     m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1070                     m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1071 
1072                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1073                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1074 
1075                     /* Loading coeff for computing o2  in the next block */
1076 
1077                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
1078 
1079                 }
1080 
1081                 /* Column 1 of destination computed here */
1082                 /* It is stored in m_temp_reg_51 */
1083                 /* Column 6 of destination computed here */
1084                 /* It is stored in m_temp_reg_56 */
1085                 {
1086                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1087                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1088 
1089                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1090                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1091 
1092                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1093                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1094                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1095                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1096 
1097                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1098                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1099                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1100                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1101 
1102                     //o2:1B*50-3B*89,1T*50-3T*89
1103                     m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1104                     m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1105 
1106                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1107                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1108 
1109 
1110                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1111 
1112 
1113                     /* Loading coeff for computing o3  in the next block */
1114 
1115                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
1116 
1117                 }
1118 
1119                 /* Column 2 of destination computed here */
1120                 /* It is stored in m_temp_reg_52 */
1121                 /* Column 5 of destination computed here */
1122                 /* It is stored in m_temp_reg_55 */
1123                 {
1124                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1125                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1126 
1127                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1128                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1129 
1130                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1131                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1132                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1133                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1134 
1135                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1136                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1137                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1138                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1139 
1140                     //o3:1B*18-3B*50,1T*18-3T*50
1141                     m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1142                     m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1143 
1144                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1145                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1146 
1147 
1148 
1149                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1150 
1151 
1152                 }
1153 
1154                 /* Column 3 of destination computed here */
1155                 /* It is stored in m_temp_reg_53 */
1156                 /* Column 4 of destination computed here */
1157                 /* It is stored in m_temp_reg_54 */
1158                 {
1159                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1160                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1161 
1162                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1163                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1164 
1165                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1166                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1167                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1168                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1169 
1170                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1171                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1172                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1173                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1174 
1175                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1176                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1177                 }
1178             }
1179 
1180             /* Transpose of the destination 8x8 matrix done here */
1181             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1182             /* respectively */
1183             {
1184 
1185 
1186                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1187                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1188                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1189                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1190                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1191                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1192                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1193                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1194 
1195                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1196                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1197                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1198                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1199                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1200                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1201                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1202                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1203 
1204                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1205                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1206                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1207                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1208 
1209                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1210                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1211                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1212                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1213             }
1214         }
1215         else
1216         {
1217 
1218             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1219             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1220             {
1221                 //Interleaving 0,4 row in 0 , 1 Rishab
1222                 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/
1223                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]);
1224                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]);
1225 
1226                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74);
1227                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74);
1228 
1229                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1230                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1231 
1232 
1233                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1234                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1235             }
1236 
1237 
1238             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1239             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1240             {
1241 
1242                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83
1243                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36
1244 
1245                 /* Combining instructions to eliminate them based on zero_rows : Lokesh */
1246                 //Interleaving 2,6 row in 4, 5 Rishab
1247                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76);
1248                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76);
1249 
1250                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1);
1251                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1252 
1253                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1);
1254                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1255 
1256 
1257 
1258                 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */
1259 
1260                 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]);
1261                 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]);
1262 
1263                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]);
1264                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]);
1265 
1266             }
1267 
1268             /* e */
1269             {
1270                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1271                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1272                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1273                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1274                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1275                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1276 
1277                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1278                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1279 
1280                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1281                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1282 
1283                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1284                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1285 
1286             }
1287 
1288             /* o */
1289             {
1290 
1291                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1292                 {
1293 
1294                     m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73);
1295                     m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73);
1296                     m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77);
1297                     m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77);
1298                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1299                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1300                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1301                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1302                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1303 
1304 
1305                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1306                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1307 
1308                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1309                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1310                 }
1311 
1312                 /* Column 0 of destination computed here */
1313                 /* It is stored in m_temp_reg_50 */
1314                 /* Column 7 of destination computed here */
1315                 /* It is stored in m_temp_reg_57 */
1316                 {
1317 
1318 
1319                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1320                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1321 
1322                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1323                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1324 
1325                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1326                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1327                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1328                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1329 
1330                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1331                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1332                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1333                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1334 
1335                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1336                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1337                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1338                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1339                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1340 
1341                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1342                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1343 
1344                     /* Loading coeff for computing o2  in the next block */
1345 
1346                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]);
1347                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]);
1348 
1349                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1350                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1351                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1352                 }
1353 
1354                 /* Column 1 of destination computed here */
1355                 /* It is stored in m_temp_reg_51 */
1356                 /* Column 6 of destination computed here */
1357                 /* It is stored in m_temp_reg_56 */
1358                 {
1359                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1360                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1361 
1362                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1363                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1364 
1365                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1366                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1367                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1368                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1369 
1370                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1371                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1372                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1373                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1374 
1375                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
1376                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1);
1377                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2);
1378                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1);
1379                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2);
1380 
1381                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1382                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1383 
1384 
1385                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1386 
1387 
1388                     /* Loading coeff for computing o3  in the next block */
1389 
1390                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]);
1391                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]);
1392 
1393                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1394                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1395                 }
1396 
1397                 /* Column 2 of destination computed here */
1398                 /* It is stored in m_temp_reg_52 */
1399                 /* Column 5 of destination computed here */
1400                 /* It is stored in m_temp_reg_55 */
1401                 {
1402                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1403                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1404 
1405                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1406                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1407 
1408                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1409                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1410                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1411                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1412 
1413                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1414                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1415                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1416                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1417 
1418                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
1419                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3);
1420                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4);
1421                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3);
1422                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4);
1423 
1424                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1425                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1426 
1427 
1428 
1429                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1430 
1431 
1432                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1433                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
1434                 }
1435 
1436                 /* Column 3 of destination computed here */
1437                 /* It is stored in m_temp_reg_53 */
1438                 /* Column 4 of destination computed here */
1439                 /* It is stored in m_temp_reg_54 */
1440                 {
1441                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1442                     m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1443 
1444                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1445                     m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1446 
1447                     m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor);
1448                     m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor);
1449                     m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor);
1450                     m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor);
1451 
1452                     m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift);
1453                     m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift);
1454                     m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift);
1455                     m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift);
1456 
1457                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63);
1458                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67);
1459                 }
1460             }
1461 
1462             /* Transpose of the destination 8x8 matrix done here */
1463             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1464             /* respectively */
1465             {
1466 
1467 
1468                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1469                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1470                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1471                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1472                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1473                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1474                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1475                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1476 
1477                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1478                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1479                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1480                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1481                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1482                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1483                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1484                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1485 
1486                 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1487                 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1488                 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1489                 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1490 
1491                 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1492                 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1493                 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1494                 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1495             }
1496         }
1497         /* Stage 2 */
1498 
1499         i4_shift = IDCT_STG2_SHIFT;
1500 
1501         {
1502 
1503             /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */
1504             /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */
1505             {
1506                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add
1507                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub
1508 
1509                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54);
1510                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54);
1511 
1512                 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1513                 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1514                 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1515                 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1516 
1517 
1518                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]);
1519                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]);
1520             }
1521 
1522 
1523             /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */
1524             /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */
1525             {
1526                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56);
1527                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56);
1528 
1529 
1530                 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1531                 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2);
1532                 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1533                 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2);
1534 
1535                 /* Loading coeff for computing o0 in the next block */
1536                 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]);
1537                 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]);
1538 
1539 
1540                 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53);
1541                 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53);
1542             }
1543 
1544             /* e */
1545             {
1546                 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */
1547                 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */
1548                 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */
1549                 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */
1550                 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16);
1551                 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16);
1552 
1553                 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14);
1554                 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14);
1555 
1556                 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17);
1557                 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17);
1558 
1559                 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15);
1560                 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15);
1561 
1562             }
1563 
1564             /* o */
1565             {
1566                 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57);
1567                 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57);
1568 
1569                 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */
1570                 {
1571                     //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18
1572                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1573                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1574                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1575                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1576 
1577                     m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1)));
1578                     m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000);
1579                     /* Loading coeff for computing o1 in the next block */
1580                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]);
1581                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]);
1582 
1583                     m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1584                     m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1585                 }
1586 
1587                 /* Column 0 of destination computed here */
1588                 /* It is stored in m_temp_reg_50 */
1589                 /* Column 7 of destination computed here */
1590                 /* It is stored in m_temp_reg_57 */
1591                 {
1592                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30);
1593                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30);
1594 
1595                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31);
1596                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31);
1597 
1598                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1599                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1600                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1601                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1602 
1603                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1604                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1605                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1606                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1607 
1608                     //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50
1609                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1610                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
1611                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1612                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
1613 
1614                     m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1615                     m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1616 
1617 
1618                     /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */
1619 
1620 
1621                     /* Loading coeff for computing o2  in the next block */
1622                     m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]);
1623                     m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]);
1624 
1625                     m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26);
1626                     m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27);
1627                 }
1628 
1629                 /* Column 1 of destination computed here */
1630                 /* It is stored in m_temp_reg_51 */
1631                 /* Column 6 of destination computed here */
1632                 /* It is stored in m_temp_reg_56 */
1633                 {
1634                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32);
1635                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32);
1636 
1637                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33);
1638                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33);
1639 
1640                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1641                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1642                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1643                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1644 
1645                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1646                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1647                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1648                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1649 
1650                     //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75
1651                     m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1);
1652                     m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2);
1653                     m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1);
1654                     m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2);
1655 
1656                     m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1657                     m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1658 
1659 
1660                     /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */
1661 
1662                     /* Loading coeff for computing o3  in the next block */
1663 
1664                     m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]);
1665                     m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]);
1666 
1667                     m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24);
1668                     m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25);
1669                 }
1670 
1671                 /* Column 2 of destination computed here */
1672                 /* It is stored in m_temp_reg_52 */
1673                 /* Column 5 of destination computed here */
1674                 /* It is stored in m_temp_reg_55 */
1675                 {
1676                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34);
1677                     m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34);
1678 
1679                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35);
1680                     m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35);
1681 
1682                     m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor);
1683                     m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor);
1684                     m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor);
1685                     m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor);
1686 
1687                     m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift);
1688                     m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift);
1689                     m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift);
1690                     m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift);
1691 
1692                     //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89
1693                     m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3);
1694                     m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4);
1695                     m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3);
1696                     m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4);
1697 
1698                     m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3);
1699                     m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7);
1700 
1701 
1702 
1703                     /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */
1704 
1705 
1706                     m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26);
1707                     m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27);
1708                 }
1709 
1710                 /* Column 3 of destination computed here */
1711                 /* It is stored in m_temp_reg_53 */
1712                 /* Column 4 of destination computed here */
1713                 /* It is stored in m_temp_reg_54 */
1714                 {
1715                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36);
1716                     m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36);
1717 
1718                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37);
1719                     m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37);
1720 
1721                     m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor);
1722                     m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor);
1723                     m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor);
1724                     m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor);
1725 
1726                     m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift);
1727                     m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift);
1728                     m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift);
1729                     m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift);
1730 
1731                     m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21);
1732                     m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23);
1733                 }
1734             }
1735 
1736             /* Transpose of the destination 8x8 matrix done here */
1737             /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */
1738             /* respectively */
1739             {
1740                 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51);
1741                 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53);
1742                 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51);
1743                 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53);
1744                 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11);
1745                 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11);
1746                 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15);
1747                 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15);
1748 
1749                 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55);
1750                 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57);
1751                 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55);
1752                 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57);
1753                 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13);
1754                 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13);
1755                 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17);
1756                 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17);
1757                 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4);
1758                 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4);
1759                 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5);
1760                 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5);
1761 
1762                 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6);
1763                 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6);
1764                 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7);
1765                 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7);
1766             }
1767 
1768             /* Recon and store */
1769             {
1770                 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred);
1771                 pu1_pred += pred_strd;
1772                 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred);
1773                 pu1_pred += pred_strd;
1774                 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred);
1775                 pu1_pred += pred_strd;
1776                 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred);
1777                 pu1_pred += pred_strd;
1778                 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred);
1779                 pu1_pred += pred_strd;
1780                 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred);
1781                 pu1_pred += pred_strd;
1782                 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred);
1783                 pu1_pred += pred_strd;
1784                 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred);
1785 
1786 
1787                 m_temp_reg_50 = _mm_setzero_si128();
1788                 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50);
1789                 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50);
1790                 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50);
1791                 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50);
1792                 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50);
1793                 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50);
1794                 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50);
1795                 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50);
1796 
1797                 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0);
1798                 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1);
1799                 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2);
1800                 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3);
1801                 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4);
1802                 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5);
1803                 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6);
1804                 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7);
1805 
1806                 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50);
1807                 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51);
1808                 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52);
1809                 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53);
1810                 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54);
1811                 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55);
1812                 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56);
1813                 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57);
1814 
1815                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50);
1816                 pu1_dst += dst_strd;
1817                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51);
1818                 pu1_dst += dst_strd;
1819                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52);
1820                 pu1_dst += dst_strd;
1821                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53);
1822                 pu1_dst += dst_strd;
1823                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54);
1824                 pu1_dst += dst_strd;
1825                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55);
1826                 pu1_dst += dst_strd;
1827                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56);
1828                 pu1_dst += dst_strd;
1829                 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57);
1830                 pu1_dst += dst_strd;
1831 
1832             }
1833 
1834 
1835         }
1836 
1837 
1838     }
1839 }
1840 
1841 void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src,
1842                             WORD16 *pi2_tmp,
1843                             UWORD8 *pu1_pred,
1844                             UWORD8 *pu1_dst,
1845                             WORD32 src_strd,
1846                             WORD32 pred_strd,
1847                             WORD32 dst_strd,
1848                             WORD32 zero_cols,
1849                             WORD32 zero_rows)
1850 {
1851     WORD32 val;
1852     __m128i value_4x32b, mismatch_stg2_additive;
1853     __m128i pred_r, pred_half0, pred_half1;
1854     __m128i temp0, temp1;
1855     __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND);
1856 
1857     UNUSED(pi2_tmp);
1858     UNUSED(src_strd);
1859     UNUSED(zero_cols);
1860     UNUSED(zero_rows);
1861 
1862     val = pi2_src[0] * gai2_impeg2_idct_q15[0];
1863     val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
1864     val *= gai2_impeg2_idct_q11[0];
1865     value_4x32b = _mm_set1_epi32(val);
1866 
1867     // Row 0 processing
1868     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive);
1869     pred_r = _mm_loadl_epi64((__m128i *) pu1_pred);
1870     pred_r =  _mm_cvtepu8_epi16(pred_r);
1871     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1872     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1873     pred_half0 = _mm_cvtepu16_epi32(pred_r);
1874     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1875 
1876     pred_r = _mm_srli_si128(pred_r, 8);
1877 
1878     temp0 = _mm_add_epi32(temp0, value_4x32b);
1879     temp1 = _mm_add_epi32(temp1, value_4x32b);
1880     temp0 = _mm_add_epi32(temp0, round_stg2);
1881     temp1 = _mm_add_epi32(temp1, round_stg2);
1882     pred_half1 = _mm_cvtepu16_epi32(pred_r);
1883     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1884     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1885     temp0 = _mm_add_epi32(temp0, pred_half0);
1886     temp1 = _mm_add_epi32(temp1, pred_half1);
1887 
1888     temp0 = _mm_packus_epi32(temp0, temp1);
1889     temp0 = _mm_packus_epi16(temp0, temp1);
1890 
1891     _mm_storel_epi64((__m128i *)pu1_dst, temp0);
1892 
1893     // Row 1 processing
1894     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8));
1895     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
1896     pred_r =  _mm_cvtepu8_epi16(pred_r);
1897     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1898     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1899     pred_half0 = _mm_cvtepu16_epi32(pred_r);
1900     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1901 
1902     pred_r = _mm_srli_si128(pred_r, 8);
1903 
1904     temp0 = _mm_add_epi32(temp0, value_4x32b);
1905     temp1 = _mm_add_epi32(temp1, value_4x32b);
1906     temp0 = _mm_add_epi32(temp0, round_stg2);
1907     temp1 = _mm_add_epi32(temp1, round_stg2);
1908     pred_half1 = _mm_cvtepu16_epi32(pred_r);
1909     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1910     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1911     temp0 = _mm_add_epi32(temp0, pred_half0);
1912     temp1 = _mm_add_epi32(temp1, pred_half1);
1913 
1914     temp0 = _mm_packus_epi32(temp0, temp1);
1915     temp0 = _mm_packus_epi16(temp0, temp1);
1916 
1917     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0);
1918 
1919     // Row 2 processing
1920     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16));
1921     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd));
1922     pred_r =  _mm_cvtepu8_epi16(pred_r);
1923     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1924     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1925     pred_half0 = _mm_cvtepu16_epi32(pred_r);
1926     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1927 
1928     pred_r = _mm_srli_si128(pred_r, 8);
1929 
1930     temp0 = _mm_add_epi32(temp0, value_4x32b);
1931     temp1 = _mm_add_epi32(temp1, value_4x32b);
1932     temp0 = _mm_add_epi32(temp0, round_stg2);
1933     temp1 = _mm_add_epi32(temp1, round_stg2);
1934     pred_half1 = _mm_cvtepu16_epi32(pred_r);
1935     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1936     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1937     temp0 = _mm_add_epi32(temp0, pred_half0);
1938     temp1 = _mm_add_epi32(temp1, pred_half1);
1939 
1940     temp0 = _mm_packus_epi32(temp0, temp1);
1941     temp0 = _mm_packus_epi16(temp0, temp1);
1942 
1943     _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0);
1944 
1945     // Row 3 processing
1946     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24));
1947     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd));
1948     pred_r =  _mm_cvtepu8_epi16(pred_r);
1949     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1950     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1951     pred_half0 = _mm_cvtepu16_epi32(pred_r);
1952     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1953 
1954     pred_r = _mm_srli_si128(pred_r, 8);
1955 
1956     temp0 = _mm_add_epi32(temp0, value_4x32b);
1957     temp1 = _mm_add_epi32(temp1, value_4x32b);
1958     temp0 = _mm_add_epi32(temp0, round_stg2);
1959     temp1 = _mm_add_epi32(temp1, round_stg2);
1960     pred_half1 = _mm_cvtepu16_epi32(pred_r);
1961     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1962     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1963     temp0 = _mm_add_epi32(temp0, pred_half0);
1964     temp1 = _mm_add_epi32(temp1, pred_half1);
1965 
1966     temp0 = _mm_packus_epi32(temp0, temp1);
1967     temp0 = _mm_packus_epi16(temp0, temp1);
1968 
1969     _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0);
1970 
1971     // Row 4 processing
1972     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32));
1973     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd));
1974     pred_r =  _mm_cvtepu8_epi16(pred_r);
1975     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1976     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
1977     pred_half0 = _mm_cvtepu16_epi32(pred_r);
1978     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
1979 
1980     pred_r = _mm_srli_si128(pred_r, 8);
1981 
1982     temp0 = _mm_add_epi32(temp0, value_4x32b);
1983     temp1 = _mm_add_epi32(temp1, value_4x32b);
1984     temp0 = _mm_add_epi32(temp0, round_stg2);
1985     temp1 = _mm_add_epi32(temp1, round_stg2);
1986     pred_half1 = _mm_cvtepu16_epi32(pred_r);
1987     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
1988     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
1989     temp0 = _mm_add_epi32(temp0, pred_half0);
1990     temp1 = _mm_add_epi32(temp1, pred_half1);
1991 
1992     temp0 = _mm_packus_epi32(temp0, temp1);
1993     temp0 = _mm_packus_epi16(temp0, temp1);
1994 
1995     _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0);
1996 
1997     // Row 5 processing
1998     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40));
1999     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd));
2000     pred_r =  _mm_cvtepu8_epi16(pred_r);
2001     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2002     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2003     pred_half0 = _mm_cvtepu16_epi32(pred_r);
2004     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2005 
2006     pred_r = _mm_srli_si128(pred_r, 8);
2007 
2008     temp0 = _mm_add_epi32(temp0, value_4x32b);
2009     temp1 = _mm_add_epi32(temp1, value_4x32b);
2010     temp0 = _mm_add_epi32(temp0, round_stg2);
2011     temp1 = _mm_add_epi32(temp1, round_stg2);
2012     pred_half1 = _mm_cvtepu16_epi32(pred_r);
2013     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2014     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2015     temp0 = _mm_add_epi32(temp0, pred_half0);
2016     temp1 = _mm_add_epi32(temp1, pred_half1);
2017 
2018     temp0 = _mm_packus_epi32(temp0, temp1);
2019     temp0 = _mm_packus_epi16(temp0, temp1);
2020 
2021     _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0);
2022 
2023     // Row 6 processing
2024     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48));
2025     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd));
2026     pred_r =  _mm_cvtepu8_epi16(pred_r);
2027     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2028     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2029     pred_half0 = _mm_cvtepu16_epi32(pred_r);
2030     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2031 
2032     pred_r = _mm_srli_si128(pred_r, 8);
2033 
2034     temp0 = _mm_add_epi32(temp0, value_4x32b);
2035     temp1 = _mm_add_epi32(temp1, value_4x32b);
2036     temp0 = _mm_add_epi32(temp0, round_stg2);
2037     temp1 = _mm_add_epi32(temp1, round_stg2);
2038     pred_half1 = _mm_cvtepu16_epi32(pred_r);
2039     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2040     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2041     temp0 = _mm_add_epi32(temp0, pred_half0);
2042     temp1 = _mm_add_epi32(temp1, pred_half1);
2043 
2044     temp0 = _mm_packus_epi32(temp0, temp1);
2045     temp0 = _mm_packus_epi16(temp0, temp1);
2046 
2047     _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0);
2048 
2049     // Row 7 processing
2050     mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56));
2051     pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd));
2052     pred_r =  _mm_cvtepu8_epi16(pred_r);
2053     temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2054     mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8);
2055     pred_half0 = _mm_cvtepu16_epi32(pred_r);
2056     temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive);
2057 
2058     pred_r = _mm_srli_si128(pred_r, 8);
2059 
2060     temp0 = _mm_add_epi32(temp0, value_4x32b);
2061     temp1 = _mm_add_epi32(temp1, value_4x32b);
2062     temp0 = _mm_add_epi32(temp0, round_stg2);
2063     temp1 = _mm_add_epi32(temp1, round_stg2);
2064     pred_half1 = _mm_cvtepu16_epi32(pred_r);
2065     temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT);
2066     temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT);
2067     temp0 = _mm_add_epi32(temp0, pred_half0);
2068     temp1 = _mm_add_epi32(temp1, pred_half1);
2069 
2070     temp0 = _mm_packus_epi32(temp0, temp1);
2071     temp0 = _mm_packus_epi16(temp0, temp1);
2072 
2073     _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0);
2074 }
2075 
2076 void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src,
2077                             WORD16 *pi2_tmp,
2078                             UWORD8 *pu1_pred,
2079                             UWORD8 *pu1_dst,
2080                             WORD32 src_strd,
2081                             WORD32 pred_strd,
2082                             WORD32 dst_strd,
2083                             WORD32 zero_cols,
2084                             WORD32 zero_rows)
2085 {
2086     WORD32 val;
2087     __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3;
2088 
2089     UNUSED(pi2_tmp);
2090     UNUSED(src_strd);
2091     UNUSED(zero_cols);
2092     UNUSED(zero_rows);
2093 
2094     val = pi2_src[0] * gai2_impeg2_idct_q15[0];
2095     val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT);
2096     val = val * gai2_impeg2_idct_q11[0];
2097     val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT);
2098 
2099     value_4x32b = _mm_set1_epi32(val);
2100 
2101     //Row 0-1 processing
2102     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2103     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2104     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2105     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2106 
2107     temp0 = _mm_cvtepu16_epi32(pred_r0);
2108     pred_r0 = _mm_srli_si128(pred_r0, 8);
2109     temp2 = _mm_cvtepu16_epi32(pred_r1);
2110     pred_r1 = _mm_srli_si128(pred_r1, 8);
2111     temp1 = _mm_cvtepu16_epi32(pred_r0);
2112     temp3 = _mm_cvtepu16_epi32(pred_r1);
2113 
2114     temp0 = _mm_add_epi32(temp0, value_4x32b);
2115     temp2 = _mm_add_epi32(temp2, value_4x32b);
2116     temp1 = _mm_add_epi32(temp1, value_4x32b);
2117     temp3 = _mm_add_epi32(temp3, value_4x32b);
2118     temp0 = _mm_packus_epi32(temp0, temp1);
2119     temp2 = _mm_packus_epi32(temp2, temp3);
2120     temp0 = _mm_packus_epi16(temp0, temp1);
2121     temp2 = _mm_packus_epi16(temp2, temp3);
2122     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2123     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2124 
2125     //Row 2-3 processing
2126     pu1_pred += 2 * pred_strd;
2127     pu1_dst += 2 * dst_strd;
2128 
2129     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2130     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2131     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2132     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2133 
2134     temp0 = _mm_cvtepu16_epi32(pred_r0);
2135     pred_r0 = _mm_srli_si128(pred_r0, 8);
2136     temp2 = _mm_cvtepu16_epi32(pred_r1);
2137     pred_r1 = _mm_srli_si128(pred_r1, 8);
2138     temp1 = _mm_cvtepu16_epi32(pred_r0);
2139     temp3 = _mm_cvtepu16_epi32(pred_r1);
2140 
2141     temp0 = _mm_add_epi32(temp0, value_4x32b);
2142     temp2 = _mm_add_epi32(temp2, value_4x32b);
2143     temp1 = _mm_add_epi32(temp1, value_4x32b);
2144     temp3 = _mm_add_epi32(temp3, value_4x32b);
2145     temp0 = _mm_packus_epi32(temp0, temp1);
2146     temp2 = _mm_packus_epi32(temp2, temp3);
2147     temp0 = _mm_packus_epi16(temp0, temp1);
2148     temp2 = _mm_packus_epi16(temp2, temp3);
2149     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2150     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2151 
2152     //Row 4-5 processing
2153     pu1_pred += 2 * pred_strd;
2154     pu1_dst += 2 * dst_strd;
2155 
2156     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2157     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2158     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2159     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2160 
2161     temp0 = _mm_cvtepu16_epi32(pred_r0);
2162     pred_r0 = _mm_srli_si128(pred_r0, 8);
2163     temp2 = _mm_cvtepu16_epi32(pred_r1);
2164     pred_r1 = _mm_srli_si128(pred_r1, 8);
2165     temp1 = _mm_cvtepu16_epi32(pred_r0);
2166     temp3 = _mm_cvtepu16_epi32(pred_r1);
2167 
2168     temp0 = _mm_add_epi32(temp0, value_4x32b);
2169     temp2 = _mm_add_epi32(temp2, value_4x32b);
2170     temp1 = _mm_add_epi32(temp1, value_4x32b);
2171     temp3 = _mm_add_epi32(temp3, value_4x32b);
2172     temp0 = _mm_packus_epi32(temp0, temp1);
2173     temp2 = _mm_packus_epi32(temp2, temp3);
2174     temp0 = _mm_packus_epi16(temp0, temp1);
2175     temp2 = _mm_packus_epi16(temp2, temp3);
2176     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2177     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2178 
2179     //Row 6-7 processing
2180     pu1_pred += 2 * pred_strd;
2181     pu1_dst += 2 * dst_strd;
2182 
2183     pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred);
2184     pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd));
2185     pred_r0 =  _mm_cvtepu8_epi16(pred_r0);
2186     pred_r1 =  _mm_cvtepu8_epi16(pred_r1);
2187 
2188     temp0 = _mm_cvtepu16_epi32(pred_r0);
2189     pred_r0 = _mm_srli_si128(pred_r0, 8);
2190     temp2 = _mm_cvtepu16_epi32(pred_r1);
2191     pred_r1 = _mm_srli_si128(pred_r1, 8);
2192     temp1 = _mm_cvtepu16_epi32(pred_r0);
2193     temp3 = _mm_cvtepu16_epi32(pred_r1);
2194 
2195     temp0 = _mm_add_epi32(temp0, value_4x32b);
2196     temp2 = _mm_add_epi32(temp2, value_4x32b);
2197     temp1 = _mm_add_epi32(temp1, value_4x32b);
2198     temp3 = _mm_add_epi32(temp3, value_4x32b);
2199     temp0 = _mm_packus_epi32(temp0, temp1);
2200     temp2 = _mm_packus_epi32(temp2, temp3);
2201     temp0 = _mm_packus_epi16(temp0, temp1);
2202     temp2 = _mm_packus_epi16(temp2, temp3);
2203     _mm_storel_epi64((__m128i *)(pu1_dst), temp0);
2204     _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2);
2205 }
2206