1 /****************************************************************************** 2 * 3 * Copyright (C) 2015 The Android Open Source Project 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ***************************************************************************** 18 * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore 19 */ 20 21 /** 22 ******************************************************************************* 23 * @file 24 * impeg2_itrans_recon_x86_intr.c 25 * 26 * @brief 27 * Contains function definitions for inverse quantization, inverse 28 * transform and reconstruction 29 * 30 * @author 31 * 100470 32 * 100592 (edited by) 33 * 34 * @par List of Functions: 35 * - impeg2_itrans_recon_8x8_sse42() 36 * 37 * @remarks 38 * None 39 * 40 ******************************************************************************* 41 */ 42 #include <stdio.h> 43 #include <string.h> 44 #include "iv_datatypedef.h" 45 #include "impeg2_macros.h" 46 #include "impeg2_defs.h" 47 #include "impeg2_globals.h" 48 49 #include <immintrin.h> 50 #include <emmintrin.h> 51 #include <smmintrin.h> 52 #include <tmmintrin.h> 53 54 55 /** 56 ******************************************************************************* 57 * 58 * @brief 59 * This function performs inverse quantization, inverse transform and 60 * reconstruction for 8c8 input block 61 * 62 * @par Description: 63 * Performs inverse quantization , inverse transform and adds the 64 * prediction data and clips output to 8 bit 65 * 66 * @param[in] pi2_src 67 * Input 8x8 coefficients 68 * 69 * @param[in] pi2_tmp 70 * Temporary 8x8 buffer for storing inverse 71 * transform 1st stage output 72 * 73 * @param[in] pu1_pred 74 * Prediction 8x8 block 75 * 76 * @param[in] pi2_dequant_coeff 77 * Dequant Coeffs 78 * 79 * @param[out] pu1_dst 80 * Output 8x8 block 81 * 82 * @param[in] src_strd 83 * Input stride 84 * 85 * @param[in] qp_div 86 * Quantization parameter / 6 87 * 88 * @param[in] qp_rem 89 * Quantization parameter % 6 90 * 91 * @param[in] pred_strd 92 * Prediction stride 93 * 94 * @param[in] dst_strd 95 * Output Stride 96 * 97 * @param[in] zero_cols 98 * Zero columns in pi2_src 99 * 100 * @returns Void 101 * 102 * @remarks 103 * None 104 * 105 ******************************************************************************* 106 */ 107 108 109 void impeg2_idct_recon_sse42(WORD16 *pi2_src, 110 WORD16 *pi2_tmp, 111 UWORD8 *pu1_pred, 112 UWORD8 *pu1_dst, 113 WORD32 src_strd, 114 WORD32 pred_strd, 115 WORD32 dst_strd, 116 WORD32 zero_cols, 117 WORD32 zero_rows) 118 { 119 __m128i m_temp_reg_0; 120 __m128i m_temp_reg_1; 121 __m128i m_temp_reg_2; 122 __m128i m_temp_reg_3; 123 __m128i m_temp_reg_5; 124 __m128i m_temp_reg_6; 125 __m128i m_temp_reg_7; 126 __m128i m_temp_reg_4; 127 __m128i m_temp_reg_10; 128 __m128i m_temp_reg_11; 129 __m128i m_temp_reg_12; 130 __m128i m_temp_reg_13; 131 __m128i m_temp_reg_14; 132 __m128i m_temp_reg_15; 133 __m128i m_temp_reg_16; 134 __m128i m_temp_reg_17; 135 __m128i m_temp_reg_20; 136 __m128i m_temp_reg_21; 137 __m128i m_temp_reg_22; 138 __m128i m_temp_reg_23; 139 __m128i m_temp_reg_24; 140 __m128i m_temp_reg_25; 141 __m128i m_temp_reg_26; 142 __m128i m_temp_reg_27; 143 __m128i m_temp_reg_30; 144 __m128i m_temp_reg_31; 145 __m128i m_temp_reg_32; 146 __m128i m_temp_reg_33; 147 __m128i m_temp_reg_34; 148 __m128i m_temp_reg_35; 149 __m128i m_temp_reg_36; 150 __m128i m_temp_reg_37; 151 __m128i m_temp_reg_40; 152 __m128i m_temp_reg_41; 153 __m128i m_temp_reg_42; 154 __m128i m_temp_reg_43; 155 __m128i m_temp_reg_44; 156 __m128i m_temp_reg_45; 157 __m128i m_temp_reg_46; 158 __m128i m_temp_reg_47; 159 __m128i m_temp_reg_50; 160 __m128i m_temp_reg_51; 161 __m128i m_temp_reg_52; 162 __m128i m_temp_reg_53; 163 __m128i m_temp_reg_54; 164 __m128i m_temp_reg_55; 165 __m128i m_temp_reg_56; 166 __m128i m_temp_reg_57; 167 __m128i m_temp_reg_60; 168 __m128i m_temp_reg_61; 169 __m128i m_temp_reg_62; 170 __m128i m_temp_reg_63; 171 __m128i m_temp_reg_64; 172 __m128i m_temp_reg_65; 173 __m128i m_temp_reg_66; 174 __m128i m_temp_reg_67; 175 __m128i m_temp_reg_70; 176 __m128i m_temp_reg_71; 177 __m128i m_temp_reg_72; 178 __m128i m_temp_reg_73; 179 __m128i m_temp_reg_74; 180 __m128i m_temp_reg_75; 181 __m128i m_temp_reg_76; 182 __m128i m_temp_reg_77; 183 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 184 185 WORD32 check_row_stage_1; /* Lokesh */ 186 WORD32 check_row_stage_2; /* Lokesh */ 187 188 __m128i m_rdng_factor; 189 WORD32 i4_shift = IDCT_STG1_SHIFT; 190 UNUSED(pi2_tmp); 191 check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0; 192 check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0; 193 194 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src); 195 pi2_src += src_strd; 196 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src); 197 pi2_src += src_strd; 198 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src); 199 pi2_src += src_strd; 200 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src); 201 pi2_src += src_strd; 202 203 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src); 204 pi2_src += src_strd; 205 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src); 206 pi2_src += src_strd; 207 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src); 208 pi2_src += src_strd; 209 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src); 210 211 if(!check_row_stage_2) 212 { 213 if(!check_row_stage_1) 214 { 215 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 216 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 217 { 218 //Interleaving 0,4 row in 0 , 1 Rishab 219 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 220 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); 221 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); 222 223 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 224 225 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 226 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 227 228 } 229 230 231 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 232 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 233 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 234 { 235 236 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 237 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 238 239 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 240 //Interleaving 2,6 row in 4, 5 Rishab 241 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 242 243 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 244 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 245 246 247 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 248 249 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); 250 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]); 251 252 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); 253 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]); 254 255 256 257 /* e */ 258 259 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 260 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 261 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 262 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 263 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 264 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 265 266 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 267 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 268 269 } 270 271 /* o */ 272 { 273 274 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 275 { 276 277 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 278 //o0:1B*89+3B*75,5B*50+7B*18 279 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 280 281 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 282 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 283 284 285 286 /* Column 0 of destination computed here */ 287 /* It is stored in m_temp_reg_50 */ 288 /* Column 7 of destination computed here */ 289 /* It is stored in m_temp_reg_57 */ 290 /* Upper 8 bytes of both registers are zero due to zero_cols*/ 291 292 293 294 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 295 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 296 297 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 298 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 299 300 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 301 m_temp_reg_63 = _mm_setzero_si128(); 302 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 303 304 //o1:1B*75-3B*18,5B*89+7B*50 305 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 306 307 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 308 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 309 310 /* Loading coeff for computing o2 in the next block */ 311 312 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); 313 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]); 314 315 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 316 317 318 319 /* Column 1 of destination computed here */ 320 /* It is stored in m_temp_reg_51 */ 321 /* Column 6 of destination computed here */ 322 /* It is stored in m_temp_reg_56 */ 323 324 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 325 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 326 327 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 328 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 329 330 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 331 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 332 333 //o2:1B*50-3B*89,5B*18+7B*75 334 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 335 336 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 337 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 338 339 340 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 341 342 /* Loading coeff for computing o3 in the next block */ 343 344 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); 345 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]); 346 347 348 349 /* Column 2 of destination computed here */ 350 /* It is stored in m_temp_reg_52 */ 351 /* Column 5 of destination computed here */ 352 /* It is stored in m_temp_reg_55 */ 353 354 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 355 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 356 357 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 358 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 359 360 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 361 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 362 363 //o3:1B*18-3B*50,5B*75-7B*89 364 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 365 366 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 367 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 368 369 370 371 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 372 373 374 375 /* Column 3 of destination computed here */ 376 /* It is stored in m_temp_reg_53 */ 377 /* Column 4 of destination computed here */ 378 /* It is stored in m_temp_reg_54 */ 379 380 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 381 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 382 383 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 384 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 385 386 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 387 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 388 389 390 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 391 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 392 } 393 } 394 395 /* Transpose of the destination 8x8 matrix done here */ 396 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 397 /* respectively */ 398 { 399 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 400 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 401 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 402 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 403 404 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 405 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 406 407 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 408 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 409 410 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 411 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 412 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 413 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 414 415 m_temp_reg_54 = _mm_setzero_si128(); 416 m_temp_reg_55 = _mm_setzero_si128(); 417 m_temp_reg_56 = _mm_setzero_si128(); 418 m_temp_reg_57 = _mm_setzero_si128(); 419 } 420 } 421 else 422 { 423 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 424 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 425 { 426 //Interleaving 0,4 row in 0 , 1 Rishab 427 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 428 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); 429 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); 430 431 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 432 433 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 434 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 435 436 } 437 438 439 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 440 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 441 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 442 { 443 444 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 445 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 446 447 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 448 //Interleaving 2,6 row in 4, 5 Rishab 449 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 450 451 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 452 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 453 454 455 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 456 457 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); 458 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]); 459 460 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); 461 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]); 462 463 464 465 /* e */ 466 467 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 468 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 469 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 470 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 471 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 472 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 473 474 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 475 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 476 477 } 478 479 /* o */ 480 { 481 482 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 483 { 484 485 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 486 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 487 //o0:1B*89+3B*75,5B*50+7B*18 488 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 489 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 490 491 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 492 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 493 494 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 495 496 497 498 /* Column 0 of destination computed here */ 499 /* It is stored in m_temp_reg_50 */ 500 /* Column 7 of destination computed here */ 501 /* It is stored in m_temp_reg_57 */ 502 /* Upper 8 bytes of both registers are zero due to zero_cols*/ 503 504 505 506 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 507 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 508 509 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 510 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 511 512 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 513 m_temp_reg_63 = _mm_setzero_si128(); 514 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 515 516 //o1:1B*75-3B*18,5B*89+7B*50 517 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 518 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 519 520 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 521 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 522 523 /* Loading coeff for computing o2 in the next block */ 524 525 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); 526 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]); 527 528 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 529 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 530 531 532 533 /* Column 1 of destination computed here */ 534 /* It is stored in m_temp_reg_51 */ 535 /* Column 6 of destination computed here */ 536 /* It is stored in m_temp_reg_56 */ 537 538 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 539 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 540 541 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 542 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 543 544 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 545 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 546 547 //o2:1B*50-3B*89,5B*18+7B*75 548 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 549 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 550 551 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 552 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 553 554 555 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 556 557 /* Loading coeff for computing o3 in the next block */ 558 559 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); 560 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]); 561 562 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 563 564 565 /* Column 2 of destination computed here */ 566 /* It is stored in m_temp_reg_52 */ 567 /* Column 5 of destination computed here */ 568 /* It is stored in m_temp_reg_55 */ 569 570 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 571 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 572 573 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 574 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 575 576 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 577 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 578 579 //o3:1B*18-3B*50,5B*75-7B*89 580 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 581 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 582 583 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 584 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 585 586 587 588 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 589 590 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 591 592 593 /* Column 3 of destination computed here */ 594 /* It is stored in m_temp_reg_53 */ 595 /* Column 4 of destination computed here */ 596 /* It is stored in m_temp_reg_54 */ 597 598 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 599 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 600 601 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 602 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 603 604 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 605 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 606 607 608 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 609 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 610 } 611 } 612 613 /* Transpose of the destination 8x8 matrix done here */ 614 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 615 /* respectively */ 616 { 617 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 618 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 619 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 620 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 621 622 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 623 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 624 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 625 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 626 627 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 628 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 629 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 630 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 631 632 m_temp_reg_54 = _mm_setzero_si128(); 633 m_temp_reg_55 = _mm_setzero_si128(); 634 m_temp_reg_56 = _mm_setzero_si128(); 635 m_temp_reg_57 = _mm_setzero_si128(); 636 } 637 } 638 639 /* Stage 2 */ 640 i4_shift = IDCT_STG2_SHIFT; 641 { 642 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 643 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 644 { 645 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add 646 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub 647 648 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 649 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 650 651 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 652 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 653 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 654 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 655 656 657 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]); 658 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]); 659 } 660 661 662 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 663 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 664 { 665 666 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 667 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 668 669 670 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 671 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 672 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 673 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 674 675 /* Loading coeff for computing o0 in the next block */ 676 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]); 677 678 679 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 680 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 681 682 683 684 /* e */ 685 686 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 687 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 688 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 689 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 690 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 691 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 692 693 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 694 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 695 696 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 697 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 698 699 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 700 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 701 702 } 703 704 /* o */ 705 { 706 707 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 708 { 709 //o0:1B*89+3B*75,1T*89+3T*75 710 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 711 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 712 713 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 714 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 715 /* Loading coeff for computing o1 in the next block */ 716 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]); 717 718 719 720 /* Column 0 of destination computed here */ 721 /* It is stored in m_temp_reg_50 */ 722 /* Column 7 of destination computed here */ 723 /* It is stored in m_temp_reg_57 */ 724 725 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 726 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 727 728 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 729 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 730 731 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 732 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 733 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 734 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 735 736 //o1:1B*75-3B*18,1T*75-3T*18 737 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 738 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 739 740 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 741 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 742 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 743 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 744 745 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 746 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 747 748 749 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 750 751 752 /* Loading coeff for computing o2 in the next block */ 753 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]); 754 755 756 757 /* Column 1 of destination computed here */ 758 /* It is stored in m_temp_reg_51 */ 759 /* Column 6 of destination computed here */ 760 /* It is stored in m_temp_reg_56 */ 761 762 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 763 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 764 765 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 766 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 767 768 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 769 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 770 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 771 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 772 773 //o2:1B*50-3B*89,5T*18+7T*75. 774 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 775 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 776 777 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 778 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 779 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 780 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 781 782 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 783 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 784 785 786 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 787 788 /* Loading coeff for computing o3 in the next block */ 789 790 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]); 791 792 793 /* Column 2 of destination computed here */ 794 /* It is stored in m_temp_reg_52 */ 795 /* Column 5 of destination computed here */ 796 /* It is stored in m_temp_reg_55 */ 797 798 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 799 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 800 801 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 802 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 803 804 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 805 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 806 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 807 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 808 809 //o3:1B*18-3B*50,1T*18-3T*50 810 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 811 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 812 813 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 814 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 815 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 816 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 817 818 819 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 820 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 821 822 823 824 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 825 826 827 /* Column 3 of destination computed here */ 828 /* It is stored in m_temp_reg_53 */ 829 /* Column 4 of destination computed here */ 830 /* It is stored in m_temp_reg_54 */ 831 832 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 833 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 834 835 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 836 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 837 838 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 839 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 840 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 841 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 842 843 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 844 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 845 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 846 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 847 848 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 849 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 850 } 851 } 852 853 /* Transpose of the destination 8x8 matrix done here */ 854 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 855 /* respectively */ 856 { 857 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 858 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 859 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 860 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 861 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 862 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 863 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 864 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 865 866 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 867 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 868 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 869 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 870 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 871 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 872 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 873 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 874 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 875 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 876 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 877 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 878 879 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 880 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 881 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 882 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 883 } 884 885 /* Recon and store */ 886 { 887 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 888 pu1_pred += pred_strd; 889 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 890 pu1_pred += pred_strd; 891 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 892 pu1_pred += pred_strd; 893 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 894 pu1_pred += pred_strd; 895 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 896 pu1_pred += pred_strd; 897 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 898 pu1_pred += pred_strd; 899 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 900 pu1_pred += pred_strd; 901 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 902 903 m_temp_reg_50 = _mm_setzero_si128(); 904 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 905 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 906 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 907 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 908 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 909 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 910 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 911 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 912 913 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 914 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 915 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 916 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 917 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 918 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 919 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 920 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 921 922 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 923 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 924 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 925 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 926 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 927 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 928 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 929 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 930 931 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 932 pu1_dst += dst_strd; 933 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 934 pu1_dst += dst_strd; 935 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 936 pu1_dst += dst_strd; 937 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 938 pu1_dst += dst_strd; 939 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 940 pu1_dst += dst_strd; 941 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 942 pu1_dst += dst_strd; 943 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 944 pu1_dst += dst_strd; 945 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 946 pu1_dst += dst_strd; 947 } 948 } 949 } 950 else 951 952 { 953 954 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 955 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 956 if(!check_row_stage_1) 957 { 958 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 959 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 960 { 961 //Interleaving 0,4 row in 0 , 1 Rishab 962 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 963 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); 964 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); 965 966 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 967 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 968 969 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 970 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 971 972 973 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 974 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 975 } 976 977 978 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 979 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 980 { 981 982 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 983 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 984 985 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 986 //Interleaving 2,6 row in 4, 5 Rishab 987 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 988 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 989 990 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 991 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 992 993 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 994 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 995 996 997 998 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 999 1000 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); 1001 //m_coeff4 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[3][0]); 1002 1003 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); 1004 //m_coeff2 = _mm_loadu_si128((__m128i *) &gai2_impeg2_idct_odd_8_q15[1][0]); 1005 1006 } 1007 1008 /* e */ 1009 { 1010 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1011 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1012 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1013 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1014 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1015 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1016 1017 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1018 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1019 1020 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1021 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1022 1023 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1024 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1025 1026 } 1027 1028 /* o */ 1029 { 1030 1031 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1032 { 1033 1034 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1035 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1036 //o0:1B*89+3B*75,1T*89+3T*75 1037 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1038 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1039 1040 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1041 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1042 1043 } 1044 1045 /* Column 0 of destination computed here */ 1046 /* It is stored in m_temp_reg_50 */ 1047 /* Column 7 of destination computed here */ 1048 /* It is stored in m_temp_reg_57 */ 1049 { 1050 1051 1052 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1053 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1054 1055 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1056 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1057 1058 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1059 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1060 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1061 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1062 1063 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1064 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1065 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1066 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1067 1068 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 1069 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1070 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1071 1072 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1073 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1074 1075 /* Loading coeff for computing o2 in the next block */ 1076 1077 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); 1078 1079 } 1080 1081 /* Column 1 of destination computed here */ 1082 /* It is stored in m_temp_reg_51 */ 1083 /* Column 6 of destination computed here */ 1084 /* It is stored in m_temp_reg_56 */ 1085 { 1086 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1087 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1088 1089 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1090 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1091 1092 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1093 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1094 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1095 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1096 1097 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1098 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1099 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1100 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1101 1102 //o2:1B*50-3B*89,1T*50-3T*89 1103 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1104 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1105 1106 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1107 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1108 1109 1110 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1111 1112 1113 /* Loading coeff for computing o3 in the next block */ 1114 1115 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); 1116 1117 } 1118 1119 /* Column 2 of destination computed here */ 1120 /* It is stored in m_temp_reg_52 */ 1121 /* Column 5 of destination computed here */ 1122 /* It is stored in m_temp_reg_55 */ 1123 { 1124 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1125 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1126 1127 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 1128 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 1129 1130 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1131 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1132 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1133 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1134 1135 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1136 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1137 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1138 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1139 1140 //o3:1B*18-3B*50,1T*18-3T*50 1141 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1142 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1143 1144 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1145 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1146 1147 1148 1149 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1150 1151 1152 } 1153 1154 /* Column 3 of destination computed here */ 1155 /* It is stored in m_temp_reg_53 */ 1156 /* Column 4 of destination computed here */ 1157 /* It is stored in m_temp_reg_54 */ 1158 { 1159 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1160 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1161 1162 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 1163 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 1164 1165 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1166 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1167 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1168 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1169 1170 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1171 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1172 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1173 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1174 1175 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1176 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1177 } 1178 } 1179 1180 /* Transpose of the destination 8x8 matrix done here */ 1181 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1182 /* respectively */ 1183 { 1184 1185 1186 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1187 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1188 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1189 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1190 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1191 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1192 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1193 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1194 1195 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1196 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1197 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1198 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1199 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1200 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1201 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1202 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1203 1204 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1205 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1206 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1207 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1208 1209 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1210 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1211 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1212 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1213 } 1214 } 1215 else 1216 { 1217 1218 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1219 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1220 { 1221 //Interleaving 0,4 row in 0 , 1 Rishab 1222 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1223 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[3][0]); 1224 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[0][0]); 1225 1226 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1227 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 1228 1229 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1230 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1231 1232 1233 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1234 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1235 } 1236 1237 1238 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1239 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1240 { 1241 1242 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1243 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q15[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1244 1245 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1246 //Interleaving 2,6 row in 4, 5 Rishab 1247 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1248 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 1249 1250 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1251 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1252 1253 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 1254 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 1255 1256 1257 1258 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1259 1260 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[2][0]); 1261 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[3][0]); 1262 1263 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[0][0]); 1264 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[1][0]); 1265 1266 } 1267 1268 /* e */ 1269 { 1270 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1271 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1272 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1273 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1274 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1275 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1276 1277 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1278 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1279 1280 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1281 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1282 1283 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1284 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1285 1286 } 1287 1288 /* o */ 1289 { 1290 1291 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1292 { 1293 1294 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1295 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1296 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1297 m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 1298 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 1299 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1300 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1301 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1302 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 1303 1304 1305 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1306 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1307 1308 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1309 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 1310 } 1311 1312 /* Column 0 of destination computed here */ 1313 /* It is stored in m_temp_reg_50 */ 1314 /* Column 7 of destination computed here */ 1315 /* It is stored in m_temp_reg_57 */ 1316 { 1317 1318 1319 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1320 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1321 1322 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1323 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1324 1325 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1326 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1327 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1328 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1329 1330 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1331 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1332 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1333 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1334 1335 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 1336 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1337 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1338 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1339 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 1340 1341 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1342 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1343 1344 /* Loading coeff for computing o2 in the next block */ 1345 1346 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[4][0]); 1347 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[5][0]); 1348 1349 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1350 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 1351 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 1352 } 1353 1354 /* Column 1 of destination computed here */ 1355 /* It is stored in m_temp_reg_51 */ 1356 /* Column 6 of destination computed here */ 1357 /* It is stored in m_temp_reg_56 */ 1358 { 1359 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1360 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1361 1362 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1363 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1364 1365 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1366 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1367 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1368 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1369 1370 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1371 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1372 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1373 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1374 1375 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 1376 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1377 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1378 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1379 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 1380 1381 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1382 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1383 1384 1385 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1386 1387 1388 /* Loading coeff for computing o3 in the next block */ 1389 1390 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[6][0]); 1391 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q15[7][0]); 1392 1393 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1394 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 1395 } 1396 1397 /* Column 2 of destination computed here */ 1398 /* It is stored in m_temp_reg_52 */ 1399 /* Column 5 of destination computed here */ 1400 /* It is stored in m_temp_reg_55 */ 1401 { 1402 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1403 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1404 1405 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 1406 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 1407 1408 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1409 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1410 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1411 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1412 1413 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1414 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1415 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1416 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1417 1418 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 1419 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1420 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1421 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1422 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 1423 1424 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1425 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1426 1427 1428 1429 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1430 1431 1432 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 1433 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 1434 } 1435 1436 /* Column 3 of destination computed here */ 1437 /* It is stored in m_temp_reg_53 */ 1438 /* Column 4 of destination computed here */ 1439 /* It is stored in m_temp_reg_54 */ 1440 { 1441 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1442 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1443 1444 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 1445 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 1446 1447 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1448 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1449 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1450 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1451 1452 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1453 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1454 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1455 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1456 1457 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1458 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1459 } 1460 } 1461 1462 /* Transpose of the destination 8x8 matrix done here */ 1463 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1464 /* respectively */ 1465 { 1466 1467 1468 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1469 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1470 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1471 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1472 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1473 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1474 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1475 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1476 1477 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1478 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1479 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1480 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1481 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1482 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1483 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1484 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1485 1486 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1487 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1488 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1489 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1490 1491 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1492 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1493 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1494 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1495 } 1496 } 1497 /* Stage 2 */ 1498 1499 i4_shift = IDCT_STG2_SHIFT; 1500 1501 { 1502 1503 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1504 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1505 { 1506 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[0][0]); //add 1507 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[3][0]); //sub 1508 1509 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 1510 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 1511 1512 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1513 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1514 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1515 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1516 1517 1518 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[1][0]); 1519 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_even_8_q11[2][0]); 1520 } 1521 1522 1523 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1524 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1525 { 1526 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 1527 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 1528 1529 1530 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1531 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1532 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1533 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1534 1535 /* Loading coeff for computing o0 in the next block */ 1536 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[0][0]); 1537 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[1][0]); 1538 1539 1540 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 1541 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 1542 } 1543 1544 /* e */ 1545 { 1546 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1547 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1548 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1549 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1550 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1551 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1552 1553 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1554 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1555 1556 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1557 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1558 1559 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1560 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1561 1562 } 1563 1564 /* o */ 1565 { 1566 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57); 1567 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57); 1568 1569 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1570 { 1571 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 1572 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1573 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1574 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1575 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 1576 1577 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1578 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1579 /* Loading coeff for computing o1 in the next block */ 1580 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[2][0]); 1581 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[3][0]); 1582 1583 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1584 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 1585 } 1586 1587 /* Column 0 of destination computed here */ 1588 /* It is stored in m_temp_reg_50 */ 1589 /* Column 7 of destination computed here */ 1590 /* It is stored in m_temp_reg_57 */ 1591 { 1592 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1593 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1594 1595 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1596 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1597 1598 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1599 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1600 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1601 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1602 1603 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1604 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1605 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1606 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1607 1608 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 1609 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 1610 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 1611 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 1612 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 1613 1614 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1615 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1616 1617 1618 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1619 1620 1621 /* Loading coeff for computing o2 in the next block */ 1622 m_coeff1 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[4][0]); 1623 m_coeff2 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[5][0]); 1624 1625 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 1626 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 1627 } 1628 1629 /* Column 1 of destination computed here */ 1630 /* It is stored in m_temp_reg_51 */ 1631 /* Column 6 of destination computed here */ 1632 /* It is stored in m_temp_reg_56 */ 1633 { 1634 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1635 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1636 1637 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1638 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1639 1640 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1641 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1642 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1643 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1644 1645 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1646 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1647 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1648 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1649 1650 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 1651 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1652 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1653 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1654 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 1655 1656 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1657 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1658 1659 1660 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1661 1662 /* Loading coeff for computing o3 in the next block */ 1663 1664 m_coeff3 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[6][0]); 1665 m_coeff4 = _mm_loadu_si128((__m128i *)&gai2_impeg2_idct_odd_8_q11[7][0]); 1666 1667 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1668 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 1669 } 1670 1671 /* Column 2 of destination computed here */ 1672 /* It is stored in m_temp_reg_52 */ 1673 /* Column 5 of destination computed here */ 1674 /* It is stored in m_temp_reg_55 */ 1675 { 1676 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1677 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1678 1679 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 1680 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 1681 1682 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1683 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1684 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1685 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1686 1687 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1688 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1689 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1690 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1691 1692 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 1693 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 1694 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 1695 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 1696 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 1697 1698 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1699 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1700 1701 1702 1703 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1704 1705 1706 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 1707 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 1708 } 1709 1710 /* Column 3 of destination computed here */ 1711 /* It is stored in m_temp_reg_53 */ 1712 /* Column 4 of destination computed here */ 1713 /* It is stored in m_temp_reg_54 */ 1714 { 1715 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1716 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1717 1718 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 1719 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 1720 1721 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 1722 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 1723 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 1724 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 1725 1726 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 1727 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 1728 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 1729 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 1730 1731 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 1732 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 1733 } 1734 } 1735 1736 /* Transpose of the destination 8x8 matrix done here */ 1737 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1738 /* respectively */ 1739 { 1740 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1741 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1742 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1743 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1744 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1745 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1746 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1747 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1748 1749 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1750 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1751 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1752 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1753 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1754 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1755 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1756 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1757 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1758 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1759 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1760 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1761 1762 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1763 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1764 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1765 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1766 } 1767 1768 /* Recon and store */ 1769 { 1770 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 1771 pu1_pred += pred_strd; 1772 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 1773 pu1_pred += pred_strd; 1774 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 1775 pu1_pred += pred_strd; 1776 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 1777 pu1_pred += pred_strd; 1778 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 1779 pu1_pred += pred_strd; 1780 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 1781 pu1_pred += pred_strd; 1782 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 1783 pu1_pred += pred_strd; 1784 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 1785 1786 1787 m_temp_reg_50 = _mm_setzero_si128(); 1788 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 1789 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 1790 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 1791 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 1792 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 1793 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 1794 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 1795 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 1796 1797 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 1798 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 1799 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 1800 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 1801 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 1802 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 1803 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 1804 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 1805 1806 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 1807 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 1808 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 1809 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 1810 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 1811 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 1812 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 1813 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 1814 1815 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 1816 pu1_dst += dst_strd; 1817 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 1818 pu1_dst += dst_strd; 1819 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 1820 pu1_dst += dst_strd; 1821 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 1822 pu1_dst += dst_strd; 1823 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 1824 pu1_dst += dst_strd; 1825 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 1826 pu1_dst += dst_strd; 1827 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 1828 pu1_dst += dst_strd; 1829 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 1830 pu1_dst += dst_strd; 1831 1832 } 1833 1834 1835 } 1836 1837 1838 } 1839 } 1840 1841 void impeg2_idct_recon_dc_mismatch_sse42(WORD16 *pi2_src, 1842 WORD16 *pi2_tmp, 1843 UWORD8 *pu1_pred, 1844 UWORD8 *pu1_dst, 1845 WORD32 src_strd, 1846 WORD32 pred_strd, 1847 WORD32 dst_strd, 1848 WORD32 zero_cols, 1849 WORD32 zero_rows) 1850 { 1851 WORD32 val; 1852 __m128i value_4x32b, mismatch_stg2_additive; 1853 __m128i pred_r, pred_half0, pred_half1; 1854 __m128i temp0, temp1; 1855 __m128i round_stg2 = _mm_set1_epi32(IDCT_STG2_ROUND); 1856 1857 UNUSED(pi2_tmp); 1858 UNUSED(src_strd); 1859 UNUSED(zero_cols); 1860 UNUSED(zero_rows); 1861 1862 val = pi2_src[0] * gai2_impeg2_idct_q15[0]; 1863 val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT); 1864 val *= gai2_impeg2_idct_q11[0]; 1865 value_4x32b = _mm_set1_epi32(val); 1866 1867 // Row 0 processing 1868 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) gai2_impeg2_mismatch_stg2_additive); 1869 pred_r = _mm_loadl_epi64((__m128i *) pu1_pred); 1870 pred_r = _mm_cvtepu8_epi16(pred_r); 1871 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1872 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 1873 pred_half0 = _mm_cvtepu16_epi32(pred_r); 1874 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1875 1876 pred_r = _mm_srli_si128(pred_r, 8); 1877 1878 temp0 = _mm_add_epi32(temp0, value_4x32b); 1879 temp1 = _mm_add_epi32(temp1, value_4x32b); 1880 temp0 = _mm_add_epi32(temp0, round_stg2); 1881 temp1 = _mm_add_epi32(temp1, round_stg2); 1882 pred_half1 = _mm_cvtepu16_epi32(pred_r); 1883 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 1884 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 1885 temp0 = _mm_add_epi32(temp0, pred_half0); 1886 temp1 = _mm_add_epi32(temp1, pred_half1); 1887 1888 temp0 = _mm_packus_epi32(temp0, temp1); 1889 temp0 = _mm_packus_epi16(temp0, temp1); 1890 1891 _mm_storel_epi64((__m128i *)pu1_dst, temp0); 1892 1893 // Row 1 processing 1894 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 8)); 1895 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); 1896 pred_r = _mm_cvtepu8_epi16(pred_r); 1897 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1898 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 1899 pred_half0 = _mm_cvtepu16_epi32(pred_r); 1900 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1901 1902 pred_r = _mm_srli_si128(pred_r, 8); 1903 1904 temp0 = _mm_add_epi32(temp0, value_4x32b); 1905 temp1 = _mm_add_epi32(temp1, value_4x32b); 1906 temp0 = _mm_add_epi32(temp0, round_stg2); 1907 temp1 = _mm_add_epi32(temp1, round_stg2); 1908 pred_half1 = _mm_cvtepu16_epi32(pred_r); 1909 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 1910 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 1911 temp0 = _mm_add_epi32(temp0, pred_half0); 1912 temp1 = _mm_add_epi32(temp1, pred_half1); 1913 1914 temp0 = _mm_packus_epi32(temp0, temp1); 1915 temp0 = _mm_packus_epi16(temp0, temp1); 1916 1917 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp0); 1918 1919 // Row 2 processing 1920 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 16)); 1921 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 2 * pred_strd)); 1922 pred_r = _mm_cvtepu8_epi16(pred_r); 1923 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1924 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 1925 pred_half0 = _mm_cvtepu16_epi32(pred_r); 1926 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1927 1928 pred_r = _mm_srli_si128(pred_r, 8); 1929 1930 temp0 = _mm_add_epi32(temp0, value_4x32b); 1931 temp1 = _mm_add_epi32(temp1, value_4x32b); 1932 temp0 = _mm_add_epi32(temp0, round_stg2); 1933 temp1 = _mm_add_epi32(temp1, round_stg2); 1934 pred_half1 = _mm_cvtepu16_epi32(pred_r); 1935 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 1936 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 1937 temp0 = _mm_add_epi32(temp0, pred_half0); 1938 temp1 = _mm_add_epi32(temp1, pred_half1); 1939 1940 temp0 = _mm_packus_epi32(temp0, temp1); 1941 temp0 = _mm_packus_epi16(temp0, temp1); 1942 1943 _mm_storel_epi64((__m128i *)(pu1_dst + 2 * dst_strd), temp0); 1944 1945 // Row 3 processing 1946 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 24)); 1947 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 3 * pred_strd)); 1948 pred_r = _mm_cvtepu8_epi16(pred_r); 1949 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1950 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 1951 pred_half0 = _mm_cvtepu16_epi32(pred_r); 1952 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1953 1954 pred_r = _mm_srli_si128(pred_r, 8); 1955 1956 temp0 = _mm_add_epi32(temp0, value_4x32b); 1957 temp1 = _mm_add_epi32(temp1, value_4x32b); 1958 temp0 = _mm_add_epi32(temp0, round_stg2); 1959 temp1 = _mm_add_epi32(temp1, round_stg2); 1960 pred_half1 = _mm_cvtepu16_epi32(pred_r); 1961 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 1962 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 1963 temp0 = _mm_add_epi32(temp0, pred_half0); 1964 temp1 = _mm_add_epi32(temp1, pred_half1); 1965 1966 temp0 = _mm_packus_epi32(temp0, temp1); 1967 temp0 = _mm_packus_epi16(temp0, temp1); 1968 1969 _mm_storel_epi64((__m128i *)(pu1_dst + 3 * dst_strd), temp0); 1970 1971 // Row 4 processing 1972 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 32)); 1973 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 4 * pred_strd)); 1974 pred_r = _mm_cvtepu8_epi16(pred_r); 1975 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1976 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 1977 pred_half0 = _mm_cvtepu16_epi32(pred_r); 1978 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 1979 1980 pred_r = _mm_srli_si128(pred_r, 8); 1981 1982 temp0 = _mm_add_epi32(temp0, value_4x32b); 1983 temp1 = _mm_add_epi32(temp1, value_4x32b); 1984 temp0 = _mm_add_epi32(temp0, round_stg2); 1985 temp1 = _mm_add_epi32(temp1, round_stg2); 1986 pred_half1 = _mm_cvtepu16_epi32(pred_r); 1987 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 1988 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 1989 temp0 = _mm_add_epi32(temp0, pred_half0); 1990 temp1 = _mm_add_epi32(temp1, pred_half1); 1991 1992 temp0 = _mm_packus_epi32(temp0, temp1); 1993 temp0 = _mm_packus_epi16(temp0, temp1); 1994 1995 _mm_storel_epi64((__m128i *)(pu1_dst + 4 * dst_strd), temp0); 1996 1997 // Row 5 processing 1998 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 40)); 1999 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 5 * pred_strd)); 2000 pred_r = _mm_cvtepu8_epi16(pred_r); 2001 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 2002 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 2003 pred_half0 = _mm_cvtepu16_epi32(pred_r); 2004 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 2005 2006 pred_r = _mm_srli_si128(pred_r, 8); 2007 2008 temp0 = _mm_add_epi32(temp0, value_4x32b); 2009 temp1 = _mm_add_epi32(temp1, value_4x32b); 2010 temp0 = _mm_add_epi32(temp0, round_stg2); 2011 temp1 = _mm_add_epi32(temp1, round_stg2); 2012 pred_half1 = _mm_cvtepu16_epi32(pred_r); 2013 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 2014 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 2015 temp0 = _mm_add_epi32(temp0, pred_half0); 2016 temp1 = _mm_add_epi32(temp1, pred_half1); 2017 2018 temp0 = _mm_packus_epi32(temp0, temp1); 2019 temp0 = _mm_packus_epi16(temp0, temp1); 2020 2021 _mm_storel_epi64((__m128i *)(pu1_dst + 5 * dst_strd), temp0); 2022 2023 // Row 6 processing 2024 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 48)); 2025 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 6 * pred_strd)); 2026 pred_r = _mm_cvtepu8_epi16(pred_r); 2027 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 2028 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 2029 pred_half0 = _mm_cvtepu16_epi32(pred_r); 2030 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 2031 2032 pred_r = _mm_srli_si128(pred_r, 8); 2033 2034 temp0 = _mm_add_epi32(temp0, value_4x32b); 2035 temp1 = _mm_add_epi32(temp1, value_4x32b); 2036 temp0 = _mm_add_epi32(temp0, round_stg2); 2037 temp1 = _mm_add_epi32(temp1, round_stg2); 2038 pred_half1 = _mm_cvtepu16_epi32(pred_r); 2039 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 2040 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 2041 temp0 = _mm_add_epi32(temp0, pred_half0); 2042 temp1 = _mm_add_epi32(temp1, pred_half1); 2043 2044 temp0 = _mm_packus_epi32(temp0, temp1); 2045 temp0 = _mm_packus_epi16(temp0, temp1); 2046 2047 _mm_storel_epi64((__m128i *)(pu1_dst + 6 * dst_strd), temp0); 2048 2049 // Row 7 processing 2050 mismatch_stg2_additive = _mm_loadu_si128((__m128i *) (gai2_impeg2_mismatch_stg2_additive + 56)); 2051 pred_r = _mm_loadl_epi64((__m128i *) (pu1_pred + 7 * pred_strd)); 2052 pred_r = _mm_cvtepu8_epi16(pred_r); 2053 temp0 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 2054 mismatch_stg2_additive = _mm_srli_si128(mismatch_stg2_additive, 8); 2055 pred_half0 = _mm_cvtepu16_epi32(pred_r); 2056 temp1 = _mm_cvtepi16_epi32(mismatch_stg2_additive); 2057 2058 pred_r = _mm_srli_si128(pred_r, 8); 2059 2060 temp0 = _mm_add_epi32(temp0, value_4x32b); 2061 temp1 = _mm_add_epi32(temp1, value_4x32b); 2062 temp0 = _mm_add_epi32(temp0, round_stg2); 2063 temp1 = _mm_add_epi32(temp1, round_stg2); 2064 pred_half1 = _mm_cvtepu16_epi32(pred_r); 2065 temp0 = _mm_srai_epi32(temp0, IDCT_STG2_SHIFT); 2066 temp1 = _mm_srai_epi32(temp1, IDCT_STG2_SHIFT); 2067 temp0 = _mm_add_epi32(temp0, pred_half0); 2068 temp1 = _mm_add_epi32(temp1, pred_half1); 2069 2070 temp0 = _mm_packus_epi32(temp0, temp1); 2071 temp0 = _mm_packus_epi16(temp0, temp1); 2072 2073 _mm_storel_epi64((__m128i *)(pu1_dst + 7 * dst_strd), temp0); 2074 } 2075 2076 void impeg2_idct_recon_dc_sse42(WORD16 *pi2_src, 2077 WORD16 *pi2_tmp, 2078 UWORD8 *pu1_pred, 2079 UWORD8 *pu1_dst, 2080 WORD32 src_strd, 2081 WORD32 pred_strd, 2082 WORD32 dst_strd, 2083 WORD32 zero_cols, 2084 WORD32 zero_rows) 2085 { 2086 WORD32 val; 2087 __m128i value_4x32b, pred_r0, pred_r1, temp0, temp1, temp2, temp3; 2088 2089 UNUSED(pi2_tmp); 2090 UNUSED(src_strd); 2091 UNUSED(zero_cols); 2092 UNUSED(zero_rows); 2093 2094 val = pi2_src[0] * gai2_impeg2_idct_q15[0]; 2095 val = ((val + IDCT_STG1_ROUND) >> IDCT_STG1_SHIFT); 2096 val = val * gai2_impeg2_idct_q11[0]; 2097 val = ((val + IDCT_STG2_ROUND) >> IDCT_STG2_SHIFT); 2098 2099 value_4x32b = _mm_set1_epi32(val); 2100 2101 //Row 0-1 processing 2102 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); 2103 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); 2104 pred_r0 = _mm_cvtepu8_epi16(pred_r0); 2105 pred_r1 = _mm_cvtepu8_epi16(pred_r1); 2106 2107 temp0 = _mm_cvtepu16_epi32(pred_r0); 2108 pred_r0 = _mm_srli_si128(pred_r0, 8); 2109 temp2 = _mm_cvtepu16_epi32(pred_r1); 2110 pred_r1 = _mm_srli_si128(pred_r1, 8); 2111 temp1 = _mm_cvtepu16_epi32(pred_r0); 2112 temp3 = _mm_cvtepu16_epi32(pred_r1); 2113 2114 temp0 = _mm_add_epi32(temp0, value_4x32b); 2115 temp2 = _mm_add_epi32(temp2, value_4x32b); 2116 temp1 = _mm_add_epi32(temp1, value_4x32b); 2117 temp3 = _mm_add_epi32(temp3, value_4x32b); 2118 temp0 = _mm_packus_epi32(temp0, temp1); 2119 temp2 = _mm_packus_epi32(temp2, temp3); 2120 temp0 = _mm_packus_epi16(temp0, temp1); 2121 temp2 = _mm_packus_epi16(temp2, temp3); 2122 _mm_storel_epi64((__m128i *)(pu1_dst), temp0); 2123 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); 2124 2125 //Row 2-3 processing 2126 pu1_pred += 2 * pred_strd; 2127 pu1_dst += 2 * dst_strd; 2128 2129 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); 2130 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); 2131 pred_r0 = _mm_cvtepu8_epi16(pred_r0); 2132 pred_r1 = _mm_cvtepu8_epi16(pred_r1); 2133 2134 temp0 = _mm_cvtepu16_epi32(pred_r0); 2135 pred_r0 = _mm_srli_si128(pred_r0, 8); 2136 temp2 = _mm_cvtepu16_epi32(pred_r1); 2137 pred_r1 = _mm_srli_si128(pred_r1, 8); 2138 temp1 = _mm_cvtepu16_epi32(pred_r0); 2139 temp3 = _mm_cvtepu16_epi32(pred_r1); 2140 2141 temp0 = _mm_add_epi32(temp0, value_4x32b); 2142 temp2 = _mm_add_epi32(temp2, value_4x32b); 2143 temp1 = _mm_add_epi32(temp1, value_4x32b); 2144 temp3 = _mm_add_epi32(temp3, value_4x32b); 2145 temp0 = _mm_packus_epi32(temp0, temp1); 2146 temp2 = _mm_packus_epi32(temp2, temp3); 2147 temp0 = _mm_packus_epi16(temp0, temp1); 2148 temp2 = _mm_packus_epi16(temp2, temp3); 2149 _mm_storel_epi64((__m128i *)(pu1_dst), temp0); 2150 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); 2151 2152 //Row 4-5 processing 2153 pu1_pred += 2 * pred_strd; 2154 pu1_dst += 2 * dst_strd; 2155 2156 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); 2157 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); 2158 pred_r0 = _mm_cvtepu8_epi16(pred_r0); 2159 pred_r1 = _mm_cvtepu8_epi16(pred_r1); 2160 2161 temp0 = _mm_cvtepu16_epi32(pred_r0); 2162 pred_r0 = _mm_srli_si128(pred_r0, 8); 2163 temp2 = _mm_cvtepu16_epi32(pred_r1); 2164 pred_r1 = _mm_srli_si128(pred_r1, 8); 2165 temp1 = _mm_cvtepu16_epi32(pred_r0); 2166 temp3 = _mm_cvtepu16_epi32(pred_r1); 2167 2168 temp0 = _mm_add_epi32(temp0, value_4x32b); 2169 temp2 = _mm_add_epi32(temp2, value_4x32b); 2170 temp1 = _mm_add_epi32(temp1, value_4x32b); 2171 temp3 = _mm_add_epi32(temp3, value_4x32b); 2172 temp0 = _mm_packus_epi32(temp0, temp1); 2173 temp2 = _mm_packus_epi32(temp2, temp3); 2174 temp0 = _mm_packus_epi16(temp0, temp1); 2175 temp2 = _mm_packus_epi16(temp2, temp3); 2176 _mm_storel_epi64((__m128i *)(pu1_dst), temp0); 2177 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); 2178 2179 //Row 6-7 processing 2180 pu1_pred += 2 * pred_strd; 2181 pu1_dst += 2 * dst_strd; 2182 2183 pred_r0 = _mm_loadl_epi64((__m128i *) pu1_pred); 2184 pred_r1 = _mm_loadl_epi64((__m128i *) (pu1_pred + pred_strd)); 2185 pred_r0 = _mm_cvtepu8_epi16(pred_r0); 2186 pred_r1 = _mm_cvtepu8_epi16(pred_r1); 2187 2188 temp0 = _mm_cvtepu16_epi32(pred_r0); 2189 pred_r0 = _mm_srli_si128(pred_r0, 8); 2190 temp2 = _mm_cvtepu16_epi32(pred_r1); 2191 pred_r1 = _mm_srli_si128(pred_r1, 8); 2192 temp1 = _mm_cvtepu16_epi32(pred_r0); 2193 temp3 = _mm_cvtepu16_epi32(pred_r1); 2194 2195 temp0 = _mm_add_epi32(temp0, value_4x32b); 2196 temp2 = _mm_add_epi32(temp2, value_4x32b); 2197 temp1 = _mm_add_epi32(temp1, value_4x32b); 2198 temp3 = _mm_add_epi32(temp3, value_4x32b); 2199 temp0 = _mm_packus_epi32(temp0, temp1); 2200 temp2 = _mm_packus_epi32(temp2, temp3); 2201 temp0 = _mm_packus_epi16(temp0, temp1); 2202 temp2 = _mm_packus_epi16(temp2, temp3); 2203 _mm_storel_epi64((__m128i *)(pu1_dst), temp0); 2204 _mm_storel_epi64((__m128i *)(pu1_dst + dst_strd), temp2); 2205 } 2206