1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_itrans_recon_32x32_atom_intr.c 22 * 23 * @brief 24 * Contains function definitions for inverse quantization, inverse 25 * transform and reconstruction 26 * 27 * @author 28 * 100470 29 * 30 * @par List of Functions: 31 * - ihevc_iquant_itrans_recon_32x32_ssse3() 32 * 33 * @remarks 34 * None 35 * 36 ******************************************************************************* 37 */ 38 #include <stdio.h> 39 #include <string.h> 40 #include "ihevc_typedefs.h" 41 #include "ihevc_platform_macros.h" 42 #include "ihevc_macros.h" 43 #include "ihevc_defs.h" 44 #include "ihevc_trans_tables.h" 45 #include "ihevc_iquant_itrans_recon.h" 46 #include "ihevc_func_selector.h" 47 #include "ihevc_trans_macros.h" 48 49 50 51 52 #include <immintrin.h> 53 #include <emmintrin.h> 54 55 #include <tmmintrin.h> 56 57 58 59 /** 60 ******************************************************************************* 61 * 62 * @brief 63 * This function performs inverse quantization, inverse transform and 64 * reconstruction for 16x16 input block 65 * 66 * @par Description: 67 * Performs inverse quantization , inverse transform and adds the 68 * prediction data and clips output to 8 bit 69 * 70 * @param[in] pi2_src 71 * Input 16x16 coefficients 72 * 73 * @param[in] pi2_tmp 74 * Temporary 16x16 buffer for storing inverse 75 * transform 1st stage output 76 * 77 * @param[in] pu1_pred 78 * Prediction 16x16 block 79 * 80 * @param[in] pi2_dequant_coeff 81 * Dequant Coeffs 82 * 83 * @param[out] pu1_dst 84 * Output 16x16 block 85 * 86 * @param[in] qp_div 87 * Quantization parameter / 6 88 * 89 * @param[in] qp_rem 90 * Quantization parameter % 6 91 * 92 * @param[in] src_strd 93 * Input stride 94 * 95 * @param[in] pred_strd 96 * Prediction stride 97 * 98 * @param[in] dst_strd 99 * Output Stride 100 * 101 * @param[in] zero_cols 102 * Zero columns in pi2_src 103 * 104 * @returns Void 105 * 106 * @remarks 107 * None 108 * 109 ******************************************************************************* 110 */ 111 /**/ 112 113 void ihevc_itrans_recon_32x32_ssse3(WORD16 *pi2_src, 114 WORD16 *pi2_tmp, 115 UWORD8 *pu1_pred, 116 UWORD8 *pu1_dst, 117 WORD32 src_strd, 118 WORD32 pred_strd, 119 WORD32 dst_strd, 120 WORD32 zero_cols, 121 WORD32 zero_rows) 122 { 123 /* Inverse Transform */ 124 125 WORD32 j; 126 127 128 WORD16 *pi2_tmp_orig; 129 130 131 /*MEM_ALIGN16 WORD32 temp_array[1024]; 132 MEM_ALIGN16 WORD16 temp1_array[1024];*/ 133 WORD16 *o_temp_ptr; 134 WORD16 *temp_ptr; 135 136 __m128i m_temp_reg_0; 137 __m128i m_temp_reg_1; 138 __m128i m_temp_reg_2; 139 __m128i m_temp_reg_3; 140 __m128i m_temp_reg_4; 141 __m128i m_temp_reg_5; 142 __m128i m_temp_reg_6; 143 __m128i m_temp_reg_7; 144 __m128i m_temp_reg_10; 145 __m128i m_temp_reg_11; 146 __m128i m_temp_reg_12; 147 __m128i m_temp_reg_13; 148 __m128i m_temp_reg_14; 149 __m128i m_temp_reg_15; 150 __m128i m_temp_reg_16; 151 __m128i m_temp_reg_17; 152 __m128i m_temp_reg_18; 153 __m128i m_temp_reg_19; 154 __m128i m_temp_reg_20; 155 __m128i m_temp_reg_21; 156 __m128i m_temp_reg_22; 157 __m128i m_temp_reg_23; 158 __m128i m_temp_reg_30; 159 __m128i m_temp_reg_31; 160 __m128i m_temp_reg_32; 161 __m128i m_temp_reg_33; 162 __m128i m_temp_reg_34; 163 __m128i m_temp_reg_35; 164 __m128i m_temp_reg_36; 165 __m128i m_temp_reg_37; 166 __m128i m_temp_reg_40; 167 __m128i m_temp_reg_41; 168 __m128i m_temp_reg_42; 169 __m128i m_temp_reg_43; 170 __m128i m_temp_reg_44; 171 __m128i m_temp_reg_45; 172 __m128i m_temp_reg_46; 173 __m128i m_temp_reg_47; 174 175 __m128i m_temp_reg_70; 176 __m128i m_temp_reg_71; 177 __m128i m_temp_reg_72; 178 __m128i m_temp_reg_73; 179 __m128i m_temp_reg_74; 180 __m128i m_temp_reg_75; 181 __m128i m_temp_reg_76; 182 __m128i m_temp_reg_77; 183 184 __m128i m_temp_reg_80; 185 __m128i m_temp_reg_81; 186 __m128i m_temp_reg_82; 187 __m128i m_temp_reg_83; 188 __m128i m_temp_reg_84; 189 __m128i m_temp_reg_85; 190 __m128i m_temp_reg_86; 191 __m128i m_temp_reg_87; 192 193 __m128i m_temp_reg_90; 194 __m128i m_temp_reg_91; 195 __m128i m_temp_reg_92; 196 __m128i m_temp_reg_93; 197 __m128i m_temp_reg_94; 198 __m128i m_temp_reg_95; 199 __m128i m_temp_reg_96; 200 __m128i m_temp_reg_97; 201 202 __m128i m_rdng_factor; 203 __m128i m_count; 204 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 205 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; 206 207 __m128i temp1, temp2, temp3, temp4; 208 __m128i temp5, temp6, temp7, temp8; 209 210 __m128i all_zero_reg; 211 WORD32 i; 212 213 /*Lokesh*/ 214 WORD32 zero_last24_cols_stg1; 215 WORD32 zero_last24_rows_stg1; 216 WORD32 zero_last28_rows_stg1; 217 218 WORD32 zero_last28_rows_stg2; 219 WORD32 zero_last24_rows_stg2; 220 221 WORD32 trans_size_stg1; 222 223 WORD32 i4_shift = IT_SHIFT_STAGE_1; 224 WORD32 trans_size = TRANS_SIZE_32; 225 226 227 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ 228 zero_last24_cols_stg1 = ((zero_cols & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; 229 zero_last24_rows_stg1 = ((zero_rows & 0xFFFFFF00) == 0xFFFFFF00) ? 1 : 0; 230 zero_last28_rows_stg1 = ((zero_rows & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; 231 232 zero_last28_rows_stg2 = ((zero_cols & 0xFFFFFFF0) == 0xFFFFFFF0) ? 1 : 0; 233 zero_last24_rows_stg2 = zero_last24_cols_stg1; 234 235 if((zero_last28_rows_stg2) || (zero_last24_cols_stg1)) 236 { 237 trans_size_stg1 = 8; 238 239 } 240 else 241 { 242 trans_size_stg1 = 32; 243 } 244 245 all_zero_reg = _mm_setzero_si128(); 246 247 o_temp_ptr = pi2_tmp; 248 temp_ptr = (pi2_tmp + 1024); 249 250 pi2_tmp += 2048; 251 pi2_tmp_orig = pi2_tmp; 252 253 for(i = 0; i < trans_size_stg1; i += 8) 254 { 255 256 257 { 258 WORD16 *pi2_tmp_src = pi2_src; 259 260 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 261 pi2_tmp_src += (src_strd << 1); 262 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 263 pi2_tmp_src += (src_strd << 1); 264 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 265 pi2_tmp_src += (src_strd << 1); 266 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 267 pi2_tmp_src += (src_strd << 1); 268 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 269 pi2_tmp_src += (src_strd << 1); 270 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 271 pi2_tmp_src += (src_strd << 1); 272 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 273 pi2_tmp_src += (src_strd << 1); 274 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 275 pi2_tmp_src += (src_strd << 1); 276 277 m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src); 278 pi2_tmp_src += (src_strd << 1); 279 m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src); 280 pi2_tmp_src += (src_strd << 1); 281 m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src); 282 pi2_tmp_src += (src_strd << 1); 283 m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src); 284 pi2_tmp_src += (src_strd << 1); 285 m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src); 286 pi2_tmp_src += (src_strd << 1); 287 m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src); 288 pi2_tmp_src += (src_strd << 1); 289 m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src); 290 pi2_tmp_src += (src_strd << 1); 291 m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src); 292 } 293 294 if(zero_last28_rows_stg1) 295 { 296 /* eeo */ 297 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 298 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 299 { 300 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 301 302 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 303 304 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 305 306 /* eeeo[0]= m_temp_reg_20 */ 307 /* eeeo[1]= m_temp_reg_21 */ 308 /* eeee[0]= m_temp_reg_22 */ 309 /* eeee[1]= m_temp_reg_23 */ 310 311 /* eee[0] = eeee[0] + eeeo[0]; */ 312 m_temp_reg_40 = m_temp_reg_14; 313 314 /* eee[3] = eeee[0] - eeeo[0]; */ 315 m_temp_reg_43 = m_temp_reg_14; 316 317 /* eee[2] = eeee[1] - eeeo[1]; */ 318 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; 319 320 /* eee[1] = eeee[1] + eeeo[1];*/ 321 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; 322 323 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 324 325 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 326 327 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 328 329 /* eeeo[0]= m_temp_reg_20 */ 330 /* eeeo[1]= m_temp_reg_21 */ 331 /* eeee[0]= m_temp_reg_22 */ 332 /* eeee[1]= m_temp_reg_23 */ 333 334 /* eee[0] = eeee[0] + eeeo[0]; */ 335 m_temp_reg_44 = m_temp_reg_14; 336 337 /* eee[3] = eeee[0] - eeeo[0]; */ 338 m_temp_reg_47 = m_temp_reg_14; 339 340 /* eee[2] = eeee[1] - eeeo[1]; */ 341 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; 342 343 /* eee[1] = eeee[1] + eeeo[1];*/ 344 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; 345 346 347 } 348 /* eo */ 349 { 350 WORD16 *pi2_scratch = o_temp_ptr; 351 352 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 353 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 354 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 355 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 356 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 357 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 358 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 359 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 360 361 //m_temp_reg_10 = _mm_cvtepi16_epi32(m_temp_reg_71); 362 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); 363 364 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 365 366 /* eo0[0-3] */ 367 { 368 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 369 370 //m_temp_reg_14 = _mm_cvtepi16_epi32(m_temp_reg_71); 371 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, all_zero_reg); 372 373 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 374 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 375 376 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 377 pi2_scratch += 8; 378 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 379 pi2_scratch += 8; 380 381 } 382 383 /* eo0[4-7] */ 384 { 385 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 386 387 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 388 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 389 390 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 391 pi2_scratch += 8; 392 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 393 pi2_scratch += 8; 394 395 } 396 /* eo1[0-3] */ 397 { 398 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 399 400 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 401 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 402 403 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 404 pi2_scratch += 8; 405 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 406 pi2_scratch += 8; 407 408 } 409 410 /* eo1[4-7] */ 411 { 412 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); 413 414 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 415 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 416 417 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 418 pi2_scratch += 8; 419 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 420 pi2_scratch += 8; 421 422 } 423 424 /* eo2[0-3] */ 425 { 426 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 427 428 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 429 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 430 431 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 432 pi2_scratch += 8; 433 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 434 pi2_scratch += 8; 435 436 } 437 438 /* eo2[4-7] */ 439 { 440 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 441 442 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 443 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 444 445 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 446 pi2_scratch += 8; 447 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 448 pi2_scratch += 8; 449 450 } 451 452 /**************************************************************************/ 453 454 455 /* eo3[0-3] */ 456 { 457 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 458 459 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 460 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 461 462 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 463 pi2_scratch += 8; 464 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 465 pi2_scratch += 8; 466 467 } 468 469 /* eo3[4-7] */ 470 { 471 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); 472 473 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 474 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 475 476 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 477 pi2_scratch += 8; 478 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 479 pi2_scratch += 8; 480 481 } 482 483 484 /* eo4[0-3] */ 485 { 486 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 487 488 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 489 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 490 491 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 492 pi2_scratch += 8; 493 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 494 pi2_scratch += 8; 495 496 } 497 /* eo4[4-7] */ 498 { 499 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 500 501 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 502 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 503 504 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 505 pi2_scratch += 8; 506 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 507 pi2_scratch += 8; 508 509 } 510 511 /***********************************************************************/ 512 513 /* eo5[0-3] */ 514 { 515 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); 516 517 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 518 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 519 520 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 521 pi2_scratch += 8; 522 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 523 pi2_scratch += 8; 524 525 } 526 527 528 /* eo5[4-7] */ 529 { 530 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff6); 531 532 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 533 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 534 535 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 536 pi2_scratch += 8; 537 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 538 pi2_scratch += 8; 539 540 } 541 542 /* eo6[0-3] */ 543 { 544 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); 545 546 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 547 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 548 549 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 550 pi2_scratch += 8; 551 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 552 pi2_scratch += 8; 553 554 } 555 556 557 /* eo6[4-7] */ 558 { 559 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff7); 560 561 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 562 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 563 564 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 565 pi2_scratch += 8; 566 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 567 pi2_scratch += 8; 568 569 } 570 571 572 /* eo7[0-3] */ 573 { 574 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); 575 576 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 577 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 578 579 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 580 pi2_scratch += 8; 581 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 582 pi2_scratch += 8; 583 584 } 585 586 587 /* eo7[4-7] */ 588 { 589 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff8); 590 591 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 592 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 593 594 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 595 pi2_scratch += 8; 596 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 597 pi2_scratch += 8; 598 599 } 600 601 } 602 } 603 else if(zero_last24_rows_stg1) 604 { 605 { 606 /* eeo */ 607 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 608 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 609 610 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 611 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 612 613 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 614 615 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 616 617 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 618 619 /* eeeo[0]= m_temp_reg_20 */ 620 /* eeeo[1]= m_temp_reg_21 */ 621 /* eeee[0]= m_temp_reg_22 */ 622 /* eeee[1]= m_temp_reg_23 */ 623 624 /* eee[0] = eeee[0] + eeeo[0]; */ 625 m_temp_reg_40 = m_temp_reg_14; 626 627 /* eee[3] = eeee[0] - eeeo[0]; */ 628 m_temp_reg_43 = m_temp_reg_14; 629 630 /* eee[2] = eeee[1] - eeeo[1]; */ 631 m_temp_reg_42 = m_temp_reg_14; //m_temp_reg_16; 632 633 /* eee[1] = eeee[1] + eeeo[1];*/ 634 m_temp_reg_41 = m_temp_reg_14; //m_temp_reg_16; 635 636 /* for row 4 to 7 */ 637 638 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 639 640 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 641 642 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 643 644 /* eeeo[0]= m_temp_reg_20 */ 645 /* eeeo[1]= m_temp_reg_21 */ 646 /* eeee[0]= m_temp_reg_22 */ 647 /* eeee[1]= m_temp_reg_23 */ 648 649 /* eee[0] = eeee[0] + eeeo[0]; */ 650 m_temp_reg_44 = m_temp_reg_14; 651 652 /* eee[3] = eeee[0] - eeeo[0]; */ 653 m_temp_reg_47 = m_temp_reg_14; 654 655 /* eee[2] = eeee[1] - eeeo[1]; */ 656 m_temp_reg_46 = m_temp_reg_14; //m_temp_reg_16; 657 658 /* eee[1] = eeee[1] + eeeo[1];*/ 659 m_temp_reg_45 = m_temp_reg_14; //m_temp_reg_16; 660 661 662 /* eeo[] */ 663 /* for(k = 0; k < 4; k++) */ 664 665 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 666 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 667 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 668 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 669 670 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 671 672 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 673 674 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 675 676 m_temp_reg_33 = _mm_setzero_si128(); 677 678 /* eeo */ 679 { 680 /* eeo0[0-3] */ 681 { 682 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 683 684 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 685 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 686 687 m_temp_reg_90 = m_temp_reg_34; 688 m_temp_reg_97 = m_temp_reg_35; 689 } 690 /* eeo0[4-7] */ 691 { 692 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 693 694 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 695 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 696 697 m_temp_reg_91 = m_temp_reg_34; 698 m_temp_reg_96 = m_temp_reg_35; 699 700 } 701 702 /* eeo1[0-3] */ 703 { 704 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 705 706 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 707 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 708 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 709 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 710 711 m_temp_reg_92 = m_temp_reg_34; 712 m_temp_reg_95 = m_temp_reg_35; 713 714 } 715 716 /* eo1[4-7] */ 717 { 718 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff2); 719 720 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 721 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 722 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 723 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 724 725 m_temp_reg_93 = m_temp_reg_34; 726 m_temp_reg_94 = m_temp_reg_35; 727 728 729 } 730 731 /* eo2[0-3] */ 732 { 733 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 734 735 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 736 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 737 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 738 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 739 740 temp1 = m_temp_reg_34; 741 temp7 = m_temp_reg_35; 742 743 } 744 745 /* eo2[4-7] */ 746 { 747 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff4); 748 749 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 750 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 751 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 752 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 753 754 temp2 = m_temp_reg_34; 755 temp6 = m_temp_reg_35; 756 757 } 758 759 /* eo3[0-3] */ 760 { 761 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 762 763 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 764 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 765 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 766 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 767 768 temp3 = m_temp_reg_34; 769 temp5 = m_temp_reg_35; 770 771 } 772 773 774 /* eo3[4-7] */ 775 { 776 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 777 778 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 779 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 780 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 781 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 782 783 temp4 = m_temp_reg_34; 784 temp8 = m_temp_reg_35; 785 786 787 } 788 /* All values of ee[] array in pi2_temp */ 789 790 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 791 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 792 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 793 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 794 795 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 796 797 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 798 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 799 800 } 801 } 802 /* eo */ 803 { 804 WORD16 *pi2_scratch = o_temp_ptr; 805 806 /* eo0[0-3] */ 807 { 808 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 809 810 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); 811 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); 812 813 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 814 pi2_scratch += 8; 815 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 816 pi2_scratch += 8; 817 818 } 819 820 821 /* eo0[4-7] */ 822 { 823 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 824 825 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 826 827 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); 828 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); 829 830 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 831 pi2_scratch += 8; 832 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 833 pi2_scratch += 8; 834 835 } 836 837 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 838 839 /* eo1[0-3] */ 840 { 841 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 842 843 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); 844 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); 845 846 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 847 pi2_scratch += 8; 848 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 849 pi2_scratch += 8; 850 851 } 852 853 854 /* eo1[4-7] */ 855 { 856 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 857 858 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); 859 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); 860 861 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 862 pi2_scratch += 8; 863 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 864 pi2_scratch += 8; 865 866 } 867 868 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 869 870 /* eo2[0-3] */ 871 { 872 873 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 874 875 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); 876 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); 877 878 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 879 pi2_scratch += 8; 880 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 881 pi2_scratch += 8; 882 883 } 884 885 /* eo2[4-7] */ 886 { 887 888 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 889 890 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); 891 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); 892 893 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 894 pi2_scratch += 8; 895 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 896 pi2_scratch += 8; 897 898 } 899 900 /**************************************************************************/ 901 902 903 904 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 905 906 /* eo3[0-3] */ 907 { 908 909 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 910 911 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); 912 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); 913 914 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 915 pi2_scratch += 8; 916 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 917 pi2_scratch += 8; 918 919 } 920 921 922 /* eo3[4-7] */ 923 { 924 925 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 926 927 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); 928 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); 929 930 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 931 pi2_scratch += 8; 932 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 933 pi2_scratch += 8; 934 935 } 936 937 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 938 939 /* eo4[0-3] */ 940 { 941 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 942 943 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); 944 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); 945 946 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 947 pi2_scratch += 8; 948 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 949 pi2_scratch += 8; 950 951 } 952 /* eo4[4-7] */ 953 { 954 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 955 956 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); 957 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); 958 959 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 960 pi2_scratch += 8; 961 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 962 pi2_scratch += 8; 963 964 } 965 966 /***********************************************************************/ 967 968 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 969 970 /* eo5[0-3] */ 971 { 972 973 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 974 975 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); 976 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); 977 978 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 979 pi2_scratch += 8; 980 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 981 pi2_scratch += 8; 982 983 } 984 985 986 /* eo5[4-7] */ 987 { 988 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 989 990 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); 991 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); 992 993 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 994 pi2_scratch += 8; 995 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 996 pi2_scratch += 8; 997 998 } 999 1000 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 1001 1002 /* eo6[0-3] */ 1003 { 1004 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1005 1006 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); 1007 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); 1008 1009 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1010 pi2_scratch += 8; 1011 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1012 pi2_scratch += 8; 1013 1014 } 1015 1016 1017 /* eo6[4-7] */ 1018 { 1019 1020 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1021 1022 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); 1023 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); 1024 1025 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1026 pi2_scratch += 8; 1027 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1028 pi2_scratch += 8; 1029 1030 } 1031 1032 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 1033 1034 /* eo7[0-3] */ 1035 { 1036 1037 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1038 1039 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); 1040 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); 1041 1042 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1043 pi2_scratch += 8; 1044 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1045 pi2_scratch += 8; 1046 1047 } 1048 1049 1050 /* eo7[4-7] */ 1051 { 1052 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1053 1054 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); 1055 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); 1056 1057 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1058 pi2_scratch += 8; 1059 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1060 pi2_scratch += 8; 1061 1062 } 1063 1064 } 1065 1066 } 1067 else 1068 { 1069 1070 { 1071 /* eeo */ 1072 /* eeeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 1073 /* eeeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 1074 1075 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 1076 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 1077 1078 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 1079 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 1080 1081 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 1082 1083 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 1084 1085 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 1086 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 1087 1088 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 1089 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 1090 1091 1092 /* eeeo[0]= m_temp_reg_20 */ 1093 /* eeeo[1]= m_temp_reg_21 */ 1094 /* eeee[0]= m_temp_reg_22 */ 1095 /* eeee[1]= m_temp_reg_23 */ 1096 1097 /* eee[0] = eeee[0] + eeeo[0]; */ 1098 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 1099 1100 /* eee[3] = eeee[0] - eeeo[0]; */ 1101 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 1102 1103 /* eee[2] = eeee[1] - eeeo[1]; */ 1104 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 1105 1106 /* eee[1] = eeee[1] + eeeo[1];*/ 1107 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 1108 1109 /* for row 4 to 7 */ 1110 1111 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); 1112 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); 1113 1114 /* Interleaving row 8 and row 24*/ 1115 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 1116 1117 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 1118 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); 1119 1120 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 1121 1122 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 1123 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 1124 1125 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 1126 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 1127 1128 1129 /* eeeo[0]= m_temp_reg_20 */ 1130 /* eeeo[1]= m_temp_reg_21 */ 1131 /* eeee[0]= m_temp_reg_22 */ 1132 /* eeee[1]= m_temp_reg_23 */ 1133 1134 /* eee[0] = eeee[0] + eeeo[0]; */ 1135 m_temp_reg_44 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 1136 1137 /* eee[3] = eeee[0] - eeeo[0]; */ 1138 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 1139 1140 /* eee[2] = eeee[1] - eeeo[1]; */ 1141 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 1142 1143 /* eee[1] = eeee[1] + eeeo[1];*/ 1144 m_temp_reg_45 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 1145 1146 1147 // eeo[] 1148 /* for(k = 0; k < 4; k++) */ 1149 1150 1151 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 1152 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 1153 1154 /* eeo */ 1155 { 1156 /* eeo0[0-3] */ 1157 { 1158 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1159 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 1160 1161 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1162 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1163 1164 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1165 1166 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1167 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1168 1169 } 1170 1171 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 1172 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); 1173 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); 1174 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); 1175 1176 /* eeo0[4-7] */ 1177 { 1178 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1179 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 1180 1181 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1182 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1183 1184 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1185 1186 m_temp_reg_91 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 1187 m_temp_reg_96 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 1188 1189 } 1190 1191 1192 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 1193 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 1194 1195 /* eeo1[0-3] */ 1196 { 1197 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1198 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 1199 1200 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_30); 1201 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_30); 1202 1203 m_temp_reg_92 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); 1204 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); 1205 1206 } 1207 1208 /* eeo1[4-7] */ 1209 { 1210 1211 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1212 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 1213 1214 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_30); 1215 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_30); 1216 1217 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_31); 1218 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_31); 1219 1220 1221 } 1222 1223 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 1224 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 1225 1226 /* eeo2[0-3] */ 1227 { 1228 1229 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1230 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 1231 1232 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 1233 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 1234 1235 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 1236 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 1237 1238 temp1 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1239 temp7 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1240 1241 } 1242 1243 /* eeo2[4-7] */ 1244 { 1245 1246 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1247 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 1248 1249 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 1250 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 1251 1252 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 1253 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 1254 1255 temp2 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1256 temp6 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1257 1258 } 1259 1260 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 1261 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 1262 1263 /* eeo3[0-3] */ 1264 { 1265 1266 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1267 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 1268 1269 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 1270 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 1271 1272 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_30); 1273 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_30); 1274 1275 temp3 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1276 temp5 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1277 1278 1279 } 1280 1281 /* eeo3[4-7] */ 1282 { 1283 1284 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1285 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff4); 1286 1287 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 1288 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 1289 1290 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_30); 1291 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_30); 1292 temp4 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_31); 1293 temp8 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_31); 1294 1295 } 1296 1297 1298 /* All values of ee[] array in pi2_temp */ 1299 1300 /* for(k = 0; k < 8; k++) */ 1301 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 1302 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 1303 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 1304 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 1305 } 1306 } 1307 /* eo */ 1308 { 1309 WORD16 *pi2_scratch = o_temp_ptr; 1310 1311 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1312 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1313 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); 1314 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); 1315 1316 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 1317 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 1318 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); 1319 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); 1320 1321 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); 1322 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); 1323 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); 1324 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); 1325 1326 /* eo0[0-3] */ 1327 { 1328 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1329 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1330 1331 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1332 1333 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1334 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1335 1336 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1337 1338 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1339 1340 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_30); 1341 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_30); 1342 1343 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1344 pi2_scratch += 8; 1345 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1346 pi2_scratch += 8; 1347 1348 } 1349 /* eo0[4-7] */ 1350 { 1351 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1352 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1353 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_81, m_temp_reg_83); 1354 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_85, m_temp_reg_87); 1355 1356 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1357 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1358 1359 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1360 1361 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1362 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1363 1364 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1365 1366 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1367 1368 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_30); 1369 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_30); 1370 1371 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1372 pi2_scratch += 8; 1373 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1374 pi2_scratch += 8; 1375 1376 } 1377 1378 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 1379 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 1380 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 1381 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 1382 1383 /* eo1[0-3] */ 1384 { 1385 1386 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1387 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1388 1389 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1390 1391 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1392 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1393 1394 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1395 1396 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 1397 1398 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_30); 1399 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_30); 1400 1401 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1402 pi2_scratch += 8; 1403 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1404 pi2_scratch += 8; 1405 1406 } 1407 1408 /* eo1[4-7] */ 1409 { 1410 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1411 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1412 1413 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1414 1415 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1416 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1417 1418 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1419 1420 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 1421 1422 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_30); 1423 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_30); 1424 1425 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1426 pi2_scratch += 8; 1427 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1428 pi2_scratch += 8; 1429 1430 } 1431 1432 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 1433 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 1434 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 1435 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 1436 1437 /* eo2[0-3] */ 1438 { 1439 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1440 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1441 1442 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 1443 1444 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1445 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1446 1447 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1448 1449 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1450 1451 m_temp_reg_34 = _mm_add_epi32(temp1, m_temp_reg_30); 1452 m_temp_reg_35 = _mm_sub_epi32(temp1, m_temp_reg_30); 1453 1454 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1455 pi2_scratch += 8; 1456 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1457 pi2_scratch += 8; 1458 1459 } 1460 1461 1462 /* eo2[4-7] */ 1463 { 1464 1465 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1466 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1467 1468 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 1469 1470 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1471 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1472 1473 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1474 1475 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1476 1477 m_temp_reg_34 = _mm_add_epi32(temp2, m_temp_reg_30); 1478 m_temp_reg_35 = _mm_sub_epi32(temp2, m_temp_reg_30); 1479 1480 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1481 pi2_scratch += 8; 1482 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1483 pi2_scratch += 8; 1484 1485 } 1486 /**************************************************************************/ 1487 1488 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 1489 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 1490 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 1491 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 1492 1493 /* eo3[0-3] */ 1494 { 1495 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1496 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1497 1498 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1499 1500 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1501 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1502 1503 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 1504 1505 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1506 1507 m_temp_reg_34 = _mm_add_epi32(temp3, m_temp_reg_30); 1508 m_temp_reg_35 = _mm_sub_epi32(temp3, m_temp_reg_30); 1509 1510 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1511 pi2_scratch += 8; 1512 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1513 pi2_scratch += 8; 1514 1515 } 1516 1517 1518 /* eo3[4-7] */ 1519 { 1520 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1521 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1522 1523 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1524 1525 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1526 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1527 1528 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 1529 1530 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1531 1532 m_temp_reg_34 = _mm_add_epi32(temp4, m_temp_reg_30); 1533 m_temp_reg_35 = _mm_sub_epi32(temp4, m_temp_reg_30); 1534 1535 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1536 pi2_scratch += 8; 1537 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1538 pi2_scratch += 8; 1539 1540 } 1541 1542 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 1543 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 1544 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 1545 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 1546 1547 /* eo4[0-3] */ 1548 { 1549 1550 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1551 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1552 1553 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1554 1555 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1556 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1557 1558 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 1559 1560 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1561 1562 m_temp_reg_34 = _mm_add_epi32(temp5, m_temp_reg_30); 1563 m_temp_reg_35 = _mm_sub_epi32(temp5, m_temp_reg_30); 1564 1565 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1566 pi2_scratch += 8; 1567 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1568 pi2_scratch += 8; 1569 1570 } 1571 1572 1573 /* eo4[4-7] */ 1574 { 1575 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1576 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1577 1578 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1579 1580 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1581 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1582 1583 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 1584 1585 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1586 1587 m_temp_reg_34 = _mm_add_epi32(temp8, m_temp_reg_30); 1588 m_temp_reg_35 = _mm_sub_epi32(temp8, m_temp_reg_30); 1589 1590 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1591 pi2_scratch += 8; 1592 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1593 pi2_scratch += 8; 1594 1595 } 1596 1597 /***********************************************************************/ 1598 1599 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 1600 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 1601 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 1602 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 1603 1604 /* eo5[0-3] */ 1605 { 1606 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1607 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1608 1609 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1610 1611 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1612 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1613 1614 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1615 1616 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1617 1618 m_temp_reg_34 = _mm_add_epi32(temp7, m_temp_reg_30); 1619 m_temp_reg_35 = _mm_sub_epi32(temp7, m_temp_reg_30); 1620 1621 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1622 pi2_scratch += 8; 1623 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1624 pi2_scratch += 8; 1625 1626 } 1627 1628 1629 /* eo5[4-7] */ 1630 { 1631 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1632 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1633 1634 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1635 1636 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1637 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1638 1639 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1640 1641 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1642 1643 m_temp_reg_34 = _mm_add_epi32(temp6, m_temp_reg_30); 1644 m_temp_reg_35 = _mm_sub_epi32(temp6, m_temp_reg_30); 1645 1646 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1647 pi2_scratch += 8; 1648 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1649 pi2_scratch += 8; 1650 1651 } 1652 1653 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 1654 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 1655 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 1656 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 1657 1658 /* eo6[0-3] */ 1659 { 1660 1661 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1662 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1663 1664 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1665 1666 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1667 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1668 1669 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1670 1671 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1672 1673 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_30); 1674 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_30); 1675 1676 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1677 pi2_scratch += 8; 1678 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1679 pi2_scratch += 8; 1680 1681 } 1682 1683 1684 /* eo6[4-7] */ 1685 { 1686 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1687 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1688 1689 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1690 1691 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1692 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1693 1694 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1695 1696 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1697 1698 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_30); 1699 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_30); 1700 1701 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1702 pi2_scratch += 8; 1703 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1704 pi2_scratch += 8; 1705 1706 } 1707 1708 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 1709 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 1710 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 1711 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 1712 1713 /* eo7[0-3] */ 1714 { 1715 1716 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1717 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1718 1719 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1720 1721 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1722 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1723 1724 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1725 1726 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1727 1728 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_30); 1729 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_30); 1730 1731 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1732 pi2_scratch += 8; 1733 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1734 pi2_scratch += 8; 1735 1736 } 1737 1738 1739 /* eo7[4-7] */ 1740 { 1741 1742 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1743 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_15, m_coeff2); 1744 1745 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 1746 1747 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_16, m_coeff3); 1748 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_17, m_coeff4); 1749 1750 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 1751 1752 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 1753 1754 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_30); 1755 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_30); 1756 1757 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1758 pi2_scratch += 8; 1759 _mm_store_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1760 pi2_scratch += 8; 1761 1762 } 1763 1764 } 1765 1766 } 1767 /* All e[] are done */ 1768 /****************************/ 1769 1770 1771 { 1772 1773 WORD16 *pi2_tmp_src = pi2_src + src_strd; 1774 1775 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_tmp_src); 1776 pi2_tmp_src += (src_strd << 1); 1777 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_tmp_src); 1778 pi2_tmp_src += (src_strd << 1); 1779 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_tmp_src); 1780 pi2_tmp_src += (src_strd << 1); 1781 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_tmp_src); 1782 pi2_tmp_src += (src_strd << 1); 1783 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_tmp_src); 1784 pi2_tmp_src += (src_strd << 1); 1785 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_tmp_src); 1786 pi2_tmp_src += (src_strd << 1); 1787 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_tmp_src); 1788 pi2_tmp_src += (src_strd << 1); 1789 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_tmp_src); 1790 pi2_tmp_src += (src_strd << 1); 1791 1792 m_temp_reg_80 = _mm_load_si128((__m128i *)pi2_tmp_src); 1793 pi2_tmp_src += (src_strd << 1); 1794 m_temp_reg_81 = _mm_load_si128((__m128i *)pi2_tmp_src); 1795 pi2_tmp_src += (src_strd << 1); 1796 m_temp_reg_82 = _mm_load_si128((__m128i *)pi2_tmp_src); 1797 pi2_tmp_src += (src_strd << 1); 1798 m_temp_reg_83 = _mm_load_si128((__m128i *)pi2_tmp_src); 1799 pi2_tmp_src += (src_strd << 1); 1800 m_temp_reg_84 = _mm_load_si128((__m128i *)pi2_tmp_src); 1801 pi2_tmp_src += (src_strd << 1); 1802 m_temp_reg_85 = _mm_load_si128((__m128i *)pi2_tmp_src); 1803 pi2_tmp_src += (src_strd << 1); 1804 m_temp_reg_86 = _mm_load_si128((__m128i *)pi2_tmp_src); 1805 pi2_tmp_src += (src_strd << 1); 1806 m_temp_reg_87 = _mm_load_si128((__m128i *)pi2_tmp_src); 1807 } 1808 1809 if(zero_last28_rows_stg1) 1810 { 1811 /* o & stage 1 out */ 1812 { 1813 WORD32 j; 1814 WORD16 *pi2_src_scratch = o_temp_ptr; 1815 WORD16 *pi2_dst_scratch = temp_ptr; 1816 WORD32 out_stride = (trans_size << 1); 1817 WORD32 in_stride = trans_size; 1818 1819 for(j = 0; j < 2; j++) 1820 { 1821 if(j) 1822 { 1823 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 1824 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 1825 } 1826 1827 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 1828 1829 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 1830 1831 /* o0[0-3] */ 1832 { 1833 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1834 1835 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1836 pi2_src_scratch += in_stride; 1837 1838 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1839 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1840 1841 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1842 m_count = _mm_cvtsi32_si128(i4_shift); 1843 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1844 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1845 1846 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1847 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1848 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1849 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1850 1851 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1852 1853 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1854 pi2_dst_scratch += out_stride; 1855 1856 } 1857 1858 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 1859 1860 /* o1[0-3] */ 1861 { 1862 1863 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1864 1865 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1866 pi2_src_scratch += in_stride; 1867 1868 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1869 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1870 1871 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1872 m_count = _mm_cvtsi32_si128(i4_shift); 1873 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1874 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1875 1876 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1877 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1878 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1879 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1880 1881 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1882 1883 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1884 pi2_dst_scratch += out_stride; 1885 1886 } 1887 1888 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 1889 1890 /* o2[0-3] */ 1891 { 1892 1893 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1894 1895 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1896 pi2_src_scratch += in_stride; 1897 1898 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1899 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1900 1901 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1902 m_count = _mm_cvtsi32_si128(i4_shift); 1903 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1904 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1905 1906 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1907 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1908 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1909 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1910 1911 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1912 1913 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1914 pi2_dst_scratch += out_stride; 1915 1916 } 1917 1918 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 1919 1920 /* o3[0-3] */ 1921 { 1922 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1923 1924 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1925 pi2_src_scratch += in_stride; 1926 1927 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1928 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1929 1930 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1931 m_count = _mm_cvtsi32_si128(i4_shift); 1932 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1933 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1934 1935 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1936 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1937 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1938 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1939 1940 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1941 1942 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1943 pi2_dst_scratch += out_stride; 1944 1945 } 1946 1947 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 1948 1949 /* o4[0-3] */ 1950 { 1951 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1952 1953 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1954 pi2_src_scratch += in_stride; 1955 1956 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1957 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1958 1959 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1960 m_count = _mm_cvtsi32_si128(i4_shift); 1961 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1962 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1963 1964 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1965 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1966 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1967 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1968 1969 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1970 1971 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1972 pi2_dst_scratch += out_stride; 1973 1974 } 1975 1976 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 1977 1978 /* o5[0-3] */ 1979 { 1980 1981 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1982 1983 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 1984 pi2_src_scratch += in_stride; 1985 1986 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1987 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1988 1989 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1990 m_count = _mm_cvtsi32_si128(i4_shift); 1991 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1992 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1993 1994 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1995 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1996 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1997 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1998 1999 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2000 2001 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2002 pi2_dst_scratch += out_stride; 2003 2004 } 2005 2006 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 2007 2008 /* o6[0-3] */ 2009 { 2010 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2011 2012 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2013 pi2_src_scratch += in_stride; 2014 2015 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2016 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2017 2018 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2019 m_count = _mm_cvtsi32_si128(i4_shift); 2020 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2021 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2022 2023 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2024 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2025 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2026 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2027 2028 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2029 2030 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2031 pi2_dst_scratch += out_stride; 2032 2033 } 2034 2035 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 2036 2037 /* o7[0-3] */ 2038 { 2039 2040 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2041 2042 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2043 pi2_src_scratch += 8; 2044 2045 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2046 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2047 2048 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2049 m_count = _mm_cvtsi32_si128(i4_shift); 2050 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2051 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2052 2053 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2054 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2055 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2056 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2057 2058 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2059 2060 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2061 pi2_dst_scratch += 8; 2062 2063 } 2064 2065 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 2066 2067 /* o8[0-3] */ 2068 { 2069 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2070 2071 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2072 pi2_src_scratch -= in_stride; 2073 2074 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2075 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2076 2077 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2078 m_count = _mm_cvtsi32_si128(i4_shift); 2079 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2080 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2081 2082 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2083 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2084 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2085 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2086 2087 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2088 2089 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2090 pi2_dst_scratch -= out_stride; 2091 } 2092 2093 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 2094 2095 /* o9[0-3] */ 2096 { 2097 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2098 2099 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2100 pi2_src_scratch -= in_stride; 2101 2102 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2103 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2104 2105 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2106 m_count = _mm_cvtsi32_si128(i4_shift); 2107 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2108 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2109 2110 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2111 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2112 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2113 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2114 2115 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2116 2117 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2118 pi2_dst_scratch -= out_stride; 2119 } 2120 2121 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 2122 2123 /* o10[0-3] */ 2124 { 2125 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2126 2127 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2128 pi2_src_scratch -= in_stride; 2129 2130 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2131 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2132 2133 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2134 m_count = _mm_cvtsi32_si128(i4_shift); 2135 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2136 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2137 2138 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2139 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2140 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2141 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2142 2143 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2144 2145 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2146 pi2_dst_scratch -= out_stride; 2147 } 2148 2149 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 2150 2151 /* o11[0-3] */ 2152 { 2153 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2154 2155 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2156 pi2_src_scratch -= in_stride; 2157 2158 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2159 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2160 2161 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2162 m_count = _mm_cvtsi32_si128(i4_shift); 2163 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2164 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2165 2166 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2167 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2168 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2169 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2170 2171 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2172 2173 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2174 pi2_dst_scratch -= out_stride; 2175 2176 } 2177 2178 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 2179 2180 /* o12[0-3] */ 2181 { 2182 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2183 2184 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2185 pi2_src_scratch -= in_stride; 2186 2187 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2188 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2189 2190 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2191 m_count = _mm_cvtsi32_si128(i4_shift); 2192 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2193 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2194 2195 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2196 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2197 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2198 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2199 2200 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2201 2202 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2203 pi2_dst_scratch -= out_stride; 2204 2205 } 2206 2207 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 2208 2209 /* o13[0-3] */ 2210 { 2211 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2212 2213 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2214 pi2_src_scratch -= in_stride; 2215 2216 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2217 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2218 2219 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2220 m_count = _mm_cvtsi32_si128(i4_shift); 2221 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2222 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2223 2224 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2225 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2226 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2227 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2228 2229 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2230 2231 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2232 pi2_dst_scratch -= out_stride; 2233 } 2234 2235 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 2236 2237 /* o14[0-3] */ 2238 { 2239 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2240 2241 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2242 pi2_src_scratch -= in_stride; 2243 2244 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2245 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2246 2247 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2248 m_count = _mm_cvtsi32_si128(i4_shift); 2249 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2250 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2251 2252 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2253 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2254 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2255 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2256 2257 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2258 2259 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2260 pi2_dst_scratch -= out_stride; 2261 2262 } 2263 2264 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 2265 2266 /* o15[0-3] */ 2267 { 2268 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2269 2270 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2271 pi2_src_scratch += 8; 2272 2273 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2274 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2275 2276 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2277 m_count = _mm_cvtsi32_si128(i4_shift); 2278 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2279 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2280 2281 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2282 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2283 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2284 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2285 2286 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2287 2288 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2289 pi2_dst_scratch += 8; 2290 } 2291 2292 } 2293 } 2294 } 2295 else if(zero_last24_rows_stg1) 2296 { 2297 /* o & stage 1 out */ 2298 { 2299 WORD32 j; 2300 WORD16 *pi2_src_scratch = o_temp_ptr; 2301 WORD16 *pi2_dst_scratch = temp_ptr; 2302 WORD32 out_stride = (trans_size << 1); 2303 WORD32 in_stride = trans_size; 2304 2305 for(j = 0; j < 2; j++) 2306 { 2307 if(j) 2308 { 2309 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 2310 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 2311 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 2312 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 2313 } 2314 2315 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 2316 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 2317 2318 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 2319 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 2320 2321 /* o0[0-3] */ 2322 { 2323 2324 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2325 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2326 2327 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2328 2329 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2330 pi2_src_scratch += in_stride; 2331 2332 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2333 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2334 2335 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2336 m_count = _mm_cvtsi32_si128(i4_shift); 2337 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2338 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2339 2340 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2341 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2342 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2343 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2344 2345 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2346 2347 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2348 pi2_dst_scratch += out_stride; 2349 2350 } 2351 2352 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 2353 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 2354 2355 /* o1[0-3] */ 2356 { 2357 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2358 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2359 2360 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2361 2362 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2363 pi2_src_scratch += in_stride; 2364 2365 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2366 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2367 2368 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2369 m_count = _mm_cvtsi32_si128(i4_shift); 2370 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2371 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2372 2373 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2374 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2375 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2376 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2377 2378 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2379 2380 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2381 pi2_dst_scratch += out_stride; 2382 2383 } 2384 2385 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 2386 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 2387 2388 /* o2[0-3] */ 2389 { 2390 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2391 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2392 2393 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 2394 2395 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2396 pi2_src_scratch += in_stride; 2397 2398 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2399 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2400 2401 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2402 m_count = _mm_cvtsi32_si128(i4_shift); 2403 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2404 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2405 2406 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2407 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2408 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2409 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2410 2411 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2412 2413 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2414 pi2_dst_scratch += out_stride; 2415 2416 } 2417 2418 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 2419 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 2420 2421 /* o3[0-3] */ 2422 { 2423 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2424 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2425 2426 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 2427 2428 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2429 pi2_src_scratch += in_stride; 2430 2431 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2432 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2433 2434 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2435 m_count = _mm_cvtsi32_si128(i4_shift); 2436 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2437 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2438 2439 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2440 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2441 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2442 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2443 2444 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2445 2446 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2447 pi2_dst_scratch += out_stride; 2448 2449 } 2450 2451 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 2452 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 2453 2454 /* o4[0-3] */ 2455 { 2456 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2457 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2458 2459 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2460 2461 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2462 pi2_src_scratch += in_stride; 2463 2464 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2465 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2466 2467 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2468 m_count = _mm_cvtsi32_si128(i4_shift); 2469 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2470 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2471 2472 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2473 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2474 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2475 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2476 2477 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2478 2479 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2480 pi2_dst_scratch += out_stride; 2481 2482 } 2483 2484 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 2485 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 2486 2487 /* o5[0-3] */ 2488 { 2489 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2490 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2491 2492 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2493 2494 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2495 pi2_src_scratch += in_stride; 2496 2497 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2498 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2499 2500 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2501 m_count = _mm_cvtsi32_si128(i4_shift); 2502 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2503 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2504 2505 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2506 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2507 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2508 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2509 2510 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2511 2512 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2513 pi2_dst_scratch += out_stride; 2514 2515 } 2516 2517 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 2518 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 2519 2520 /* o6[0-3] */ 2521 { 2522 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2523 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2524 2525 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2526 2527 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2528 pi2_src_scratch += in_stride; 2529 2530 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2531 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2532 2533 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2534 m_count = _mm_cvtsi32_si128(i4_shift); 2535 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2536 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2537 2538 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2539 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2540 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2541 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2542 2543 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2544 2545 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2546 pi2_dst_scratch += out_stride; 2547 2548 } 2549 2550 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 2551 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 2552 2553 /* o7[0-3] */ 2554 { 2555 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2556 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2557 2558 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2559 2560 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2561 pi2_src_scratch += 8; 2562 2563 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2564 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2565 2566 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2567 m_count = _mm_cvtsi32_si128(i4_shift); 2568 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2569 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2570 2571 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2572 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2573 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2574 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2575 2576 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2577 2578 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2579 pi2_dst_scratch += 8; 2580 2581 } 2582 2583 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 2584 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 2585 2586 /* o8[0-3] */ 2587 { 2588 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2589 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2590 2591 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2592 2593 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2594 pi2_src_scratch -= in_stride; 2595 2596 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2597 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2598 2599 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2600 m_count = _mm_cvtsi32_si128(i4_shift); 2601 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2602 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2603 2604 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2605 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2606 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2607 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2608 2609 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2610 2611 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2612 pi2_dst_scratch -= out_stride; 2613 } 2614 2615 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 2616 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 2617 2618 /* o9[0-3] */ 2619 { 2620 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2621 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2622 2623 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2624 2625 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2626 pi2_src_scratch -= in_stride; 2627 2628 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2629 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2630 2631 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2632 m_count = _mm_cvtsi32_si128(i4_shift); 2633 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2634 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2635 2636 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2637 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2638 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2639 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2640 2641 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2642 2643 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2644 pi2_dst_scratch -= out_stride; 2645 } 2646 2647 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 2648 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 2649 2650 /* o10[0-3] */ 2651 { 2652 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2653 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2654 2655 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2656 2657 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2658 pi2_src_scratch -= in_stride; 2659 2660 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2661 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2662 2663 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2664 m_count = _mm_cvtsi32_si128(i4_shift); 2665 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2666 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2667 2668 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2669 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2670 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2671 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2672 2673 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2674 2675 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2676 pi2_dst_scratch -= out_stride; 2677 } 2678 2679 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 2680 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 2681 2682 /* o11[0-3] */ 2683 { 2684 2685 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2686 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2687 2688 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2689 2690 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2691 pi2_src_scratch -= in_stride; 2692 2693 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2694 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2695 2696 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2697 m_count = _mm_cvtsi32_si128(i4_shift); 2698 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2699 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2700 2701 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2702 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2703 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2704 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2705 2706 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2707 2708 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2709 pi2_dst_scratch -= out_stride; 2710 2711 } 2712 2713 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 2714 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 2715 2716 /* o12[0-3] */ 2717 { 2718 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2719 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2720 2721 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2722 2723 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2724 pi2_src_scratch -= in_stride; 2725 2726 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2727 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2728 2729 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2730 m_count = _mm_cvtsi32_si128(i4_shift); 2731 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2732 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2733 2734 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2735 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2736 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2737 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2738 2739 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2740 2741 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2742 pi2_dst_scratch -= out_stride; 2743 2744 } 2745 2746 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 2747 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 2748 2749 /* o13[0-3] */ 2750 { 2751 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2752 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2753 2754 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2755 2756 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2757 pi2_src_scratch -= in_stride; 2758 2759 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2760 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2761 2762 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2763 m_count = _mm_cvtsi32_si128(i4_shift); 2764 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2765 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2766 2767 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2768 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2769 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2770 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2771 2772 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2773 2774 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2775 pi2_dst_scratch -= out_stride; 2776 } 2777 2778 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 2779 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 2780 2781 /* o14[0-3] */ 2782 { 2783 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2784 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2785 2786 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2787 2788 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2789 pi2_src_scratch -= in_stride; 2790 2791 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2792 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2793 2794 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2795 m_count = _mm_cvtsi32_si128(i4_shift); 2796 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2797 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2798 2799 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2800 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2801 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2802 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2803 2804 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2805 2806 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2807 pi2_dst_scratch -= out_stride; 2808 2809 } 2810 2811 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 2812 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 2813 2814 /* o15[0-3] */ 2815 { 2816 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2817 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2818 2819 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2820 2821 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2822 pi2_src_scratch += 8; 2823 2824 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2825 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2826 2827 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2828 m_count = _mm_cvtsi32_si128(i4_shift); 2829 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2830 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2831 2832 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2833 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2834 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2835 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2836 2837 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2838 2839 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2840 pi2_dst_scratch += 8; 2841 } 2842 2843 } 2844 } 2845 } 2846 else 2847 { 2848 /* o & stage 1 out */ 2849 { 2850 WORD32 j; 2851 WORD16 *pi2_src_scratch = o_temp_ptr; 2852 WORD16 *pi2_dst_scratch = temp_ptr; 2853 WORD32 out_stride = (trans_size << 1); 2854 WORD32 in_stride = trans_size; 2855 2856 2857 for(j = 0; j < 2; j++) 2858 { 2859 if(j) 2860 { 2861 m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 2862 m_temp_reg_71 = _mm_srli_si128(m_temp_reg_71, 8); 2863 m_temp_reg_72 = _mm_srli_si128(m_temp_reg_72, 8); 2864 m_temp_reg_73 = _mm_srli_si128(m_temp_reg_73, 8); 2865 m_temp_reg_74 = _mm_srli_si128(m_temp_reg_74, 8); 2866 m_temp_reg_75 = _mm_srli_si128(m_temp_reg_75, 8); 2867 m_temp_reg_76 = _mm_srli_si128(m_temp_reg_76, 8); 2868 m_temp_reg_77 = _mm_srli_si128(m_temp_reg_77, 8); 2869 2870 m_temp_reg_80 = _mm_srli_si128(m_temp_reg_80, 8); 2871 m_temp_reg_81 = _mm_srli_si128(m_temp_reg_81, 8); 2872 m_temp_reg_82 = _mm_srli_si128(m_temp_reg_82, 8); 2873 m_temp_reg_83 = _mm_srli_si128(m_temp_reg_83, 8); 2874 m_temp_reg_84 = _mm_srli_si128(m_temp_reg_84, 8); 2875 m_temp_reg_85 = _mm_srli_si128(m_temp_reg_85, 8); 2876 m_temp_reg_86 = _mm_srli_si128(m_temp_reg_86, 8); 2877 m_temp_reg_87 = _mm_srli_si128(m_temp_reg_87, 8); 2878 } 2879 2880 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 2881 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 2882 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); 2883 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); 2884 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); 2885 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); 2886 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); 2887 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); 2888 2889 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 2890 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 2891 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved 2892 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved 2893 temp1 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved 2894 temp2 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved 2895 temp3 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved 2896 temp4 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved 2897 2898 2899 /* o0[0-3] */ 2900 { 2901 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2902 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2903 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 2904 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2905 2906 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2907 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2908 2909 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 2910 2911 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 2912 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 2913 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 2914 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 2915 2916 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 2917 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 2918 2919 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 2920 2921 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 2922 2923 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2924 pi2_src_scratch += in_stride; 2925 2926 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2927 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2928 2929 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2930 m_count = _mm_cvtsi32_si128(i4_shift); 2931 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2932 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2933 2934 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2935 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2936 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2937 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2938 2939 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2940 2941 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2942 pi2_dst_scratch += out_stride; 2943 2944 } 2945 2946 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 2947 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 2948 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); 2949 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); 2950 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); 2951 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); 2952 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); 2953 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); 2954 2955 2956 /* o1[0-3] */ 2957 { 2958 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2959 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2960 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 2961 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2962 2963 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2964 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2965 2966 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); 2967 2968 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 2969 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 2970 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 2971 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 2972 2973 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 2974 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 2975 2976 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 2977 2978 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 2979 2980 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 2981 pi2_src_scratch += in_stride; 2982 2983 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2984 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2985 2986 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2987 m_count = _mm_cvtsi32_si128(i4_shift); 2988 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 2989 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 2990 2991 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2992 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2993 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2994 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2995 2996 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2997 2998 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2999 pi2_dst_scratch += out_stride; 3000 3001 } 3002 3003 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 3004 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 3005 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); 3006 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); 3007 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); 3008 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); 3009 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); 3010 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); 3011 3012 /* o2[0-3] */ 3013 { 3014 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3015 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3016 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3017 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3018 3019 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 3020 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3021 3022 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3023 3024 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3025 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3026 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3027 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3028 3029 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); 3030 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3031 3032 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); 3033 3034 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3035 3036 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3037 pi2_src_scratch += in_stride; 3038 3039 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3040 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3041 3042 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3043 m_count = _mm_cvtsi32_si128(i4_shift); 3044 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3045 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3046 3047 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3048 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3049 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3050 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3051 3052 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3053 3054 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3055 pi2_dst_scratch += out_stride; 3056 3057 } 3058 3059 3060 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 3061 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 3062 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]); 3063 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]); 3064 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]); 3065 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]); 3066 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]); 3067 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]); 3068 3069 /* o3[0-3] */ 3070 { 3071 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3072 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3073 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3074 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3075 3076 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 3077 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3078 3079 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3080 3081 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3082 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3083 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3084 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3085 3086 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40); 3087 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3088 3089 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3090 3091 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3092 3093 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3094 pi2_src_scratch += in_stride; 3095 3096 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3097 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3098 3099 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3100 m_count = _mm_cvtsi32_si128(i4_shift); 3101 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3102 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3103 3104 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3105 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3106 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3107 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3108 3109 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3110 3111 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3112 pi2_dst_scratch += out_stride; 3113 3114 } 3115 3116 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 3117 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 3118 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]); 3119 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]); 3120 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]); 3121 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]); 3122 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]); 3123 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]); 3124 3125 /* o4[0-3] */ 3126 { 3127 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3128 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3129 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3130 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3131 3132 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3133 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3134 3135 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3136 3137 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3138 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3139 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3140 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3141 3142 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3143 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3144 3145 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3146 3147 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3148 3149 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3150 pi2_src_scratch += in_stride; 3151 3152 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3153 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3154 3155 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3156 m_count = _mm_cvtsi32_si128(i4_shift); 3157 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3158 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3159 3160 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3161 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3162 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3163 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3164 3165 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3166 3167 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3168 pi2_dst_scratch += out_stride; 3169 3170 } 3171 3172 3173 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 3174 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 3175 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]); 3176 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]); 3177 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]); 3178 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]); 3179 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]); 3180 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]); 3181 3182 /* o5[0-3] */ 3183 { 3184 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3185 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3186 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3187 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3188 3189 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3190 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3191 3192 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3193 3194 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3195 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3196 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3197 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3198 3199 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3200 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3201 3202 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3203 3204 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3205 3206 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3207 pi2_src_scratch += in_stride; 3208 3209 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3210 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3211 3212 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3213 m_count = _mm_cvtsi32_si128(i4_shift); 3214 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3215 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3216 3217 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3218 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3219 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3220 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3221 3222 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3223 3224 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3225 pi2_dst_scratch += out_stride; 3226 3227 } 3228 3229 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 3230 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 3231 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]); 3232 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]); 3233 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]); 3234 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]); 3235 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]); 3236 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]); 3237 3238 3239 /* o6[0-3] */ 3240 { 3241 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3242 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3243 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3244 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3245 3246 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3247 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3248 3249 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3250 3251 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3252 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3253 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3254 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3255 3256 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3257 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3258 3259 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3260 3261 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3262 3263 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3264 pi2_src_scratch += in_stride; 3265 3266 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3267 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3268 3269 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3270 m_count = _mm_cvtsi32_si128(i4_shift); 3271 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3272 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3273 3274 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3275 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3276 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3277 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3278 3279 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3280 3281 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3282 pi2_dst_scratch += out_stride; 3283 3284 } 3285 3286 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 3287 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 3288 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]); 3289 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]); 3290 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]); 3291 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]); 3292 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]); 3293 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]); 3294 3295 /* o7[0-3] */ 3296 { 3297 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3298 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3299 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3300 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3301 3302 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3303 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3304 3305 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3306 3307 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3308 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3309 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3310 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3311 3312 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3313 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3314 3315 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3316 3317 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3318 3319 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3320 pi2_src_scratch += 8; 3321 3322 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3323 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3324 3325 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3326 m_count = _mm_cvtsi32_si128(i4_shift); 3327 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3328 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3329 3330 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3331 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3332 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3333 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3334 3335 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3336 3337 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3338 pi2_dst_scratch += 8; 3339 3340 } 3341 3342 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 3343 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 3344 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]); 3345 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]); 3346 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]); 3347 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]); 3348 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]); 3349 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]); 3350 3351 3352 /* o8[0-3] */ 3353 { 3354 3355 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3356 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3357 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3358 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3359 3360 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3361 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3362 3363 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3364 3365 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3366 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3367 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3368 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3369 3370 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3371 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3372 3373 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3374 3375 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3376 3377 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3378 pi2_src_scratch -= in_stride; 3379 3380 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3381 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3382 3383 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3384 m_count = _mm_cvtsi32_si128(i4_shift); 3385 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3386 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3387 3388 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3389 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3390 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3391 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3392 3393 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3394 3395 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3396 pi2_dst_scratch -= out_stride; 3397 } 3398 3399 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 3400 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 3401 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]); 3402 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]); 3403 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]); 3404 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]); 3405 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]); 3406 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]); 3407 3408 3409 /* o9[0-3] */ 3410 { 3411 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3412 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3413 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3414 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3415 3416 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3417 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3418 3419 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3420 3421 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3422 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3423 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3424 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3425 3426 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3427 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3428 3429 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3430 3431 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3432 3433 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3434 pi2_src_scratch -= in_stride; 3435 3436 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3437 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3438 3439 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3440 m_count = _mm_cvtsi32_si128(i4_shift); 3441 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3442 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3443 3444 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3445 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3446 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3447 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3448 3449 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3450 3451 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3452 pi2_dst_scratch -= out_stride; 3453 } 3454 3455 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 3456 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 3457 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]); 3458 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]); 3459 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]); 3460 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]); 3461 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]); 3462 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]); 3463 3464 /* o10[0-3] */ 3465 { 3466 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3467 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3468 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3469 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3470 3471 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3472 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3473 3474 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3475 3476 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3477 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3478 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3479 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3480 3481 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3482 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3483 3484 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3485 3486 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3487 3488 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3489 pi2_src_scratch -= in_stride; 3490 3491 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3492 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3493 3494 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3495 m_count = _mm_cvtsi32_si128(i4_shift); 3496 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3497 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3498 3499 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3500 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3501 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3502 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3503 3504 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3505 3506 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3507 pi2_dst_scratch -= out_stride; 3508 } 3509 3510 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 3511 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 3512 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]); 3513 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]); 3514 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]); 3515 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]); 3516 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]); 3517 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]); 3518 3519 /* o11[0-3] */ 3520 { 3521 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3522 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3523 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3524 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3525 3526 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3527 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3528 3529 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3530 3531 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3532 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3533 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3534 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3535 3536 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3537 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3538 3539 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3540 3541 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3542 3543 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3544 pi2_src_scratch -= in_stride; 3545 3546 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3547 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3548 3549 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3550 m_count = _mm_cvtsi32_si128(i4_shift); 3551 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3552 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3553 3554 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3555 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3556 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3557 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3558 3559 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3560 3561 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3562 pi2_dst_scratch -= out_stride; 3563 3564 } 3565 3566 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 3567 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 3568 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]); 3569 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]); 3570 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]); 3571 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]); 3572 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]); 3573 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]); 3574 3575 3576 /* o12[0-3] */ 3577 { 3578 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3579 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3580 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3581 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3582 3583 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3584 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3585 3586 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3587 3588 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3589 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3590 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3591 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3592 3593 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3594 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3595 3596 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3597 3598 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3599 3600 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3601 pi2_src_scratch -= in_stride; 3602 3603 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3604 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3605 3606 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3607 m_count = _mm_cvtsi32_si128(i4_shift); 3608 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3609 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3610 3611 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3612 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3613 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3614 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3615 3616 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3617 3618 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3619 pi2_dst_scratch -= out_stride; 3620 3621 } 3622 3623 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 3624 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 3625 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]); 3626 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]); 3627 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]); 3628 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]); 3629 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]); 3630 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]); 3631 3632 3633 /* o13[0-3] */ 3634 { 3635 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3636 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3637 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3638 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3639 3640 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3641 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3642 3643 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3644 3645 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3646 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3647 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3648 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3649 3650 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3651 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3652 3653 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3654 3655 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3656 3657 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3658 pi2_src_scratch -= in_stride; 3659 3660 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3661 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3662 3663 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3664 m_count = _mm_cvtsi32_si128(i4_shift); 3665 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3666 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3667 3668 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3669 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3670 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3671 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3672 3673 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3674 3675 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3676 pi2_dst_scratch -= out_stride; 3677 } 3678 3679 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 3680 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 3681 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]); 3682 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]); 3683 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]); 3684 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]); 3685 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]); 3686 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]); 3687 3688 3689 /* o14[0-3] */ 3690 { 3691 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3692 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3693 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3694 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3695 3696 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3697 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3698 3699 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3700 3701 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3702 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3703 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3704 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3705 3706 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3707 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3708 3709 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3710 3711 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3712 3713 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3714 pi2_src_scratch -= in_stride; 3715 3716 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3717 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3718 3719 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3720 m_count = _mm_cvtsi32_si128(i4_shift); 3721 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3722 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3723 3724 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3725 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3726 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3727 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3728 3729 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3730 3731 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3732 pi2_dst_scratch -= out_stride; 3733 3734 } 3735 3736 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 3737 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 3738 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]); 3739 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]); 3740 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]); 3741 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]); 3742 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]); 3743 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]); 3744 3745 /* o15[0-3] */ 3746 { 3747 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3748 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3749 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3750 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3751 3752 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3753 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 3754 3755 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3756 3757 m_temp_reg_40 = _mm_madd_epi16(temp1, m_coeff5); 3758 m_temp_reg_41 = _mm_madd_epi16(temp2, m_coeff6); 3759 m_temp_reg_42 = _mm_madd_epi16(temp3, m_coeff7); 3760 m_temp_reg_43 = _mm_madd_epi16(temp4, m_coeff8); 3761 3762 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 3763 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 3764 3765 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 3766 3767 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 3768 3769 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3770 pi2_src_scratch += 8; 3771 3772 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3773 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3774 3775 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 3776 m_count = _mm_cvtsi32_si128(i4_shift); 3777 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 3778 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 3779 3780 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3781 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3782 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3783 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3784 3785 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3786 3787 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3788 pi2_dst_scratch += 8; 3789 } 3790 3791 } 3792 } 3793 } 3794 /* Transpose */ 3795 { 3796 WORD16 *pi2_src_scratch = temp_ptr; 3797 WORD16 *pi2_dst_scratch = pi2_tmp; 3798 WORD32 in_stride = (trans_size << 1); 3799 3800 for(j = 0; j < 2; j++) 3801 { 3802 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 3803 pi2_src_scratch += in_stride; 3804 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch); 3805 pi2_src_scratch += in_stride; 3806 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch); 3807 pi2_src_scratch += in_stride; 3808 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch); 3809 pi2_src_scratch += in_stride; 3810 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch); 3811 pi2_src_scratch += in_stride; 3812 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch); 3813 pi2_src_scratch += in_stride; 3814 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch); 3815 pi2_src_scratch += in_stride; 3816 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch); 3817 pi2_src_scratch += 8; 3818 3819 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch); 3820 pi2_src_scratch -= in_stride; 3821 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch); 3822 pi2_src_scratch -= in_stride; 3823 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch); 3824 pi2_src_scratch -= in_stride; 3825 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch); 3826 pi2_src_scratch -= in_stride; 3827 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch); 3828 pi2_src_scratch -= in_stride; 3829 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch); 3830 pi2_src_scratch -= in_stride; 3831 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch); 3832 pi2_src_scratch -= in_stride; 3833 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch); 3834 pi2_src_scratch += 8; 3835 3836 3837 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); 3838 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); 3839 3840 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); 3841 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); 3842 3843 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); 3844 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); 3845 3846 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); 3847 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); 3848 3849 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 3850 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70); 3851 3852 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 3853 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72); 3854 3855 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); 3856 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74); 3857 3858 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); 3859 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76); 3860 3861 /****************/ 3862 3863 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); 3864 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); 3865 3866 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); 3867 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); 3868 3869 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82); 3870 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82); 3871 3872 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86); 3873 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86); 3874 3875 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); 3876 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); 3877 3878 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); 3879 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); 3880 3881 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81); 3882 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81); 3883 3884 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85); 3885 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85); 3886 3887 /******************/ 3888 3889 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); 3890 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); 3891 3892 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); 3893 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); 3894 3895 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); 3896 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); 3897 3898 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); 3899 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); 3900 3901 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); 3902 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); 3903 3904 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); 3905 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); 3906 3907 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); 3908 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); 3909 3910 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); 3911 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); 3912 3913 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_30); 3914 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_34); 3915 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_36); 3916 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_32); 3917 3918 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_31); 3919 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_35); 3920 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_37); 3921 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_33); 3922 3923 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_80); 3924 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_84); 3925 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_86); 3926 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_82); 3927 3928 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_81); 3929 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_85); 3930 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_87); 3931 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_83); 3932 3933 pi2_dst_scratch += 4 * trans_size; 3934 } 3935 } 3936 pi2_src += 8; 3937 // pi2_dequant_coeff +=8; 3938 pi2_tmp += 8 * trans_size; 3939 zero_cols = zero_cols >> 1; 3940 } 3941 3942 if(trans_size_stg1 != TRANS_SIZE_32) 3943 { 3944 m_temp_reg_10 = _mm_setzero_si128(); 3945 3946 for(i = trans_size_stg1; i < 32; i += 8) 3947 { 3948 WORD16 *pi2_dst_scratch = pi2_tmp; 3949 3950 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size), m_temp_reg_10); 3951 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 8), m_temp_reg_10); 3952 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 16), m_temp_reg_10); 3953 _mm_store_si128((__m128i *)(pi2_dst_scratch + 0 * trans_size + 24), m_temp_reg_10); 3954 3955 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size), m_temp_reg_10); 3956 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 8), m_temp_reg_10); 3957 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 16), m_temp_reg_10); 3958 _mm_store_si128((__m128i *)(pi2_dst_scratch + 1 * trans_size + 24), m_temp_reg_10); 3959 3960 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size), m_temp_reg_10); 3961 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 8), m_temp_reg_10); 3962 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 16), m_temp_reg_10); 3963 _mm_store_si128((__m128i *)(pi2_dst_scratch + 2 * trans_size + 24), m_temp_reg_10); 3964 3965 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size), m_temp_reg_10); 3966 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 8), m_temp_reg_10); 3967 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 16), m_temp_reg_10); 3968 _mm_store_si128((__m128i *)(pi2_dst_scratch + 3 * trans_size + 24), m_temp_reg_10); 3969 3970 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size), m_temp_reg_10); 3971 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 8), m_temp_reg_10); 3972 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 16), m_temp_reg_10); 3973 _mm_store_si128((__m128i *)(pi2_dst_scratch + 4 * trans_size + 24), m_temp_reg_10); 3974 3975 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size), m_temp_reg_10); 3976 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 8), m_temp_reg_10); 3977 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 16), m_temp_reg_10); 3978 _mm_store_si128((__m128i *)(pi2_dst_scratch + 5 * trans_size + 24), m_temp_reg_10); 3979 3980 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size), m_temp_reg_10); 3981 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 8), m_temp_reg_10); 3982 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 16), m_temp_reg_10); 3983 _mm_store_si128((__m128i *)(pi2_dst_scratch + 6 * trans_size + 24), m_temp_reg_10); 3984 3985 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size), m_temp_reg_10); 3986 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 8), m_temp_reg_10); 3987 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 16), m_temp_reg_10); 3988 _mm_store_si128((__m128i *)(pi2_dst_scratch + 7 * trans_size + 24), m_temp_reg_10); 3989 3990 pi2_tmp += 8 * trans_size; 3991 } 3992 } 3993 3994 pi2_tmp = pi2_tmp_orig; 3995 3996 /* Inverse Transform 2nd stage */ 3997 3998 for(j = 0; j < trans_size; j += 4) 3999 { 4000 i4_shift = IT_SHIFT_STAGE_2; 4001 4002 /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */ 4003 if(zero_last28_rows_stg2) 4004 { 4005 { 4006 4007 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 4008 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 4009 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 4010 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 4011 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 4012 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 4013 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 4014 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //9 4015 4016 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 4017 4018 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, all_zero_reg); 4019 4020 /* eo0[0-3] */ 4021 { 4022 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4023 4024 } 4025 /* eo1[0-3] */ 4026 { 4027 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 4028 4029 } 4030 /* eo2[0-3] */ 4031 { 4032 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 4033 } 4034 4035 /* eo3[0-3] */ 4036 { 4037 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 4038 } 4039 /* eo4[0-3] */ 4040 { 4041 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 4042 } 4043 4044 /* eo5[0-3] */ 4045 { 4046 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff6); 4047 } 4048 4049 /* eo6[0-3] */ 4050 { 4051 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff7); 4052 } 4053 /* eo7[0-3] */ 4054 { 4055 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff8); 4056 } 4057 } 4058 4059 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 4060 4061 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 4062 4063 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 4064 4065 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4066 4067 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4068 4069 /* e[]*/ 4070 4071 temp1 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[0] */ 4072 temp2 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_90); /* ee[15] */ 4073 4074 temp3 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[1] */ 4075 temp4 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_91); /* ee[14] */ 4076 4077 temp5 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[2] */ 4078 temp6 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_92); /* ee[13] */ 4079 4080 temp7 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[3] */ 4081 temp8 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_93); /* ee[12] */ 4082 4083 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[4] */ 4084 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_94); /* ee[11] */ 4085 4086 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[5] */ 4087 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_95); /* ee[10] */ 4088 4089 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[6] */ 4090 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_16, m_temp_reg_96); /* ee[9] */ 4091 4092 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[7] */ 4093 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_14, m_temp_reg_97); /* ee[8] */ 4094 4095 /*o[k]*/ 4096 { 4097 4098 WORD16 *pi2_dst_scratch = temp_ptr; 4099 WORD32 out_stride = 8; 4100 4101 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 4102 4103 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 4104 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 4105 4106 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 4107 4108 4109 /* o0[0-3] */ 4110 { 4111 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4112 4113 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 4114 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 4115 4116 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4117 m_count = _mm_cvtsi32_si128(i4_shift); 4118 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4119 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4120 4121 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4122 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4123 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4124 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4125 4126 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4127 4128 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4129 pi2_dst_scratch += out_stride; 4130 4131 } 4132 4133 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 4134 4135 /* o1[0-3] */ 4136 { 4137 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4138 4139 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20); 4140 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20); 4141 4142 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4143 m_count = _mm_cvtsi32_si128(i4_shift); 4144 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4145 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4146 4147 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4148 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4149 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4150 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4151 4152 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4153 4154 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4155 pi2_dst_scratch += out_stride; 4156 4157 } 4158 4159 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 4160 4161 /* o2[0-3] */ 4162 { 4163 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4164 4165 m_temp_reg_31 = _mm_sub_epi32(temp5, m_temp_reg_20); 4166 m_temp_reg_30 = _mm_add_epi32(temp5, m_temp_reg_20); 4167 4168 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4169 m_count = _mm_cvtsi32_si128(i4_shift); 4170 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4171 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4172 4173 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4174 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4175 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4176 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4177 4178 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4179 4180 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4181 pi2_dst_scratch += out_stride; 4182 4183 } 4184 4185 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 4186 4187 /* o3[0-3] */ 4188 { 4189 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4190 4191 m_temp_reg_31 = _mm_sub_epi32(temp7, m_temp_reg_20); 4192 m_temp_reg_30 = _mm_add_epi32(temp7, m_temp_reg_20); 4193 4194 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4195 m_count = _mm_cvtsi32_si128(i4_shift); 4196 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4197 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4198 4199 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4200 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4201 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4202 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4203 4204 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4205 4206 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4207 pi2_dst_scratch += out_stride; 4208 4209 } 4210 4211 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 4212 4213 /* o4[0-3] */ 4214 { 4215 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4216 4217 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 4218 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 4219 4220 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4221 m_count = _mm_cvtsi32_si128(i4_shift); 4222 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4223 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4224 4225 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4226 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4227 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4228 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4229 4230 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4231 4232 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4233 pi2_dst_scratch += out_stride; 4234 4235 } 4236 4237 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 4238 4239 /* o5[0-3] */ 4240 { 4241 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4242 4243 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 4244 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 4245 4246 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4247 m_count = _mm_cvtsi32_si128(i4_shift); 4248 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4249 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4250 4251 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4252 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4253 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4254 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4255 4256 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4257 4258 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4259 pi2_dst_scratch += out_stride; 4260 4261 } 4262 4263 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 4264 4265 /* o6[0-3] */ 4266 { 4267 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4268 4269 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 4270 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 4271 4272 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4273 m_count = _mm_cvtsi32_si128(i4_shift); 4274 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4275 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4276 4277 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4278 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4279 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4280 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4281 4282 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4283 4284 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4285 pi2_dst_scratch += out_stride; 4286 4287 } 4288 4289 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 4290 4291 /* o7[0-3] */ 4292 { 4293 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4294 4295 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 4296 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 4297 4298 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4299 m_count = _mm_cvtsi32_si128(i4_shift); 4300 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4301 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4302 4303 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4304 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4305 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4306 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4307 4308 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4309 4310 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4311 pi2_dst_scratch += 8; 4312 4313 } 4314 4315 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 4316 4317 /* o8[0-3] */ 4318 { 4319 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4320 4321 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 4322 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 4323 4324 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4325 m_count = _mm_cvtsi32_si128(i4_shift); 4326 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4327 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4328 4329 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4330 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4331 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4332 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4333 4334 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4335 4336 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4337 pi2_dst_scratch += out_stride; 4338 } 4339 4340 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 4341 4342 /* o9[0-3] */ 4343 { 4344 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4345 4346 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 4347 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 4348 4349 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4350 m_count = _mm_cvtsi32_si128(i4_shift); 4351 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4352 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4353 4354 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4355 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4356 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4357 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4358 4359 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4360 4361 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4362 pi2_dst_scratch += out_stride; 4363 4364 } 4365 4366 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 4367 4368 /* o10[0-3] */ 4369 { 4370 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4371 4372 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 4373 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 4374 4375 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4376 m_count = _mm_cvtsi32_si128(i4_shift); 4377 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4378 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4379 4380 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4381 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4382 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4383 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4384 4385 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4386 4387 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4388 pi2_dst_scratch += out_stride; 4389 } 4390 4391 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 4392 4393 /* o11[0-3] */ 4394 { 4395 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4396 4397 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 4398 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 4399 4400 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4401 m_count = _mm_cvtsi32_si128(i4_shift); 4402 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4403 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4404 4405 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4406 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4407 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4408 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4409 4410 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4411 4412 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4413 pi2_dst_scratch += out_stride; 4414 4415 } 4416 4417 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 4418 4419 /* o12[0-3] */ 4420 { 4421 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4422 4423 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 4424 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 4425 4426 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4427 m_count = _mm_cvtsi32_si128(i4_shift); 4428 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4429 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4430 4431 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4432 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4433 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4434 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4435 4436 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4437 4438 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4439 pi2_dst_scratch += out_stride; 4440 4441 } 4442 4443 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 4444 4445 /* o13[0-3] */ 4446 { 4447 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4448 4449 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 4450 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 4451 4452 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4453 m_count = _mm_cvtsi32_si128(i4_shift); 4454 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4455 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4456 4457 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4458 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4459 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4460 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4461 4462 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4463 4464 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4465 pi2_dst_scratch += out_stride; 4466 } 4467 4468 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 4469 4470 /* o14[0-3] */ 4471 { 4472 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4473 4474 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 4475 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 4476 4477 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4478 m_count = _mm_cvtsi32_si128(i4_shift); 4479 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4480 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4481 4482 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4483 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4484 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4485 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4486 4487 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4488 4489 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4490 pi2_dst_scratch += out_stride; 4491 4492 } 4493 4494 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 4495 4496 /* o15[0-3] */ 4497 { 4498 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4499 4500 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 4501 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 4502 4503 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4504 m_count = _mm_cvtsi32_si128(i4_shift); 4505 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4506 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4507 4508 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4509 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4510 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4511 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4512 4513 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4514 4515 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4516 pi2_dst_scratch += 8; 4517 } 4518 4519 } 4520 4521 } 4522 else if(zero_last24_rows_stg2) 4523 { 4524 /* eo */ 4525 { 4526 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 4527 4528 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 4529 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]); 4530 4531 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11); 4532 4533 4534 /* eo0[0-3] */ 4535 { 4536 m_temp_reg_90 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4537 4538 } 4539 4540 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 4541 4542 /* eo1[0-3] */ 4543 { 4544 m_temp_reg_91 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4545 4546 } 4547 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 4548 4549 /* eo2[0-3] */ 4550 { 4551 m_temp_reg_92 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4552 4553 } 4554 4555 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 4556 4557 /* eo3[0-3] */ 4558 { 4559 4560 m_temp_reg_93 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4561 4562 } 4563 4564 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 4565 4566 /* eo4[0-3] */ 4567 { 4568 m_temp_reg_94 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4569 4570 } 4571 4572 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 4573 4574 /* eo5[0-3] */ 4575 { 4576 m_temp_reg_95 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4577 } 4578 4579 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 4580 /* eo6[0-3] */ 4581 { 4582 m_temp_reg_96 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4583 } 4584 4585 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 4586 /* eo7[0-3] */ 4587 { 4588 m_temp_reg_97 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4589 4590 } 4591 4592 } 4593 4594 /* eeo */ 4595 { 4596 4597 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 4598 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 4599 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 4600 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 4601 4602 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]); 4603 4604 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, all_zero_reg); 4605 4606 /* eeo0[0-3] */ 4607 { 4608 temp1 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4609 4610 } 4611 4612 /* eeo1[0-3] */ 4613 { 4614 temp2 = _mm_madd_epi16(m_temp_reg_10, m_coeff2); 4615 4616 } 4617 4618 /* eo2[0-3] */ 4619 { 4620 temp3 = _mm_madd_epi16(m_temp_reg_10, m_coeff4); 4621 4622 } 4623 4624 4625 /* eo3[0-3] */ 4626 { 4627 temp4 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 4628 4629 } 4630 4631 } 4632 4633 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 4634 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 4635 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 4636 4637 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 4638 4639 //m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_70); 4640 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, all_zero_reg); 4641 4642 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4643 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 4644 4645 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_14, temp1); /* ee[0] */ 4646 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_14, temp1); /* ee[7] */ 4647 4648 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_16, temp2); /* ee[1] */ 4649 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_16, temp2); /* ee[6] */ 4650 4651 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_16, temp3); /* ee[2] */ 4652 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_16, temp3); /* ee[5] */ 4653 4654 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_14, temp4); /* ee[3] */ 4655 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_14, temp4); /* ee[4] */ 4656 4657 /* e[]*/ 4658 4659 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */ 4660 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */ 4661 4662 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */ 4663 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */ 4664 4665 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */ 4666 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */ 4667 4668 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */ 4669 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */ 4670 4671 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */ 4672 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */ 4673 4674 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */ 4675 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */ 4676 4677 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */ 4678 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */ 4679 4680 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */ 4681 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */ 4682 4683 /*o[k] */ 4684 { 4685 4686 WORD16 *pi2_dst_scratch = temp_ptr; 4687 WORD32 out_stride = 8; 4688 4689 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 4690 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 4691 4692 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 4693 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 4694 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]); 4695 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]); 4696 4697 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 4698 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 4699 4700 /* o0[0-3] */ 4701 { 4702 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4703 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4704 4705 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 4706 4707 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 4708 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 4709 4710 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4711 m_count = _mm_cvtsi32_si128(i4_shift); 4712 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4713 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4714 4715 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4716 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4717 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4718 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4719 4720 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4721 4722 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4723 pi2_dst_scratch += out_stride; 4724 4725 } 4726 4727 4728 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 4729 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 4730 4731 /* o1[0-3] */ 4732 { 4733 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4734 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4735 4736 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 4737 4738 m_temp_reg_31 = _mm_sub_epi32(temp3, m_temp_reg_20); 4739 m_temp_reg_30 = _mm_add_epi32(temp3, m_temp_reg_20); 4740 4741 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4742 m_count = _mm_cvtsi32_si128(i4_shift); 4743 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4744 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4745 4746 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4747 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4748 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4749 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4750 4751 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4752 4753 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4754 pi2_dst_scratch += out_stride; 4755 4756 } 4757 4758 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 4759 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 4760 4761 /* o2[0-3] */ 4762 { 4763 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4764 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4765 4766 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 4767 4768 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20); 4769 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20); 4770 4771 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4772 m_count = _mm_cvtsi32_si128(i4_shift); 4773 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4774 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4775 4776 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4777 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4778 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4779 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4780 4781 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4782 4783 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4784 pi2_dst_scratch += out_stride; 4785 4786 } 4787 4788 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 4789 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 4790 4791 /* o3[0-3] */ 4792 { 4793 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4794 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4795 4796 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 4797 4798 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20); 4799 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20); 4800 4801 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4802 m_count = _mm_cvtsi32_si128(i4_shift); 4803 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4804 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4805 4806 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4807 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4808 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4809 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4810 4811 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4812 4813 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4814 pi2_dst_scratch += out_stride; 4815 4816 } 4817 4818 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 4819 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 4820 4821 /* o4[0-3] */ 4822 { 4823 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4824 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4825 4826 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4827 4828 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 4829 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 4830 4831 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4832 m_count = _mm_cvtsi32_si128(i4_shift); 4833 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4834 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4835 4836 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4837 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4838 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4839 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4840 4841 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4842 4843 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4844 pi2_dst_scratch += out_stride; 4845 4846 } 4847 4848 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 4849 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 4850 4851 /* o5[0-3] */ 4852 { 4853 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4854 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4855 4856 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4857 4858 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 4859 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 4860 4861 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4862 m_count = _mm_cvtsi32_si128(i4_shift); 4863 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4864 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4865 4866 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4867 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4868 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4869 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4870 4871 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4872 4873 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4874 pi2_dst_scratch += out_stride; 4875 4876 } 4877 4878 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 4879 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 4880 4881 /* o6[0-3] */ 4882 { 4883 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4884 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4885 4886 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4887 4888 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 4889 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 4890 4891 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4892 m_count = _mm_cvtsi32_si128(i4_shift); 4893 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4894 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4895 4896 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4897 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4898 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4899 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4900 4901 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4902 4903 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4904 pi2_dst_scratch += out_stride; 4905 4906 } 4907 4908 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 4909 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 4910 4911 /* o7[0-3] */ 4912 { 4913 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4914 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4915 4916 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4917 4918 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 4919 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 4920 4921 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4922 m_count = _mm_cvtsi32_si128(i4_shift); 4923 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4924 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4925 4926 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4927 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4928 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4929 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4930 4931 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4932 4933 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4934 pi2_dst_scratch += 8; 4935 4936 } 4937 4938 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 4939 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 4940 4941 /* o8[0-3] */ 4942 { 4943 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4944 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4945 4946 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4947 4948 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 4949 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 4950 4951 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4952 m_count = _mm_cvtsi32_si128(i4_shift); 4953 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4954 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4955 4956 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4957 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4958 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4959 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4960 4961 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4962 4963 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4964 pi2_dst_scratch += out_stride; 4965 } 4966 4967 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 4968 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 4969 4970 /* o9[0-3] */ 4971 { 4972 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 4973 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 4974 4975 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 4976 4977 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 4978 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 4979 4980 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 4981 m_count = _mm_cvtsi32_si128(i4_shift); 4982 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 4983 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 4984 4985 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 4986 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 4987 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 4988 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 4989 4990 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 4991 4992 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 4993 pi2_dst_scratch += out_stride; 4994 } 4995 4996 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 4997 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 4998 4999 /* o10[0-3] */ 5000 { 5001 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5002 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5003 5004 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5005 5006 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 5007 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 5008 5009 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5010 m_count = _mm_cvtsi32_si128(i4_shift); 5011 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5012 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5013 5014 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5015 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5016 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5017 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5018 5019 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5020 5021 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5022 pi2_dst_scratch += out_stride; 5023 } 5024 5025 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 5026 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 5027 5028 /* o11[0-3] */ 5029 { 5030 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5031 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5032 5033 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5034 5035 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 5036 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 5037 5038 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5039 m_count = _mm_cvtsi32_si128(i4_shift); 5040 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5041 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5042 5043 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5044 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5045 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5046 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5047 5048 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5049 5050 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5051 pi2_dst_scratch += out_stride; 5052 5053 } 5054 5055 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 5056 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 5057 5058 /* o12[0-3] */ 5059 { 5060 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5061 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5062 5063 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5064 5065 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 5066 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 5067 5068 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5069 m_count = _mm_cvtsi32_si128(i4_shift); 5070 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5071 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5072 5073 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5074 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5075 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5076 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5077 5078 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5079 5080 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5081 pi2_dst_scratch += out_stride; 5082 5083 } 5084 5085 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 5086 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 5087 5088 /* o13[0-3] */ 5089 { 5090 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5091 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5092 5093 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5094 5095 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 5096 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 5097 5098 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5099 m_count = _mm_cvtsi32_si128(i4_shift); 5100 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5101 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5102 5103 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5104 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5105 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5106 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5107 5108 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5109 5110 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5111 pi2_dst_scratch += out_stride; 5112 } 5113 5114 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 5115 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 5116 5117 /* o14[0-3] */ 5118 { 5119 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5120 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5121 5122 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5123 5124 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 5125 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 5126 5127 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5128 m_count = _mm_cvtsi32_si128(i4_shift); 5129 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5130 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5131 5132 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5133 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5134 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5135 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5136 5137 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5138 5139 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5140 pi2_dst_scratch += out_stride; 5141 } 5142 5143 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 5144 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 5145 5146 /* o15[0-3] */ 5147 { 5148 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5149 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5150 5151 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); 5152 5153 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 5154 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 5155 5156 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5157 m_count = _mm_cvtsi32_si128(i4_shift); 5158 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5159 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5160 5161 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5162 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5163 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5164 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5165 5166 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5167 5168 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5169 pi2_dst_scratch += 8; 5170 } 5171 5172 } 5173 } 5174 else 5175 { 5176 /* eo */ 5177 { 5178 5179 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[0][0]); //90 87 5180 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[1][0]); //80 70 5181 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[2][0]); //57 43 5182 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[3][0]); //25 9 5183 5184 5185 m_temp_reg_10 = _mm_loadu_si128((__m128i *)&pi2_tmp[2 * trans_size]); 5186 m_temp_reg_11 = _mm_loadu_si128((__m128i *)&pi2_tmp[6 * trans_size]); 5187 m_temp_reg_12 = _mm_loadu_si128((__m128i *)&pi2_tmp[10 * trans_size]); 5188 m_temp_reg_13 = _mm_loadu_si128((__m128i *)&pi2_tmp[14 * trans_size]); 5189 m_temp_reg_18 = _mm_loadu_si128((__m128i *)&pi2_tmp[18 * trans_size]); 5190 m_temp_reg_19 = _mm_loadu_si128((__m128i *)&pi2_tmp[22 * trans_size]); 5191 m_temp_reg_20 = _mm_loadu_si128((__m128i *)&pi2_tmp[26 * trans_size]); 5192 m_temp_reg_21 = _mm_loadu_si128((__m128i *)&pi2_tmp[30 * trans_size]); 5193 5194 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_10, m_temp_reg_11); 5195 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_12, m_temp_reg_13); 5196 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_18, m_temp_reg_19); 5197 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_21); 5198 5199 /* eo0[0-3] */ 5200 { 5201 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5202 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5203 5204 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5205 5206 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5207 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5208 5209 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5210 5211 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5212 5213 } 5214 5215 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[4][0]); //87 57 5216 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[5][0]); //0 -43 5217 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[6][0]); //80 90 5218 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[7][0]); //70 25 5219 5220 /* eo1[0-3] */ 5221 { 5222 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5223 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5224 5225 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5226 5227 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5228 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5229 5230 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5231 5232 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_32); 5233 5234 } 5235 5236 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[8][0]); //80 9 5237 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[9][0]); //70 87 5238 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[10][0]); //-25 57 5239 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[11][0]); //90 43 5240 5241 /* eo2[0-3] */ 5242 { 5243 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5244 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5245 5246 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 5247 5248 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5249 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5250 5251 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5252 5253 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5254 5255 } 5256 5257 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[12][0]); //70 -43 5258 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[13][0]); //-87 9 5259 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[14][0]); //90 25 5260 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[15][0]); //80 57 5261 5262 /* eo3[0-3] */ 5263 { 5264 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5265 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5266 5267 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5268 5269 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5270 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5271 5272 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_32, m_temp_reg_33); 5273 5274 m_temp_reg_93 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5275 5276 } 5277 5278 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[16][0]); //57 -80 5279 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[17][0]); //-25 90 5280 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[18][0]); //9 87 5281 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[19][0]); //43 70 5282 5283 5284 /* eo4[0-3] */ 5285 { 5286 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5287 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5288 5289 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5290 5291 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5292 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5293 5294 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 5295 5296 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5297 5298 } 5299 5300 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[20][0]); //43 -90 5301 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[21][0]); //57 25 5302 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[22][0]); //-87 70 5303 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[23][0]); //9 -80 5304 5305 /* eo5[0-3] */ 5306 { 5307 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5308 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5309 5310 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5311 5312 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5313 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5314 5315 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5316 5317 m_temp_reg_95 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5318 } 5319 5320 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[24][0]); //25 -70 5321 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[25][0]); //90 -80 5322 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[26][0]); //43 9 5323 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[27][0]); //-57 87 5324 5325 /* eo6[0-3] */ 5326 { 5327 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5328 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5329 5330 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5331 5332 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5333 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5334 5335 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5336 5337 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5338 5339 } 5340 5341 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[28][0]); //9 -25 5342 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[29][0]); //43 -57 5343 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[30][0]); //70 -80 5344 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_packed[31][0]); //87 -90 5345 5346 /* eo7[0-3] */ 5347 { 5348 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5349 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5350 5351 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5352 5353 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5354 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5355 5356 m_temp_reg_32 = _mm_add_epi32(m_temp_reg_32, m_temp_reg_33); 5357 5358 m_temp_reg_97 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_32); 5359 5360 5361 } 5362 5363 } 5364 5365 /* eeo */ 5366 { 5367 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[4][0]); //89 75 5368 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[5][0]); //50 18 5369 5370 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[4 * trans_size]); 5371 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[12 * trans_size]); 5372 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[20 * trans_size]); 5373 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[28 * trans_size]); 5374 5375 /* eeo0[0-3] */ 5376 { 5377 5378 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 5379 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_86); 5380 5381 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5382 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5383 5384 temp1 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5385 5386 } 5387 5388 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[6][0]); //75 -18 5389 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[7][0]); //89 50 5390 5391 /* eeo1[0-3] */ 5392 { 5393 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5394 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 5395 5396 temp2 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_31); 5397 5398 } 5399 5400 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[8][0]); //50 -89 5401 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[9][0]); //18 75 5402 5403 /* eo2[0-3] */ 5404 { 5405 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5406 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 5407 5408 temp3 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5409 5410 } 5411 5412 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[10][0]); //18 -50 5413 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[11][0]); //75 -89 5414 5415 /* eo3[0-3] */ 5416 { 5417 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 5418 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff4); 5419 5420 temp4 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 5421 5422 } 5423 5424 5425 } 5426 5427 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[0][0]); //83 36 5428 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[1][0]); //36 -83 5429 5430 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[2][0]); //64 64 5431 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_16_even_packed[3][0]); //64 -64 5432 5433 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[8 * trans_size]); 5434 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[24 * trans_size]); 5435 5436 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_84); 5437 5438 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[0 * trans_size]); 5439 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[16 * trans_size]); 5440 5441 m_temp_reg_1 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_80); 5442 5443 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); /* eeeo[0] */ 5444 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); /* eeeo[1] */ 5445 5446 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); /* eeee[0] */ 5447 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); /* eeee[1] */ 5448 5449 /* eeeo[0]= m_temp_reg_20 */ 5450 /* eeeo[1]= m_temp_reg_21 */ 5451 /* eeee[0]= m_temp_reg_22 */ 5452 /* eeee[1]= m_temp_reg_23 */ 5453 5454 /* eee[0] = eeee[0] + eeeo[0]; */ 5455 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[0] */ 5456 5457 /* eee[3] = eeee[0] - eeeo[0]; */ 5458 m_temp_reg_43 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); /* eeeo[1] */ 5459 5460 /* eee[2] = eeee[1] - eeeo[1]; */ 5461 m_temp_reg_42 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[1] */ 5462 5463 /* eee[1] = eeee[1] + eeeo[1];*/ 5464 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_22); /* eeee[0] */ 5465 5466 m_temp_reg_70 = _mm_add_epi32(m_temp_reg_40, temp1); /* ee[0] */ 5467 m_temp_reg_71 = _mm_sub_epi32(m_temp_reg_40, temp1); /* ee[7] */ 5468 5469 m_temp_reg_72 = _mm_add_epi32(m_temp_reg_41, temp2); /* ee[1] */ 5470 m_temp_reg_73 = _mm_sub_epi32(m_temp_reg_41, temp2); /* ee[6] */ 5471 5472 m_temp_reg_74 = _mm_add_epi32(m_temp_reg_42, temp3); /* ee[2] */ 5473 m_temp_reg_75 = _mm_sub_epi32(m_temp_reg_42, temp3); /* ee[5] */ 5474 5475 m_temp_reg_76 = _mm_add_epi32(m_temp_reg_43, temp4); /* ee[3] */ 5476 m_temp_reg_77 = _mm_sub_epi32(m_temp_reg_43, temp4); /* ee[4] */ 5477 5478 /* e[]*/ 5479 5480 temp1 = _mm_add_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[0] */ 5481 temp2 = _mm_sub_epi32(m_temp_reg_70, m_temp_reg_90); /* ee[15] */ 5482 5483 temp3 = _mm_add_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[1] */ 5484 temp4 = _mm_sub_epi32(m_temp_reg_72, m_temp_reg_91); /* ee[14] */ 5485 5486 temp5 = _mm_add_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[2] */ 5487 temp6 = _mm_sub_epi32(m_temp_reg_74, m_temp_reg_92); /* ee[13] */ 5488 5489 temp7 = _mm_add_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[3] */ 5490 temp8 = _mm_sub_epi32(m_temp_reg_76, m_temp_reg_93); /* ee[12] */ 5491 5492 m_temp_reg_90 = _mm_add_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[4] */ 5493 m_temp_reg_91 = _mm_sub_epi32(m_temp_reg_77, m_temp_reg_94); /* ee[11] */ 5494 5495 m_temp_reg_92 = _mm_add_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[5] */ 5496 m_temp_reg_93 = _mm_sub_epi32(m_temp_reg_75, m_temp_reg_95); /* ee[10] */ 5497 5498 m_temp_reg_94 = _mm_add_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[6] */ 5499 m_temp_reg_95 = _mm_sub_epi32(m_temp_reg_73, m_temp_reg_96); /* ee[9] */ 5500 5501 m_temp_reg_96 = _mm_add_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[7] */ 5502 m_temp_reg_97 = _mm_sub_epi32(m_temp_reg_71, m_temp_reg_97); /* ee[8] */ 5503 5504 /*o[k] */ 5505 { 5506 5507 WORD16 *pi2_dst_scratch = temp_ptr; 5508 WORD32 out_stride = 8; 5509 5510 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[0][0]); 5511 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[1][0]); 5512 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[2][0]); 5513 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[3][0]); 5514 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[4][0]); 5515 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[5][0]); 5516 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[6][0]); 5517 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[7][0]); 5518 5519 5520 m_temp_reg_70 = _mm_loadu_si128((__m128i *)&pi2_tmp[trans_size]); 5521 m_temp_reg_71 = _mm_loadu_si128((__m128i *)&pi2_tmp[3 * trans_size]); 5522 m_temp_reg_72 = _mm_loadu_si128((__m128i *)&pi2_tmp[5 * trans_size]); 5523 m_temp_reg_73 = _mm_loadu_si128((__m128i *)&pi2_tmp[7 * trans_size]); 5524 m_temp_reg_74 = _mm_loadu_si128((__m128i *)&pi2_tmp[9 * trans_size]); 5525 m_temp_reg_75 = _mm_loadu_si128((__m128i *)&pi2_tmp[11 * trans_size]); 5526 m_temp_reg_76 = _mm_loadu_si128((__m128i *)&pi2_tmp[13 * trans_size]); 5527 m_temp_reg_77 = _mm_loadu_si128((__m128i *)&pi2_tmp[15 * trans_size]); 5528 5529 m_temp_reg_80 = _mm_loadu_si128((__m128i *)&pi2_tmp[17 * trans_size]); 5530 m_temp_reg_81 = _mm_loadu_si128((__m128i *)&pi2_tmp[19 * trans_size]); 5531 m_temp_reg_82 = _mm_loadu_si128((__m128i *)&pi2_tmp[21 * trans_size]); 5532 m_temp_reg_83 = _mm_loadu_si128((__m128i *)&pi2_tmp[23 * trans_size]); 5533 m_temp_reg_84 = _mm_loadu_si128((__m128i *)&pi2_tmp[25 * trans_size]); 5534 m_temp_reg_85 = _mm_loadu_si128((__m128i *)&pi2_tmp[27 * trans_size]); 5535 m_temp_reg_86 = _mm_loadu_si128((__m128i *)&pi2_tmp[29 * trans_size]); 5536 m_temp_reg_87 = _mm_loadu_si128((__m128i *)&pi2_tmp[31 * trans_size]); 5537 5538 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 interleaved 5539 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 interleaved 5540 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 interleaved 5541 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 interleaved 5542 m_temp_reg_14 = _mm_unpacklo_epi16(m_temp_reg_80, m_temp_reg_81); //row 17 and row 19 interleaved 5543 m_temp_reg_15 = _mm_unpacklo_epi16(m_temp_reg_82, m_temp_reg_83); //row 21 and row 23 interleaved 5544 m_temp_reg_16 = _mm_unpacklo_epi16(m_temp_reg_84, m_temp_reg_85); //row 25 and row 27 interleaved 5545 m_temp_reg_17 = _mm_unpacklo_epi16(m_temp_reg_86, m_temp_reg_87); //row 29 and row 31 interleaved 5546 5547 /* o0[0-3] */ 5548 { 5549 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5550 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5551 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5552 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5553 5554 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5555 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5556 5557 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5558 5559 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5560 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5561 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5562 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5563 5564 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5565 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5566 5567 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5568 5569 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5570 5571 m_temp_reg_31 = _mm_sub_epi32(temp1, m_temp_reg_20); 5572 m_temp_reg_30 = _mm_add_epi32(temp1, m_temp_reg_20); 5573 5574 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5575 m_count = _mm_cvtsi32_si128(i4_shift); 5576 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5577 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5578 5579 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5580 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5581 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5582 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5583 5584 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5585 5586 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5587 pi2_dst_scratch += out_stride; 5588 5589 } 5590 5591 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[8][0]); 5592 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[9][0]); 5593 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[10][0]); 5594 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[11][0]); 5595 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[12][0]); 5596 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[13][0]); 5597 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[14][0]); 5598 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[15][0]); 5599 5600 /* o1[0-3] */ 5601 { 5602 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5603 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5604 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5605 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5606 5607 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5608 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5609 5610 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_20); 5611 5612 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5613 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5614 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5615 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5616 5617 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5618 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5619 5620 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5621 5622 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5623 5624 m_temp_reg_31 = _mm_add_epi32(temp3, m_temp_reg_20); 5625 m_temp_reg_30 = _mm_sub_epi32(temp3, m_temp_reg_20); 5626 5627 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5628 m_count = _mm_cvtsi32_si128(i4_shift); 5629 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5630 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5631 5632 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5633 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5634 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5635 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5636 5637 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5638 5639 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5640 pi2_dst_scratch += out_stride; 5641 5642 } 5643 5644 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[16][0]); 5645 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[17][0]); 5646 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[18][0]); 5647 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[19][0]); 5648 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[20][0]); 5649 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[21][0]); 5650 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[22][0]); 5651 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[23][0]); 5652 5653 /* o2[0-3] */ 5654 { 5655 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5656 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5657 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5658 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5659 5660 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 5661 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5662 5663 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5664 5665 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5666 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5667 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5668 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5669 5670 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_41); 5671 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5672 5673 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_42); 5674 5675 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5676 5677 m_temp_reg_31 = _mm_add_epi32(temp5, m_temp_reg_20); 5678 m_temp_reg_30 = _mm_sub_epi32(temp5, m_temp_reg_20); 5679 5680 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5681 m_count = _mm_cvtsi32_si128(i4_shift); 5682 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5683 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5684 5685 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5686 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5687 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5688 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5689 5690 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5691 5692 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5693 pi2_dst_scratch += out_stride; 5694 5695 } 5696 5697 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[24][0]); 5698 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[25][0]); 5699 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[26][0]); 5700 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[27][0]); 5701 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[28][0]); 5702 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[29][0]); 5703 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[30][0]); 5704 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[31][0]); 5705 5706 /* o3[0-3] */ 5707 { 5708 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5709 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5710 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5711 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5712 5713 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_21, m_temp_reg_20); 5714 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5715 5716 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5717 5718 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5719 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5720 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5721 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5722 5723 m_temp_reg_40 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_40); 5724 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5725 5726 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5727 5728 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5729 5730 m_temp_reg_31 = _mm_add_epi32(temp7, m_temp_reg_20); 5731 m_temp_reg_30 = _mm_sub_epi32(temp7, m_temp_reg_20); 5732 5733 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5734 m_count = _mm_cvtsi32_si128(i4_shift); 5735 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5736 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5737 5738 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5739 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5740 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5741 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5742 5743 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5744 5745 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5746 pi2_dst_scratch += out_stride; 5747 5748 } 5749 5750 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[32][0]); 5751 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[33][0]); 5752 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[34][0]); 5753 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[35][0]); 5754 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[36][0]); 5755 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[37][0]); 5756 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[38][0]); 5757 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[39][0]); 5758 5759 /* o4[0-3] */ 5760 { 5761 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5762 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5763 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5764 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5765 5766 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5767 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5768 5769 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5770 5771 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5772 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5773 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5774 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5775 5776 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5777 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5778 5779 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5780 5781 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5782 5783 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_90, m_temp_reg_20); 5784 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_90, m_temp_reg_20); 5785 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5786 m_count = _mm_cvtsi32_si128(i4_shift); 5787 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5788 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5789 5790 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5791 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5792 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5793 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5794 5795 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5796 5797 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5798 pi2_dst_scratch += out_stride; 5799 5800 } 5801 5802 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[40][0]); 5803 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[41][0]); 5804 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[42][0]); 5805 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[43][0]); 5806 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[44][0]); 5807 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[45][0]); 5808 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[46][0]); 5809 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[47][0]); 5810 5811 /* o5[0-3] */ 5812 { 5813 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5814 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5815 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5816 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5817 5818 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5819 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5820 5821 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5822 5823 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5824 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5825 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5826 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5827 5828 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5829 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5830 5831 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5832 5833 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5834 5835 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_92, m_temp_reg_20); 5836 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_92, m_temp_reg_20); 5837 5838 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5839 m_count = _mm_cvtsi32_si128(i4_shift); 5840 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5841 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5842 5843 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5844 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5845 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5846 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5847 5848 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5849 5850 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5851 pi2_dst_scratch += out_stride; 5852 5853 } 5854 5855 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[48][0]); 5856 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[49][0]); 5857 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[50][0]); 5858 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[51][0]); 5859 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[52][0]); 5860 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[53][0]); 5861 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[54][0]); 5862 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[55][0]); 5863 5864 /* o6[0-3] */ 5865 { 5866 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5867 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5868 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5869 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5870 5871 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5872 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5873 5874 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5875 5876 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5877 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5878 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5879 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5880 5881 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5882 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5883 5884 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5885 5886 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5887 5888 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_94, m_temp_reg_20); 5889 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_94, m_temp_reg_20); 5890 5891 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5892 m_count = _mm_cvtsi32_si128(i4_shift); 5893 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5894 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5895 5896 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5897 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5898 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5899 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5900 5901 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5902 5903 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5904 pi2_dst_scratch += out_stride; 5905 5906 } 5907 5908 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[56][0]); 5909 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[57][0]); 5910 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[58][0]); 5911 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[59][0]); 5912 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[60][0]); 5913 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[61][0]); 5914 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[62][0]); 5915 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[63][0]); 5916 5917 /* o7[0-3] */ 5918 { 5919 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5920 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5921 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5922 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5923 5924 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5925 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5926 5927 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5928 5929 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5930 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5931 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5932 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5933 5934 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5935 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5936 5937 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5938 5939 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5940 5941 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_96, m_temp_reg_20); 5942 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_96, m_temp_reg_20); 5943 5944 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5945 m_count = _mm_cvtsi32_si128(i4_shift); 5946 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 5947 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 5948 5949 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 5950 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 5951 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 5952 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 5953 5954 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 5955 5956 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 5957 pi2_dst_scratch += 8; 5958 5959 } 5960 5961 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[64][0]); 5962 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[65][0]); 5963 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[66][0]); 5964 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[67][0]); 5965 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[68][0]); 5966 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[69][0]); 5967 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[70][0]); 5968 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[71][0]); 5969 5970 /* o8[0-3] */ 5971 { 5972 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 5973 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 5974 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 5975 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 5976 5977 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 5978 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 5979 5980 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 5981 5982 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 5983 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 5984 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 5985 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 5986 5987 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 5988 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 5989 5990 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 5991 5992 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 5993 5994 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_97, m_temp_reg_20); 5995 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_97, m_temp_reg_20); 5996 5997 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 5998 m_count = _mm_cvtsi32_si128(i4_shift); 5999 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6000 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6001 6002 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6003 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6004 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6005 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6006 6007 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6008 6009 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6010 pi2_dst_scratch += out_stride; 6011 } 6012 6013 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[72][0]); 6014 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[73][0]); 6015 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[74][0]); 6016 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[75][0]); 6017 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[76][0]); 6018 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[77][0]); 6019 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[78][0]); 6020 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[79][0]); 6021 6022 /* o9[0-3] */ 6023 { 6024 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6025 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6026 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6027 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6028 6029 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6030 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6031 6032 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6033 6034 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6035 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6036 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6037 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6038 6039 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6040 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6041 6042 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6043 6044 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6045 6046 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_95, m_temp_reg_20); 6047 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_95, m_temp_reg_20); 6048 6049 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6050 m_count = _mm_cvtsi32_si128(i4_shift); 6051 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6052 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6053 6054 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6055 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6056 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6057 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6058 6059 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6060 6061 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6062 pi2_dst_scratch += out_stride; 6063 } 6064 6065 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[80][0]); 6066 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[81][0]); 6067 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[82][0]); 6068 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[83][0]); 6069 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[84][0]); 6070 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[85][0]); 6071 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[86][0]); 6072 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[87][0]); 6073 6074 /* o10[0-3] */ 6075 { 6076 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6077 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6078 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6079 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6080 6081 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6082 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6083 6084 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6085 6086 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6087 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6088 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6089 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6090 6091 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6092 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6093 6094 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6095 6096 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6097 6098 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_93, m_temp_reg_20); 6099 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_93, m_temp_reg_20); 6100 6101 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6102 m_count = _mm_cvtsi32_si128(i4_shift); 6103 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6104 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6105 6106 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6107 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6108 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6109 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6110 6111 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6112 6113 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6114 pi2_dst_scratch += out_stride; 6115 } 6116 6117 6118 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[88][0]); 6119 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[89][0]); 6120 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[90][0]); 6121 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[91][0]); 6122 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[92][0]); 6123 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[93][0]); 6124 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[94][0]); 6125 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[95][0]); 6126 6127 /* o11[0-3] */ 6128 { 6129 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6130 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6131 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6132 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6133 6134 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6135 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6136 6137 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6138 6139 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6140 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6141 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6142 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6143 6144 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6145 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6146 6147 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6148 6149 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6150 6151 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_91, m_temp_reg_20); 6152 m_temp_reg_30 = _mm_sub_epi32(m_temp_reg_91, m_temp_reg_20); 6153 6154 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6155 m_count = _mm_cvtsi32_si128(i4_shift); 6156 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6157 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6158 6159 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6160 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6161 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6162 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6163 6164 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6165 6166 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6167 pi2_dst_scratch += out_stride; 6168 6169 } 6170 6171 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[96][0]); 6172 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[97][0]); 6173 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[98][0]); 6174 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[99][0]); 6175 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[100][0]); 6176 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[101][0]); 6177 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[102][0]); 6178 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[103][0]); 6179 6180 /* o12[0-3] */ 6181 { 6182 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6183 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6184 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6185 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6186 6187 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6188 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6189 6190 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6191 6192 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6193 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6194 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6195 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6196 6197 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6198 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6199 6200 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6201 6202 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6203 6204 m_temp_reg_31 = _mm_add_epi32(temp8, m_temp_reg_20); 6205 m_temp_reg_30 = _mm_sub_epi32(temp8, m_temp_reg_20); 6206 6207 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6208 m_count = _mm_cvtsi32_si128(i4_shift); 6209 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6210 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6211 6212 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6213 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6214 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6215 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6216 6217 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6218 6219 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6220 pi2_dst_scratch += out_stride; 6221 6222 } 6223 6224 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[104][0]); 6225 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[105][0]); 6226 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[106][0]); 6227 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[107][0]); 6228 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[108][0]); 6229 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[109][0]); 6230 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[110][0]); 6231 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[111][0]); 6232 6233 /* o13[0-3] */ 6234 { 6235 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6236 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6237 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6238 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6239 6240 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6241 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6242 6243 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6244 6245 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6246 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6247 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6248 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6249 6250 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6251 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6252 6253 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6254 6255 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6256 6257 m_temp_reg_31 = _mm_add_epi32(temp6, m_temp_reg_20); 6258 m_temp_reg_30 = _mm_sub_epi32(temp6, m_temp_reg_20); 6259 6260 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6261 m_count = _mm_cvtsi32_si128(i4_shift); 6262 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6263 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6264 6265 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6266 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6267 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6268 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6269 6270 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6271 6272 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6273 pi2_dst_scratch += out_stride; 6274 } 6275 6276 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[112][0]); 6277 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[113][0]); 6278 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[114][0]); 6279 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[115][0]); 6280 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[116][0]); 6281 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[117][0]); 6282 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[118][0]); 6283 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[119][0]); 6284 6285 /* o14[0-3] */ 6286 { 6287 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6288 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6289 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6290 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6291 6292 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6293 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6294 6295 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6296 6297 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6298 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6299 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6300 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6301 6302 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6303 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6304 6305 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6306 6307 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6308 6309 m_temp_reg_31 = _mm_add_epi32(temp4, m_temp_reg_20); 6310 m_temp_reg_30 = _mm_sub_epi32(temp4, m_temp_reg_20); 6311 6312 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6313 m_count = _mm_cvtsi32_si128(i4_shift); 6314 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6315 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6316 6317 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6318 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6319 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6320 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6321 6322 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6323 6324 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6325 pi2_dst_scratch += out_stride; 6326 6327 } 6328 6329 m_coeff1 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[120][0]); 6330 m_coeff2 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[121][0]); 6331 m_coeff3 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[122][0]); 6332 m_coeff4 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[123][0]); 6333 m_coeff5 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[124][0]); 6334 m_coeff6 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[125][0]); 6335 m_coeff7 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[126][0]); 6336 m_coeff8 = _mm_load_si128((__m128i *)&g_ai2_ihevc_trans_32_intr_odd_packed[127][0]); 6337 6338 /* o15[0-3] */ 6339 { 6340 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 6341 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 6342 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 6343 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 6344 6345 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 6346 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 6347 6348 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 6349 6350 m_temp_reg_40 = _mm_madd_epi16(m_temp_reg_14, m_coeff5); 6351 m_temp_reg_41 = _mm_madd_epi16(m_temp_reg_15, m_coeff6); 6352 m_temp_reg_42 = _mm_madd_epi16(m_temp_reg_16, m_coeff7); 6353 m_temp_reg_43 = _mm_madd_epi16(m_temp_reg_17, m_coeff8); 6354 6355 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_41); 6356 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_43); 6357 6358 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_42); 6359 6360 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_40); 6361 6362 m_temp_reg_31 = _mm_add_epi32(temp2, m_temp_reg_20); 6363 m_temp_reg_30 = _mm_sub_epi32(temp2, m_temp_reg_20); 6364 6365 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 6366 m_count = _mm_cvtsi32_si128(i4_shift); 6367 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 6368 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 6369 6370 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 6371 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 6372 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 6373 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 6374 6375 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 6376 6377 _mm_store_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 6378 pi2_dst_scratch += 8; 6379 } 6380 6381 } 6382 6383 } 6384 6385 /* Transpose */ 6386 { 6387 6388 WORD16 *pi2_src_scratch = temp_ptr; 6389 WORD32 out_stride = dst_strd; 6390 WORD32 in_stride = 8; 6391 6392 m_temp_reg_30 = _mm_load_si128((__m128i *)pi2_src_scratch); 6393 pi2_src_scratch += in_stride; 6394 m_temp_reg_31 = _mm_load_si128((__m128i *)pi2_src_scratch); 6395 pi2_src_scratch += in_stride; 6396 m_temp_reg_32 = _mm_load_si128((__m128i *)pi2_src_scratch); 6397 pi2_src_scratch += in_stride; 6398 m_temp_reg_33 = _mm_load_si128((__m128i *)pi2_src_scratch); 6399 pi2_src_scratch += in_stride; 6400 m_temp_reg_34 = _mm_load_si128((__m128i *)pi2_src_scratch); 6401 pi2_src_scratch += in_stride; 6402 m_temp_reg_35 = _mm_load_si128((__m128i *)pi2_src_scratch); 6403 pi2_src_scratch += in_stride; 6404 m_temp_reg_36 = _mm_load_si128((__m128i *)pi2_src_scratch); 6405 pi2_src_scratch += in_stride; 6406 m_temp_reg_37 = _mm_load_si128((__m128i *)pi2_src_scratch); 6407 pi2_src_scratch += 8; 6408 6409 m_temp_reg_70 = _mm_load_si128((__m128i *)pi2_src_scratch); 6410 pi2_src_scratch += in_stride; 6411 m_temp_reg_71 = _mm_load_si128((__m128i *)pi2_src_scratch); 6412 pi2_src_scratch += in_stride; 6413 m_temp_reg_72 = _mm_load_si128((__m128i *)pi2_src_scratch); 6414 pi2_src_scratch += in_stride; 6415 m_temp_reg_73 = _mm_load_si128((__m128i *)pi2_src_scratch); 6416 pi2_src_scratch += in_stride; 6417 m_temp_reg_74 = _mm_load_si128((__m128i *)pi2_src_scratch); 6418 pi2_src_scratch += in_stride; 6419 m_temp_reg_75 = _mm_load_si128((__m128i *)pi2_src_scratch); 6420 pi2_src_scratch += in_stride; 6421 m_temp_reg_76 = _mm_load_si128((__m128i *)pi2_src_scratch); 6422 pi2_src_scratch += in_stride; 6423 m_temp_reg_77 = _mm_load_si128((__m128i *)pi2_src_scratch); 6424 pi2_src_scratch += 8; 6425 6426 6427 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); 6428 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); 6429 6430 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); 6431 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); 6432 6433 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); 6434 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); 6435 6436 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); 6437 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); 6438 6439 m_temp_reg_80 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); 6440 m_temp_reg_81 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_70); 6441 6442 m_temp_reg_82 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); 6443 m_temp_reg_83 = _mm_unpackhi_epi16(m_temp_reg_73, m_temp_reg_72); 6444 6445 m_temp_reg_84 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); 6446 m_temp_reg_85 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_74); 6447 6448 m_temp_reg_86 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); 6449 m_temp_reg_87 = _mm_unpackhi_epi16(m_temp_reg_77, m_temp_reg_76); 6450 6451 6452 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); 6453 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); 6454 6455 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); 6456 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); 6457 6458 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_80, m_temp_reg_82); 6459 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_80, m_temp_reg_82); 6460 6461 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_84, m_temp_reg_86); 6462 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_84, m_temp_reg_86); 6463 6464 m_temp_reg_90 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); 6465 m_temp_reg_91 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); 6466 6467 m_temp_reg_92 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); 6468 m_temp_reg_93 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); 6469 6470 m_temp_reg_94 = _mm_unpacklo_epi32(m_temp_reg_83, m_temp_reg_81); 6471 m_temp_reg_95 = _mm_unpackhi_epi32(m_temp_reg_83, m_temp_reg_81); 6472 6473 m_temp_reg_96 = _mm_unpacklo_epi32(m_temp_reg_87, m_temp_reg_85); 6474 m_temp_reg_97 = _mm_unpackhi_epi32(m_temp_reg_87, m_temp_reg_85); 6475 6476 6477 m_temp_reg_30 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_2); // row0 = 0-7 6478 m_temp_reg_31 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_2); // row1 = 0-7 6479 6480 m_temp_reg_32 = _mm_unpacklo_epi64(m_temp_reg_92, m_temp_reg_90); // row0=24-31 6481 m_temp_reg_33 = _mm_unpackhi_epi64(m_temp_reg_92, m_temp_reg_90); // row1=24-31 6482 6483 m_temp_reg_34 = _mm_unpacklo_epi64(m_temp_reg_4, m_temp_reg_6); // row0=8-15 6484 m_temp_reg_35 = _mm_unpackhi_epi64(m_temp_reg_4, m_temp_reg_6); // row1=8-15 6485 6486 m_temp_reg_36 = _mm_unpacklo_epi64(m_temp_reg_96, m_temp_reg_94); // row0=16-23 6487 m_temp_reg_37 = _mm_unpackhi_epi64(m_temp_reg_96, m_temp_reg_94); // row1=16-23 6488 6489 m_temp_reg_80 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_3); // row2 =0-7 6490 m_temp_reg_81 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_3); // row3 =0-7 6491 6492 m_temp_reg_82 = _mm_unpacklo_epi64(m_temp_reg_93, m_temp_reg_91); // row2=24-31 6493 m_temp_reg_83 = _mm_unpackhi_epi64(m_temp_reg_93, m_temp_reg_91); // row3=24-31 6494 6495 m_temp_reg_84 = _mm_unpacklo_epi64(m_temp_reg_5, m_temp_reg_7); // row2=8-15 6496 m_temp_reg_85 = _mm_unpackhi_epi64(m_temp_reg_5, m_temp_reg_7); // row3=8-15 6497 6498 m_temp_reg_86 = _mm_unpacklo_epi64(m_temp_reg_97, m_temp_reg_95); // row2=16-23 6499 m_temp_reg_87 = _mm_unpackhi_epi64(m_temp_reg_97, m_temp_reg_95); // row3=16-23 6500 6501 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6502 6503 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6504 6505 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_30, m_temp_reg_0); 6506 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6507 6508 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6509 6510 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_34, m_temp_reg_0); 6511 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6512 6513 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6514 6515 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6516 6517 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6518 6519 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_36, m_temp_reg_0); 6520 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6521 6522 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6523 6524 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_32, m_temp_reg_0); 6525 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6526 6527 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6528 pu1_dst += out_stride; 6529 pu1_pred += pred_strd; 6530 6531 6532 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6533 6534 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6535 6536 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_31, m_temp_reg_0); 6537 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6538 6539 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6540 6541 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_35, m_temp_reg_0); 6542 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6543 6544 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6545 6546 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6547 6548 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6549 6550 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_37, m_temp_reg_0); 6551 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6552 6553 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6554 6555 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_33, m_temp_reg_0); 6556 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6557 6558 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6559 pu1_dst += out_stride; 6560 pu1_pred += pred_strd; 6561 6562 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6563 6564 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6565 6566 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_80, m_temp_reg_0); 6567 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6568 6569 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6570 6571 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_84, m_temp_reg_0); 6572 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6573 6574 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6575 6576 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6577 6578 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6579 6580 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_86, m_temp_reg_0); 6581 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6582 6583 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6584 6585 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_82, m_temp_reg_0); 6586 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6587 6588 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6589 pu1_dst += out_stride; 6590 pu1_pred += pred_strd; 6591 6592 6593 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred); 6594 6595 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6596 6597 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_81, m_temp_reg_0); 6598 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6599 6600 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6601 6602 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_85, m_temp_reg_0); 6603 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6604 6605 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 6606 6607 m_temp_reg_20 = _mm_loadu_si128((__m128i *)(pu1_pred + 16)); 6608 6609 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, all_zero_reg); 6610 6611 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_87, m_temp_reg_0); 6612 m_temp_reg_0 = _mm_srli_si128(m_temp_reg_20, 8); 6613 6614 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, all_zero_reg); 6615 6616 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_83, m_temp_reg_0); 6617 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 6618 6619 _mm_storeu_si128((__m128i *)(pu1_dst + 16), m_temp_reg_20); 6620 pu1_dst += out_stride; 6621 pu1_pred += pred_strd; 6622 6623 } 6624 pi2_tmp += 4; 6625 } 6626 } 6627 6628 6629