1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_16x16_itrans_recon_x86_intr.c 22 * 23 * @brief 24 * Contains function definitions for inverse 25 * transform and reconstruction for 16x16. 26 * 27 * @author 28 * 100470 29 * 100592 (edited by) 30 * 31 * @par List of Functions: 32 * - ihevc_itrans_recon_16x16_sse42() 33 * 34 * @remarks 35 * None 36 * 37 ******************************************************************************* 38 */ 39 #include <stdio.h> 40 #include <string.h> 41 #include "ihevc_typedefs.h" 42 #include "ihevc_macros.h" 43 #include "ihevc_platform_macros.h" 44 #include "ihevc_defs.h" 45 #include "ihevc_trans_tables.h" 46 #include "ihevc_itrans_recon.h" 47 #include "ihevc_func_selector.h" 48 #include "ihevc_trans_macros.h" 49 50 #include <immintrin.h> 51 #include <emmintrin.h> 52 #include <smmintrin.h> 53 #include <tmmintrin.h> 54 55 /** 56 ******************************************************************************* 57 * 58 * @brief 59 * This function performs inverse quantization, inverse transform and 60 * reconstruction for 16x16 input block 61 * 62 * @par Description: 63 * Performs inverse quantization , inverse transform and adds the 64 * prediction data and clips output to 8 bit 65 * 66 * @param[in] pi2_src 67 * Input 16x16 coefficients 68 * 69 * @param[in] pi2_tmp 70 * Temporary 16x16 buffer for storing inverse 71 * transform 1st stage output 72 * 73 * @param[in] pu1_pred 74 * Prediction 16x16 block 75 * 76 * @param[in] pi2_dequant_coeff 77 * Dequant Coeffs 78 * 79 * @param[out] pu1_dst 80 * Output 16x16 block 81 * 82 * @param[in] qp_div 83 * Quantization parameter / 6 84 * 85 * @param[in] qp_rem 86 * Quantization parameter % 6 87 * 88 * @param[in] src_strd 89 * Input stride 90 * 91 * @param[in] pred_strd 92 * Prediction stride 93 * 94 * @param[in] dst_strd 95 * Output Stride 96 * 97 * @param[in] zero_cols 98 * Zero columns in pi2_src 99 * 100 * @returns Void 101 * 102 * @remarks 103 * None 104 * 105 ******************************************************************************* 106 */ 107 108 void ihevc_itrans_recon_16x16_sse42(WORD16 *pi2_src, 109 WORD16 *pi2_tmp, 110 UWORD8 *pu1_pred, 111 UWORD8 *pu1_dst, 112 WORD32 src_strd, 113 WORD32 pred_strd, 114 WORD32 dst_strd, 115 WORD32 zero_cols, 116 WORD32 zero_rows) 117 { 118 __m128i m_temp_reg_0; 119 __m128i m_temp_reg_1; 120 __m128i m_temp_reg_10; 121 __m128i m_temp_reg_11; 122 __m128i m_temp_reg_12; 123 __m128i m_temp_reg_13; 124 __m128i m_temp_reg_14; 125 __m128i m_temp_reg_20; 126 __m128i m_temp_reg_21; 127 __m128i m_temp_reg_22; 128 __m128i m_temp_reg_23; 129 __m128i m_temp_reg_24; 130 __m128i m_temp_reg_25; 131 __m128i m_temp_reg_26; 132 __m128i m_temp_reg_27; 133 __m128i m_temp_reg_30; 134 __m128i m_temp_reg_31; 135 __m128i m_temp_reg_32; 136 __m128i m_temp_reg_33; 137 __m128i m_temp_reg_34; 138 __m128i m_temp_reg_35; 139 __m128i m_temp_reg_36; 140 __m128i m_temp_reg_37; 141 __m128i m_temp_reg_40; 142 __m128i m_temp_reg_41; 143 __m128i m_temp_reg_42; 144 __m128i m_temp_reg_43; 145 __m128i m_temp_reg_44; 146 __m128i m_temp_reg_45; 147 __m128i m_temp_reg_46; 148 __m128i m_temp_reg_47; 149 150 __m128i m_temp_reg_70; 151 __m128i m_temp_reg_71; 152 __m128i m_temp_reg_72; 153 __m128i m_temp_reg_73; 154 __m128i m_temp_reg_74; 155 __m128i m_temp_reg_75; 156 __m128i m_temp_reg_76; 157 __m128i m_temp_reg_77; 158 __m128i m_rdng_factor; 159 __m128i m_count; 160 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 161 __m128i m_coeff5, m_coeff6, m_coeff7, m_coeff8; 162 163 WORD32 i; 164 165 WORD32 zero_last8_cols_stg1; 166 WORD32 zero_last8_rows_stg1; 167 WORD32 zero_last12_rows_stg1; 168 WORD32 zero_last12_rows_stg2; 169 WORD32 zero_last8_rows_stg2; 170 171 WORD32 loop = 0; 172 173 WORD32 i4_shift = IT_SHIFT_STAGE_1; 174 WORD32 trans_size = TRANS_SIZE_16; 175 176 /* Following 3 instructions replicates the value in the */ 177 /* lower 16 bits of m_add_iq in the entire register */ 178 179 /* Last 8 cols of 16x16 block are skipped based on the below flag : Lokesh */ 180 181 zero_last8_cols_stg1 = ((zero_cols & 0xFF00) == 0xFF00) ? 1 : 0; 182 zero_last8_rows_stg1 = ((zero_rows & 0xFF00) == 0xFF00) ? 1 : 0; 183 zero_last12_rows_stg1 = ((zero_rows & 0xFFF0) == 0xFFF0) ? 1 : 0; 184 185 zero_last12_rows_stg2 = ((zero_cols & 0xFFF0) == 0xFFF0) ? 1 : 0; 186 zero_last8_rows_stg2 = zero_last8_cols_stg1; 187 188 if(zero_last8_cols_stg1) 189 { 190 loop = 1; 191 } 192 else 193 loop = 2; 194 195 /* i = 0 => lower 8 samples */ 196 /* i = 1 => higher 8 samples */ 197 for(i = 0; i < loop; i++) 198 { 199 { 200 WORD32 sample_half_index = i << 3; 201 WORD16 *pi2_tmp_src = pi2_src + sample_half_index; 202 WORD16 *pi2_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 203 204 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 205 pi2_tmp_src += (src_strd << 1); 206 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 207 pi2_tmp_src += (src_strd << 1); 208 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 209 pi2_tmp_src += (src_strd << 1); 210 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 211 pi2_tmp_src += (src_strd << 1); 212 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 213 pi2_tmp_src += (src_strd << 1); 214 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 215 pi2_tmp_src += (src_strd << 1); 216 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 217 pi2_tmp_src += (src_strd << 1); 218 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 219 pi2_tmp_src += (src_strd << 1); 220 221 222 223 224 /* If last 12 rows are zero : Rishab */ 225 if(zero_last12_rows_stg1) 226 { 227 228 /* eee */ 229 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 230 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 231 { 232 /* Loading coeff and src for use in next block */ 233 234 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get sign 235 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 236 237 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 238 239 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 240 241 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 242 243 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 244 245 m_temp_reg_26 = m_temp_reg_24; 246 m_temp_reg_27 = m_temp_reg_25; 247 } 248 249 /* eo */ 250 251 /* eo0[0-3] */ 252 { 253 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 254 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 255 256 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 257 258 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 259 260 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 261 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 262 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 263 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 264 265 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 266 pi2_scratch += 8; 267 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 268 pi2_scratch += 8; 269 270 } 271 272 273 /* eo0[4-7] */ 274 { 275 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 276 277 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 278 279 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 280 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 281 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 282 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 283 284 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 285 pi2_scratch += 8; 286 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 287 pi2_scratch += 8; 288 289 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 290 } 291 292 /* eo1[0-3] */ 293 { 294 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 295 296 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 297 298 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 299 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 300 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 301 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 302 303 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 304 pi2_scratch += 8; 305 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 306 pi2_scratch += 8; 307 } 308 309 /* eo1[4-7] */ 310 { 311 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 312 313 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 314 315 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 316 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 317 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 318 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 319 320 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 321 pi2_scratch += 8; 322 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 323 pi2_scratch += 8; 324 325 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 326 327 } 328 329 /* eo2[0-3] */ 330 { 331 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 332 333 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 334 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 335 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 336 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 337 338 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 339 pi2_scratch += 8; 340 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 341 pi2_scratch += 8; 342 343 } 344 345 /* eo2[4-7] */ 346 { 347 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 348 349 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 350 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 351 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 352 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 353 354 355 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 356 pi2_scratch += 8; 357 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 358 pi2_scratch += 8; 359 360 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 361 } 362 363 /* eo3[0-3] */ 364 { 365 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 366 367 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 368 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 369 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 370 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 371 372 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 373 pi2_scratch += 8; 374 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 375 pi2_scratch += 8; 376 } 377 378 /* eo3[4-7] */ 379 { 380 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 381 382 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 383 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 384 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 385 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 386 387 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 388 pi2_scratch += 8; 389 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 390 pi2_scratch += 8; 391 } 392 } 393 /* If last 8 rows are zero : Rishab */ 394 else if(zero_last8_rows_stg1) 395 { 396 /* eeo */ 397 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 398 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 399 { 400 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 401 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 402 403 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 404 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 405 406 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 407 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 408 409 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 410 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 411 412 } 413 414 /* eee */ 415 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 416 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 417 { 418 /* Loading coeff and src for use in next block */ 419 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_77, m_temp_reg_70); //to get signs 420 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 421 422 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 423 424 //m_temp_reg_70 = _mm_srli_si128(m_temp_reg_70, 8); 425 426 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 427 428 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 429 430 m_temp_reg_26 = m_temp_reg_24; 431 m_temp_reg_27 = m_temp_reg_25; 432 433 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 434 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 435 } 436 437 /* eo0[0-3] */ 438 { 439 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 440 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 441 442 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 443 444 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 445 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 446 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 447 448 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 449 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 450 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 451 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 452 453 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 454 pi2_scratch += 8; 455 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 456 pi2_scratch += 8; 457 458 } 459 460 /* eo0[4-7] */ 461 { 462 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 463 464 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 465 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 466 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 467 468 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 469 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 470 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 471 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 472 473 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 474 pi2_scratch += 8; 475 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 476 pi2_scratch += 8; 477 478 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 479 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 480 481 } 482 483 /* eo1[0-3] */ 484 { 485 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 486 487 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 488 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 489 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 490 491 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 492 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 493 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 494 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 495 496 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 497 pi2_scratch += 8; 498 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 499 pi2_scratch += 8; 500 501 } 502 503 /* eo1[4-7] */ 504 { 505 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 506 507 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 508 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 509 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 510 511 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 512 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 513 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 514 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 515 516 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 517 pi2_scratch += 8; 518 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 519 pi2_scratch += 8; 520 521 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 522 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 523 524 } 525 526 /* eo2[0-3] */ 527 { 528 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 529 530 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 531 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 532 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 533 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 534 535 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 536 pi2_scratch += 8; 537 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 538 pi2_scratch += 8; 539 540 } 541 542 /* eo2[4-7] */ 543 { 544 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 545 546 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 547 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 548 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 549 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 550 551 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 552 pi2_scratch += 8; 553 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 554 pi2_scratch += 8; 555 556 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 557 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 558 559 } 560 561 /* eo3[0-3] */ 562 { 563 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 564 565 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 566 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 567 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 568 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 569 570 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 571 pi2_scratch += 8; 572 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 573 pi2_scratch += 8; 574 } 575 576 /* eo3[4-7] */ 577 { 578 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 579 580 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 581 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 582 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 583 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 584 585 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 586 pi2_scratch += 8; 587 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 588 pi2_scratch += 8; 589 } 590 } /* If all the rows are non-zero : Rishab */ 591 else 592 { 593 /* eeo */ 594 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 595 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 596 597 { 598 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 599 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 600 601 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 602 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 603 604 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 605 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 606 607 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 608 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 609 } 610 611 /* eee */ 612 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 613 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 614 { 615 /* Loading coeff and src for use in next block */ 616 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 617 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 618 619 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's 620 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's 621 622 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 623 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); 624 625 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 626 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); 627 628 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 629 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 630 631 } 632 /* eo0[0-3] */ 633 { 634 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 635 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 636 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 637 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 638 639 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 640 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 641 642 643 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 644 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 645 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 646 647 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 648 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 649 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 650 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 651 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 652 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 653 654 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 655 pi2_scratch += 8; 656 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 657 pi2_scratch += 8; 658 659 660 } 661 662 /* eo0[4-7] */ 663 { 664 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 665 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 666 667 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 668 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 669 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 670 671 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 672 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 673 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 674 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 675 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 676 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 677 678 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 679 pi2_scratch += 8; 680 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 681 pi2_scratch += 8; 682 683 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 684 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 685 686 } 687 688 /* eo1[0-3] */ 689 { 690 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 691 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 692 693 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 694 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 695 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 696 697 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 698 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 699 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 700 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 701 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); 702 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); 703 704 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 705 pi2_scratch += 8; 706 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 707 pi2_scratch += 8; 708 709 } 710 711 /* eo1[4-7] */ 712 { 713 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 714 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 715 716 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 717 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 718 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 719 720 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 721 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 722 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 723 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 724 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); 725 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); 726 727 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 728 pi2_scratch += 8; 729 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 730 pi2_scratch += 8; 731 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 732 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 733 } 734 735 /* eo2[0-3] */ 736 { 737 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 738 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 739 740 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 741 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 742 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 743 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 744 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 745 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 746 747 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 748 pi2_scratch += 8; 749 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 750 pi2_scratch += 8; 751 } 752 753 /* eo2[4-7] */ 754 { 755 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 756 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 757 758 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 759 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 760 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 761 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 762 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 763 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 764 765 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 766 pi2_scratch += 8; 767 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 768 pi2_scratch += 8; 769 770 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 771 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 772 773 } 774 775 /* eo3[0-3] */ 776 { 777 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 778 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 779 780 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 781 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 782 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 783 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 784 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 785 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 786 787 788 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 789 pi2_scratch += 8; 790 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 791 pi2_scratch += 8; 792 } 793 794 /* eo3[4-7] */ 795 { 796 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 797 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 798 799 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 800 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 801 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 802 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 803 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 804 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 805 806 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 807 pi2_scratch += 8; 808 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 809 pi2_scratch += 8; 810 } 811 812 } 813 } 814 815 { 816 WORD32 sample_half_index = i << 3; 817 WORD16 *pi2_tmp_src = pi2_src + sample_half_index + src_strd; 818 819 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 820 pi2_tmp_src += (src_strd << 1); 821 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 822 pi2_tmp_src += (src_strd << 1); 823 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 824 pi2_tmp_src += (src_strd << 1); 825 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 826 pi2_tmp_src += (src_strd << 1); 827 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 828 pi2_tmp_src += (src_strd << 1); 829 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 830 pi2_tmp_src += (src_strd << 1); 831 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 832 pi2_tmp_src += (src_strd << 1); 833 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_tmp_src); 834 pi2_tmp_src += (src_strd << 1); 835 } 836 837 /* o & stage 1 out */ 838 { 839 WORD32 j; 840 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 841 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 842 WORD32 out_stride = (trans_size << 1); 843 WORD32 in_stride = trans_size << 1; 844 845 if(zero_last12_rows_stg1) 846 { 847 for(j = 0; j < 2; j++) 848 { 849 if(j) //H8B= higher 8 bytes L8B lower 8 bytes 850 { 851 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 852 } 853 else 854 { 855 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 856 } 857 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 858 859 860 /* o0[0-3] */ 861 { 862 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 863 864 865 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 866 pi2_src_scratch += in_stride; 867 868 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 869 870 871 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 872 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 873 874 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 875 m_count = _mm_cvtsi32_si128(i4_shift); 876 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 877 878 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 879 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 880 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 881 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 882 883 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 884 885 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 886 pi2_dst_scratch += out_stride; 887 } 888 889 /* o1[0-3] */ 890 { 891 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 892 893 894 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 895 pi2_src_scratch += in_stride; 896 897 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 898 899 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 900 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 901 902 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 903 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 904 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 905 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 906 907 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 908 909 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 910 pi2_dst_scratch += out_stride; 911 } 912 913 /* o2[0-3] */ 914 { 915 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 916 917 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 918 pi2_src_scratch += in_stride; 919 920 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 921 922 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 923 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 924 925 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 926 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 927 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 928 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 929 930 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 931 932 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 933 pi2_dst_scratch += out_stride; 934 } 935 936 /* o3[0-3] */ 937 { 938 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 939 940 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 941 pi2_src_scratch += 8; 942 943 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 944 945 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 946 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 947 948 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 949 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 950 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 951 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 952 953 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 954 955 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 956 pi2_dst_scratch += 8; 957 } 958 959 /* o4[0-3] */ 960 { 961 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 962 963 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 964 pi2_src_scratch -= in_stride; 965 966 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 967 968 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 969 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 970 971 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 972 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 973 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 974 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 975 976 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 977 978 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 979 pi2_dst_scratch -= out_stride; 980 } 981 982 /* o5[0-3] */ 983 { 984 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 985 986 987 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 988 pi2_src_scratch -= in_stride; 989 990 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 991 992 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 993 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 994 995 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 996 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 997 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 998 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 999 1000 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1001 1002 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1003 pi2_dst_scratch -= out_stride; 1004 } 1005 1006 /* o6[0-3] */ 1007 { 1008 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1009 1010 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1011 pi2_src_scratch -= in_stride; 1012 1013 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 1014 1015 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1016 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1017 1018 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1019 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1020 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1021 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1022 1023 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1024 1025 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1026 pi2_dst_scratch -= out_stride; 1027 } 1028 1029 /* o7[0-3] */ 1030 { 1031 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1032 1033 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1034 pi2_src_scratch += 8; 1035 1036 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1037 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1038 1039 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1040 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1041 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1042 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1043 1044 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1045 1046 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1047 pi2_dst_scratch += 8; 1048 } 1049 } 1050 } 1051 else if(zero_last8_rows_stg1) 1052 { 1053 for(j = 0; j < 2; j++) 1054 { 1055 if(j) 1056 { 1057 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 1058 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 1059 } 1060 else 1061 { 1062 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 1063 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 1064 } 1065 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 1066 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 1067 1068 /* o0[0-3] */ 1069 { 1070 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1071 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1072 1073 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1074 pi2_src_scratch += in_stride; 1075 1076 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 1077 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 1078 1079 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1080 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1081 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1082 1083 1084 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1085 m_count = _mm_cvtsi32_si128(i4_shift); 1086 1087 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 1088 1089 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1090 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1091 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1092 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1093 1094 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1095 1096 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1097 pi2_dst_scratch += out_stride; 1098 } 1099 1100 /* o1[0-3] */ 1101 { 1102 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1103 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1104 1105 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1106 pi2_src_scratch += in_stride; 1107 1108 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 1109 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 1110 1111 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1112 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1113 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1114 1115 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1116 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1117 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1118 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1119 1120 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1121 1122 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1123 pi2_dst_scratch += out_stride; 1124 } 1125 1126 /* o2[0-3] */ 1127 { 1128 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1129 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1130 1131 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1132 pi2_src_scratch += in_stride; 1133 1134 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 1135 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 1136 1137 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1138 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1139 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1140 1141 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1142 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1143 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1144 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1145 1146 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1147 1148 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1149 pi2_dst_scratch += out_stride; 1150 } 1151 1152 /* o3[0-3] */ 1153 { 1154 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1155 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1156 1157 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1158 pi2_src_scratch += 8; 1159 1160 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 1161 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 1162 1163 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 1164 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1165 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1166 1167 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1168 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1169 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1170 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1171 1172 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1173 1174 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1175 pi2_dst_scratch += 8; 1176 } 1177 1178 /* o4[0-3] */ 1179 { 1180 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1181 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1182 1183 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1184 pi2_src_scratch -= in_stride; 1185 1186 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 1187 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 1188 1189 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1190 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1191 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1192 1193 1194 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1195 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1196 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1197 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1198 1199 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1200 1201 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1202 pi2_dst_scratch -= out_stride; 1203 } 1204 1205 /* o5[0-3] */ 1206 { 1207 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1208 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1209 1210 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1211 pi2_src_scratch -= in_stride; 1212 1213 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 1214 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 1215 1216 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1217 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1218 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1219 1220 1221 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1222 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1223 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1224 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1225 1226 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1227 1228 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1229 pi2_dst_scratch -= out_stride; 1230 } 1231 1232 /* o6[0-3] */ 1233 { 1234 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1235 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1236 1237 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1238 pi2_src_scratch -= in_stride; 1239 1240 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 1241 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 1242 1243 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1244 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1245 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1246 1247 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1248 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1249 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1250 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1251 1252 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1253 1254 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1255 pi2_dst_scratch -= out_stride; 1256 } 1257 1258 /* o7[0-3] */ 1259 { 1260 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1261 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1262 1263 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1264 pi2_src_scratch += 8; 1265 1266 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1267 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1268 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1269 1270 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1271 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1272 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1273 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1274 1275 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1276 1277 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1278 pi2_dst_scratch += 8; 1279 } 1280 } 1281 1282 } 1283 else 1284 { 1285 1286 1287 1288 for(j = 0; j < 2; j++) 1289 { 1290 if(j) //H8B= higher 8 bytes L8B lower 8 bytes 1291 { 1292 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 1293 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 1294 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B 1295 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B 1296 } 1297 else 1298 { 1299 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 1300 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 1301 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B 1302 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B 1303 } 1304 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 1305 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 1306 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 1307 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 1308 1309 1310 /* o0[0-3] */ 1311 { 1312 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1313 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1314 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1315 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1316 1317 1318 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1319 pi2_src_scratch += in_stride; 1320 1321 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 1322 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 1323 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 1324 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 1325 1326 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1327 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 1328 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 1329 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1330 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1331 1332 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1333 m_count = _mm_cvtsi32_si128(i4_shift); 1334 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 1335 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 1336 1337 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1338 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1339 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1340 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1341 1342 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1343 1344 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1345 pi2_dst_scratch += out_stride; 1346 } 1347 1348 /* o1[0-3] */ 1349 { 1350 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1351 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1352 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1353 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1354 1355 1356 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1357 pi2_src_scratch += in_stride; 1358 1359 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 1360 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 1361 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 1362 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 1363 1364 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1365 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 1366 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 1367 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1368 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1369 1370 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1371 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1372 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1373 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1374 1375 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1376 1377 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1378 pi2_dst_scratch += out_stride; 1379 } 1380 1381 /* o2[0-3] */ 1382 { 1383 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1384 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1385 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1386 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1387 1388 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1389 pi2_src_scratch += in_stride; 1390 1391 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 1392 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 1393 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 1394 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 1395 1396 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1397 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 1398 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 1399 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1400 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1401 1402 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1403 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1404 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1405 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1406 1407 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1408 1409 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1410 pi2_dst_scratch += out_stride; 1411 } 1412 1413 /* o3[0-3] */ 1414 { 1415 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1416 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1417 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1418 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1419 1420 1421 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1422 pi2_src_scratch += 8; 1423 1424 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 1425 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 1426 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 1427 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 1428 1429 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 1430 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 1431 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 1432 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1433 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1434 1435 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1436 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1437 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1438 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1439 1440 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1441 1442 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1443 pi2_dst_scratch += 8; 1444 } 1445 1446 /* o4[0-3] */ 1447 { 1448 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1449 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1450 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1451 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1452 1453 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1454 pi2_src_scratch -= in_stride; 1455 1456 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 1457 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 1458 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 1459 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 1460 1461 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 1462 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 1463 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 1464 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1465 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1466 1467 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1468 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1469 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1470 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1471 1472 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1473 1474 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1475 pi2_dst_scratch -= out_stride; 1476 } 1477 1478 /* o5[0-3] */ 1479 { 1480 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1481 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1482 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1483 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1484 1485 1486 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1487 pi2_src_scratch -= in_stride; 1488 1489 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 1490 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 1491 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 1492 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 1493 1494 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1495 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 1496 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 1497 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1498 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1499 1500 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1501 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1502 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1503 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1504 1505 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1506 1507 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1508 pi2_dst_scratch -= out_stride; 1509 } 1510 1511 /* o6[0-3] */ 1512 { 1513 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1514 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 1515 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 1516 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 1517 1518 1519 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1520 pi2_src_scratch -= in_stride; 1521 1522 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 1523 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 1524 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 1525 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 1526 1527 1528 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 1529 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 1530 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 1531 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 1532 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 1533 1534 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1535 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1536 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1537 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1538 1539 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1540 1541 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1542 pi2_dst_scratch -= out_stride; 1543 } 1544 1545 /* o7[0-3] */ 1546 { 1547 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 1548 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 1549 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 1550 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 1551 1552 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 1553 pi2_src_scratch += 8; 1554 1555 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 1556 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 1557 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 1558 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 1559 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 1560 1561 1562 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 1563 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 1564 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 1565 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 1566 1567 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 1568 1569 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 1570 pi2_dst_scratch += 8; 1571 } 1572 } 1573 } 1574 } 1575 1576 /* Transpose */ 1577 { 1578 WORD16 *pi2_src_scratch = (i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp; 1579 WORD16 *pi2_dst_scratch = ((i) ? (pi2_tmp + 8 * trans_size) : pi2_tmp); 1580 WORD32 out_stride = (trans_size << 1); 1581 WORD32 in_stride = (trans_size << 1); 1582 WORD32 j; 1583 1584 for(j = 0; j < 2; j++) 1585 { 1586 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a 1587 pi2_src_scratch += in_stride; 1588 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c 1589 pi2_src_scratch += in_stride; 1590 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e 1591 pi2_src_scratch += in_stride; 1592 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g 1593 pi2_src_scratch += 8; 1594 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i 1595 pi2_src_scratch -= in_stride; 1596 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k 1597 pi2_src_scratch -= in_stride; 1598 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m 1599 pi2_src_scratch -= in_stride; 1600 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o 1601 pi2_src_scratch += 8; 1602 1603 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 1604 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 1605 1606 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 1607 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 1608 1609 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 1610 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 1611 1612 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 1613 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 1614 1615 1616 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 1617 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 1618 1619 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 1620 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 1621 1622 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 1623 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 1624 1625 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 1626 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 1627 1628 1629 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 1630 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 1631 1632 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 1633 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 1634 1635 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 1636 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp1 1637 1638 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp2 1639 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp3 1640 1641 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1642 pi2_dst_scratch += out_stride; 1643 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_44); 1644 pi2_dst_scratch += out_stride; 1645 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_41); 1646 pi2_dst_scratch += out_stride; 1647 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_45); 1648 pi2_dst_scratch += 8; 1649 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_42); 1650 pi2_dst_scratch -= out_stride; 1651 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_46); 1652 pi2_dst_scratch -= out_stride; 1653 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_43); 1654 pi2_dst_scratch -= out_stride; 1655 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_47); 1656 pi2_dst_scratch += 8; 1657 } 1658 } 1659 } 1660 1661 if(zero_last8_cols_stg1) 1662 { 1663 WORD16 *pi2_dst_scratch = (pi2_tmp + 8 * trans_size); 1664 WORD32 out_stride = (trans_size << 1); 1665 WORD32 j; 1666 1667 m_temp_reg_40 = _mm_setzero_si128(); 1668 for(j = 0; j < 2; j++) 1669 { 1670 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1671 pi2_dst_scratch += out_stride; 1672 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1673 pi2_dst_scratch += out_stride; 1674 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1675 pi2_dst_scratch += out_stride; 1676 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1677 pi2_dst_scratch += 8; 1678 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1679 pi2_dst_scratch -= out_stride; 1680 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1681 pi2_dst_scratch -= out_stride; 1682 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1683 pi2_dst_scratch -= out_stride; 1684 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_40); 1685 pi2_dst_scratch += 8; 1686 } 1687 } 1688 1689 1690 1691 1692 /* Stage 2 */ 1693 for(i = 0; i < 2; i++) 1694 { 1695 //__m128i m_temp_reg_15,m_temp_reg_16; 1696 WORD16 *pi2_src_temp = (i) ? (pi2_tmp + 2 * trans_size) : (WORD16 *)(pi2_tmp); 1697 WORD32 stride = (trans_size); 1698 WORD16 temp_array[256]; 1699 1700 i4_shift = IT_SHIFT_STAGE_2; 1701 1702 if(zero_last12_rows_stg2) 1703 { 1704 /* eeo */ 1705 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 1706 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 1707 { 1708 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 1709 1710 pi2_src_temp += (stride * 9); 1711 1712 if(!i) 1713 { 1714 pi2_src_temp += (stride * 6 + 8); 1715 } 1716 else 1717 { 1718 pi2_src_temp += (stride * 2 + 8); 1719 } 1720 1721 pi2_src_temp -= (stride * 9); 1722 1723 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 1724 1725 m_temp_reg_20 = _mm_setzero_si128(); 1726 m_temp_reg_22 = _mm_setzero_si128(); 1727 1728 m_temp_reg_21 = _mm_setzero_si128(); 1729 m_temp_reg_23 = _mm_setzero_si128(); 1730 } 1731 1732 /* eee */ 1733 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 1734 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 1735 { 1736 /* Loading coeff and src for use in next block */ 1737 1738 /* Loading coeff and src for use in next block */ 1739 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_20, m_temp_reg_70); 1740 1741 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 1742 1743 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 1744 1745 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 1746 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 1747 1748 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 1749 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 1750 1751 m_temp_reg_26 = m_temp_reg_24; 1752 m_temp_reg_27 = m_temp_reg_25; 1753 /* */ 1754 1755 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_20); 1756 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_20); 1757 } 1758 1759 /* eo */ 1760 { 1761 WORD16 *pi2_scratch = temp_array; 1762 WORD32 out_stride = 8; 1763 1764 1765 /* eo0[0-3] */ 1766 { 1767 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1768 1769 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 1770 1771 /* e[0][0-3] stored in pu1_dst[0] */ 1772 /* e[7][0-3] stored in pu1_dst[1] */ 1773 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 1774 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 1775 1776 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1777 pi2_scratch += out_stride; 1778 _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35); 1779 pi2_scratch += out_stride; 1780 } 1781 1782 /* eo0[4-7] */ 1783 { 1784 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1785 1786 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 1787 1788 /* e[0][4-7] stored in pu1_dst[2] */ 1789 /* e[7][4-7] stored in pu1_dst[3] */ 1790 1791 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 1792 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 1793 1794 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1795 pi2_scratch += out_stride; 1796 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1797 pi2_scratch += out_stride; 1798 1799 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 1800 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 1801 1802 } 1803 1804 /* eo1[0-3] */ 1805 { 1806 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1807 1808 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 1809 1810 /* e[1][0-3] stored in pu1_dst[4] */ 1811 /* e[6][0-3] stored in pu1_dst[5] */ 1812 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 1813 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 1814 1815 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1816 pi2_scratch += out_stride; 1817 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1818 pi2_scratch += out_stride; 1819 } 1820 1821 /* eo1[4-7] */ 1822 { 1823 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1824 1825 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 1826 1827 /* e[1][4-7] stored in pu1_dst[6]*/ 1828 /* e[6][4-7] stored in pu1_dst[7] */ 1829 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 1830 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 1831 1832 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1833 pi2_scratch += out_stride; 1834 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1835 pi2_scratch += out_stride; 1836 1837 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 1838 1839 } 1840 1841 /* eo2[0-3] */ 1842 { 1843 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1844 1845 /* e[2][0-3] stored in pu1_dst[8]*/ 1846 /* e[5][0-3] stored in pu1_dst[9] */ 1847 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_30); 1848 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_30); 1849 1850 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1851 pi2_scratch += out_stride; 1852 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1853 pi2_scratch += out_stride; 1854 } 1855 1856 /* eo2[4-7] */ 1857 { 1858 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 1859 1860 /* e[2][4-7] stored in pu1_dst[10]*/ 1861 /* e[5][4-7] stored in pu1_dst[11] */ 1862 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_31); 1863 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_31); 1864 1865 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1866 pi2_scratch += out_stride; 1867 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1868 pi2_scratch += out_stride; 1869 1870 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 1871 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 1872 1873 } 1874 1875 /* eo3[0-3] */ 1876 { 1877 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 1878 1879 /* e[3][0-3] stored in pu1_dst[12]*/ 1880 /* e[4][0-3] stored in pu1_dst[13] */ 1881 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_30); 1882 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_30); 1883 1884 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1885 pi2_scratch += out_stride; 1886 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1887 pi2_scratch += out_stride; 1888 } 1889 1890 /* eo3[4-7] */ 1891 { 1892 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 1893 1894 /* e[3][4-7] stored in pu1_dst[14]*/ 1895 /* e[4][4-7] stored in pu1_dst[15] */ 1896 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_31); 1897 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_31); 1898 1899 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1900 pi2_scratch += out_stride; 1901 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 1902 pi2_scratch += out_stride; 1903 } 1904 1905 } 1906 } 1907 else if(zero_last8_rows_stg2) 1908 { 1909 /* eeo */ 1910 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 1911 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 1912 { 1913 1914 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[3][0]); //83 1915 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_16_even[4][0]); //36 1916 1917 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 1918 pi2_src_temp += (stride); 1919 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 1920 pi2_src_temp += (stride * 8); 1921 1922 if(!i) 1923 { 1924 pi2_src_temp += (stride * 6 + 8); 1925 } 1926 else 1927 { 1928 pi2_src_temp += (stride * 2 + 8); 1929 } 1930 1931 pi2_src_temp -= (stride * 8); 1932 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 1933 pi2_src_temp -= (stride); 1934 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 1935 1936 1937 m_temp_reg_76 = _mm_setzero_si128(); 1938 1939 1940 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 1941 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 1942 1943 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 1944 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 1945 1946 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1947 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1948 1949 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1950 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1951 } 1952 1953 /* eee */ 1954 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 1955 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 1956 { 1957 /* Loading coeff and src for use in next block */ 1958 1959 1960 m_temp_reg_77 = _mm_cmpgt_epi16(m_temp_reg_76, m_temp_reg_70); 1961 1962 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_77); //row 0 1963 1964 m_temp_reg_24 = _mm_slli_epi32(m_temp_reg_0, 6); 1965 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_77); 1966 m_temp_reg_25 = _mm_slli_epi32(m_temp_reg_1, 6); 1967 1968 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 1969 1970 m_temp_reg_26 = m_temp_reg_24; 1971 m_temp_reg_27 = m_temp_reg_25; 1972 1973 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1974 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1975 } 1976 1977 /* eo */ 1978 { 1979 WORD16 *pi2_scratch = temp_array; 1980 WORD32 out_stride = 8; 1981 1982 1983 /* eo0[0-3] */ 1984 { 1985 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 1986 1987 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 1988 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 1989 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 1990 1991 /* e[0][0-3] stored in pu1_dst[0] */ 1992 /* e[7][0-3] stored in pu1_dst[1] */ 1993 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1994 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1995 1996 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 1997 pi2_scratch += out_stride; 1998 _mm_storeu_si128((__m128i *)(pi2_scratch), m_temp_reg_35); 1999 pi2_scratch += out_stride; 2000 } 2001 2002 /* eo0[4-7] */ 2003 { 2004 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 2005 2006 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2007 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 2008 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 2009 2010 /* e[0][4-7] stored in pu1_dst[2] */ 2011 /* e[7][4-7] stored in pu1_dst[3] */ 2012 2013 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2014 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2015 2016 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2017 pi2_scratch += out_stride; 2018 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2019 pi2_scratch += out_stride; 2020 2021 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 2022 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 2023 2024 } 2025 2026 /* eo1[0-3] */ 2027 { 2028 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2029 2030 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2031 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 2032 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 2033 2034 /* e[1][0-3] stored in pu1_dst[4] */ 2035 /* e[6][0-3] stored in pu1_dst[5] */ 2036 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 2037 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 2038 2039 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2040 pi2_scratch += out_stride; 2041 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2042 pi2_scratch += out_stride; 2043 } 2044 2045 /* eo1[4-7] */ 2046 { 2047 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 2048 2049 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2050 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 2051 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 2052 2053 /* e[1][4-7] stored in pu1_dst[6]*/ 2054 /* e[6][4-7] stored in pu1_dst[7] */ 2055 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 2056 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 2057 2058 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2059 pi2_scratch += out_stride; 2060 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2061 pi2_scratch += out_stride; 2062 2063 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 2064 2065 } 2066 2067 /* eo2[0-3] */ 2068 { 2069 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2070 2071 /* e[2][0-3] stored in pu1_dst[8]*/ 2072 /* e[5][0-3] stored in pu1_dst[9] */ 2073 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 2074 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 2075 2076 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2077 pi2_scratch += out_stride; 2078 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2079 pi2_scratch += out_stride; 2080 } 2081 2082 /* eo2[4-7] */ 2083 { 2084 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff1); 2085 2086 /* e[2][4-7] stored in pu1_dst[10]*/ 2087 /* e[5][4-7] stored in pu1_dst[11] */ 2088 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 2089 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 2090 2091 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2092 pi2_scratch += out_stride; 2093 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2094 pi2_scratch += out_stride; 2095 2096 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 2097 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 2098 2099 } 2100 2101 /* eo3[0-3] */ 2102 { 2103 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2104 2105 /* e[3][0-3] stored in pu1_dst[12]*/ 2106 /* e[4][0-3] stored in pu1_dst[13] */ 2107 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 2108 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 2109 2110 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2111 pi2_scratch += out_stride; 2112 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2113 pi2_scratch += out_stride; 2114 } 2115 2116 /* eo3[4-7] */ 2117 { 2118 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_14, m_coeff3); 2119 2120 /* e[3][4-7] stored in pu1_dst[14]*/ 2121 /* e[4][4-7] stored in pu1_dst[15] */ 2122 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 2123 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 2124 2125 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2126 pi2_scratch += out_stride; 2127 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2128 pi2_scratch += out_stride; 2129 } 2130 } 2131 } 2132 2133 else 2134 { 2135 /* eeo */ 2136 /* eeo[0] stored in m_temp_reg_20 and m_temp_reg_21 */ 2137 /* eeo[1] stored in m_temp_reg_22 and m_temp_reg_23 */ 2138 { 2139 2140 2141 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //0 2142 pi2_src_temp += (stride); 2143 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //4 2144 pi2_src_temp += (stride * 7); 2145 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //8 2146 pi2_src_temp += (stride); 2147 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //12 2148 if(!i) 2149 { 2150 pi2_src_temp += (stride * 6 + 8); 2151 } 2152 else 2153 { 2154 pi2_src_temp += (stride * 2 + 8); 2155 } 2156 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //14 2157 pi2_src_temp -= (stride); 2158 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //10 2159 pi2_src_temp -= (stride * 7); 2160 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //6 2161 pi2_src_temp -= (stride); 2162 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //2 2163 2164 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[6][0]); //83 36 2165 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[7][0]); //36 -83 2166 2167 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved LSB's 2168 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); //row 4 and row 12 interleaved MSB's 2169 2170 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2171 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2172 2173 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2174 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2175 2176 2177 } 2178 2179 /* eee */ 2180 /* eee[0] stored in m_temp_reg_24 and m_temp_reg_25 */ 2181 /* eee[1] stored in m_temp_reg_26 and m_temp_reg_27 */ 2182 { 2183 /* Loading coeff and src for use in next block */ 2184 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[0][0]); //64 64 2185 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[1][0]); //64 -64 2186 2187 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved LSB's 2188 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); //row 0 and row 8 interleaved MSB's 2189 2190 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 2191 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_0, m_coeff4); 2192 2193 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 2194 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_1, m_coeff4); 2195 2196 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[2][0]); //89 75 2197 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[4][0]); //50 18 2198 2199 } 2200 2201 /* eo */ 2202 { 2203 WORD16 *pi2_scratch = temp_array; 2204 WORD32 out_stride = 8; 2205 2206 2207 2208 /* eo0[0-3] */ 2209 { 2210 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 2211 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 2212 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 2213 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 2214 2215 2216 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2217 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 2218 2219 2220 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2221 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_20); 2222 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_20); 2223 2224 2225 /* e[0][0-3] stored in pi2_tmp[0][0-7] */ 2226 /* e[7][0-3] stored in pi2_tmp[0][8-15] */ 2227 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 2228 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 2229 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 2230 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 2231 2232 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2233 pi2_scratch += out_stride; 2234 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2235 pi2_scratch += out_stride; 2236 2237 2238 } 2239 2240 /* eo0[4-7] */ 2241 { 2242 2243 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 2244 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 2245 2246 /* ee[0] and ee[3] stored in m_temp_reg_40-41 & m_temp_reg_46-47 */ 2247 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_25, m_temp_reg_21); 2248 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_25, m_temp_reg_21); 2249 2250 /* e[0][4-7] stored in pi2_tmp[1][0-7] */ 2251 /* e[7][4-7] stored in pi2_tmp[1][8-15] */ 2252 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2253 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2254 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 2255 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 2256 2257 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2258 pi2_scratch += out_stride; 2259 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2260 pi2_scratch += out_stride; 2261 2262 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[3][0]); //75 -18 2263 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[5][0]); //89 50 2264 2265 } 2266 2267 /* eo1[0-3] */ 2268 { 2269 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2270 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 2271 2272 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2273 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_22); 2274 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_22); 2275 2276 /* e[1][0-3] stored in pi2_tmp[2][0-7] */ 2277 /* e[6][0-3] stored in pi2_tmp[2][8-15] */ 2278 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_30); 2279 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_30); 2280 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_32); 2281 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_32); 2282 2283 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2284 pi2_scratch += out_stride; 2285 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2286 pi2_scratch += out_stride; 2287 2288 } 2289 2290 /* eo1[4-7] */ 2291 { 2292 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 2293 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2294 2295 /* ee[1] and ee[2] stored in m_temp_reg_4-43 & m_temp_reg_44-45 */ 2296 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_27, m_temp_reg_23); 2297 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_27, m_temp_reg_23); 2298 2299 /* e[1][4-7] stored in pi2_tmp[3][0-7] */ 2300 /* e[6][4-7] stored in pi2_tmp[3][8-15] */ 2301 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_31); 2302 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_31); 2303 m_temp_reg_34 = _mm_sub_epi32(m_temp_reg_34, m_temp_reg_33); 2304 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_35, m_temp_reg_33); 2305 2306 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2307 pi2_scratch += out_stride; 2308 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2309 pi2_scratch += out_stride; 2310 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[8][0]); //50 -89 2311 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[10][0]); //18 75 2312 } 2313 2314 /* eo2[0-3] */ 2315 { 2316 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2317 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff2); 2318 2319 /* e[2][0-3] stored in pi2_tmp[4][0-7] */ 2320 /* e[5][0-3] stored in pi2_tmp[4][8-15] */ 2321 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_30); 2322 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_30); 2323 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 2324 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 2325 2326 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2327 pi2_scratch += out_stride; 2328 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2329 pi2_scratch += out_stride; 2330 } 2331 2332 /* eo2[4-7] */ 2333 { 2334 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff1); 2335 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff2); 2336 2337 /* e[2][4-7] stored in pi2_tmp[5][0-7] */ 2338 /* e[5][4-7] stored in pi2_tmp[5][8-15] */ 2339 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_31); 2340 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_31); 2341 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 2342 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 2343 2344 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2345 pi2_scratch += out_stride; 2346 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2347 pi2_scratch += out_stride; 2348 2349 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[9][0]); //18 -50 2350 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_even[11][0]); //75 -89 2351 2352 } 2353 2354 /* eo3[0-3] */ 2355 { 2356 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_10, m_coeff3); 2357 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_12, m_coeff4); 2358 2359 /* e[3][0-3] stored in pi2_tmp[6][0-7] */ 2360 /* e[4][0-3] stored in pi2_tmp[6][8-15] */ 2361 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_30); 2362 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_30); 2363 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_32); 2364 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_32); 2365 2366 2367 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2368 pi2_scratch += out_stride; 2369 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2370 pi2_scratch += out_stride; 2371 } 2372 2373 /* eo3[4-7] */ 2374 { 2375 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_11, m_coeff3); 2376 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2377 2378 /* e[3][4-7] stored in pi2_tmp[7][0-7] */ 2379 /* e[4][4-7] stored in pi2_tmp[7][8-15] */ 2380 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_31); 2381 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_31); 2382 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_33); 2383 m_temp_reg_35 = _mm_sub_epi32(m_temp_reg_35, m_temp_reg_33); 2384 2385 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_34); 2386 pi2_scratch += out_stride; 2387 _mm_storeu_si128((__m128i *)pi2_scratch, m_temp_reg_35); 2388 pi2_scratch += out_stride; 2389 } 2390 } 2391 } 2392 2393 if(zero_last12_rows_stg2) 2394 { 2395 /* o & stage 2 pre-transposed out */ 2396 { 2397 WORD32 j; 2398 WORD16 *pi2_src_scratch = temp_array; 2399 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 2400 WORD32 out_stride = (trans_size); 2401 WORD32 in_stride = (8) * 4; 2402 2403 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 2404 2405 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 2406 2407 pi2_src_temp += (stride * 9); 2408 2409 if(0 == i) 2410 { 2411 pi2_src_temp -= (stride * 2 - 8); 2412 } 2413 else 2414 { 2415 pi2_src_temp -= (stride * 6 - 8); 2416 } 2417 pi2_src_temp -= (stride * 9); 2418 2419 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 2420 2421 2422 for(j = 0; j < 2; j++) 2423 { 2424 if(j) 2425 { 2426 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 2427 } 2428 else 2429 { 2430 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 2431 } 2432 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 2433 2434 /* o0[0-3] */ 2435 { 2436 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2437 2438 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2439 pi2_src_scratch += in_stride; 2440 2441 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 2442 2443 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2444 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2445 2446 2447 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2448 m_count = _mm_cvtsi32_si128(i4_shift); 2449 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 2450 2451 2452 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2453 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2454 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2455 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2456 2457 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2458 2459 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2460 pi2_dst_scratch += out_stride; 2461 } 2462 2463 /* o1[0-3] */ 2464 { 2465 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2466 2467 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2468 pi2_src_scratch += in_stride; 2469 2470 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 2471 2472 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2473 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2474 2475 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2476 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2477 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2478 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2479 2480 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2481 2482 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2483 pi2_dst_scratch += ((!i) * out_stride + 8); 2484 } 2485 2486 /* o2[0-3] */ 2487 { 2488 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2489 2490 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2491 pi2_src_scratch += in_stride; 2492 2493 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 2494 2495 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2496 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2497 2498 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2499 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2500 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2501 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2502 2503 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2504 2505 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2506 pi2_dst_scratch += out_stride; 2507 } 2508 2509 /* o3[0-3] */ 2510 { 2511 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2512 2513 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2514 pi2_src_scratch += 8; 2515 2516 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 2517 2518 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2519 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2520 2521 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2522 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2523 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2524 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2525 2526 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2527 2528 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2529 pi2_dst_scratch += (i * out_stride + 8); 2530 } 2531 2532 /* o4[0-3] */ 2533 { 2534 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2535 2536 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2537 pi2_src_scratch -= in_stride; 2538 2539 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 2540 2541 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2542 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2543 2544 2545 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2546 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2547 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2548 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2549 2550 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2551 2552 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2553 pi2_dst_scratch += out_stride; 2554 } 2555 2556 /* o5[0-3] */ 2557 { 2558 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2559 2560 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2561 pi2_src_scratch -= in_stride; 2562 2563 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 2564 2565 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2566 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2567 2568 2569 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2570 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2571 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2572 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2573 2574 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2575 2576 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2577 pi2_dst_scratch += ((!i) * out_stride + 8); 2578 } 2579 2580 /* o6[0-3] */ 2581 { 2582 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2583 2584 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2585 pi2_src_scratch -= in_stride; 2586 2587 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 2588 2589 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2590 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2591 2592 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2593 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2594 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2595 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2596 2597 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2598 2599 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2600 pi2_dst_scratch += out_stride; 2601 } 2602 2603 /* o7[0-3] */ 2604 { 2605 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2606 2607 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2608 pi2_src_scratch += 8; 2609 2610 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2611 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2612 2613 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2614 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2615 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2616 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2617 2618 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2619 2620 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2621 pi2_dst_scratch += (i * out_stride + 8); 2622 } 2623 2624 2625 } 2626 } 2627 } 2628 else if(zero_last8_rows_stg2) 2629 { 2630 /* o & stage 2 pre-transposed out */ 2631 { 2632 WORD32 j; 2633 WORD16 *pi2_src_scratch = temp_array; 2634 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 2635 WORD32 out_stride = (trans_size); 2636 WORD32 in_stride = (8) * 4; 2637 2638 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 2639 2640 2641 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 2642 pi2_src_temp += (stride); 2643 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 2644 pi2_src_temp += (stride * 8); 2645 2646 if(0 == i) 2647 { 2648 pi2_src_temp -= (stride * 2 - 8); 2649 } 2650 else 2651 { 2652 pi2_src_temp -= (stride * 6 - 8); 2653 } 2654 2655 pi2_src_temp -= (stride * 8); 2656 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 2657 pi2_src_temp -= (stride); 2658 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 2659 2660 2661 for(j = 0; j < 2; j++) 2662 { 2663 if(j) 2664 { 2665 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 2666 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 2667 } 2668 else 2669 { 2670 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 2671 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 2672 } 2673 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 2674 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 2675 2676 /* o0[0-3] */ 2677 { 2678 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2679 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2680 2681 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2682 pi2_src_scratch += in_stride; 2683 2684 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 2685 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 2686 2687 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2688 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2689 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2690 2691 2692 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2693 m_count = _mm_cvtsi32_si128(i4_shift); 2694 2695 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 2696 2697 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2698 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2699 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2700 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2701 2702 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2703 2704 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2705 pi2_dst_scratch += out_stride; 2706 } 2707 2708 /* o1[0-3] */ 2709 { 2710 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2711 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2712 2713 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2714 pi2_src_scratch += in_stride; 2715 2716 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 2717 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 2718 2719 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 2720 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2721 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2722 2723 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2724 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2725 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2726 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2727 2728 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2729 2730 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2731 pi2_dst_scratch += ((!i) * out_stride + 8); 2732 } 2733 2734 /* o2[0-3] */ 2735 { 2736 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2737 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2738 2739 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2740 pi2_src_scratch += in_stride; 2741 2742 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 2743 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 2744 2745 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 2746 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2747 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2748 2749 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2750 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2751 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2752 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2753 2754 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2755 2756 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2757 pi2_dst_scratch += out_stride; 2758 } 2759 2760 /* o3[0-3] */ 2761 { 2762 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2763 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2764 2765 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2766 pi2_src_scratch += 8; 2767 2768 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 2769 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 2770 2771 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 2772 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2773 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2774 2775 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2776 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2777 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2778 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2779 2780 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2781 2782 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2783 pi2_dst_scratch += (i * out_stride + 8); 2784 } 2785 2786 /* o4[0-3] */ 2787 { 2788 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2789 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2790 2791 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2792 pi2_src_scratch -= in_stride; 2793 2794 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 2795 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 2796 2797 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 2798 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2799 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2800 2801 2802 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2803 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2804 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2805 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2806 2807 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2808 2809 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2810 pi2_dst_scratch += out_stride; 2811 } 2812 2813 /* o5[0-3] */ 2814 { 2815 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2816 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2817 2818 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2819 pi2_src_scratch -= in_stride; 2820 2821 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 2822 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 2823 2824 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 2825 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2826 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2827 2828 2829 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2830 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2831 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2832 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2833 2834 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2835 2836 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2837 pi2_dst_scratch += ((!i) * out_stride + 8); 2838 } 2839 2840 /* o6[0-3] */ 2841 { 2842 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2843 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2844 2845 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2846 pi2_src_scratch -= in_stride; 2847 2848 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 2849 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 2850 2851 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2852 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2853 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2854 2855 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2856 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2857 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2858 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2859 2860 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2861 2862 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2863 pi2_dst_scratch += out_stride; 2864 } 2865 2866 /* o7[0-3] */ 2867 { 2868 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2869 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2870 2871 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2872 pi2_src_scratch += 8; 2873 2874 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 2875 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 2876 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 2877 2878 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2879 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2880 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2881 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2882 2883 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2884 2885 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2886 pi2_dst_scratch += (i * out_stride + 8); 2887 } 2888 } 2889 } 2890 } 2891 else 2892 { 2893 /* o & stage 2 pre-transposed out */ 2894 { 2895 WORD32 j; 2896 WORD16 *pi2_src_scratch = temp_array; 2897 WORD16 *pi2_dst_scratch = (i) ? (pi2_tmp + 8) : (pi2_tmp); 2898 WORD32 out_stride = (trans_size); 2899 WORD32 in_stride = (8) * 4; 2900 2901 pi2_src_temp = pi2_tmp + (stride * 4) + i * (stride * 2); 2902 2903 2904 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src_temp); //1 2905 pi2_src_temp += (stride); 2906 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src_temp); //5 2907 pi2_src_temp += (stride * 7); 2908 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src_temp); //9 2909 pi2_src_temp += (stride); 2910 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src_temp); //13 2911 if(0 == i) 2912 { 2913 pi2_src_temp -= (stride * 2 - 8); 2914 } 2915 else 2916 { 2917 pi2_src_temp -= (stride * 6 - 8); 2918 } 2919 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src_temp); //15 2920 pi2_src_temp -= (stride); 2921 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src_temp); //11 2922 pi2_src_temp -= (stride * 7); 2923 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src_temp); //7 2924 pi2_src_temp -= (stride); 2925 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src_temp); //3 2926 2927 2928 for(j = 0; j < 2; j++) 2929 { 2930 2931 if(j) //H8B= higher 8 bytes L8B lower 8 bytes 2932 { 2933 m_temp_reg_10 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 H8B 2934 m_temp_reg_11 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 H8B 2935 m_temp_reg_12 = _mm_unpackhi_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 H8B 2936 m_temp_reg_13 = _mm_unpackhi_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 H8B 2937 } 2938 else 2939 { 2940 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_71); //row 1 and row 3 L8B 2941 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_73); //row 5 and row 7 L8B 2942 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_74, m_temp_reg_75); //row 9 and row 11 L8B 2943 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_76, m_temp_reg_77); //row 13 and row 15 L8B 2944 } 2945 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[0][0]); //90 87 2946 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[1][0]); //80 70 2947 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[2][0]); //57 43 2948 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[3][0]); //25 9 2949 2950 2951 /* o0[0-3] */ 2952 { 2953 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 2954 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 2955 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 2956 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 2957 2958 2959 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2960 pi2_src_scratch += in_stride; 2961 2962 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[4][0]); //87 57 2963 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[5][0]); //9 -43 2964 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[6][0]); //80 90 2965 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[7][0]); //70 25 2966 2967 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 2968 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_23); 2969 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 2970 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 2971 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 2972 2973 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2974 m_count = _mm_cvtsi32_si128(i4_shift); 2975 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x00); 2976 2977 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 2978 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 2979 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 2980 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 2981 2982 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 2983 2984 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 2985 pi2_dst_scratch += out_stride; 2986 } 2987 2988 /* o1[0-3] */ 2989 { 2990 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 2991 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 2992 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 2993 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 2994 2995 2996 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 2997 pi2_src_scratch += in_stride; 2998 2999 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[8][0]); //80 9 3000 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[9][0]); //70 87 3001 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[10][0]); //25 -57 3002 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[11][0]); //90 43 3003 3004 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 3005 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 3006 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 3007 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3008 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3009 3010 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3011 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3012 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3013 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3014 3015 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3016 3017 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3018 pi2_dst_scratch += ((!i) * out_stride + 8); 3019 } 3020 3021 /* o2[0-3] */ 3022 { 3023 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3024 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3025 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3026 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3027 3028 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3029 pi2_src_scratch += in_stride; 3030 3031 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[12][0]); //70 -43 3032 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[13][0]); //87 -9 3033 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[14][0]); //90 25 3034 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[15][0]); //80 57 3035 3036 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 3037 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 3038 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 3039 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3040 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3041 3042 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3043 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3044 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3045 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3046 3047 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3048 3049 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3050 pi2_dst_scratch += out_stride; 3051 } 3052 3053 /* o3[0-3] */ 3054 { 3055 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 3056 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 3057 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 3058 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 3059 3060 3061 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3062 pi2_src_scratch += 8; 3063 3064 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[16][0]); //57 -80 3065 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[17][0]); //25 -90 3066 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[18][0]); //9 87 3067 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[19][0]); //43 70 3068 3069 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_25); 3070 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 3071 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 3072 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3073 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3074 3075 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3076 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3077 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3078 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3079 3080 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3081 3082 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3083 pi2_dst_scratch += (i * out_stride + 8); 3084 } 3085 3086 /* o4[0-3] */ 3087 { 3088 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3089 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3090 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3091 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3092 3093 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3094 pi2_src_scratch -= in_stride; 3095 3096 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[20][0]); //43 -90 3097 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[21][0]); //57 25 3098 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[22][0]); //87 -70 3099 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[23][0]); //9 -80 3100 3101 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 3102 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 3103 m_temp_reg_20 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_22); 3104 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3105 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3106 3107 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3108 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3109 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3110 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3111 3112 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3113 3114 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3115 pi2_dst_scratch += out_stride; 3116 } 3117 3118 /* o5[0-3] */ 3119 { 3120 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 3121 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 3122 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 3123 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 3124 3125 3126 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3127 pi2_src_scratch -= in_stride; 3128 3129 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[24][0]); //25 -70 3130 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[25][0]); //90 -80 3131 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[26][0]); //43 9 3132 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[27][0]); //57 -87 3133 3134 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 3135 m_temp_reg_26 = _mm_sub_epi32(m_temp_reg_26, m_temp_reg_27); 3136 m_temp_reg_24 = _mm_sub_epi32(m_temp_reg_24, m_temp_reg_26); 3137 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3138 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3139 3140 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3141 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3142 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3143 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3144 3145 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3146 3147 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3148 pi2_dst_scratch += ((!i) * out_stride + 8); 3149 } 3150 3151 /* o6[0-3] */ 3152 { 3153 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_10, m_coeff1); 3154 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_11, m_coeff2); 3155 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_12, m_coeff3); 3156 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_13, m_coeff4); 3157 3158 3159 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3160 pi2_src_scratch -= in_stride; 3161 3162 m_coeff5 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[28][0]); //9 -25 3163 m_coeff6 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[29][0]); //43 -57 3164 m_coeff7 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[30][0]); //70 -80 3165 m_coeff8 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_16_odd[31][0]); //87 -90 3166 3167 3168 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 3169 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_23); 3170 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_22); 3171 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_20); 3172 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_20); 3173 3174 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3175 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3176 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3177 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3178 3179 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3180 3181 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3182 pi2_dst_scratch += out_stride; 3183 } 3184 3185 /* o7[0-3] */ 3186 { 3187 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_10, m_coeff5); 3188 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_11, m_coeff6); 3189 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_12, m_coeff7); 3190 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_13, m_coeff8); 3191 3192 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); 3193 pi2_src_scratch += 8; 3194 3195 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_25); 3196 m_temp_reg_26 = _mm_add_epi32(m_temp_reg_26, m_temp_reg_27); 3197 m_temp_reg_24 = _mm_add_epi32(m_temp_reg_24, m_temp_reg_26); 3198 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_24); 3199 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_24); 3200 3201 3202 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 3203 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 3204 m_temp_reg_31 = _mm_sra_epi32(m_temp_reg_31, m_count); 3205 m_temp_reg_30 = _mm_sra_epi32(m_temp_reg_30, m_count); 3206 3207 m_temp_reg_30 = _mm_packs_epi32(m_temp_reg_30, m_temp_reg_31); 3208 3209 _mm_storeu_si128((__m128i *)pi2_dst_scratch, m_temp_reg_30); 3210 pi2_dst_scratch += (i * out_stride + 8); 3211 } 3212 3213 } 3214 } 3215 } 3216 } 3217 3218 /* Transpose */ 3219 { 3220 WORD16 *pi2_src_scratch; 3221 UWORD8 *pu1_pred_temp = pu1_pred; 3222 WORD32 out_stride = dst_strd; 3223 WORD32 in_stride = trans_size; 3224 WORD32 j; 3225 m_temp_reg_1 = _mm_setzero_si128(); 3226 for(i = 0; i < 2; i++) 3227 { 3228 pi2_src_scratch = (i) ? (pi2_tmp + 8) : pi2_tmp; 3229 3230 for(j = 0; j < 2; j++) 3231 { 3232 m_temp_reg_30 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //b, a 3233 pi2_src_scratch += in_stride; 3234 m_temp_reg_31 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //d, c 3235 pi2_src_scratch += ((!i) * in_stride + 8); 3236 m_temp_reg_32 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //f, e 3237 pi2_src_scratch += (in_stride); 3238 m_temp_reg_33 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //h, g 3239 pi2_src_scratch += (i * in_stride + 8); 3240 m_temp_reg_34 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //j, i 3241 pi2_src_scratch += in_stride; 3242 m_temp_reg_35 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //l, k 3243 pi2_src_scratch += ((!i) * in_stride + 8); 3244 m_temp_reg_36 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //n, m 3245 pi2_src_scratch += in_stride; 3246 m_temp_reg_37 = _mm_loadu_si128((__m128i *)pi2_src_scratch); //p, o 3247 pi2_src_scratch += (i * in_stride + 8); 3248 3249 m_temp_reg_40 = _mm_unpacklo_epi16(m_temp_reg_30, m_temp_reg_31); //ca3ca2ca1ca0 3250 m_temp_reg_41 = _mm_unpackhi_epi16(m_temp_reg_31, m_temp_reg_30); //bd3bd2bd1bd0 3251 3252 m_temp_reg_42 = _mm_unpacklo_epi16(m_temp_reg_32, m_temp_reg_33); //ge3ge2ge1ge0 3253 m_temp_reg_43 = _mm_unpackhi_epi16(m_temp_reg_33, m_temp_reg_32); //fh3fh2fh1fh0 3254 3255 m_temp_reg_44 = _mm_unpacklo_epi16(m_temp_reg_34, m_temp_reg_35); //ki3ki2ki1ki0 3256 m_temp_reg_45 = _mm_unpackhi_epi16(m_temp_reg_35, m_temp_reg_34); //jl3jl2jl1jl0 3257 3258 m_temp_reg_46 = _mm_unpacklo_epi16(m_temp_reg_36, m_temp_reg_37); //om3om2om1om0 3259 m_temp_reg_47 = _mm_unpackhi_epi16(m_temp_reg_37, m_temp_reg_36); //np3np2np1np0 3260 3261 3262 m_temp_reg_30 = _mm_unpacklo_epi32(m_temp_reg_40, m_temp_reg_42); //ge1ca1ge0ca0 3263 m_temp_reg_31 = _mm_unpackhi_epi32(m_temp_reg_40, m_temp_reg_42); //ge3ca3ge2ca2 3264 3265 m_temp_reg_32 = _mm_unpacklo_epi32(m_temp_reg_44, m_temp_reg_46); //om1ki1om0ki0 3266 m_temp_reg_33 = _mm_unpackhi_epi32(m_temp_reg_44, m_temp_reg_46); //om3ki3om2ki2 3267 3268 m_temp_reg_34 = _mm_unpacklo_epi32(m_temp_reg_43, m_temp_reg_41); //bd1fh1bd0fh0 3269 m_temp_reg_35 = _mm_unpackhi_epi32(m_temp_reg_43, m_temp_reg_41); //bd3fh3bd2fh2 3270 3271 m_temp_reg_36 = _mm_unpacklo_epi32(m_temp_reg_47, m_temp_reg_45); //jl1np1jl0np0 3272 m_temp_reg_37 = _mm_unpackhi_epi32(m_temp_reg_47, m_temp_reg_45); //jl3np3jl2np2 3273 3274 3275 m_temp_reg_40 = _mm_unpacklo_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca0 3276 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3277 3278 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3279 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3280 3281 m_temp_reg_44 = _mm_unpacklo_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 3282 m_temp_reg_40 = _mm_add_epi16(m_temp_reg_40, m_temp_reg_0); 3283 m_temp_reg_44 = _mm_add_epi16(m_temp_reg_44, m_temp_reg_12); 3284 3285 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_40, m_temp_reg_44); 3286 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3287 pu1_dst += out_stride; 3288 pu1_pred_temp += pred_strd; 3289 3290 m_temp_reg_41 = _mm_unpackhi_epi64(m_temp_reg_30, m_temp_reg_32); //omkigeca1 3291 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3292 3293 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3294 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3295 3296 m_temp_reg_45 = _mm_unpackhi_epi64(m_temp_reg_36, m_temp_reg_34); //bdfhjlnp0 3297 m_temp_reg_41 = _mm_add_epi16(m_temp_reg_41, m_temp_reg_0); 3298 m_temp_reg_45 = _mm_add_epi16(m_temp_reg_45, m_temp_reg_12); 3299 3300 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_41, m_temp_reg_45); 3301 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3302 pu1_dst += out_stride; 3303 pu1_pred_temp += pred_strd; 3304 3305 m_temp_reg_42 = _mm_unpacklo_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca2 3306 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3307 3308 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3309 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3310 3311 m_temp_reg_46 = _mm_unpacklo_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 3312 m_temp_reg_42 = _mm_add_epi16(m_temp_reg_42, m_temp_reg_0); 3313 m_temp_reg_46 = _mm_add_epi16(m_temp_reg_46, m_temp_reg_12); 3314 3315 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_42, m_temp_reg_46); 3316 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3317 pu1_dst += out_stride; 3318 pu1_pred_temp += pred_strd; 3319 3320 m_temp_reg_43 = _mm_unpackhi_epi64(m_temp_reg_31, m_temp_reg_33); //omkigeca3 3321 m_temp_reg_20 = _mm_loadu_si128((__m128i *)pu1_pred_temp); 3322 3323 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_20, m_temp_reg_1); 3324 m_temp_reg_12 = _mm_unpackhi_epi8(m_temp_reg_20, m_temp_reg_1); 3325 3326 m_temp_reg_47 = _mm_unpackhi_epi64(m_temp_reg_37, m_temp_reg_35); //bdfhjlnp0 3327 m_temp_reg_43 = _mm_add_epi16(m_temp_reg_43, m_temp_reg_0); 3328 m_temp_reg_47 = _mm_add_epi16(m_temp_reg_47, m_temp_reg_12); 3329 3330 m_temp_reg_20 = _mm_packus_epi16(m_temp_reg_43, m_temp_reg_47); 3331 _mm_storeu_si128((__m128i *)pu1_dst, m_temp_reg_20); 3332 pu1_dst += out_stride; 3333 pu1_pred_temp += pred_strd; 3334 } 3335 } 3336 } 3337 } 3338