1 /****************************************************************************** 2 * 3 * Copyright (C) 2012 Ittiam Systems Pvt Ltd, Bangalore 4 * 5 * Licensed under the Apache License, Version 2.0 (the "License"); 6 * you may not use this file except in compliance with the License. 7 * You may obtain a copy of the License at: 8 * 9 * http://www.apache.org/licenses/LICENSE-2.0 10 * 11 * Unless required by applicable law or agreed to in writing, software 12 * distributed under the License is distributed on an "AS IS" BASIS, 13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 * See the License for the specific language governing permissions and 15 * limitations under the License. 16 * 17 ******************************************************************************/ 18 /** 19 ******************************************************************************* 20 * @file 21 * ihevc_itrans_recon_x86_intr.c 22 * 23 * @brief 24 * Contains function definitions for inverse quantization, inverse 25 * transform and reconstruction 26 * 27 * @author 28 * 100470 29 * 100592 (edited by) 30 * 31 * @par List of Functions: 32 * - ihevc_itrans_recon_4x4_ttype1_sse42() 33 * - ihevc_itrans_recon_4x4_sse42() 34 * - ihevc_itrans_recon_8x8_sse42() 35 * 36 * @remarks 37 * None 38 * 39 ******************************************************************************* 40 */ 41 #include <stdio.h> 42 #include <string.h> 43 #include "ihevc_typedefs.h" 44 #include "ihevc_macros.h" 45 #include "ihevc_platform_macros.h" 46 #include "ihevc_defs.h" 47 #include "ihevc_trans_tables.h" 48 #include "ihevc_iquant_itrans_recon.h" 49 #include "ihevc_func_selector.h" 50 #include "ihevc_trans_macros.h" 51 52 #include <immintrin.h> 53 #include <emmintrin.h> 54 #include <smmintrin.h> 55 #include <tmmintrin.h> 56 57 /** 58 ******************************************************************************* 59 * 60 * @brief 61 * This function performs inverse quantization, inverse transform 62 * type1(DST) and reconstruction for 4x4 input block 63 * 64 * @par Description: 65 * Performs inverse quantization , inverse transform type 1 and adds 66 * prediction data and clips output to 8 bit 67 * 68 * @param[in] pi2_src 69 * Input 4x4 coefficients 70 * 71 * @param[in] pi2_tmp 72 * Temporary 4x4 buffer for storing inverse 73 * transform 1st stage output 74 * 75 * @param[in] pu1_pred 76 * Prediction 4x4 block 77 * 78 * @param[in] pi2_dequant_coeff 79 * Dequant Coeffs 80 * 81 * @param[out] pu1_dst 82 * Output 4x4 block 83 * 84 * @param[in] qp_div 85 * Quantization parameter / 6 86 * 87 * @param[in] qp_rem 88 * Quantization parameter % 6 89 * 90 * @param[in] src_strd 91 * Input stride 92 * 93 * @param[in] pred_strd 94 * Prediction stride 95 * 96 * @param[in] dst_strd 97 * Output Stride 98 * 99 * @param[in] zero_cols 100 * Zero columns in pi2_src 101 * 102 * @returns Void 103 * 104 * @remarks 105 * None 106 * 107 ******************************************************************************* 108 */ 109 110 111 void ihevc_itrans_recon_4x4_ttype1_sse42(WORD16 *pi2_src, 112 WORD16 *pi2_tmp, 113 UWORD8 *pu1_pred, 114 UWORD8 *pu1_dst, 115 WORD32 src_strd, 116 WORD32 pred_strd, 117 WORD32 dst_strd, 118 WORD32 zero_cols, 119 WORD32 zero_rows) 120 { 121 __m128i m_temp_reg_0; 122 __m128i m_temp_reg_1; 123 __m128i m_temp_reg_2; 124 __m128i m_temp_reg_3; 125 __m128i m_temp_reg_4; 126 __m128i m_temp_reg_10; 127 __m128i m_temp_reg_11; 128 __m128i m_temp_reg_12; 129 __m128i m_temp_reg_13; 130 __m128i m_temp_reg_14; 131 __m128i m_temp_reg_20; 132 __m128i m_temp_reg_21; 133 __m128i m_temp_reg_22; 134 __m128i m_temp_reg_23; 135 __m128i m_temp_reg_24; 136 __m128i m_temp_reg_25; 137 __m128i m_temp_reg_30; 138 __m128i m_temp_reg_31; 139 __m128i m_temp_reg_32; 140 __m128i m_temp_reg_33; 141 __m128i m_temp_reg_34; 142 __m128i m_temp_reg_35; 143 __m128i m_temp_reg_36; 144 __m128i m_coeff1, m_coeff2, m_coeff3; 145 __m128i m_rdng_factor; 146 __m128i m_count; 147 148 WORD32 i4_shift = IT_SHIFT_STAGE_1; 149 UNUSED(zero_rows); 150 UNUSED(zero_cols); 151 UNUSED(pi2_tmp); 152 153 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[2][0]); //74 154 155 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src); 156 pi2_src += src_strd; 157 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src); 158 pi2_src += src_strd; 159 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src); 160 pi2_src += src_strd; 161 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src); 162 163 m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0); 164 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2); 165 166 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 167 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3); 168 169 /* c[4] in m_temp_reg_14 */ 170 /* c[4] = src[0] - src[2] + src[3] */ 171 { 172 m_temp_reg_14 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_2); 173 } 174 175 /* c[3] in m_temp_reg_13 */ 176 { 177 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); 178 } 179 180 /* c[0] in m_temp_reg_10 */ 181 { 182 m_temp_reg_10 = _mm_add_epi32(m_temp_reg_0, m_temp_reg_2); 183 } 184 185 /* c[1] in m_temp_reg_11 */ 186 { 187 m_temp_reg_11 = _mm_add_epi32(m_temp_reg_2, m_temp_reg_3); 188 } 189 190 /* c[2] in m_temp_reg_12 */ 191 { 192 m_temp_reg_12 = _mm_sub_epi32(m_temp_reg_0, m_temp_reg_3); 193 } 194 195 /* c[4] in m_temp_reg_14 */ 196 /* c[4] = src[0] - src[2] + src[3] */ 197 { 198 m_temp_reg_14 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_3); 199 } 200 201 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[1][0]); //29 202 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype1[0][0]); //55 203 204 /* Stage 1 outputs stored in m_temp_reg_20-23 */ 205 { 206 m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_10, m_coeff1); //29*c0 207 m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_11, m_coeff2); //55*c1 208 209 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 210 211 m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_11, m_coeff1); //29*c1 212 m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_12, m_coeff2); //55*c2 213 214 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 215 216 m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_10, m_coeff2); //55*c0 217 m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_12, m_coeff1); //29*c2 218 m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_14, m_coeff3); //74*c4 219 220 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 221 m_count = _mm_cvtsi32_si128(i4_shift); 222 223 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 224 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_13); 225 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4); 226 227 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 228 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4); 229 230 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35); 231 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_13); 232 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4); 233 234 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor); 235 236 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 237 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 238 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 239 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 240 241 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 242 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 243 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 244 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 245 246 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 247 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 248 249 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 250 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 251 252 } 253 254 /* Stage 2 */ 255 { 256 i4_shift = IT_SHIFT_STAGE_2; 257 258 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 259 m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20); 260 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 261 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21); 262 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22); 263 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23); 264 265 /* c[4] stored in m_temp_reg_4 */ 266 { 267 m_temp_reg_4 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_21); 268 } 269 270 /* c[3] stored in m_temp_reg_3 */ 271 { 272 m_temp_reg_3 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); 273 } 274 275 /* c[0] stored in m_temp_reg_0 */ 276 { 277 m_temp_reg_0 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_21); 278 } 279 280 /* c[1] stored in m_temp_reg_1 */ 281 { 282 m_temp_reg_1 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_21); 283 } 284 285 /* c[2] stored in m_temp_reg_2 */ 286 { 287 m_temp_reg_2 = _mm_sub_epi32(m_temp_reg_20, m_temp_reg_23); 288 } 289 290 /* c[4] stored in m_temp_reg_4 */ 291 { 292 m_temp_reg_4 = _mm_add_epi32(m_temp_reg_4, m_temp_reg_23); 293 } 294 295 /* Stage 2 output generation */ 296 { 297 m_temp_reg_30 = _mm_mullo_epi32(m_temp_reg_0, m_coeff1); //29*c0 298 m_temp_reg_31 = _mm_mullo_epi32(m_temp_reg_1, m_coeff2); //55*c1 299 300 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 301 302 m_temp_reg_32 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //29*c1 303 m_temp_reg_33 = _mm_mullo_epi32(m_temp_reg_2, m_coeff2); //55*c2 304 305 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 306 307 m_temp_reg_34 = _mm_mullo_epi32(m_temp_reg_0, m_coeff2); //55*c0 308 m_temp_reg_35 = _mm_mullo_epi32(m_temp_reg_2, m_coeff1); //29*c2 309 m_temp_reg_36 = _mm_mullo_epi32(m_temp_reg_4, m_coeff3); //74*c4 310 311 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 312 m_count = _mm_cvtsi32_si128(i4_shift); 313 314 m_temp_reg_4 = _mm_add_epi32(m_rdng_factor, m_temp_reg_3); 315 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_31); 316 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_4); 317 318 m_temp_reg_21 = _mm_sub_epi32(m_temp_reg_33, m_temp_reg_32); 319 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_4); 320 321 m_temp_reg_4 = _mm_sub_epi32(m_rdng_factor, m_temp_reg_3); 322 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_34, m_temp_reg_35); 323 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_4); 324 325 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_36, m_rdng_factor); 326 327 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 328 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 329 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 330 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 331 332 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 333 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 334 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 335 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 336 337 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 338 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 339 340 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 341 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 342 } 343 344 /* Recon and store */ 345 { 346 WORD32 *pi4_dst = (WORD32 *)pu1_dst; 347 348 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 349 pu1_pred += pred_strd; 350 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 351 pu1_pred += pred_strd; 352 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 353 pu1_pred += pred_strd; 354 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 355 356 m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 357 m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1); 358 m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2); 359 m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3); 360 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1); 361 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3); 362 363 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0); 364 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1); 365 366 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21); 367 368 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_0); 369 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4); 370 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8); 371 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12); 372 pu1_dst += dst_strd; 373 pi4_dst = (WORD32 *)(pu1_dst); 374 375 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 376 pu1_dst += dst_strd; 377 pi4_dst = (WORD32 *)(pu1_dst); 378 379 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 380 pu1_dst += dst_strd; 381 pi4_dst = (WORD32 *)(pu1_dst); 382 383 *pi4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 384 } 385 } 386 } 387 388 /** 389 ******************************************************************************* 390 * 391 * @brief 392 * This function performs inverse quantization, inverse transform 393 * (DCT) and reconstruction for 4x4 input block 394 * 395 * @par Description: 396 * Performs inverse quantization , inverse transform and adds 397 * prediction data and clips output to 8 bit 398 * 399 * @param[in] pi2_src 400 * Input 4x4 coefficients 401 * 402 * @param[in] pi2_tmp 403 * Temporary 4x4 buffer for storing inverse 404 * transform 1st stage output 405 * 406 * @param[in] pu1_pred 407 * Prediction 4x4 block 408 * 409 * @param[in] pi2_dequant_coeff 410 * Dequant Coeffs 411 * 412 * @param[out] pu1_dst 413 * Output 4x4 block 414 * 415 * @param[in] qp_div 416 * Quantization parameter / 6 417 * 418 * @param[in] qp_rem 419 * Quantization parameter % 6 420 * 421 * @param[in] src_strd 422 * Input stride 423 * 424 * @param[in] pred_strd 425 * Prediction stride 426 * 427 * @param[in] dst_strd 428 * Output Stride 429 * 430 * @param[in] zero_cols 431 * Zero columns in pi2_src 432 * 433 * @returns Void 434 * 435 * @remarks 436 * None 437 * 438 ******************************************************************************* 439 */ 440 441 void ihevc_itrans_recon_4x4_sse42(WORD16 *pi2_src, 442 WORD16 *pi2_tmp, 443 UWORD8 *pu1_pred, 444 UWORD8 *pu1_dst, 445 WORD32 src_strd, 446 WORD32 pred_strd, 447 WORD32 dst_strd, 448 WORD32 zero_cols, 449 WORD32 zero_rows) 450 { 451 452 453 __m128i m_temp_reg_0; 454 __m128i m_temp_reg_1; 455 __m128i m_temp_reg_2; 456 __m128i m_temp_reg_3; 457 __m128i m_temp_reg_10; 458 __m128i m_temp_reg_11; 459 __m128i m_temp_reg_12; 460 __m128i m_temp_reg_13; 461 __m128i m_temp_reg_14; 462 __m128i m_temp_reg_15; 463 __m128i m_temp_reg_20; 464 __m128i m_temp_reg_21; 465 __m128i m_temp_reg_22; 466 __m128i m_temp_reg_23; 467 __m128i m_temp_reg_24; 468 __m128i m_temp_reg_25; 469 __m128i m_temp_reg_30; 470 __m128i m_temp_reg_31; 471 __m128i m_temp_reg_33; 472 __m128i m_temp_reg_34; 473 __m128i m_coeff1, m_coeff3; 474 __m128i m_rdng_factor; 475 __m128i m_count; 476 477 478 WORD32 i4_shift = IT_SHIFT_STAGE_1; 479 UNUSED(zero_rows); 480 UNUSED(zero_cols); 481 UNUSED(pi2_tmp); 482 483 484 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pi2_src); 485 pi2_src += src_strd; 486 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pi2_src); 487 pi2_src += src_strd; 488 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pi2_src); 489 pi2_src += src_strd; 490 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pi2_src); 491 492 m_temp_reg_0 = _mm_cvtepi16_epi32(m_temp_reg_0); 493 m_temp_reg_2 = _mm_cvtepi16_epi32(m_temp_reg_2); 494 495 m_temp_reg_1 = _mm_cvtepi16_epi32(m_temp_reg_1); 496 m_temp_reg_3 = _mm_cvtepi16_epi32(m_temp_reg_3); 497 498 499 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[0][0]); //36 500 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai4_ihevc_trans_4_ttype0[2][0]); //83 501 502 /* e */ 503 { 504 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_0, 6); 505 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_2, 6); 506 } 507 508 /* o */ 509 { 510 m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_1, m_coeff1); //src[1]*36 511 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_3, m_coeff3); //src[3]*83 512 m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_1, m_coeff3); //src[1]*83 513 m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_3, m_coeff1); //src[3]*36 514 } 515 516 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 517 518 /* e1 stored in m_temp_reg_31 */ 519 { 520 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11); 521 } 522 523 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 524 525 /* e0 stored in m_temp_reg_30 */ 526 { 527 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 528 } 529 530 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 531 m_count = _mm_cvtsi32_si128(i4_shift); 532 533 /* o1 stored in m_temp_reg_33 */ 534 { 535 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13); 536 } 537 538 /* e1 + add */ 539 { 540 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 541 } 542 543 /* e0 + add */ 544 { 545 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 546 } 547 548 /* o0 stored in m_temp_reg_34 */ 549 { 550 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15); 551 } 552 553 /* Stage 1 outputs */ 554 { 555 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33); 556 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33); 557 558 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34); 559 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34); 560 561 562 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 563 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 564 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 565 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 566 567 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 568 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 569 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 570 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 571 572 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 573 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 574 575 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 576 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 577 } 578 579 /* Stage 2 */ 580 { 581 i4_shift = IT_SHIFT_STAGE_2; 582 583 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 584 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 585 586 m_temp_reg_20 = _mm_cvtepi16_epi32(m_temp_reg_20); 587 m_temp_reg_21 = _mm_cvtepi16_epi32(m_temp_reg_21); 588 589 m_temp_reg_22 = _mm_cvtepi16_epi32(m_temp_reg_22); 590 m_temp_reg_23 = _mm_cvtepi16_epi32(m_temp_reg_23); 591 592 /* e */ 593 { 594 m_temp_reg_10 = _mm_slli_epi32(m_temp_reg_20, 6); 595 } 596 597 /* o */ 598 { 599 m_temp_reg_12 = _mm_mullo_epi32(m_temp_reg_22, m_coeff1); //src[1]*36 600 m_temp_reg_14 = _mm_mullo_epi32(m_temp_reg_22, m_coeff3); //src[1]*83 601 m_temp_reg_13 = _mm_mullo_epi32(m_temp_reg_23, m_coeff3); //src[3]*83 602 m_temp_reg_15 = _mm_mullo_epi32(m_temp_reg_23, m_coeff1); //src[3]*36 603 } 604 605 /* e */ 606 { 607 m_temp_reg_11 = _mm_slli_epi32(m_temp_reg_21, 6); 608 } 609 610 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 611 612 /* e1 stored in m_temp_reg_31 */ 613 { 614 m_temp_reg_31 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_11); 615 } 616 617 m_rdng_factor = _mm_unpacklo_epi32(m_rdng_factor, m_rdng_factor); 618 619 /* e0 stored in m_temp_reg_30 */ 620 { 621 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_11); 622 } 623 624 m_rdng_factor = _mm_unpacklo_epi64(m_rdng_factor, m_rdng_factor); 625 m_count = _mm_cvtsi32_si128(i4_shift); 626 627 /* o1 stored in m_temp_reg_33 */ 628 { 629 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_13); 630 } 631 632 /* e1 + add */ 633 { 634 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_31, m_rdng_factor); 635 } 636 637 /* e0 + add */ 638 { 639 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_30, m_rdng_factor); 640 } 641 642 /* o0 stored in m_temp_reg_34 */ 643 { 644 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_14, m_temp_reg_15); 645 } 646 647 /* Stage 2 outputs */ 648 { 649 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_31, m_temp_reg_33); 650 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_31, m_temp_reg_33); 651 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_30, m_temp_reg_34); 652 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_30, m_temp_reg_34); 653 654 m_temp_reg_21 = _mm_sra_epi32(m_temp_reg_21, m_count); 655 m_temp_reg_22 = _mm_sra_epi32(m_temp_reg_22, m_count); 656 m_temp_reg_20 = _mm_sra_epi32(m_temp_reg_20, m_count); 657 m_temp_reg_23 = _mm_sra_epi32(m_temp_reg_23, m_count); 658 659 m_temp_reg_20 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 660 m_temp_reg_21 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 661 m_temp_reg_22 = _mm_srli_si128(m_temp_reg_20, 8); 662 m_temp_reg_23 = _mm_srli_si128(m_temp_reg_21, 8); 663 664 m_temp_reg_24 = _mm_unpacklo_epi16(m_temp_reg_20, m_temp_reg_22); 665 m_temp_reg_25 = _mm_unpacklo_epi16(m_temp_reg_21, m_temp_reg_23); 666 667 m_temp_reg_20 = _mm_unpacklo_epi32(m_temp_reg_24, m_temp_reg_25); 668 m_temp_reg_21 = _mm_unpackhi_epi32(m_temp_reg_24, m_temp_reg_25); 669 } 670 671 /* Recon and store */ 672 { 673 UWORD32 *pu4_dst = (UWORD32 *)pu1_dst; 674 675 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 676 pu1_pred += pred_strd; 677 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 678 pu1_pred += pred_strd; 679 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 680 pu1_pred += pred_strd; 681 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 682 683 m_temp_reg_0 = _mm_cvtepu8_epi16(m_temp_reg_0); 684 m_temp_reg_1 = _mm_cvtepu8_epi16(m_temp_reg_1); 685 m_temp_reg_0 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_1); 686 m_temp_reg_2 = _mm_cvtepu8_epi16(m_temp_reg_2); 687 m_temp_reg_3 = _mm_cvtepu8_epi16(m_temp_reg_3); 688 m_temp_reg_1 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_3); 689 690 m_temp_reg_20 = _mm_add_epi16(m_temp_reg_20, m_temp_reg_0); 691 m_temp_reg_21 = _mm_add_epi16(m_temp_reg_21, m_temp_reg_1); 692 693 m_temp_reg_0 = _mm_packus_epi16(m_temp_reg_20, m_temp_reg_21); 694 695 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_0); 696 m_temp_reg_1 = _mm_srli_si128(m_temp_reg_0, 4); 697 m_temp_reg_2 = _mm_srli_si128(m_temp_reg_0, 8); 698 m_temp_reg_3 = _mm_srli_si128(m_temp_reg_0, 12); 699 pu1_dst += dst_strd; 700 pu4_dst = (UWORD32 *)(pu1_dst); 701 702 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_1); 703 pu1_dst += dst_strd; 704 pu4_dst = (UWORD32 *)(pu1_dst); 705 706 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_2); 707 pu1_dst += dst_strd; 708 pu4_dst = (UWORD32 *)(pu1_dst); 709 710 *pu4_dst = _mm_cvtsi128_si32(m_temp_reg_3); 711 } 712 } 713 } 714 715 716 717 /** 718 ******************************************************************************* 719 * 720 * @brief 721 * This function performs inverse quantization, inverse transform and 722 * reconstruction for 8c8 input block 723 * 724 * @par Description: 725 * Performs inverse quantization , inverse transform and adds the 726 * prediction data and clips output to 8 bit 727 * 728 * @param[in] pi2_src 729 * Input 8x8 coefficients 730 * 731 * @param[in] pi2_tmp 732 * Temporary 8x8 buffer for storing inverse 733 * transform 1st stage output 734 * 735 * @param[in] pu1_pred 736 * Prediction 8x8 block 737 * 738 * @param[in] pi2_dequant_coeff 739 * Dequant Coeffs 740 * 741 * @param[out] pu1_dst 742 * Output 8x8 block 743 * 744 * @param[in] src_strd 745 * Input stride 746 * 747 * @param[in] qp_div 748 * Quantization parameter / 6 749 * 750 * @param[in] qp_rem 751 * Quantization parameter % 6 752 * 753 * @param[in] pred_strd 754 * Prediction stride 755 * 756 * @param[in] dst_strd 757 * Output Stride 758 * 759 * @param[in] zero_cols 760 * Zero columns in pi2_src 761 * 762 * @returns Void 763 * 764 * @remarks 765 * None 766 * 767 ******************************************************************************* 768 */ 769 770 771 void ihevc_itrans_recon_8x8_sse42(WORD16 *pi2_src, 772 WORD16 *pi2_tmp, 773 UWORD8 *pu1_pred, 774 UWORD8 *pu1_dst, 775 WORD32 src_strd, 776 WORD32 pred_strd, 777 WORD32 dst_strd, 778 WORD32 zero_cols, 779 WORD32 zero_rows) 780 { 781 __m128i m_temp_reg_0; 782 __m128i m_temp_reg_1; 783 __m128i m_temp_reg_2; 784 __m128i m_temp_reg_3; 785 __m128i m_temp_reg_5; 786 __m128i m_temp_reg_6; 787 __m128i m_temp_reg_7; 788 __m128i m_temp_reg_4; 789 __m128i m_temp_reg_10; 790 __m128i m_temp_reg_11; 791 __m128i m_temp_reg_12; 792 __m128i m_temp_reg_13; 793 __m128i m_temp_reg_14; 794 __m128i m_temp_reg_15; 795 __m128i m_temp_reg_16; 796 __m128i m_temp_reg_17; 797 __m128i m_temp_reg_20; 798 __m128i m_temp_reg_21; 799 __m128i m_temp_reg_22; 800 __m128i m_temp_reg_23; 801 __m128i m_temp_reg_24; 802 __m128i m_temp_reg_25; 803 __m128i m_temp_reg_26; 804 __m128i m_temp_reg_27; 805 __m128i m_temp_reg_30; 806 __m128i m_temp_reg_31; 807 __m128i m_temp_reg_32; 808 __m128i m_temp_reg_33; 809 __m128i m_temp_reg_34; 810 __m128i m_temp_reg_35; 811 __m128i m_temp_reg_36; 812 __m128i m_temp_reg_37; 813 __m128i m_temp_reg_40; 814 __m128i m_temp_reg_41; 815 __m128i m_temp_reg_42; 816 __m128i m_temp_reg_43; 817 __m128i m_temp_reg_44; 818 __m128i m_temp_reg_45; 819 __m128i m_temp_reg_46; 820 __m128i m_temp_reg_47; 821 __m128i m_temp_reg_50; 822 __m128i m_temp_reg_51; 823 __m128i m_temp_reg_52; 824 __m128i m_temp_reg_53; 825 __m128i m_temp_reg_54; 826 __m128i m_temp_reg_55; 827 __m128i m_temp_reg_56; 828 __m128i m_temp_reg_57; 829 __m128i m_temp_reg_60; 830 __m128i m_temp_reg_61; 831 __m128i m_temp_reg_62; 832 __m128i m_temp_reg_63; 833 __m128i m_temp_reg_64; 834 __m128i m_temp_reg_65; 835 __m128i m_temp_reg_66; 836 __m128i m_temp_reg_67; 837 __m128i m_temp_reg_70; 838 __m128i m_temp_reg_71; 839 __m128i m_temp_reg_72; 840 __m128i m_temp_reg_73; 841 __m128i m_temp_reg_74; 842 __m128i m_temp_reg_75; 843 __m128i m_temp_reg_76; 844 __m128i m_temp_reg_77; 845 __m128i m_coeff1, m_coeff2, m_coeff3, m_coeff4; 846 847 WORD32 check_row_stage_1; /* Lokesh */ 848 WORD32 check_row_stage_2; /* Lokesh */ 849 850 __m128i m_rdng_factor; 851 WORD32 i4_shift = IT_SHIFT_STAGE_1; 852 UNUSED(pi2_tmp); 853 check_row_stage_1 = ((zero_rows & 0xF0) != 0xF0) ? 1 : 0; 854 check_row_stage_2 = ((zero_cols & 0xF0) != 0xF0) ? 1 : 0; 855 856 m_temp_reg_70 = _mm_loadu_si128((__m128i *)pi2_src); 857 pi2_src += src_strd; 858 m_temp_reg_71 = _mm_loadu_si128((__m128i *)pi2_src); 859 pi2_src += src_strd; 860 m_temp_reg_72 = _mm_loadu_si128((__m128i *)pi2_src); 861 pi2_src += src_strd; 862 m_temp_reg_73 = _mm_loadu_si128((__m128i *)pi2_src); 863 pi2_src += src_strd; 864 865 m_temp_reg_74 = _mm_loadu_si128((__m128i *)pi2_src); 866 pi2_src += src_strd; 867 m_temp_reg_75 = _mm_loadu_si128((__m128i *)pi2_src); 868 pi2_src += src_strd; 869 m_temp_reg_76 = _mm_loadu_si128((__m128i *)pi2_src); 870 pi2_src += src_strd; 871 m_temp_reg_77 = _mm_loadu_si128((__m128i *)pi2_src); 872 873 if(!check_row_stage_2) 874 { 875 if(!check_row_stage_1) 876 { 877 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 878 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 879 { 880 //Interleaving 0,4 row in 0 , 1 Rishab 881 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 882 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 883 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 884 885 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 886 887 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 888 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 889 890 } 891 892 893 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 894 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 895 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 896 { 897 898 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 899 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 900 901 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 902 //Interleaving 2,6 row in 4, 5 Rishab 903 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 904 905 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 906 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 907 908 909 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 910 911 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 912 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 913 914 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 915 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 916 917 918 919 /* e */ 920 921 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 922 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 923 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 924 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 925 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 926 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 927 928 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 929 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 930 931 } 932 933 /* o */ 934 { 935 936 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 937 { 938 939 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 940 //o0:1B*89+3B*75,5B*50+7B*18 941 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 942 943 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 944 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 945 946 947 948 /* Column 0 of destination computed here */ 949 /* It is stored in m_temp_reg_50 */ 950 /* Column 7 of destination computed here */ 951 /* It is stored in m_temp_reg_57 */ 952 /* Upper 8 bytes of both registers are zero due to zero_cols*/ 953 954 955 956 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 957 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 958 959 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 960 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 961 962 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 963 m_temp_reg_63 = _mm_setzero_si128(); 964 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 965 966 //o1:1B*75-3B*18,5B*89+7B*50 967 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 968 969 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 970 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 971 972 /* Loading coeff for computing o2 in the next block */ 973 974 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 975 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 976 977 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 978 979 980 981 /* Column 1 of destination computed here */ 982 /* It is stored in m_temp_reg_51 */ 983 /* Column 6 of destination computed here */ 984 /* It is stored in m_temp_reg_56 */ 985 986 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 987 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 988 989 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 990 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 991 992 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 993 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 994 995 //o2:1B*50-3B*89,5B*18+7B*75 996 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 997 998 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 999 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1000 1001 1002 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1003 1004 /* Loading coeff for computing o3 in the next block */ 1005 1006 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1007 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 1008 1009 1010 1011 /* Column 2 of destination computed here */ 1012 /* It is stored in m_temp_reg_52 */ 1013 /* Column 5 of destination computed here */ 1014 /* It is stored in m_temp_reg_55 */ 1015 1016 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1017 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1018 1019 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1020 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1021 1022 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1023 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1024 1025 //o3:1B*18-3B*50,5B*75-7B*89 1026 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1027 1028 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1029 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1030 1031 1032 1033 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1034 1035 1036 1037 /* Column 3 of destination computed here */ 1038 /* It is stored in m_temp_reg_53 */ 1039 /* Column 4 of destination computed here */ 1040 /* It is stored in m_temp_reg_54 */ 1041 1042 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1043 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1044 1045 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1046 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1047 1048 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1049 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1050 1051 1052 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1053 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1054 } 1055 } 1056 1057 /* Transpose of the destination 8x8 matrix done here */ 1058 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1059 /* respectively */ 1060 { 1061 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1062 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1063 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1064 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1065 1066 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1067 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1068 1069 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1070 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1071 1072 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1073 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1074 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1075 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1076 1077 m_temp_reg_54 = _mm_setzero_si128(); 1078 m_temp_reg_55 = _mm_setzero_si128(); 1079 m_temp_reg_56 = _mm_setzero_si128(); 1080 m_temp_reg_57 = _mm_setzero_si128(); 1081 } 1082 } 1083 else 1084 { 1085 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1086 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1087 { 1088 //Interleaving 0,4 row in 0 , 1 Rishab 1089 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1090 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 1091 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 1092 1093 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1094 1095 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1096 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1097 1098 } 1099 1100 1101 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1102 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1103 /* as upper 8 bytes are zeros so m_temp_reg_15 and m_temp_reg_17 are not used*/ 1104 { 1105 1106 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1107 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1108 1109 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1110 //Interleaving 2,6 row in 4, 5 Rishab 1111 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1112 1113 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1114 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1115 1116 1117 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1118 1119 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1120 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 1121 1122 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1123 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 1124 1125 1126 1127 /* e */ 1128 1129 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1130 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1131 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1132 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1133 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1134 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1135 1136 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1137 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1138 1139 } 1140 1141 /* o */ 1142 { 1143 1144 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1145 { 1146 1147 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1148 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1149 //o0:1B*89+3B*75,5B*50+7B*18 1150 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1151 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1152 1153 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1154 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1155 1156 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1157 1158 1159 1160 /* Column 0 of destination computed here */ 1161 /* It is stored in m_temp_reg_50 */ 1162 /* Column 7 of destination computed here */ 1163 /* It is stored in m_temp_reg_57 */ 1164 /* Upper 8 bytes of both registers are zero due to zero_cols*/ 1165 1166 1167 1168 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1169 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1170 1171 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1172 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1173 1174 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1175 m_temp_reg_63 = _mm_setzero_si128(); 1176 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1177 1178 //o1:1B*75-3B*18,5B*89+7B*50 1179 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1180 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1181 1182 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1183 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1184 1185 /* Loading coeff for computing o2 in the next block */ 1186 1187 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1188 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 1189 1190 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1191 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 1192 1193 1194 1195 /* Column 1 of destination computed here */ 1196 /* It is stored in m_temp_reg_51 */ 1197 /* Column 6 of destination computed here */ 1198 /* It is stored in m_temp_reg_56 */ 1199 1200 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1201 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1202 1203 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1204 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1205 1206 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1207 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1208 1209 //o2:1B*50-3B*89,5B*18+7B*75 1210 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1211 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1212 1213 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1214 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1215 1216 1217 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1218 1219 /* Loading coeff for computing o3 in the next block */ 1220 1221 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1222 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 1223 1224 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1225 1226 1227 /* Column 2 of destination computed here */ 1228 /* It is stored in m_temp_reg_52 */ 1229 /* Column 5 of destination computed here */ 1230 /* It is stored in m_temp_reg_55 */ 1231 1232 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1233 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1234 1235 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1236 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1237 1238 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1239 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1240 1241 //o3:1B*18-3B*50,5B*75-7B*89 1242 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1243 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 1244 1245 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1246 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1247 1248 1249 1250 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1251 1252 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 1253 1254 1255 /* Column 3 of destination computed here */ 1256 /* It is stored in m_temp_reg_53 */ 1257 /* Column 4 of destination computed here */ 1258 /* It is stored in m_temp_reg_54 */ 1259 1260 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1261 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1262 1263 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1264 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1265 1266 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1267 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1268 1269 1270 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1271 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_63); 1272 } 1273 } 1274 1275 /* Transpose of the destination 8x8 matrix done here */ 1276 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1277 /* respectively */ 1278 { 1279 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1280 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1281 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1282 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1283 1284 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1285 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1286 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1287 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1288 1289 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1290 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1291 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1292 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1293 1294 m_temp_reg_54 = _mm_setzero_si128(); 1295 m_temp_reg_55 = _mm_setzero_si128(); 1296 m_temp_reg_56 = _mm_setzero_si128(); 1297 m_temp_reg_57 = _mm_setzero_si128(); 1298 } 1299 } 1300 1301 /* Stage 2 */ 1302 i4_shift = IT_SHIFT_STAGE_2; 1303 { 1304 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1305 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1306 { 1307 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add 1308 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub 1309 1310 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 1311 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 1312 1313 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1314 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1315 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1316 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1317 1318 1319 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); 1320 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); 1321 } 1322 1323 1324 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1325 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1326 { 1327 1328 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 1329 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 1330 1331 1332 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1333 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1334 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1335 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1336 1337 /* Loading coeff for computing o0 in the next block */ 1338 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1339 1340 1341 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 1342 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 1343 1344 1345 1346 /* e */ 1347 1348 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1349 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1350 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1351 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1352 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1353 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1354 1355 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1356 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1357 1358 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1359 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1360 1361 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1362 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1363 1364 } 1365 1366 /* o */ 1367 { 1368 1369 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1370 { 1371 //o0:1B*89+3B*75,1T*89+3T*75 1372 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1373 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1374 1375 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1376 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1377 /* Loading coeff for computing o1 in the next block */ 1378 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1379 1380 1381 1382 /* Column 0 of destination computed here */ 1383 /* It is stored in m_temp_reg_50 */ 1384 /* Column 7 of destination computed here */ 1385 /* It is stored in m_temp_reg_57 */ 1386 1387 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1388 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1389 1390 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1391 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1392 1393 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1394 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1395 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1396 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1397 1398 //o1:1B*75-3B*18,1T*75-3T*18 1399 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 1400 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 1401 1402 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1403 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1404 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1405 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1406 1407 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1408 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1409 1410 1411 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 1412 1413 1414 /* Loading coeff for computing o2 in the next block */ 1415 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1416 1417 1418 1419 /* Column 1 of destination computed here */ 1420 /* It is stored in m_temp_reg_51 */ 1421 /* Column 6 of destination computed here */ 1422 /* It is stored in m_temp_reg_56 */ 1423 1424 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1425 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1426 1427 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1428 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1429 1430 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1431 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1432 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1433 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1434 1435 //o2:1B*50-3B*89,5T*18+7T*75. 1436 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1437 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1438 1439 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1440 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1441 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1442 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1443 1444 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1445 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1446 1447 1448 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1449 1450 /* Loading coeff for computing o3 in the next block */ 1451 1452 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1453 1454 1455 /* Column 2 of destination computed here */ 1456 /* It is stored in m_temp_reg_52 */ 1457 /* Column 5 of destination computed here */ 1458 /* It is stored in m_temp_reg_55 */ 1459 1460 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1461 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1462 1463 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 1464 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 1465 1466 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 1467 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 1468 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 1469 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 1470 1471 //o3:1B*18-3B*50,1T*18-3T*50 1472 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 1473 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 1474 1475 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 1476 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 1477 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 1478 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 1479 1480 1481 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 1482 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 1483 1484 1485 1486 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1487 1488 1489 /* Column 3 of destination computed here */ 1490 /* It is stored in m_temp_reg_53 */ 1491 /* Column 4 of destination computed here */ 1492 /* It is stored in m_temp_reg_54 */ 1493 1494 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1495 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1496 1497 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 1498 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 1499 1500 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 1501 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 1502 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 1503 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 1504 1505 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 1506 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 1507 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 1508 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 1509 1510 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 1511 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 1512 } 1513 } 1514 1515 /* Transpose of the destination 8x8 matrix done here */ 1516 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1517 /* respectively */ 1518 { 1519 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1520 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1521 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1522 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1523 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1524 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1525 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1526 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1527 1528 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1529 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1530 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1531 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1532 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1533 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1534 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1535 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1536 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1537 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1538 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1539 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1540 1541 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1542 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1543 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1544 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1545 } 1546 1547 /* Recon and store */ 1548 { 1549 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 1550 pu1_pred += pred_strd; 1551 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 1552 pu1_pred += pred_strd; 1553 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 1554 pu1_pred += pred_strd; 1555 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 1556 pu1_pred += pred_strd; 1557 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 1558 pu1_pred += pred_strd; 1559 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 1560 pu1_pred += pred_strd; 1561 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 1562 pu1_pred += pred_strd; 1563 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 1564 1565 m_temp_reg_50 = _mm_setzero_si128(); 1566 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 1567 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 1568 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 1569 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 1570 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 1571 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 1572 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 1573 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 1574 1575 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 1576 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 1577 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 1578 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 1579 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 1580 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 1581 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 1582 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 1583 1584 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 1585 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 1586 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 1587 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 1588 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 1589 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 1590 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 1591 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 1592 1593 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 1594 pu1_dst += dst_strd; 1595 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 1596 pu1_dst += dst_strd; 1597 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 1598 pu1_dst += dst_strd; 1599 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 1600 pu1_dst += dst_strd; 1601 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 1602 pu1_dst += dst_strd; 1603 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 1604 pu1_dst += dst_strd; 1605 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 1606 pu1_dst += dst_strd; 1607 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 1608 pu1_dst += dst_strd; 1609 } 1610 } 1611 } 1612 else 1613 1614 { 1615 1616 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1617 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1618 if(!check_row_stage_1) 1619 { 1620 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1621 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1622 { 1623 //Interleaving 0,4 row in 0 , 1 Rishab 1624 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1625 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 1626 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 1627 1628 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1629 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 1630 1631 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1632 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1633 1634 1635 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1636 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1637 } 1638 1639 1640 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1641 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1642 { 1643 1644 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1645 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1646 1647 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1648 //Interleaving 2,6 row in 4, 5 Rishab 1649 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1650 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 1651 1652 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1653 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1654 1655 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 1656 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 1657 1658 1659 1660 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1661 1662 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1663 //m_coeff4 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[3][0]); 1664 1665 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1666 //m_coeff2 = _mm_loadu_si128((__m128i *) &g_ai2_ihevc_trans_intr_odd_8[1][0]); 1667 1668 } 1669 1670 /* e */ 1671 { 1672 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1673 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1674 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1675 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1676 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1677 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1678 1679 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1680 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1681 1682 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1683 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1684 1685 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1686 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1687 1688 } 1689 1690 /* o */ 1691 { 1692 1693 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1694 { 1695 1696 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1697 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1698 //o0:1B*89+3B*75,1T*89+3T*75 1699 m_temp_reg_30 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1700 m_temp_reg_31 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1701 1702 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1703 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1704 1705 } 1706 1707 /* Column 0 of destination computed here */ 1708 /* It is stored in m_temp_reg_50 */ 1709 /* Column 7 of destination computed here */ 1710 /* It is stored in m_temp_reg_57 */ 1711 { 1712 1713 1714 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1715 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1716 1717 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1718 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1719 1720 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1721 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1722 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1723 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1724 1725 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1726 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1727 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1728 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1729 1730 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 1731 m_temp_reg_32 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1732 m_temp_reg_33 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1733 1734 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1735 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1736 1737 /* Loading coeff for computing o2 in the next block */ 1738 1739 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 1740 1741 } 1742 1743 /* Column 1 of destination computed here */ 1744 /* It is stored in m_temp_reg_51 */ 1745 /* Column 6 of destination computed here */ 1746 /* It is stored in m_temp_reg_56 */ 1747 { 1748 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 1749 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 1750 1751 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 1752 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 1753 1754 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1755 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1756 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1757 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1758 1759 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1760 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1761 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1762 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1763 1764 //o2:1B*50-3B*89,1T*50-3T*89 1765 m_temp_reg_34 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1766 m_temp_reg_35 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1767 1768 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1769 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1770 1771 1772 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 1773 1774 1775 /* Loading coeff for computing o3 in the next block */ 1776 1777 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 1778 1779 } 1780 1781 /* Column 2 of destination computed here */ 1782 /* It is stored in m_temp_reg_52 */ 1783 /* Column 5 of destination computed here */ 1784 /* It is stored in m_temp_reg_55 */ 1785 { 1786 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 1787 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 1788 1789 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 1790 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 1791 1792 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1793 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1794 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1795 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1796 1797 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1798 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1799 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1800 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1801 1802 //o3:1B*18-3B*50,1T*18-3T*50 1803 m_temp_reg_36 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1804 m_temp_reg_37 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 1805 1806 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1807 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1808 1809 1810 1811 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 1812 1813 1814 } 1815 1816 /* Column 3 of destination computed here */ 1817 /* It is stored in m_temp_reg_53 */ 1818 /* Column 4 of destination computed here */ 1819 /* It is stored in m_temp_reg_54 */ 1820 { 1821 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 1822 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 1823 1824 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 1825 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 1826 1827 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1828 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1829 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1830 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1831 1832 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1833 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1834 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1835 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1836 1837 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 1838 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 1839 } 1840 } 1841 1842 /* Transpose of the destination 8x8 matrix done here */ 1843 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 1844 /* respectively */ 1845 { 1846 1847 1848 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 1849 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 1850 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 1851 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 1852 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 1853 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 1854 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 1855 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 1856 1857 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 1858 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 1859 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 1860 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 1861 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 1862 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 1863 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 1864 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 1865 1866 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 1867 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 1868 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 1869 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 1870 1871 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 1872 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 1873 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 1874 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 1875 } 1876 } 1877 else 1878 { 1879 1880 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 1881 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 1882 { 1883 //Interleaving 0,4 row in 0 , 1 Rishab 1884 /*coef2 for m_temp_reg_12 and m_temp_reg_13 , coef1 for m_temp_reg_10 and m_temp_reg_11*/ 1885 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); 1886 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); 1887 1888 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_70, m_temp_reg_74); 1889 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_70, m_temp_reg_74); 1890 1891 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 1892 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 1893 1894 1895 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 1896 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 1897 } 1898 1899 1900 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 1901 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 1902 { 1903 1904 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); //sub 2B*36-6B*83 ,2T*36-6T*83 1905 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); //add 2B*83+6B*36 ,2T*83+6T*36 1906 1907 /* Combining instructions to eliminate them based on zero_rows : Lokesh */ 1908 //Interleaving 2,6 row in 4, 5 Rishab 1909 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_72, m_temp_reg_76); 1910 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_72, m_temp_reg_76); 1911 1912 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_4, m_coeff1); 1913 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 1914 1915 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_5, m_coeff1); 1916 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 1917 1918 1919 1920 /* Loading coeff for computing o0, o1, o2 and o3 in the next block */ 1921 1922 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 1923 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 1924 1925 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 1926 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 1927 1928 } 1929 1930 /* e */ 1931 { 1932 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 1933 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 1934 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 1935 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 1936 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 1937 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 1938 1939 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 1940 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 1941 1942 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 1943 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 1944 1945 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 1946 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 1947 1948 } 1949 1950 /* o */ 1951 { 1952 1953 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 1954 { 1955 1956 m_temp_reg_60 = _mm_unpacklo_epi16(m_temp_reg_71, m_temp_reg_73); 1957 m_temp_reg_61 = _mm_unpackhi_epi16(m_temp_reg_71, m_temp_reg_73); 1958 m_temp_reg_64 = _mm_unpacklo_epi16(m_temp_reg_75, m_temp_reg_77); 1959 m_temp_reg_65 = _mm_unpackhi_epi16(m_temp_reg_75, m_temp_reg_77); 1960 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 1961 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 1962 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 1963 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 1964 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 1965 1966 1967 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 1968 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 1969 1970 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 1971 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 1972 } 1973 1974 /* Column 0 of destination computed here */ 1975 /* It is stored in m_temp_reg_50 */ 1976 /* Column 7 of destination computed here */ 1977 /* It is stored in m_temp_reg_57 */ 1978 { 1979 1980 1981 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 1982 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 1983 1984 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 1985 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 1986 1987 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 1988 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 1989 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 1990 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 1991 1992 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 1993 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 1994 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 1995 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 1996 1997 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 1998 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 1999 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 2000 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 2001 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 2002 2003 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2004 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2005 2006 /* Loading coeff for computing o2 in the next block */ 2007 2008 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 2009 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 2010 2011 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 2012 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 2013 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 2014 } 2015 2016 /* Column 1 of destination computed here */ 2017 /* It is stored in m_temp_reg_51 */ 2018 /* Column 6 of destination computed here */ 2019 /* It is stored in m_temp_reg_56 */ 2020 { 2021 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 2022 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 2023 2024 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 2025 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 2026 2027 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2028 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2029 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2030 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2031 2032 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2033 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2034 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2035 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2036 2037 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 2038 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_60, m_coeff1); 2039 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_64, m_coeff2); 2040 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_61, m_coeff1); 2041 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_65, m_coeff2); 2042 2043 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2044 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2045 2046 2047 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 2048 2049 2050 /* Loading coeff for computing o3 in the next block */ 2051 2052 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 2053 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 2054 2055 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2056 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2057 } 2058 2059 /* Column 2 of destination computed here */ 2060 /* It is stored in m_temp_reg_52 */ 2061 /* Column 5 of destination computed here */ 2062 /* It is stored in m_temp_reg_55 */ 2063 { 2064 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 2065 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 2066 2067 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 2068 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 2069 2070 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2071 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2072 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2073 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2074 2075 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2076 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2077 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2078 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2079 2080 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 2081 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_60, m_coeff3); 2082 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_64, m_coeff4); 2083 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_61, m_coeff3); 2084 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_65, m_coeff4); 2085 2086 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2087 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2088 2089 2090 2091 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 2092 2093 2094 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 2095 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 2096 } 2097 2098 /* Column 3 of destination computed here */ 2099 /* It is stored in m_temp_reg_53 */ 2100 /* Column 4 of destination computed here */ 2101 /* It is stored in m_temp_reg_54 */ 2102 { 2103 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 2104 m_temp_reg_66 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 2105 2106 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 2107 m_temp_reg_67 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 2108 2109 m_temp_reg_62 = _mm_add_epi32(m_temp_reg_62, m_rdng_factor); 2110 m_temp_reg_63 = _mm_add_epi32(m_temp_reg_63, m_rdng_factor); 2111 m_temp_reg_66 = _mm_add_epi32(m_temp_reg_66, m_rdng_factor); 2112 m_temp_reg_67 = _mm_add_epi32(m_temp_reg_67, m_rdng_factor); 2113 2114 m_temp_reg_62 = _mm_srai_epi32(m_temp_reg_62, i4_shift); 2115 m_temp_reg_63 = _mm_srai_epi32(m_temp_reg_63, i4_shift); 2116 m_temp_reg_66 = _mm_srai_epi32(m_temp_reg_66, i4_shift); 2117 m_temp_reg_67 = _mm_srai_epi32(m_temp_reg_67, i4_shift); 2118 2119 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_62, m_temp_reg_63); 2120 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_66, m_temp_reg_67); 2121 } 2122 } 2123 2124 /* Transpose of the destination 8x8 matrix done here */ 2125 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 2126 /* respectively */ 2127 { 2128 2129 2130 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 2131 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 2132 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 2133 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 2134 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 2135 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 2136 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 2137 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 2138 2139 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 2140 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 2141 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 2142 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 2143 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 2144 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 2145 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 2146 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 2147 2148 m_temp_reg_50 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 2149 m_temp_reg_51 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 2150 m_temp_reg_52 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 2151 m_temp_reg_53 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 2152 2153 m_temp_reg_54 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 2154 m_temp_reg_55 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 2155 m_temp_reg_56 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 2156 m_temp_reg_57 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 2157 } 2158 } 2159 /* Stage 2 */ 2160 2161 i4_shift = IT_SHIFT_STAGE_2; 2162 2163 { 2164 2165 /* ee0 is present in the registers m_temp_reg_10 and m_temp_reg_11 */ 2166 /* ee1 is present in the registers m_temp_reg_12 and m_temp_reg_13 */ 2167 { 2168 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[0][0]); //add 2169 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[3][0]); //sub 2170 2171 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_54); 2172 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_54); 2173 2174 m_temp_reg_10 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2175 m_temp_reg_12 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2176 m_temp_reg_11 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2177 m_temp_reg_13 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2178 2179 2180 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[1][0]); 2181 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_even_8[2][0]); 2182 } 2183 2184 2185 /* eo0 is present in the registers m_temp_reg_14 and m_temp_reg_15 */ 2186 /* eo1 is present in the registers m_temp_reg_16 and m_temp_reg_17 */ 2187 { 2188 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_56); 2189 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_56); 2190 2191 2192 m_temp_reg_16 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2193 m_temp_reg_14 = _mm_madd_epi16(m_temp_reg_0, m_coeff2); 2194 m_temp_reg_17 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2195 m_temp_reg_15 = _mm_madd_epi16(m_temp_reg_1, m_coeff2); 2196 2197 /* Loading coeff for computing o0 in the next block */ 2198 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[0][0]); 2199 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[1][0]); 2200 2201 2202 m_temp_reg_0 = _mm_unpacklo_epi16(m_temp_reg_51, m_temp_reg_53); 2203 m_temp_reg_1 = _mm_unpackhi_epi16(m_temp_reg_51, m_temp_reg_53); 2204 } 2205 2206 /* e */ 2207 { 2208 /* e0 stored in m_temp_reg_40 and m_temp_reg_41 */ 2209 /* e1 stored in m_temp_reg_42 and m_temp_reg_43 */ 2210 /* e3 stored in m_temp_reg_46 and m_temp_reg_47 */ 2211 /* e2 stored in m_temp_reg_44 and m_temp_reg_45 */ 2212 m_temp_reg_42 = _mm_add_epi32(m_temp_reg_12, m_temp_reg_16); 2213 m_temp_reg_44 = _mm_sub_epi32(m_temp_reg_12, m_temp_reg_16); 2214 2215 m_temp_reg_40 = _mm_add_epi32(m_temp_reg_10, m_temp_reg_14); 2216 m_temp_reg_46 = _mm_sub_epi32(m_temp_reg_10, m_temp_reg_14); 2217 2218 m_temp_reg_43 = _mm_add_epi32(m_temp_reg_13, m_temp_reg_17); 2219 m_temp_reg_45 = _mm_sub_epi32(m_temp_reg_13, m_temp_reg_17); 2220 2221 m_temp_reg_41 = _mm_add_epi32(m_temp_reg_11, m_temp_reg_15); 2222 m_temp_reg_47 = _mm_sub_epi32(m_temp_reg_11, m_temp_reg_15); 2223 2224 } 2225 2226 /* o */ 2227 { 2228 m_temp_reg_4 = _mm_unpacklo_epi16(m_temp_reg_55, m_temp_reg_57); 2229 m_temp_reg_5 = _mm_unpackhi_epi16(m_temp_reg_55, m_temp_reg_57); 2230 2231 /* o0 stored in m_temp_reg_30 and m_temp_reg_31 */ 2232 { 2233 //o0:1B*89+3B*75,1T*89+3T*75,5B*50+7B*18,5T*50+7T*18 2234 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2235 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2236 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 2237 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 2238 2239 m_rdng_factor = _mm_cvtsi32_si128((1 << (i4_shift - 1))); 2240 m_rdng_factor = _mm_shuffle_epi32(m_rdng_factor, 0x0000); 2241 /* Loading coeff for computing o1 in the next block */ 2242 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[2][0]); 2243 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[3][0]); 2244 2245 m_temp_reg_30 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2246 m_temp_reg_31 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2247 } 2248 2249 /* Column 0 of destination computed here */ 2250 /* It is stored in m_temp_reg_50 */ 2251 /* Column 7 of destination computed here */ 2252 /* It is stored in m_temp_reg_57 */ 2253 { 2254 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_40, m_temp_reg_30); 2255 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_40, m_temp_reg_30); 2256 2257 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_41, m_temp_reg_31); 2258 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_41, m_temp_reg_31); 2259 2260 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 2261 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 2262 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 2263 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 2264 2265 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 2266 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 2267 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 2268 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 2269 2270 //o1:1B*75-3B*18,1T*75-3T*18,5B*89+7B*50,5T*89+7T*50 2271 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 2272 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 2273 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 2274 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 2275 2276 m_temp_reg_50 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 2277 m_temp_reg_57 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 2278 2279 2280 /* o1 stored in m_temp_reg_32 and m_temp_reg_33 */ 2281 2282 2283 /* Loading coeff for computing o2 in the next block */ 2284 m_coeff1 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[4][0]); 2285 m_coeff2 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[5][0]); 2286 2287 m_temp_reg_32 = _mm_sub_epi32(m_temp_reg_22, m_temp_reg_26); 2288 m_temp_reg_33 = _mm_sub_epi32(m_temp_reg_23, m_temp_reg_27); 2289 } 2290 2291 /* Column 1 of destination computed here */ 2292 /* It is stored in m_temp_reg_51 */ 2293 /* Column 6 of destination computed here */ 2294 /* It is stored in m_temp_reg_56 */ 2295 { 2296 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_42, m_temp_reg_32); 2297 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_42, m_temp_reg_32); 2298 2299 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_43, m_temp_reg_33); 2300 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_43, m_temp_reg_33); 2301 2302 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 2303 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 2304 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 2305 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 2306 2307 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 2308 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 2309 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 2310 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 2311 2312 //o2:1B*50-3B*89,1T*50-3T*89,5B*18+7B*75,5T*18+7T*75 2313 m_temp_reg_20 = _mm_madd_epi16(m_temp_reg_0, m_coeff1); 2314 m_temp_reg_24 = _mm_madd_epi16(m_temp_reg_4, m_coeff2); 2315 m_temp_reg_21 = _mm_madd_epi16(m_temp_reg_1, m_coeff1); 2316 m_temp_reg_25 = _mm_madd_epi16(m_temp_reg_5, m_coeff2); 2317 2318 m_temp_reg_51 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 2319 m_temp_reg_56 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 2320 2321 2322 /* o2 stored in m_temp_reg_34 and m_temp_reg_35 */ 2323 2324 /* Loading coeff for computing o3 in the next block */ 2325 2326 m_coeff3 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[6][0]); 2327 m_coeff4 = _mm_loadu_si128((__m128i *)&g_ai2_ihevc_trans_intr_odd_8[7][0]); 2328 2329 m_temp_reg_34 = _mm_add_epi32(m_temp_reg_20, m_temp_reg_24); 2330 m_temp_reg_35 = _mm_add_epi32(m_temp_reg_21, m_temp_reg_25); 2331 } 2332 2333 /* Column 2 of destination computed here */ 2334 /* It is stored in m_temp_reg_52 */ 2335 /* Column 5 of destination computed here */ 2336 /* It is stored in m_temp_reg_55 */ 2337 { 2338 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_44, m_temp_reg_34); 2339 m_temp_reg_6 = _mm_sub_epi32(m_temp_reg_44, m_temp_reg_34); 2340 2341 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_45, m_temp_reg_35); 2342 m_temp_reg_7 = _mm_sub_epi32(m_temp_reg_45, m_temp_reg_35); 2343 2344 m_temp_reg_2 = _mm_add_epi32(m_temp_reg_2, m_rdng_factor); 2345 m_temp_reg_3 = _mm_add_epi32(m_temp_reg_3, m_rdng_factor); 2346 m_temp_reg_6 = _mm_add_epi32(m_temp_reg_6, m_rdng_factor); 2347 m_temp_reg_7 = _mm_add_epi32(m_temp_reg_7, m_rdng_factor); 2348 2349 m_temp_reg_2 = _mm_srai_epi32(m_temp_reg_2, i4_shift); 2350 m_temp_reg_3 = _mm_srai_epi32(m_temp_reg_3, i4_shift); 2351 m_temp_reg_6 = _mm_srai_epi32(m_temp_reg_6, i4_shift); 2352 m_temp_reg_7 = _mm_srai_epi32(m_temp_reg_7, i4_shift); 2353 2354 //o3:1B*18-3B*50,1T*18-3T*50,5B*75-7B*89,5T*75-7T*89 2355 m_temp_reg_22 = _mm_madd_epi16(m_temp_reg_0, m_coeff3); 2356 m_temp_reg_26 = _mm_madd_epi16(m_temp_reg_4, m_coeff4); 2357 m_temp_reg_23 = _mm_madd_epi16(m_temp_reg_1, m_coeff3); 2358 m_temp_reg_27 = _mm_madd_epi16(m_temp_reg_5, m_coeff4); 2359 2360 m_temp_reg_52 = _mm_packs_epi32(m_temp_reg_2, m_temp_reg_3); 2361 m_temp_reg_55 = _mm_packs_epi32(m_temp_reg_6, m_temp_reg_7); 2362 2363 2364 2365 /* o3 stored in m_temp_reg_36 and m_temp_reg_37 */ 2366 2367 2368 m_temp_reg_36 = _mm_add_epi32(m_temp_reg_22, m_temp_reg_26); 2369 m_temp_reg_37 = _mm_add_epi32(m_temp_reg_23, m_temp_reg_27); 2370 } 2371 2372 /* Column 3 of destination computed here */ 2373 /* It is stored in m_temp_reg_53 */ 2374 /* Column 4 of destination computed here */ 2375 /* It is stored in m_temp_reg_54 */ 2376 { 2377 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_46, m_temp_reg_36); 2378 m_temp_reg_22 = _mm_sub_epi32(m_temp_reg_46, m_temp_reg_36); 2379 2380 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_47, m_temp_reg_37); 2381 m_temp_reg_23 = _mm_sub_epi32(m_temp_reg_47, m_temp_reg_37); 2382 2383 m_temp_reg_20 = _mm_add_epi32(m_temp_reg_20, m_rdng_factor); 2384 m_temp_reg_21 = _mm_add_epi32(m_temp_reg_21, m_rdng_factor); 2385 m_temp_reg_22 = _mm_add_epi32(m_temp_reg_22, m_rdng_factor); 2386 m_temp_reg_23 = _mm_add_epi32(m_temp_reg_23, m_rdng_factor); 2387 2388 m_temp_reg_20 = _mm_srai_epi32(m_temp_reg_20, i4_shift); 2389 m_temp_reg_21 = _mm_srai_epi32(m_temp_reg_21, i4_shift); 2390 m_temp_reg_22 = _mm_srai_epi32(m_temp_reg_22, i4_shift); 2391 m_temp_reg_23 = _mm_srai_epi32(m_temp_reg_23, i4_shift); 2392 2393 m_temp_reg_53 = _mm_packs_epi32(m_temp_reg_20, m_temp_reg_21); 2394 m_temp_reg_54 = _mm_packs_epi32(m_temp_reg_22, m_temp_reg_23); 2395 } 2396 } 2397 2398 /* Transpose of the destination 8x8 matrix done here */ 2399 /* and ultimately stored in registers m_temp_reg_50 to m_temp_reg_57 */ 2400 /* respectively */ 2401 { 2402 m_temp_reg_10 = _mm_unpacklo_epi16(m_temp_reg_50, m_temp_reg_51); 2403 m_temp_reg_11 = _mm_unpacklo_epi16(m_temp_reg_52, m_temp_reg_53); 2404 m_temp_reg_14 = _mm_unpackhi_epi16(m_temp_reg_50, m_temp_reg_51); 2405 m_temp_reg_15 = _mm_unpackhi_epi16(m_temp_reg_52, m_temp_reg_53); 2406 m_temp_reg_0 = _mm_unpacklo_epi32(m_temp_reg_10, m_temp_reg_11); 2407 m_temp_reg_1 = _mm_unpackhi_epi32(m_temp_reg_10, m_temp_reg_11); 2408 m_temp_reg_2 = _mm_unpacklo_epi32(m_temp_reg_14, m_temp_reg_15); 2409 m_temp_reg_3 = _mm_unpackhi_epi32(m_temp_reg_14, m_temp_reg_15); 2410 2411 m_temp_reg_12 = _mm_unpacklo_epi16(m_temp_reg_54, m_temp_reg_55); 2412 m_temp_reg_13 = _mm_unpacklo_epi16(m_temp_reg_56, m_temp_reg_57); 2413 m_temp_reg_16 = _mm_unpackhi_epi16(m_temp_reg_54, m_temp_reg_55); 2414 m_temp_reg_17 = _mm_unpackhi_epi16(m_temp_reg_56, m_temp_reg_57); 2415 m_temp_reg_4 = _mm_unpacklo_epi32(m_temp_reg_12, m_temp_reg_13); 2416 m_temp_reg_5 = _mm_unpackhi_epi32(m_temp_reg_12, m_temp_reg_13); 2417 m_temp_reg_6 = _mm_unpacklo_epi32(m_temp_reg_16, m_temp_reg_17); 2418 m_temp_reg_7 = _mm_unpackhi_epi32(m_temp_reg_16, m_temp_reg_17); 2419 m_temp_reg_10 = _mm_unpacklo_epi64(m_temp_reg_0, m_temp_reg_4); 2420 m_temp_reg_11 = _mm_unpackhi_epi64(m_temp_reg_0, m_temp_reg_4); 2421 m_temp_reg_12 = _mm_unpacklo_epi64(m_temp_reg_1, m_temp_reg_5); 2422 m_temp_reg_13 = _mm_unpackhi_epi64(m_temp_reg_1, m_temp_reg_5); 2423 2424 m_temp_reg_14 = _mm_unpacklo_epi64(m_temp_reg_2, m_temp_reg_6); 2425 m_temp_reg_15 = _mm_unpackhi_epi64(m_temp_reg_2, m_temp_reg_6); 2426 m_temp_reg_16 = _mm_unpacklo_epi64(m_temp_reg_3, m_temp_reg_7); 2427 m_temp_reg_17 = _mm_unpackhi_epi64(m_temp_reg_3, m_temp_reg_7); 2428 } 2429 2430 /* Recon and store */ 2431 { 2432 m_temp_reg_0 = _mm_loadl_epi64((__m128i *)pu1_pred); 2433 pu1_pred += pred_strd; 2434 m_temp_reg_1 = _mm_loadl_epi64((__m128i *)pu1_pred); 2435 pu1_pred += pred_strd; 2436 m_temp_reg_2 = _mm_loadl_epi64((__m128i *)pu1_pred); 2437 pu1_pred += pred_strd; 2438 m_temp_reg_3 = _mm_loadl_epi64((__m128i *)pu1_pred); 2439 pu1_pred += pred_strd; 2440 m_temp_reg_4 = _mm_loadl_epi64((__m128i *)pu1_pred); 2441 pu1_pred += pred_strd; 2442 m_temp_reg_5 = _mm_loadl_epi64((__m128i *)pu1_pred); 2443 pu1_pred += pred_strd; 2444 m_temp_reg_6 = _mm_loadl_epi64((__m128i *)pu1_pred); 2445 pu1_pred += pred_strd; 2446 m_temp_reg_7 = _mm_loadl_epi64((__m128i *)pu1_pred); 2447 2448 2449 m_temp_reg_50 = _mm_setzero_si128(); 2450 m_temp_reg_0 = _mm_unpacklo_epi8(m_temp_reg_0, m_temp_reg_50); 2451 m_temp_reg_1 = _mm_unpacklo_epi8(m_temp_reg_1, m_temp_reg_50); 2452 m_temp_reg_2 = _mm_unpacklo_epi8(m_temp_reg_2, m_temp_reg_50); 2453 m_temp_reg_3 = _mm_unpacklo_epi8(m_temp_reg_3, m_temp_reg_50); 2454 m_temp_reg_4 = _mm_unpacklo_epi8(m_temp_reg_4, m_temp_reg_50); 2455 m_temp_reg_5 = _mm_unpacklo_epi8(m_temp_reg_5, m_temp_reg_50); 2456 m_temp_reg_6 = _mm_unpacklo_epi8(m_temp_reg_6, m_temp_reg_50); 2457 m_temp_reg_7 = _mm_unpacklo_epi8(m_temp_reg_7, m_temp_reg_50); 2458 2459 m_temp_reg_50 = _mm_add_epi16(m_temp_reg_10, m_temp_reg_0); 2460 m_temp_reg_51 = _mm_add_epi16(m_temp_reg_11, m_temp_reg_1); 2461 m_temp_reg_52 = _mm_add_epi16(m_temp_reg_12, m_temp_reg_2); 2462 m_temp_reg_53 = _mm_add_epi16(m_temp_reg_13, m_temp_reg_3); 2463 m_temp_reg_54 = _mm_add_epi16(m_temp_reg_14, m_temp_reg_4); 2464 m_temp_reg_55 = _mm_add_epi16(m_temp_reg_15, m_temp_reg_5); 2465 m_temp_reg_56 = _mm_add_epi16(m_temp_reg_16, m_temp_reg_6); 2466 m_temp_reg_57 = _mm_add_epi16(m_temp_reg_17, m_temp_reg_7); 2467 2468 m_temp_reg_50 = _mm_packus_epi16(m_temp_reg_50, m_temp_reg_50); 2469 m_temp_reg_51 = _mm_packus_epi16(m_temp_reg_51, m_temp_reg_51); 2470 m_temp_reg_52 = _mm_packus_epi16(m_temp_reg_52, m_temp_reg_52); 2471 m_temp_reg_53 = _mm_packus_epi16(m_temp_reg_53, m_temp_reg_53); 2472 m_temp_reg_54 = _mm_packus_epi16(m_temp_reg_54, m_temp_reg_54); 2473 m_temp_reg_55 = _mm_packus_epi16(m_temp_reg_55, m_temp_reg_55); 2474 m_temp_reg_56 = _mm_packus_epi16(m_temp_reg_56, m_temp_reg_56); 2475 m_temp_reg_57 = _mm_packus_epi16(m_temp_reg_57, m_temp_reg_57); 2476 2477 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_50); 2478 pu1_dst += dst_strd; 2479 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_51); 2480 pu1_dst += dst_strd; 2481 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_52); 2482 pu1_dst += dst_strd; 2483 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_53); 2484 pu1_dst += dst_strd; 2485 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_54); 2486 pu1_dst += dst_strd; 2487 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_55); 2488 pu1_dst += dst_strd; 2489 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_56); 2490 pu1_dst += dst_strd; 2491 _mm_storel_epi64((__m128i *)pu1_dst, m_temp_reg_57); 2492 pu1_dst += dst_strd; 2493 2494 } 2495 2496 2497 } 2498 2499 2500 } 2501 } 2502