1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void vp8_idct_dequant_0_2x_sse2 15; ( 16; short *qcoeff - 0 17; short *dequant - 1 18; unsigned char *dst - 2 19; int dst_stride - 3 20; ) 21 22SECTION .text 23 24global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE 25sym(vp8_idct_dequant_0_2x_sse2): 26 push rbp 27 mov rbp, rsp 28 SHADOW_ARGS_TO_STACK 4 29 GET_GOT rbx 30 ; end prolog 31 32 mov rdx, arg(1) ; dequant 33 mov rax, arg(0) ; qcoeff 34 35 movd xmm4, [rax] 36 movd xmm5, [rdx] 37 38 pinsrw xmm4, [rax+32], 4 39 pinsrw xmm5, [rdx], 4 40 41 pmullw xmm4, xmm5 42 43 ; Zero out xmm5, for use unpacking 44 pxor xmm5, xmm5 45 46 ; clear coeffs 47 movd [rax], xmm5 48 movd [rax+32], xmm5 49;pshufb 50 mov rax, arg(2) ; dst 51 movsxd rdx, dword ptr arg(3) ; dst_stride 52 53 pshuflw xmm4, xmm4, 00000000b 54 pshufhw xmm4, xmm4, 00000000b 55 56 lea rcx, [rdx + rdx*2] 57 paddw xmm4, [GLOBAL(fours)] 58 59 psraw xmm4, 3 60 61 movq xmm0, [rax] 62 movq xmm1, [rax+rdx] 63 movq xmm2, [rax+2*rdx] 64 movq xmm3, [rax+rcx] 65 66 punpcklbw xmm0, xmm5 67 punpcklbw xmm1, xmm5 68 punpcklbw xmm2, xmm5 69 punpcklbw xmm3, xmm5 70 71 72 ; Add to predict buffer 73 paddw xmm0, xmm4 74 paddw xmm1, xmm4 75 paddw xmm2, xmm4 76 paddw xmm3, xmm4 77 78 ; pack up before storing 79 packuswb xmm0, xmm5 80 packuswb xmm1, xmm5 81 packuswb xmm2, xmm5 82 packuswb xmm3, xmm5 83 84 ; store blocks back out 85 movq [rax], xmm0 86 movq [rax + rdx], xmm1 87 88 lea rax, [rax + 2*rdx] 89 90 movq [rax], xmm2 91 movq [rax + rdx], xmm3 92 93 ; begin epilog 94 RESTORE_GOT 95 UNSHADOW_ARGS 96 pop rbp 97 ret 98 99;void vp8_idct_dequant_full_2x_sse2 100; ( 101; short *qcoeff - 0 102; short *dequant - 1 103; unsigned char *dst - 2 104; int dst_stride - 3 105; ) 106global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE 107sym(vp8_idct_dequant_full_2x_sse2): 108 push rbp 109 mov rbp, rsp 110 SHADOW_ARGS_TO_STACK 4 111 SAVE_XMM 7 112 GET_GOT rbx 113 push rsi 114 push rdi 115 ; end prolog 116 117 ; special case when 2 blocks have 0 or 1 coeffs 118 ; dc is set as first coeff, so no need to load qcoeff 119 mov rax, arg(0) ; qcoeff 120 mov rdx, arg(1) ; dequant 121 mov rdi, arg(2) ; dst 122 123 124 ; Zero out xmm7, for use unpacking 125 pxor xmm7, xmm7 126 127 128 ; note the transpose of xmm1 and xmm2, necessary for shuffle 129 ; to spit out sensicle data 130 movdqa xmm0, [rax] 131 movdqa xmm2, [rax+16] 132 movdqa xmm1, [rax+32] 133 movdqa xmm3, [rax+48] 134 135 ; Clear out coeffs 136 movdqa [rax], xmm7 137 movdqa [rax+16], xmm7 138 movdqa [rax+32], xmm7 139 movdqa [rax+48], xmm7 140 141 ; dequantize qcoeff buffer 142 pmullw xmm0, [rdx] 143 pmullw xmm2, [rdx+16] 144 pmullw xmm1, [rdx] 145 pmullw xmm3, [rdx+16] 146 movsxd rdx, dword ptr arg(3) ; dst_stride 147 148 ; repack so block 0 row x and block 1 row x are together 149 movdqa xmm4, xmm0 150 punpckldq xmm0, xmm1 151 punpckhdq xmm4, xmm1 152 153 pshufd xmm0, xmm0, 11011000b 154 pshufd xmm1, xmm4, 11011000b 155 156 movdqa xmm4, xmm2 157 punpckldq xmm2, xmm3 158 punpckhdq xmm4, xmm3 159 160 pshufd xmm2, xmm2, 11011000b 161 pshufd xmm3, xmm4, 11011000b 162 163 ; first pass 164 psubw xmm0, xmm2 ; b1 = 0-2 165 paddw xmm2, xmm2 ; 166 167 movdqa xmm5, xmm1 168 paddw xmm2, xmm0 ; a1 = 0+2 169 170 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 171 lea rcx, [rdx + rdx*2] ;dst_stride * 3 172 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 173 174 movdqa xmm7, xmm3 175 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 176 177 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 178 psubw xmm7, xmm5 ; c1 179 180 movdqa xmm5, xmm1 181 movdqa xmm4, xmm3 182 183 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 184 paddw xmm5, xmm1 185 186 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 187 paddw xmm3, xmm4 188 189 paddw xmm3, xmm5 ; d1 190 movdqa xmm6, xmm2 ; a1 191 192 movdqa xmm4, xmm0 ; b1 193 paddw xmm2, xmm3 ;0 194 195 paddw xmm4, xmm7 ;1 196 psubw xmm0, xmm7 ;2 197 198 psubw xmm6, xmm3 ;3 199 200 ; transpose for the second pass 201 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 202 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 203 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 204 205 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 206 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 207 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 208 209 210 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 211 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 212 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 213 214 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 215 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 216 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 217 218 219 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 220 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 221 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 222 223 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 224 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 225 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 226 227 pshufd xmm0, xmm2, 11011000b 228 pshufd xmm2, xmm1, 11011000b 229 230 pshufd xmm1, xmm5, 11011000b 231 pshufd xmm3, xmm7, 11011000b 232 233 ; second pass 234 psubw xmm0, xmm2 ; b1 = 0-2 235 paddw xmm2, xmm2 236 237 movdqa xmm5, xmm1 238 paddw xmm2, xmm0 ; a1 = 0+2 239 240 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 241 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 242 243 movdqa xmm7, xmm3 244 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 245 246 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 247 psubw xmm7, xmm5 ; c1 248 249 movdqa xmm5, xmm1 250 movdqa xmm4, xmm3 251 252 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 253 paddw xmm5, xmm1 254 255 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 256 paddw xmm3, xmm4 257 258 paddw xmm3, xmm5 ; d1 259 paddw xmm0, [GLOBAL(fours)] 260 261 paddw xmm2, [GLOBAL(fours)] 262 movdqa xmm6, xmm2 ; a1 263 264 movdqa xmm4, xmm0 ; b1 265 paddw xmm2, xmm3 ;0 266 267 paddw xmm4, xmm7 ;1 268 psubw xmm0, xmm7 ;2 269 270 psubw xmm6, xmm3 ;3 271 psraw xmm2, 3 272 273 psraw xmm0, 3 274 psraw xmm4, 3 275 276 psraw xmm6, 3 277 278 ; transpose to save 279 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 280 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 281 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 282 283 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 284 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 285 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 286 287 288 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 289 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 290 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 291 292 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 293 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 294 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 295 296 297 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 298 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 299 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 300 301 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 302 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 303 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 304 305 pshufd xmm0, xmm2, 11011000b 306 pshufd xmm2, xmm1, 11011000b 307 308 pshufd xmm1, xmm5, 11011000b 309 pshufd xmm3, xmm7, 11011000b 310 311 pxor xmm7, xmm7 312 313 ; Load up predict blocks 314 movq xmm4, [rdi] 315 movq xmm5, [rdi+rdx] 316 317 punpcklbw xmm4, xmm7 318 punpcklbw xmm5, xmm7 319 320 paddw xmm0, xmm4 321 paddw xmm1, xmm5 322 323 movq xmm4, [rdi+2*rdx] 324 movq xmm5, [rdi+rcx] 325 326 punpcklbw xmm4, xmm7 327 punpcklbw xmm5, xmm7 328 329 paddw xmm2, xmm4 330 paddw xmm3, xmm5 331 332.finish: 333 334 ; pack up before storing 335 packuswb xmm0, xmm7 336 packuswb xmm1, xmm7 337 packuswb xmm2, xmm7 338 packuswb xmm3, xmm7 339 340 ; store blocks back out 341 movq [rdi], xmm0 342 movq [rdi + rdx], xmm1 343 movq [rdi + rdx*2], xmm2 344 movq [rdi + rcx], xmm3 345 346 ; begin epilog 347 pop rdi 348 pop rsi 349 RESTORE_GOT 350 RESTORE_XMM 351 UNSHADOW_ARGS 352 pop rbp 353 ret 354 355;void vp8_idct_dequant_dc_0_2x_sse2 356; ( 357; short *qcoeff - 0 358; short *dequant - 1 359; unsigned char *dst - 2 360; int dst_stride - 3 361; short *dc - 4 362; ) 363global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE 364sym(vp8_idct_dequant_dc_0_2x_sse2): 365 push rbp 366 mov rbp, rsp 367 SHADOW_ARGS_TO_STACK 5 368 GET_GOT rbx 369 push rdi 370 ; end prolog 371 372 ; special case when 2 blocks have 0 or 1 coeffs 373 ; dc is set as first coeff, so no need to load qcoeff 374 mov rax, arg(0) ; qcoeff 375 376 mov rdi, arg(2) ; dst 377 mov rdx, arg(4) ; dc 378 379 ; Zero out xmm5, for use unpacking 380 pxor xmm5, xmm5 381 382 ; load up 2 dc words here == 2*16 = doubleword 383 movd xmm4, [rdx] 384 385 movsxd rdx, dword ptr arg(3) ; dst_stride 386 lea rcx, [rdx + rdx*2] 387 ; Load up predict blocks 388 movq xmm0, [rdi] 389 movq xmm1, [rdi+rdx*1] 390 movq xmm2, [rdi+rdx*2] 391 movq xmm3, [rdi+rcx] 392 393 ; Duplicate and expand dc across 394 punpcklwd xmm4, xmm4 395 punpckldq xmm4, xmm4 396 397 ; Rounding to dequant and downshift 398 paddw xmm4, [GLOBAL(fours)] 399 psraw xmm4, 3 400 401 ; Predict buffer needs to be expanded from bytes to words 402 punpcklbw xmm0, xmm5 403 punpcklbw xmm1, xmm5 404 punpcklbw xmm2, xmm5 405 punpcklbw xmm3, xmm5 406 407 ; Add to predict buffer 408 paddw xmm0, xmm4 409 paddw xmm1, xmm4 410 paddw xmm2, xmm4 411 paddw xmm3, xmm4 412 413 ; pack up before storing 414 packuswb xmm0, xmm5 415 packuswb xmm1, xmm5 416 packuswb xmm2, xmm5 417 packuswb xmm3, xmm5 418 419 ; store blocks back out 420 movq [rdi], xmm0 421 movq [rdi + rdx], xmm1 422 movq [rdi + rdx*2], xmm2 423 movq [rdi + rcx], xmm3 424 425 ; begin epilog 426 pop rdi 427 RESTORE_GOT 428 UNSHADOW_ARGS 429 pop rbp 430 ret 431;void vp8_idct_dequant_dc_full_2x_sse2 432; ( 433; short *qcoeff - 0 434; short *dequant - 1 435; unsigned char *dst - 2 436; int dst_stride - 3 437; short *dc - 4 438; ) 439global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE 440sym(vp8_idct_dequant_dc_full_2x_sse2): 441 push rbp 442 mov rbp, rsp 443 SHADOW_ARGS_TO_STACK 5 444 SAVE_XMM 7 445 GET_GOT rbx 446 push rdi 447 ; end prolog 448 449 ; special case when 2 blocks have 0 or 1 coeffs 450 ; dc is set as first coeff, so no need to load qcoeff 451 mov rax, arg(0) ; qcoeff 452 mov rdx, arg(1) ; dequant 453 454 mov rdi, arg(2) ; dst 455 456 ; Zero out xmm7, for use unpacking 457 pxor xmm7, xmm7 458 459 460 ; note the transpose of xmm1 and xmm2, necessary for shuffle 461 ; to spit out sensicle data 462 movdqa xmm0, [rax] 463 movdqa xmm2, [rax+16] 464 movdqa xmm1, [rax+32] 465 movdqa xmm3, [rax+48] 466 467 ; Clear out coeffs 468 movdqa [rax], xmm7 469 movdqa [rax+16], xmm7 470 movdqa [rax+32], xmm7 471 movdqa [rax+48], xmm7 472 473 ; dequantize qcoeff buffer 474 pmullw xmm0, [rdx] 475 pmullw xmm2, [rdx+16] 476 pmullw xmm1, [rdx] 477 pmullw xmm3, [rdx+16] 478 479 ; DC component 480 mov rdx, arg(4) 481 482 ; repack so block 0 row x and block 1 row x are together 483 movdqa xmm4, xmm0 484 punpckldq xmm0, xmm1 485 punpckhdq xmm4, xmm1 486 487 pshufd xmm0, xmm0, 11011000b 488 pshufd xmm1, xmm4, 11011000b 489 490 movdqa xmm4, xmm2 491 punpckldq xmm2, xmm3 492 punpckhdq xmm4, xmm3 493 494 pshufd xmm2, xmm2, 11011000b 495 pshufd xmm3, xmm4, 11011000b 496 497 ; insert DC component 498 pinsrw xmm0, [rdx], 0 499 pinsrw xmm0, [rdx+2], 4 500 501 ; first pass 502 psubw xmm0, xmm2 ; b1 = 0-2 503 paddw xmm2, xmm2 ; 504 505 movdqa xmm5, xmm1 506 paddw xmm2, xmm0 ; a1 = 0+2 507 508 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 509 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 510 511 movdqa xmm7, xmm3 512 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 513 514 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 515 psubw xmm7, xmm5 ; c1 516 517 movdqa xmm5, xmm1 518 movdqa xmm4, xmm3 519 520 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 521 paddw xmm5, xmm1 522 523 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 524 paddw xmm3, xmm4 525 526 paddw xmm3, xmm5 ; d1 527 movdqa xmm6, xmm2 ; a1 528 529 movdqa xmm4, xmm0 ; b1 530 paddw xmm2, xmm3 ;0 531 532 paddw xmm4, xmm7 ;1 533 psubw xmm0, xmm7 ;2 534 535 psubw xmm6, xmm3 ;3 536 537 ; transpose for the second pass 538 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 539 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 540 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 541 542 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 543 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 544 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 545 546 547 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 548 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 549 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 550 551 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 552 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 553 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 554 555 556 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 557 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 558 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 559 560 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 561 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 562 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 563 564 pshufd xmm0, xmm2, 11011000b 565 pshufd xmm2, xmm1, 11011000b 566 567 pshufd xmm1, xmm5, 11011000b 568 pshufd xmm3, xmm7, 11011000b 569 570 ; second pass 571 psubw xmm0, xmm2 ; b1 = 0-2 572 paddw xmm2, xmm2 573 574 movdqa xmm5, xmm1 575 paddw xmm2, xmm0 ; a1 = 0+2 576 577 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 578 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 579 580 movdqa xmm7, xmm3 581 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 582 583 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 584 psubw xmm7, xmm5 ; c1 585 586 movdqa xmm5, xmm1 587 movdqa xmm4, xmm3 588 589 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 590 paddw xmm5, xmm1 591 592 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 593 paddw xmm3, xmm4 594 595 paddw xmm3, xmm5 ; d1 596 paddw xmm0, [GLOBAL(fours)] 597 598 paddw xmm2, [GLOBAL(fours)] 599 movdqa xmm6, xmm2 ; a1 600 601 movdqa xmm4, xmm0 ; b1 602 paddw xmm2, xmm3 ;0 603 604 paddw xmm4, xmm7 ;1 605 psubw xmm0, xmm7 ;2 606 607 psubw xmm6, xmm3 ;3 608 psraw xmm2, 3 609 610 psraw xmm0, 3 611 psraw xmm4, 3 612 613 psraw xmm6, 3 614 615 ; transpose to save 616 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 617 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 618 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 619 620 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 621 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 622 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 623 624 625 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 626 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 627 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 628 629 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 630 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 631 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 632 633 634 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 635 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 636 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 637 638 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 639 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 640 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 641 642 pshufd xmm0, xmm2, 11011000b 643 pshufd xmm2, xmm1, 11011000b 644 645 pshufd xmm1, xmm5, 11011000b 646 pshufd xmm3, xmm7, 11011000b 647 648 pxor xmm7, xmm7 649 650 ; Load up predict blocks 651 movsxd rdx, dword ptr arg(3) ; dst_stride 652 movq xmm4, [rdi] 653 movq xmm5, [rdi+rdx] 654 lea rcx, [rdx + rdx*2] 655 656 punpcklbw xmm4, xmm7 657 punpcklbw xmm5, xmm7 658 659 paddw xmm0, xmm4 660 paddw xmm1, xmm5 661 662 movq xmm4, [rdi+rdx*2] 663 movq xmm5, [rdi+rcx] 664 665 punpcklbw xmm4, xmm7 666 punpcklbw xmm5, xmm7 667 668 paddw xmm2, xmm4 669 paddw xmm3, xmm5 670 671.finish: 672 673 ; pack up before storing 674 packuswb xmm0, xmm7 675 packuswb xmm1, xmm7 676 packuswb xmm2, xmm7 677 packuswb xmm3, xmm7 678 679 ; Load destination stride before writing out, 680 ; doesn't need to persist 681 movsxd rdx, dword ptr arg(3) ; dst_stride 682 683 ; store blocks back out 684 movq [rdi], xmm0 685 movq [rdi + rdx], xmm1 686 687 lea rdi, [rdi + 2*rdx] 688 689 movq [rdi], xmm2 690 movq [rdi + rdx], xmm3 691 692 693 ; begin epilog 694 pop rdi 695 RESTORE_GOT 696 RESTORE_XMM 697 UNSHADOW_ARGS 698 pop rbp 699 ret 700 701SECTION_RODATA 702align 16 703fours: 704 times 8 dw 0x0004 705align 16 706x_s1sqr2: 707 times 8 dw 0x8A8C 708align 16 709x_c1sqr2less1: 710 times 8 dw 0x4E7B 711