1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void vp8_idct_dequant_0_2x_sse2 15; ( 16; short *qcoeff - 0 17; short *dequant - 1 18; unsigned char *dst - 2 19; int dst_stride - 3 20; ) 21 22global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE 23sym(vp8_idct_dequant_0_2x_sse2): 24 push rbp 25 mov rbp, rsp 26 SHADOW_ARGS_TO_STACK 4 27 GET_GOT rbx 28 ; end prolog 29 30 mov rdx, arg(1) ; dequant 31 mov rax, arg(0) ; qcoeff 32 33 movd xmm4, [rax] 34 movd xmm5, [rdx] 35 36 pinsrw xmm4, [rax+32], 4 37 pinsrw xmm5, [rdx], 4 38 39 pmullw xmm4, xmm5 40 41 ; Zero out xmm5, for use unpacking 42 pxor xmm5, xmm5 43 44 ; clear coeffs 45 movd [rax], xmm5 46 movd [rax+32], xmm5 47;pshufb 48 mov rax, arg(2) ; dst 49 movsxd rdx, dword ptr arg(3) ; dst_stride 50 51 pshuflw xmm4, xmm4, 00000000b 52 pshufhw xmm4, xmm4, 00000000b 53 54 lea rcx, [rdx + rdx*2] 55 paddw xmm4, [GLOBAL(fours)] 56 57 psraw xmm4, 3 58 59 movq xmm0, [rax] 60 movq xmm1, [rax+rdx] 61 movq xmm2, [rax+2*rdx] 62 movq xmm3, [rax+rcx] 63 64 punpcklbw xmm0, xmm5 65 punpcklbw xmm1, xmm5 66 punpcklbw xmm2, xmm5 67 punpcklbw xmm3, xmm5 68 69 70 ; Add to predict buffer 71 paddw xmm0, xmm4 72 paddw xmm1, xmm4 73 paddw xmm2, xmm4 74 paddw xmm3, xmm4 75 76 ; pack up before storing 77 packuswb xmm0, xmm5 78 packuswb xmm1, xmm5 79 packuswb xmm2, xmm5 80 packuswb xmm3, xmm5 81 82 ; store blocks back out 83 movq [rax], xmm0 84 movq [rax + rdx], xmm1 85 86 lea rax, [rax + 2*rdx] 87 88 movq [rax], xmm2 89 movq [rax + rdx], xmm3 90 91 ; begin epilog 92 RESTORE_GOT 93 UNSHADOW_ARGS 94 pop rbp 95 ret 96 97;void vp8_idct_dequant_full_2x_sse2 98; ( 99; short *qcoeff - 0 100; short *dequant - 1 101; unsigned char *dst - 2 102; int dst_stride - 3 103; ) 104global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE 105sym(vp8_idct_dequant_full_2x_sse2): 106 push rbp 107 mov rbp, rsp 108 SHADOW_ARGS_TO_STACK 4 109 SAVE_XMM 7 110 GET_GOT rbx 111 push rsi 112 push rdi 113 ; end prolog 114 115 ; special case when 2 blocks have 0 or 1 coeffs 116 ; dc is set as first coeff, so no need to load qcoeff 117 mov rax, arg(0) ; qcoeff 118 mov rdx, arg(1) ; dequant 119 mov rdi, arg(2) ; dst 120 121 122 ; Zero out xmm7, for use unpacking 123 pxor xmm7, xmm7 124 125 126 ; note the transpose of xmm1 and xmm2, necessary for shuffle 127 ; to spit out sensicle data 128 movdqa xmm0, [rax] 129 movdqa xmm2, [rax+16] 130 movdqa xmm1, [rax+32] 131 movdqa xmm3, [rax+48] 132 133 ; Clear out coeffs 134 movdqa [rax], xmm7 135 movdqa [rax+16], xmm7 136 movdqa [rax+32], xmm7 137 movdqa [rax+48], xmm7 138 139 ; dequantize qcoeff buffer 140 pmullw xmm0, [rdx] 141 pmullw xmm2, [rdx+16] 142 pmullw xmm1, [rdx] 143 pmullw xmm3, [rdx+16] 144 movsxd rdx, dword ptr arg(3) ; dst_stride 145 146 ; repack so block 0 row x and block 1 row x are together 147 movdqa xmm4, xmm0 148 punpckldq xmm0, xmm1 149 punpckhdq xmm4, xmm1 150 151 pshufd xmm0, xmm0, 11011000b 152 pshufd xmm1, xmm4, 11011000b 153 154 movdqa xmm4, xmm2 155 punpckldq xmm2, xmm3 156 punpckhdq xmm4, xmm3 157 158 pshufd xmm2, xmm2, 11011000b 159 pshufd xmm3, xmm4, 11011000b 160 161 ; first pass 162 psubw xmm0, xmm2 ; b1 = 0-2 163 paddw xmm2, xmm2 ; 164 165 movdqa xmm5, xmm1 166 paddw xmm2, xmm0 ; a1 = 0+2 167 168 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 169 lea rcx, [rdx + rdx*2] ;dst_stride * 3 170 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 171 172 movdqa xmm7, xmm3 173 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 174 175 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 176 psubw xmm7, xmm5 ; c1 177 178 movdqa xmm5, xmm1 179 movdqa xmm4, xmm3 180 181 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 182 paddw xmm5, xmm1 183 184 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 185 paddw xmm3, xmm4 186 187 paddw xmm3, xmm5 ; d1 188 movdqa xmm6, xmm2 ; a1 189 190 movdqa xmm4, xmm0 ; b1 191 paddw xmm2, xmm3 ;0 192 193 paddw xmm4, xmm7 ;1 194 psubw xmm0, xmm7 ;2 195 196 psubw xmm6, xmm3 ;3 197 198 ; transpose for the second pass 199 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 200 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 201 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 202 203 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 204 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 205 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 206 207 208 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 209 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 210 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 211 212 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 213 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 214 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 215 216 217 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 218 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 219 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 220 221 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 222 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 223 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 224 225 pshufd xmm0, xmm2, 11011000b 226 pshufd xmm2, xmm1, 11011000b 227 228 pshufd xmm1, xmm5, 11011000b 229 pshufd xmm3, xmm7, 11011000b 230 231 ; second pass 232 psubw xmm0, xmm2 ; b1 = 0-2 233 paddw xmm2, xmm2 234 235 movdqa xmm5, xmm1 236 paddw xmm2, xmm0 ; a1 = 0+2 237 238 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 239 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 240 241 movdqa xmm7, xmm3 242 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 243 244 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 245 psubw xmm7, xmm5 ; c1 246 247 movdqa xmm5, xmm1 248 movdqa xmm4, xmm3 249 250 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 251 paddw xmm5, xmm1 252 253 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 254 paddw xmm3, xmm4 255 256 paddw xmm3, xmm5 ; d1 257 paddw xmm0, [GLOBAL(fours)] 258 259 paddw xmm2, [GLOBAL(fours)] 260 movdqa xmm6, xmm2 ; a1 261 262 movdqa xmm4, xmm0 ; b1 263 paddw xmm2, xmm3 ;0 264 265 paddw xmm4, xmm7 ;1 266 psubw xmm0, xmm7 ;2 267 268 psubw xmm6, xmm3 ;3 269 psraw xmm2, 3 270 271 psraw xmm0, 3 272 psraw xmm4, 3 273 274 psraw xmm6, 3 275 276 ; transpose to save 277 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 278 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 279 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 280 281 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 282 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 283 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 284 285 286 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 287 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 288 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 289 290 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 291 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 292 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 293 294 295 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 296 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 297 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 298 299 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 300 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 301 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 302 303 pshufd xmm0, xmm2, 11011000b 304 pshufd xmm2, xmm1, 11011000b 305 306 pshufd xmm1, xmm5, 11011000b 307 pshufd xmm3, xmm7, 11011000b 308 309 pxor xmm7, xmm7 310 311 ; Load up predict blocks 312 movq xmm4, [rdi] 313 movq xmm5, [rdi+rdx] 314 315 punpcklbw xmm4, xmm7 316 punpcklbw xmm5, xmm7 317 318 paddw xmm0, xmm4 319 paddw xmm1, xmm5 320 321 movq xmm4, [rdi+2*rdx] 322 movq xmm5, [rdi+rcx] 323 324 punpcklbw xmm4, xmm7 325 punpcklbw xmm5, xmm7 326 327 paddw xmm2, xmm4 328 paddw xmm3, xmm5 329 330.finish: 331 332 ; pack up before storing 333 packuswb xmm0, xmm7 334 packuswb xmm1, xmm7 335 packuswb xmm2, xmm7 336 packuswb xmm3, xmm7 337 338 ; store blocks back out 339 movq [rdi], xmm0 340 movq [rdi + rdx], xmm1 341 movq [rdi + rdx*2], xmm2 342 movq [rdi + rcx], xmm3 343 344 ; begin epilog 345 pop rdi 346 pop rsi 347 RESTORE_GOT 348 RESTORE_XMM 349 UNSHADOW_ARGS 350 pop rbp 351 ret 352 353;void vp8_idct_dequant_dc_0_2x_sse2 354; ( 355; short *qcoeff - 0 356; short *dequant - 1 357; unsigned char *dst - 2 358; int dst_stride - 3 359; short *dc - 4 360; ) 361global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE 362sym(vp8_idct_dequant_dc_0_2x_sse2): 363 push rbp 364 mov rbp, rsp 365 SHADOW_ARGS_TO_STACK 5 366 GET_GOT rbx 367 push rdi 368 ; end prolog 369 370 ; special case when 2 blocks have 0 or 1 coeffs 371 ; dc is set as first coeff, so no need to load qcoeff 372 mov rax, arg(0) ; qcoeff 373 374 mov rdi, arg(2) ; dst 375 mov rdx, arg(4) ; dc 376 377 ; Zero out xmm5, for use unpacking 378 pxor xmm5, xmm5 379 380 ; load up 2 dc words here == 2*16 = doubleword 381 movd xmm4, [rdx] 382 383 movsxd rdx, dword ptr arg(3) ; dst_stride 384 lea rcx, [rdx + rdx*2] 385 ; Load up predict blocks 386 movq xmm0, [rdi] 387 movq xmm1, [rdi+rdx*1] 388 movq xmm2, [rdi+rdx*2] 389 movq xmm3, [rdi+rcx] 390 391 ; Duplicate and expand dc across 392 punpcklwd xmm4, xmm4 393 punpckldq xmm4, xmm4 394 395 ; Rounding to dequant and downshift 396 paddw xmm4, [GLOBAL(fours)] 397 psraw xmm4, 3 398 399 ; Predict buffer needs to be expanded from bytes to words 400 punpcklbw xmm0, xmm5 401 punpcklbw xmm1, xmm5 402 punpcklbw xmm2, xmm5 403 punpcklbw xmm3, xmm5 404 405 ; Add to predict buffer 406 paddw xmm0, xmm4 407 paddw xmm1, xmm4 408 paddw xmm2, xmm4 409 paddw xmm3, xmm4 410 411 ; pack up before storing 412 packuswb xmm0, xmm5 413 packuswb xmm1, xmm5 414 packuswb xmm2, xmm5 415 packuswb xmm3, xmm5 416 417 ; store blocks back out 418 movq [rdi], xmm0 419 movq [rdi + rdx], xmm1 420 movq [rdi + rdx*2], xmm2 421 movq [rdi + rcx], xmm3 422 423 ; begin epilog 424 pop rdi 425 RESTORE_GOT 426 UNSHADOW_ARGS 427 pop rbp 428 ret 429;void vp8_idct_dequant_dc_full_2x_sse2 430; ( 431; short *qcoeff - 0 432; short *dequant - 1 433; unsigned char *dst - 2 434; int dst_stride - 3 435; short *dc - 4 436; ) 437global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE 438sym(vp8_idct_dequant_dc_full_2x_sse2): 439 push rbp 440 mov rbp, rsp 441 SHADOW_ARGS_TO_STACK 5 442 SAVE_XMM 7 443 GET_GOT rbx 444 push rdi 445 ; end prolog 446 447 ; special case when 2 blocks have 0 or 1 coeffs 448 ; dc is set as first coeff, so no need to load qcoeff 449 mov rax, arg(0) ; qcoeff 450 mov rdx, arg(1) ; dequant 451 452 mov rdi, arg(2) ; dst 453 454 ; Zero out xmm7, for use unpacking 455 pxor xmm7, xmm7 456 457 458 ; note the transpose of xmm1 and xmm2, necessary for shuffle 459 ; to spit out sensicle data 460 movdqa xmm0, [rax] 461 movdqa xmm2, [rax+16] 462 movdqa xmm1, [rax+32] 463 movdqa xmm3, [rax+48] 464 465 ; Clear out coeffs 466 movdqa [rax], xmm7 467 movdqa [rax+16], xmm7 468 movdqa [rax+32], xmm7 469 movdqa [rax+48], xmm7 470 471 ; dequantize qcoeff buffer 472 pmullw xmm0, [rdx] 473 pmullw xmm2, [rdx+16] 474 pmullw xmm1, [rdx] 475 pmullw xmm3, [rdx+16] 476 477 ; DC component 478 mov rdx, arg(4) 479 480 ; repack so block 0 row x and block 1 row x are together 481 movdqa xmm4, xmm0 482 punpckldq xmm0, xmm1 483 punpckhdq xmm4, xmm1 484 485 pshufd xmm0, xmm0, 11011000b 486 pshufd xmm1, xmm4, 11011000b 487 488 movdqa xmm4, xmm2 489 punpckldq xmm2, xmm3 490 punpckhdq xmm4, xmm3 491 492 pshufd xmm2, xmm2, 11011000b 493 pshufd xmm3, xmm4, 11011000b 494 495 ; insert DC component 496 pinsrw xmm0, [rdx], 0 497 pinsrw xmm0, [rdx+2], 4 498 499 ; first pass 500 psubw xmm0, xmm2 ; b1 = 0-2 501 paddw xmm2, xmm2 ; 502 503 movdqa xmm5, xmm1 504 paddw xmm2, xmm0 ; a1 = 0+2 505 506 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 507 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 508 509 movdqa xmm7, xmm3 510 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 511 512 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 513 psubw xmm7, xmm5 ; c1 514 515 movdqa xmm5, xmm1 516 movdqa xmm4, xmm3 517 518 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 519 paddw xmm5, xmm1 520 521 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 522 paddw xmm3, xmm4 523 524 paddw xmm3, xmm5 ; d1 525 movdqa xmm6, xmm2 ; a1 526 527 movdqa xmm4, xmm0 ; b1 528 paddw xmm2, xmm3 ;0 529 530 paddw xmm4, xmm7 ;1 531 psubw xmm0, xmm7 ;2 532 533 psubw xmm6, xmm3 ;3 534 535 ; transpose for the second pass 536 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 537 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 538 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 539 540 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 541 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 542 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 543 544 545 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 546 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 547 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 548 549 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 550 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 551 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 552 553 554 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 555 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 556 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 557 558 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 559 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 560 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 561 562 pshufd xmm0, xmm2, 11011000b 563 pshufd xmm2, xmm1, 11011000b 564 565 pshufd xmm1, xmm5, 11011000b 566 pshufd xmm3, xmm7, 11011000b 567 568 ; second pass 569 psubw xmm0, xmm2 ; b1 = 0-2 570 paddw xmm2, xmm2 571 572 movdqa xmm5, xmm1 573 paddw xmm2, xmm0 ; a1 = 0+2 574 575 pmulhw xmm5, [GLOBAL(x_s1sqr2)] 576 paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 577 578 movdqa xmm7, xmm3 579 pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 580 581 paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 582 psubw xmm7, xmm5 ; c1 583 584 movdqa xmm5, xmm1 585 movdqa xmm4, xmm3 586 587 pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 588 paddw xmm5, xmm1 589 590 pmulhw xmm3, [GLOBAL(x_s1sqr2)] 591 paddw xmm3, xmm4 592 593 paddw xmm3, xmm5 ; d1 594 paddw xmm0, [GLOBAL(fours)] 595 596 paddw xmm2, [GLOBAL(fours)] 597 movdqa xmm6, xmm2 ; a1 598 599 movdqa xmm4, xmm0 ; b1 600 paddw xmm2, xmm3 ;0 601 602 paddw xmm4, xmm7 ;1 603 psubw xmm0, xmm7 ;2 604 605 psubw xmm6, xmm3 ;3 606 psraw xmm2, 3 607 608 psraw xmm0, 3 609 psraw xmm4, 3 610 611 psraw xmm6, 3 612 613 ; transpose to save 614 movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 615 punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 616 punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 617 618 movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 619 punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 620 punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 621 622 623 movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 624 punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 625 punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 626 627 movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 628 punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 629 punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 630 631 632 movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 633 punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 634 punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 635 636 movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 637 punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 638 punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 639 640 pshufd xmm0, xmm2, 11011000b 641 pshufd xmm2, xmm1, 11011000b 642 643 pshufd xmm1, xmm5, 11011000b 644 pshufd xmm3, xmm7, 11011000b 645 646 pxor xmm7, xmm7 647 648 ; Load up predict blocks 649 movsxd rdx, dword ptr arg(3) ; dst_stride 650 movq xmm4, [rdi] 651 movq xmm5, [rdi+rdx] 652 lea rcx, [rdx + rdx*2] 653 654 punpcklbw xmm4, xmm7 655 punpcklbw xmm5, xmm7 656 657 paddw xmm0, xmm4 658 paddw xmm1, xmm5 659 660 movq xmm4, [rdi+rdx*2] 661 movq xmm5, [rdi+rcx] 662 663 punpcklbw xmm4, xmm7 664 punpcklbw xmm5, xmm7 665 666 paddw xmm2, xmm4 667 paddw xmm3, xmm5 668 669.finish: 670 671 ; pack up before storing 672 packuswb xmm0, xmm7 673 packuswb xmm1, xmm7 674 packuswb xmm2, xmm7 675 packuswb xmm3, xmm7 676 677 ; Load destination stride before writing out, 678 ; doesn't need to persist 679 movsxd rdx, dword ptr arg(3) ; dst_stride 680 681 ; store blocks back out 682 movq [rdi], xmm0 683 movq [rdi + rdx], xmm1 684 685 lea rdi, [rdi + 2*rdx] 686 687 movq [rdi], xmm2 688 movq [rdi + rdx], xmm3 689 690 691 ; begin epilog 692 pop rdi 693 RESTORE_GOT 694 RESTORE_XMM 695 UNSHADOW_ARGS 696 pop rbp 697 ret 698 699SECTION_RODATA 700align 16 701fours: 702 times 8 dw 0x0004 703align 16 704x_s1sqr2: 705 times 8 dw 0x8A8C 706align 16 707x_c1sqr2less1: 708 times 8 dw 0x4E7B 709