1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;unsigned int vp9_get_mb_ss_sse2 15;( 16; short *src_ptr 17;) 18global sym(vp9_get_mb_ss_sse2) PRIVATE 19sym(vp9_get_mb_ss_sse2): 20 push rbp 21 mov rbp, rsp 22 SHADOW_ARGS_TO_STACK 1 23 GET_GOT rbx 24 push rsi 25 push rdi 26 sub rsp, 16 27 ; end prolog 28 29 30 mov rax, arg(0) ;[src_ptr] 31 mov rcx, 8 32 pxor xmm4, xmm4 33 34.NEXTROW: 35 movdqa xmm0, [rax] 36 movdqa xmm1, [rax+16] 37 movdqa xmm2, [rax+32] 38 movdqa xmm3, [rax+48] 39 pmaddwd xmm0, xmm0 40 pmaddwd xmm1, xmm1 41 pmaddwd xmm2, xmm2 42 pmaddwd xmm3, xmm3 43 44 paddd xmm0, xmm1 45 paddd xmm2, xmm3 46 paddd xmm4, xmm0 47 paddd xmm4, xmm2 48 49 add rax, 0x40 50 dec rcx 51 ja .NEXTROW 52 53 movdqa xmm3,xmm4 54 psrldq xmm4,8 55 paddd xmm4,xmm3 56 movdqa xmm3,xmm4 57 psrldq xmm4,4 58 paddd xmm4,xmm3 59 movq rax,xmm4 60 61 62 ; begin epilog 63 add rsp, 16 64 pop rdi 65 pop rsi 66 RESTORE_GOT 67 UNSHADOW_ARGS 68 pop rbp 69 ret 70 71 72;unsigned int vp9_get16x16var_sse2 73;( 74; unsigned char * src_ptr, 75; int source_stride, 76; unsigned char * ref_ptr, 77; int recon_stride, 78; unsigned int * SSE, 79; int * Sum 80;) 81global sym(vp9_get16x16var_sse2) PRIVATE 82sym(vp9_get16x16var_sse2): 83 push rbp 84 mov rbp, rsp 85 SHADOW_ARGS_TO_STACK 6 86 SAVE_XMM 7 87 push rbx 88 push rsi 89 push rdi 90 ; end prolog 91 92 mov rsi, arg(0) ;[src_ptr] 93 mov rdi, arg(2) ;[ref_ptr] 94 95 movsxd rax, DWORD PTR arg(1) ;[source_stride] 96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 97 98 ; Prefetch data 99 lea rcx, [rax+rax*2] 100 prefetcht0 [rsi] 101 prefetcht0 [rsi+rax] 102 prefetcht0 [rsi+rax*2] 103 prefetcht0 [rsi+rcx] 104 lea rbx, [rsi+rax*4] 105 prefetcht0 [rbx] 106 prefetcht0 [rbx+rax] 107 prefetcht0 [rbx+rax*2] 108 prefetcht0 [rbx+rcx] 109 110 lea rcx, [rdx+rdx*2] 111 prefetcht0 [rdi] 112 prefetcht0 [rdi+rdx] 113 prefetcht0 [rdi+rdx*2] 114 prefetcht0 [rdi+rcx] 115 lea rbx, [rdi+rdx*4] 116 prefetcht0 [rbx] 117 prefetcht0 [rbx+rdx] 118 prefetcht0 [rbx+rdx*2] 119 prefetcht0 [rbx+rcx] 120 121 pxor xmm0, xmm0 ; clear xmm0 for unpack 122 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 123 124 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 125 mov rcx, 16 126 127.var16loop: 128 movdqu xmm1, XMMWORD PTR [rsi] 129 movdqu xmm2, XMMWORD PTR [rdi] 130 131 prefetcht0 [rsi+rax*8] 132 prefetcht0 [rdi+rdx*8] 133 134 movdqa xmm3, xmm1 135 movdqa xmm4, xmm2 136 137 138 punpcklbw xmm1, xmm0 139 punpckhbw xmm3, xmm0 140 141 punpcklbw xmm2, xmm0 142 punpckhbw xmm4, xmm0 143 144 145 psubw xmm1, xmm2 146 psubw xmm3, xmm4 147 148 paddw xmm7, xmm1 149 pmaddwd xmm1, xmm1 150 151 paddw xmm7, xmm3 152 pmaddwd xmm3, xmm3 153 154 paddd xmm6, xmm1 155 paddd xmm6, xmm3 156 157 add rsi, rax 158 add rdi, rdx 159 160 sub rcx, 1 161 jnz .var16loop 162 163 164 movdqa xmm1, xmm6 165 pxor xmm6, xmm6 166 167 pxor xmm5, xmm5 168 punpcklwd xmm6, xmm7 169 170 punpckhwd xmm5, xmm7 171 psrad xmm5, 16 172 173 psrad xmm6, 16 174 paddd xmm6, xmm5 175 176 movdqa xmm2, xmm1 177 punpckldq xmm1, xmm0 178 179 punpckhdq xmm2, xmm0 180 movdqa xmm7, xmm6 181 182 paddd xmm1, xmm2 183 punpckldq xmm6, xmm0 184 185 punpckhdq xmm7, xmm0 186 paddd xmm6, xmm7 187 188 movdqa xmm2, xmm1 189 movdqa xmm7, xmm6 190 191 psrldq xmm1, 8 192 psrldq xmm6, 8 193 194 paddd xmm7, xmm6 195 paddd xmm1, xmm2 196 197 mov rax, arg(5) ;[Sum] 198 mov rdi, arg(4) ;[SSE] 199 200 movd DWORD PTR [rax], xmm7 201 movd DWORD PTR [rdi], xmm1 202 203 204 ; begin epilog 205 pop rdi 206 pop rsi 207 pop rbx 208 RESTORE_XMM 209 UNSHADOW_ARGS 210 pop rbp 211 ret 212 213 214 215 216;unsigned int vp9_get8x8var_sse2 217;( 218; unsigned char * src_ptr, 219; int source_stride, 220; unsigned char * ref_ptr, 221; int recon_stride, 222; unsigned int * SSE, 223; int * Sum 224;) 225global sym(vp9_get8x8var_sse2) PRIVATE 226sym(vp9_get8x8var_sse2): 227 push rbp 228 mov rbp, rsp 229 SHADOW_ARGS_TO_STACK 6 230 SAVE_XMM 7 231 GET_GOT rbx 232 push rsi 233 push rdi 234 sub rsp, 16 235 ; end prolog 236 237 mov rsi, arg(0) ;[src_ptr] 238 mov rdi, arg(2) ;[ref_ptr] 239 240 movsxd rax, DWORD PTR arg(1) ;[source_stride] 241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 242 243 pxor xmm0, xmm0 ; clear xmm0 for unpack 244 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 245 246 movq xmm1, QWORD PTR [rsi] 247 movq xmm2, QWORD PTR [rdi] 248 249 punpcklbw xmm1, xmm0 250 punpcklbw xmm2, xmm0 251 252 psubsw xmm1, xmm2 253 paddw xmm7, xmm1 254 255 pmaddwd xmm1, xmm1 256 257 movq xmm2, QWORD PTR[rsi + rax] 258 movq xmm3, QWORD PTR[rdi + rdx] 259 260 punpcklbw xmm2, xmm0 261 punpcklbw xmm3, xmm0 262 263 psubsw xmm2, xmm3 264 paddw xmm7, xmm2 265 266 pmaddwd xmm2, xmm2 267 paddd xmm1, xmm2 268 269 270 movq xmm2, QWORD PTR[rsi + rax * 2] 271 movq xmm3, QWORD PTR[rdi + rdx * 2] 272 273 punpcklbw xmm2, xmm0 274 punpcklbw xmm3, xmm0 275 276 psubsw xmm2, xmm3 277 paddw xmm7, xmm2 278 279 pmaddwd xmm2, xmm2 280 paddd xmm1, xmm2 281 282 283 lea rsi, [rsi + rax * 2] 284 lea rdi, [rdi + rdx * 2] 285 movq xmm2, QWORD PTR[rsi + rax] 286 movq xmm3, QWORD PTR[rdi + rdx] 287 288 punpcklbw xmm2, xmm0 289 punpcklbw xmm3, xmm0 290 291 psubsw xmm2, xmm3 292 paddw xmm7, xmm2 293 294 pmaddwd xmm2, xmm2 295 paddd xmm1, xmm2 296 297 movq xmm2, QWORD PTR[rsi + rax *2] 298 movq xmm3, QWORD PTR[rdi + rdx *2] 299 300 punpcklbw xmm2, xmm0 301 punpcklbw xmm3, xmm0 302 303 psubsw xmm2, xmm3 304 paddw xmm7, xmm2 305 306 pmaddwd xmm2, xmm2 307 paddd xmm1, xmm2 308 309 310 lea rsi, [rsi + rax * 2] 311 lea rdi, [rdi + rdx * 2] 312 313 314 movq xmm2, QWORD PTR[rsi + rax] 315 movq xmm3, QWORD PTR[rdi + rdx] 316 317 punpcklbw xmm2, xmm0 318 punpcklbw xmm3, xmm0 319 320 psubsw xmm2, xmm3 321 paddw xmm7, xmm2 322 323 pmaddwd xmm2, xmm2 324 paddd xmm1, xmm2 325 326 movq xmm2, QWORD PTR[rsi + rax *2] 327 movq xmm3, QWORD PTR[rdi + rdx *2] 328 329 punpcklbw xmm2, xmm0 330 punpcklbw xmm3, xmm0 331 332 psubsw xmm2, xmm3 333 paddw xmm7, xmm2 334 335 pmaddwd xmm2, xmm2 336 paddd xmm1, xmm2 337 338 339 lea rsi, [rsi + rax * 2] 340 lea rdi, [rdi + rdx * 2] 341 342 movq xmm2, QWORD PTR[rsi + rax] 343 movq xmm3, QWORD PTR[rdi + rdx] 344 345 punpcklbw xmm2, xmm0 346 punpcklbw xmm3, xmm0 347 348 psubsw xmm2, xmm3 349 paddw xmm7, xmm2 350 351 pmaddwd xmm2, xmm2 352 paddd xmm1, xmm2 353 354 355 movdqa xmm6, xmm7 356 punpcklwd xmm6, xmm0 357 358 punpckhwd xmm7, xmm0 359 movdqa xmm2, xmm1 360 361 paddw xmm6, xmm7 362 punpckldq xmm1, xmm0 363 364 punpckhdq xmm2, xmm0 365 movdqa xmm7, xmm6 366 367 paddd xmm1, xmm2 368 punpckldq xmm6, xmm0 369 370 punpckhdq xmm7, xmm0 371 paddw xmm6, xmm7 372 373 movdqa xmm2, xmm1 374 movdqa xmm7, xmm6 375 376 psrldq xmm1, 8 377 psrldq xmm6, 8 378 379 paddw xmm7, xmm6 380 paddd xmm1, xmm2 381 382 mov rax, arg(5) ;[Sum] 383 mov rdi, arg(4) ;[SSE] 384 385 movq rdx, xmm7 386 movsx rcx, dx 387 388 mov dword ptr [rax], ecx 389 movd DWORD PTR [rdi], xmm1 390 391 ; begin epilog 392 add rsp, 16 393 pop rdi 394 pop rsi 395 RESTORE_GOT 396 RESTORE_XMM 397 UNSHADOW_ARGS 398 pop rbp 399 ret 400 401;void vp9_half_horiz_vert_variance8x_h_sse2 402;( 403; unsigned char *ref_ptr, 404; int ref_pixels_per_line, 405; unsigned char *src_ptr, 406; int src_pixels_per_line, 407; unsigned int Height, 408; int *sum, 409; unsigned int *sumsquared 410;) 411global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE 412sym(vp9_half_horiz_vert_variance8x_h_sse2): 413 push rbp 414 mov rbp, rsp 415 SHADOW_ARGS_TO_STACK 7 416 SAVE_XMM 7 417 GET_GOT rbx 418 push rsi 419 push rdi 420 ; end prolog 421 422%if ABI_IS_32BIT=0 423 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 424 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 425%endif 426 427 pxor xmm6, xmm6 ; error accumulator 428 pxor xmm7, xmm7 ; sse eaccumulator 429 mov rsi, arg(0) ;ref_ptr ; 430 431 mov rdi, arg(2) ;src_ptr ; 432 movsxd rcx, dword ptr arg(4) ;Height ; 433 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 434 435 pxor xmm0, xmm0 ; 436 437 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 438 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 439 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 440 441%if ABI_IS_32BIT 442 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 443%else 444 add rsi, r8 445%endif 446 447.half_horiz_vert_variance8x_h_1: 448 449 movq xmm1, QWORD PTR [rsi] ; 450 movq xmm2, QWORD PTR [rsi+1] ; 451 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 452 453 pavgb xmm5, xmm1 ; xmm = vertical average of the above 454 punpcklbw xmm5, xmm0 ; xmm5 = words of above 455 456 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 457 punpcklbw xmm3, xmm0 ; xmm3 = words of above 458 459 psubw xmm5, xmm3 ; xmm5 -= xmm3 460 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 461 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 462 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 463 464 movdqa xmm5, xmm1 ; save xmm1 for use on the next row 465 466%if ABI_IS_32BIT 467 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 468 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 469%else 470 add rsi, r8 471 add rdi, r9 472%endif 473 474 sub rcx, 1 ; 475 jnz .half_horiz_vert_variance8x_h_1 ; 476 477 movdq2q mm6, xmm6 ; 478 movdq2q mm7, xmm7 ; 479 480 psrldq xmm6, 8 481 psrldq xmm7, 8 482 483 movdq2q mm2, xmm6 484 movdq2q mm3, xmm7 485 486 paddw mm6, mm2 487 paddd mm7, mm3 488 489 pxor mm3, mm3 ; 490 pxor mm2, mm2 ; 491 492 punpcklwd mm2, mm6 ; 493 punpckhwd mm3, mm6 ; 494 495 paddd mm2, mm3 ; 496 movq mm6, mm2 ; 497 498 psrlq mm6, 32 ; 499 paddd mm2, mm6 ; 500 501 psrad mm2, 16 ; 502 movq mm4, mm7 ; 503 504 psrlq mm4, 32 ; 505 paddd mm4, mm7 ; 506 507 mov rsi, arg(5) ; sum 508 mov rdi, arg(6) ; sumsquared 509 510 movd [rsi], mm2 ; 511 movd [rdi], mm4 ; 512 513 514 ; begin epilog 515 pop rdi 516 pop rsi 517 RESTORE_GOT 518 RESTORE_XMM 519 UNSHADOW_ARGS 520 pop rbp 521 ret 522 523;void vp9_half_vert_variance8x_h_sse2 524;( 525; unsigned char *ref_ptr, 526; int ref_pixels_per_line, 527; unsigned char *src_ptr, 528; int src_pixels_per_line, 529; unsigned int Height, 530; int *sum, 531; unsigned int *sumsquared 532;) 533global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE 534sym(vp9_half_vert_variance8x_h_sse2): 535 push rbp 536 mov rbp, rsp 537 SHADOW_ARGS_TO_STACK 7 538 SAVE_XMM 7 539 GET_GOT rbx 540 push rsi 541 push rdi 542 ; end prolog 543 544%if ABI_IS_32BIT=0 545 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 546 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 547%endif 548 549 pxor xmm6, xmm6 ; error accumulator 550 pxor xmm7, xmm7 ; sse eaccumulator 551 mov rsi, arg(0) ;ref_ptr ; 552 553 mov rdi, arg(2) ;src_ptr ; 554 movsxd rcx, dword ptr arg(4) ;Height ; 555 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 556 557 pxor xmm0, xmm0 ; 558.half_vert_variance8x_h_1: 559 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 560 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 561 562 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 563 punpcklbw xmm5, xmm0 ; xmm5 = words of above 564 565 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 566 punpcklbw xmm3, xmm0 ; xmm3 = words of above 567 568 psubw xmm5, xmm3 ; xmm5 -= xmm3 569 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 570 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 571 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 572 573%if ABI_IS_32BIT 574 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 575 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 576%else 577 add rsi, r8 578 add rdi, r9 579%endif 580 581 sub rcx, 1 ; 582 jnz .half_vert_variance8x_h_1 ; 583 584 movdq2q mm6, xmm6 ; 585 movdq2q mm7, xmm7 ; 586 587 psrldq xmm6, 8 588 psrldq xmm7, 8 589 590 movdq2q mm2, xmm6 591 movdq2q mm3, xmm7 592 593 paddw mm6, mm2 594 paddd mm7, mm3 595 596 pxor mm3, mm3 ; 597 pxor mm2, mm2 ; 598 599 punpcklwd mm2, mm6 ; 600 punpckhwd mm3, mm6 ; 601 602 paddd mm2, mm3 ; 603 movq mm6, mm2 ; 604 605 psrlq mm6, 32 ; 606 paddd mm2, mm6 ; 607 608 psrad mm2, 16 ; 609 movq mm4, mm7 ; 610 611 psrlq mm4, 32 ; 612 paddd mm4, mm7 ; 613 614 mov rsi, arg(5) ; sum 615 mov rdi, arg(6) ; sumsquared 616 617 movd [rsi], mm2 ; 618 movd [rdi], mm4 ; 619 620 621 ; begin epilog 622 pop rdi 623 pop rsi 624 RESTORE_GOT 625 RESTORE_XMM 626 UNSHADOW_ARGS 627 pop rbp 628 ret 629 630 631;void vp9_half_horiz_variance8x_h_sse2 632;( 633; unsigned char *ref_ptr, 634; int ref_pixels_per_line, 635; unsigned char *src_ptr, 636; int src_pixels_per_line, 637; unsigned int Height, 638; int *sum, 639; unsigned int *sumsquared 640;) 641global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE 642sym(vp9_half_horiz_variance8x_h_sse2): 643 push rbp 644 mov rbp, rsp 645 SHADOW_ARGS_TO_STACK 7 646 SAVE_XMM 7 647 GET_GOT rbx 648 push rsi 649 push rdi 650 ; end prolog 651 652%if ABI_IS_32BIT=0 653 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 654 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 655%endif 656 657 pxor xmm6, xmm6 ; error accumulator 658 pxor xmm7, xmm7 ; sse eaccumulator 659 mov rsi, arg(0) ;ref_ptr ; 660 661 mov rdi, arg(2) ;src_ptr ; 662 movsxd rcx, dword ptr arg(4) ;Height ; 663 664 pxor xmm0, xmm0 ; 665.half_horiz_variance8x_h_1: 666 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 667 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 668 669 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 670 punpcklbw xmm5, xmm0 ; xmm5 = words of above 671 672 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 673 punpcklbw xmm3, xmm0 ; xmm3 = words of above 674 675 psubw xmm5, xmm3 ; xmm5 -= xmm3 676 paddw xmm6, xmm5 ; xmm6 += accumulated column differences 677 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 678 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 679 680%if ABI_IS_32BIT 681 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 682 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 683%else 684 add rsi, r8 685 add rdi, r9 686%endif 687 sub rcx, 1 ; 688 jnz .half_horiz_variance8x_h_1 ; 689 690 movdq2q mm6, xmm6 ; 691 movdq2q mm7, xmm7 ; 692 693 psrldq xmm6, 8 694 psrldq xmm7, 8 695 696 movdq2q mm2, xmm6 697 movdq2q mm3, xmm7 698 699 paddw mm6, mm2 700 paddd mm7, mm3 701 702 pxor mm3, mm3 ; 703 pxor mm2, mm2 ; 704 705 punpcklwd mm2, mm6 ; 706 punpckhwd mm3, mm6 ; 707 708 paddd mm2, mm3 ; 709 movq mm6, mm2 ; 710 711 psrlq mm6, 32 ; 712 paddd mm2, mm6 ; 713 714 psrad mm2, 16 ; 715 movq mm4, mm7 ; 716 717 psrlq mm4, 32 ; 718 paddd mm4, mm7 ; 719 720 mov rsi, arg(5) ; sum 721 mov rdi, arg(6) ; sumsquared 722 723 movd [rsi], mm2 ; 724 movd [rdi], mm4 ; 725 726 727 ; begin epilog 728 pop rdi 729 pop rsi 730 RESTORE_GOT 731 RESTORE_XMM 732 UNSHADOW_ARGS 733 pop rbp 734 ret 735