1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define mmx_filter_shift 7 15 16;unsigned int vpx_get_mb_ss_mmx( short *src_ptr ) 17global sym(vpx_get_mb_ss_mmx) PRIVATE 18sym(vpx_get_mb_ss_mmx): 19 push rbp 20 mov rbp, rsp 21 SHADOW_ARGS_TO_STACK 7 22 GET_GOT rbx 23 push rsi 24 push rdi 25 sub rsp, 8 26 ; end prolog 27 28 mov rax, arg(0) ;src_ptr 29 mov rcx, 16 30 pxor mm4, mm4 31 32.NEXTROW: 33 movq mm0, [rax] 34 movq mm1, [rax+8] 35 movq mm2, [rax+16] 36 movq mm3, [rax+24] 37 pmaddwd mm0, mm0 38 pmaddwd mm1, mm1 39 pmaddwd mm2, mm2 40 pmaddwd mm3, mm3 41 42 paddd mm4, mm0 43 paddd mm4, mm1 44 paddd mm4, mm2 45 paddd mm4, mm3 46 47 add rax, 32 48 dec rcx 49 ja .NEXTROW 50 movq QWORD PTR [rsp], mm4 51 52 ;return sum[0]+sum[1]; 53 movsxd rax, dword ptr [rsp] 54 movsxd rcx, dword ptr [rsp+4] 55 add rax, rcx 56 57 ; begin epilog 58 add rsp, 8 59 pop rdi 60 pop rsi 61 RESTORE_GOT 62 UNSHADOW_ARGS 63 pop rbp 64 ret 65 66;void vpx_get8x8var_mmx 67;( 68; unsigned char *src_ptr, 69; int source_stride, 70; unsigned char *ref_ptr, 71; int recon_stride, 72; unsigned int *SSE, 73; int *Sum 74;) 75global sym(vpx_get8x8var_mmx) PRIVATE 76sym(vpx_get8x8var_mmx): 77 push rbp 78 mov rbp, rsp 79 SHADOW_ARGS_TO_STACK 6 80 push rsi 81 push rdi 82 push rbx 83 sub rsp, 16 84 ; end prolog 85 86 pxor mm5, mm5 ; Blank mmx6 87 pxor mm6, mm6 ; Blank mmx7 88 pxor mm7, mm7 ; Blank mmx7 89 90 mov rax, arg(0) ;[src_ptr] ; Load base addresses 91 mov rbx, arg(2) ;[ref_ptr] 92 movsxd rcx, dword ptr arg(1) ;[source_stride] 93 movsxd rdx, dword ptr arg(3) ;[recon_stride] 94 95 ; Row 1 96 movq mm0, [rax] ; Copy eight bytes to mm0 97 movq mm1, [rbx] ; Copy eight bytes to mm1 98 movq mm2, mm0 ; Take copies 99 movq mm3, mm1 ; Take copies 100 101 punpcklbw mm0, mm6 ; unpack to higher prrcision 102 punpcklbw mm1, mm6 103 punpckhbw mm2, mm6 ; unpack to higher prrcision 104 punpckhbw mm3, mm6 105 psubsw mm0, mm1 ; A-B (low order) to MM0 106 psubsw mm2, mm3 ; A-B (high order) to MM2 107 108 paddw mm5, mm0 ; accumulate differences in mm5 109 paddw mm5, mm2 ; accumulate differences in mm5 110 111 pmaddwd mm0, mm0 ; square and accumulate 112 pmaddwd mm2, mm2 ; square and accumulate 113 add rbx,rdx ; Inc pointer into ref data 114 add rax,rcx ; Inc pointer into the new data 115 movq mm1, [rbx] ; Copy eight bytes to mm1 116 paddd mm7, mm0 ; accumulate in mm7 117 paddd mm7, mm2 ; accumulate in mm7 118 119 ; Row 2 120 movq mm0, [rax] ; Copy eight bytes to mm0 121 movq mm2, mm0 ; Take copies 122 movq mm3, mm1 ; Take copies 123 124 punpcklbw mm0, mm6 ; unpack to higher prrcision 125 punpcklbw mm1, mm6 126 punpckhbw mm2, mm6 ; unpack to higher prrcision 127 punpckhbw mm3, mm6 128 psubsw mm0, mm1 ; A-B (low order) to MM0 129 psubsw mm2, mm3 ; A-B (high order) to MM2 130 131 paddw mm5, mm0 ; accumulate differences in mm5 132 paddw mm5, mm2 ; accumulate differences in mm5 133 134 pmaddwd mm0, mm0 ; square and accumulate 135 pmaddwd mm2, mm2 ; square and accumulate 136 add rbx,rdx ; Inc pointer into ref data 137 add rax,rcx ; Inc pointer into the new data 138 movq mm1, [rbx] ; Copy eight bytes to mm1 139 paddd mm7, mm0 ; accumulate in mm7 140 paddd mm7, mm2 ; accumulate in mm7 141 142 ; Row 3 143 movq mm0, [rax] ; Copy eight bytes to mm0 144 movq mm2, mm0 ; Take copies 145 movq mm3, mm1 ; Take copies 146 147 punpcklbw mm0, mm6 ; unpack to higher prrcision 148 punpcklbw mm1, mm6 149 punpckhbw mm2, mm6 ; unpack to higher prrcision 150 punpckhbw mm3, mm6 151 psubsw mm0, mm1 ; A-B (low order) to MM0 152 psubsw mm2, mm3 ; A-B (high order) to MM2 153 154 paddw mm5, mm0 ; accumulate differences in mm5 155 paddw mm5, mm2 ; accumulate differences in mm5 156 157 pmaddwd mm0, mm0 ; square and accumulate 158 pmaddwd mm2, mm2 ; square and accumulate 159 add rbx,rdx ; Inc pointer into ref data 160 add rax,rcx ; Inc pointer into the new data 161 movq mm1, [rbx] ; Copy eight bytes to mm1 162 paddd mm7, mm0 ; accumulate in mm7 163 paddd mm7, mm2 ; accumulate in mm7 164 165 ; Row 4 166 movq mm0, [rax] ; Copy eight bytes to mm0 167 movq mm2, mm0 ; Take copies 168 movq mm3, mm1 ; Take copies 169 170 punpcklbw mm0, mm6 ; unpack to higher prrcision 171 punpcklbw mm1, mm6 172 punpckhbw mm2, mm6 ; unpack to higher prrcision 173 punpckhbw mm3, mm6 174 psubsw mm0, mm1 ; A-B (low order) to MM0 175 psubsw mm2, mm3 ; A-B (high order) to MM2 176 177 paddw mm5, mm0 ; accumulate differences in mm5 178 paddw mm5, mm2 ; accumulate differences in mm5 179 180 pmaddwd mm0, mm0 ; square and accumulate 181 pmaddwd mm2, mm2 ; square and accumulate 182 add rbx,rdx ; Inc pointer into ref data 183 add rax,rcx ; Inc pointer into the new data 184 movq mm1, [rbx] ; Copy eight bytes to mm1 185 paddd mm7, mm0 ; accumulate in mm7 186 paddd mm7, mm2 ; accumulate in mm7 187 188 ; Row 5 189 movq mm0, [rax] ; Copy eight bytes to mm0 190 movq mm2, mm0 ; Take copies 191 movq mm3, mm1 ; Take copies 192 193 punpcklbw mm0, mm6 ; unpack to higher prrcision 194 punpcklbw mm1, mm6 195 punpckhbw mm2, mm6 ; unpack to higher prrcision 196 punpckhbw mm3, mm6 197 psubsw mm0, mm1 ; A-B (low order) to MM0 198 psubsw mm2, mm3 ; A-B (high order) to MM2 199 200 paddw mm5, mm0 ; accumulate differences in mm5 201 paddw mm5, mm2 ; accumulate differences in mm5 202 203 pmaddwd mm0, mm0 ; square and accumulate 204 pmaddwd mm2, mm2 ; square and accumulate 205 add rbx,rdx ; Inc pointer into ref data 206 add rax,rcx ; Inc pointer into the new data 207 movq mm1, [rbx] ; Copy eight bytes to mm1 208 ; movq mm4, [rbx + rdx] 209 paddd mm7, mm0 ; accumulate in mm7 210 paddd mm7, mm2 ; accumulate in mm7 211 212 ; Row 6 213 movq mm0, [rax] ; Copy eight bytes to mm0 214 movq mm2, mm0 ; Take copies 215 movq mm3, mm1 ; Take copies 216 217 punpcklbw mm0, mm6 ; unpack to higher prrcision 218 punpcklbw mm1, mm6 219 punpckhbw mm2, mm6 ; unpack to higher prrcision 220 punpckhbw mm3, mm6 221 psubsw mm0, mm1 ; A-B (low order) to MM0 222 psubsw mm2, mm3 ; A-B (high order) to MM2 223 224 paddw mm5, mm0 ; accumulate differences in mm5 225 paddw mm5, mm2 ; accumulate differences in mm5 226 227 pmaddwd mm0, mm0 ; square and accumulate 228 pmaddwd mm2, mm2 ; square and accumulate 229 add rbx,rdx ; Inc pointer into ref data 230 add rax,rcx ; Inc pointer into the new data 231 movq mm1, [rbx] ; Copy eight bytes to mm1 232 paddd mm7, mm0 ; accumulate in mm7 233 paddd mm7, mm2 ; accumulate in mm7 234 235 ; Row 7 236 movq mm0, [rax] ; Copy eight bytes to mm0 237 movq mm2, mm0 ; Take copies 238 movq mm3, mm1 ; Take copies 239 240 punpcklbw mm0, mm6 ; unpack to higher prrcision 241 punpcklbw mm1, mm6 242 punpckhbw mm2, mm6 ; unpack to higher prrcision 243 punpckhbw mm3, mm6 244 psubsw mm0, mm1 ; A-B (low order) to MM0 245 psubsw mm2, mm3 ; A-B (high order) to MM2 246 247 paddw mm5, mm0 ; accumulate differences in mm5 248 paddw mm5, mm2 ; accumulate differences in mm5 249 250 pmaddwd mm0, mm0 ; square and accumulate 251 pmaddwd mm2, mm2 ; square and accumulate 252 add rbx,rdx ; Inc pointer into ref data 253 add rax,rcx ; Inc pointer into the new data 254 movq mm1, [rbx] ; Copy eight bytes to mm1 255 paddd mm7, mm0 ; accumulate in mm7 256 paddd mm7, mm2 ; accumulate in mm7 257 258 ; Row 8 259 movq mm0, [rax] ; Copy eight bytes to mm0 260 movq mm2, mm0 ; Take copies 261 movq mm3, mm1 ; Take copies 262 263 punpcklbw mm0, mm6 ; unpack to higher prrcision 264 punpcklbw mm1, mm6 265 punpckhbw mm2, mm6 ; unpack to higher prrcision 266 punpckhbw mm3, mm6 267 psubsw mm0, mm1 ; A-B (low order) to MM0 268 psubsw mm2, mm3 ; A-B (high order) to MM2 269 270 paddw mm5, mm0 ; accumulate differences in mm5 271 paddw mm5, mm2 ; accumulate differences in mm5 272 273 pmaddwd mm0, mm0 ; square and accumulate 274 pmaddwd mm2, mm2 ; square and accumulate 275 add rbx,rdx ; Inc pointer into ref data 276 add rax,rcx ; Inc pointer into the new data 277 paddd mm7, mm0 ; accumulate in mm7 278 paddd mm7, mm2 ; accumulate in mm7 279 280 ; Now accumulate the final results. 281 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 282 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 283 movsx rdx, WORD PTR [rsp+8] 284 movsx rcx, WORD PTR [rsp+10] 285 movsx rbx, WORD PTR [rsp+12] 286 movsx rax, WORD PTR [rsp+14] 287 add rdx, rcx 288 add rbx, rax 289 add rdx, rbx ;XSum 290 movsxd rax, DWORD PTR [rsp] 291 movsxd rcx, DWORD PTR [rsp+4] 292 add rax, rcx ;XXSum 293 mov rsi, arg(4) ;SSE 294 mov rdi, arg(5) ;Sum 295 mov dword ptr [rsi], eax 296 mov dword ptr [rdi], edx 297 xor rax, rax ; return 0 298 299 ; begin epilog 300 add rsp, 16 301 pop rbx 302 pop rdi 303 pop rsi 304 UNSHADOW_ARGS 305 pop rbp 306 ret 307 308;void 309;vpx_get4x4var_mmx 310;( 311; unsigned char *src_ptr, 312; int source_stride, 313; unsigned char *ref_ptr, 314; int recon_stride, 315; unsigned int *SSE, 316; int *Sum 317;) 318global sym(vpx_get4x4var_mmx) PRIVATE 319sym(vpx_get4x4var_mmx): 320 push rbp 321 mov rbp, rsp 322 SHADOW_ARGS_TO_STACK 6 323 push rsi 324 push rdi 325 push rbx 326 sub rsp, 16 327 ; end prolog 328 329 pxor mm5, mm5 ; Blank mmx6 330 pxor mm6, mm6 ; Blank mmx7 331 pxor mm7, mm7 ; Blank mmx7 332 333 mov rax, arg(0) ;[src_ptr] ; Load base addresses 334 mov rbx, arg(2) ;[ref_ptr] 335 movsxd rcx, dword ptr arg(1) ;[source_stride] 336 movsxd rdx, dword ptr arg(3) ;[recon_stride] 337 338 ; Row 1 339 movd mm0, [rax] ; Copy four bytes to mm0 340 movd mm1, [rbx] ; Copy four bytes to mm1 341 punpcklbw mm0, mm6 ; unpack to higher prrcision 342 punpcklbw mm1, mm6 343 psubsw mm0, mm1 ; A-B (low order) to MM0 344 paddw mm5, mm0 ; accumulate differences in mm5 345 pmaddwd mm0, mm0 ; square and accumulate 346 add rbx,rdx ; Inc pointer into ref data 347 add rax,rcx ; Inc pointer into the new data 348 movd mm1, [rbx] ; Copy four bytes to mm1 349 paddd mm7, mm0 ; accumulate in mm7 350 351 ; Row 2 352 movd mm0, [rax] ; Copy four bytes to mm0 353 punpcklbw mm0, mm6 ; unpack to higher prrcision 354 punpcklbw mm1, mm6 355 psubsw mm0, mm1 ; A-B (low order) to MM0 356 paddw mm5, mm0 ; accumulate differences in mm5 357 358 pmaddwd mm0, mm0 ; square and accumulate 359 add rbx,rdx ; Inc pointer into ref data 360 add rax,rcx ; Inc pointer into the new data 361 movd mm1, [rbx] ; Copy four bytes to mm1 362 paddd mm7, mm0 ; accumulate in mm7 363 364 ; Row 3 365 movd mm0, [rax] ; Copy four bytes to mm0 366 punpcklbw mm0, mm6 ; unpack to higher precision 367 punpcklbw mm1, mm6 368 psubsw mm0, mm1 ; A-B (low order) to MM0 369 paddw mm5, mm0 ; accumulate differences in mm5 370 371 pmaddwd mm0, mm0 ; square and accumulate 372 add rbx,rdx ; Inc pointer into ref data 373 add rax,rcx ; Inc pointer into the new data 374 movd mm1, [rbx] ; Copy four bytes to mm1 375 paddd mm7, mm0 ; accumulate in mm7 376 377 ; Row 4 378 movd mm0, [rax] ; Copy four bytes to mm0 379 380 punpcklbw mm0, mm6 ; unpack to higher prrcision 381 punpcklbw mm1, mm6 382 psubsw mm0, mm1 ; A-B (low order) to MM0 383 384 paddw mm5, mm0 ; accumulate differences in mm5 385 386 pmaddwd mm0, mm0 ; square and accumulate 387 paddd mm7, mm0 ; accumulate in mm7 388 389 ; Now accumulate the final results. 390 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 391 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 392 movsx rdx, WORD PTR [rsp+8] 393 movsx rcx, WORD PTR [rsp+10] 394 movsx rbx, WORD PTR [rsp+12] 395 movsx rax, WORD PTR [rsp+14] 396 add rdx, rcx 397 add rbx, rax 398 add rdx, rbx ;XSum 399 movsxd rax, DWORD PTR [rsp] 400 movsxd rcx, DWORD PTR [rsp+4] 401 add rax, rcx ;XXSum 402 mov rsi, arg(4) ;SSE 403 mov rdi, arg(5) ;Sum 404 mov dword ptr [rsi], eax 405 mov dword ptr [rdi], edx 406 xor rax, rax ; return 0 407 408 ; begin epilog 409 add rsp, 16 410 pop rbx 411 pop rdi 412 pop rsi 413 UNSHADOW_ARGS 414 pop rbp 415 ret 416 417;void vpx_filter_block2d_bil4x4_var_mmx 418;( 419; unsigned char *ref_ptr, 420; int ref_pixels_per_line, 421; unsigned char *src_ptr, 422; int src_pixels_per_line, 423; unsigned short *HFilter, 424; unsigned short *VFilter, 425; int *sum, 426; unsigned int *sumsquared 427;) 428global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE 429sym(vpx_filter_block2d_bil4x4_var_mmx): 430 push rbp 431 mov rbp, rsp 432 SHADOW_ARGS_TO_STACK 8 433 GET_GOT rbx 434 push rsi 435 push rdi 436 sub rsp, 16 437 ; end prolog 438 439 pxor mm6, mm6 ; 440 pxor mm7, mm7 ; 441 442 mov rax, arg(4) ;HFilter ; 443 mov rdx, arg(5) ;VFilter ; 444 445 mov rsi, arg(0) ;ref_ptr ; 446 mov rdi, arg(2) ;src_ptr ; 447 448 mov rcx, 4 ; 449 pxor mm0, mm0 ; 450 451 movd mm1, [rsi] ; 452 movd mm3, [rsi+1] ; 453 454 punpcklbw mm1, mm0 ; 455 pmullw mm1, [rax] ; 456 457 punpcklbw mm3, mm0 ; 458 pmullw mm3, [rax+8] ; 459 460 paddw mm1, mm3 ; 461 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 462 463 psraw mm1, mmx_filter_shift ; 464 movq mm5, mm1 465 466%if ABI_IS_32BIT 467 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 468%else 469 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 470 add rsi, r8 471%endif 472 473.filter_block2d_bil4x4_var_mmx_loop: 474 475 movd mm1, [rsi] ; 476 movd mm3, [rsi+1] ; 477 478 punpcklbw mm1, mm0 ; 479 pmullw mm1, [rax] ; 480 481 punpcklbw mm3, mm0 ; 482 pmullw mm3, [rax+8] ; 483 484 paddw mm1, mm3 ; 485 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 486 487 psraw mm1, mmx_filter_shift ; 488 movq mm3, mm5 ; 489 490 movq mm5, mm1 ; 491 pmullw mm3, [rdx] ; 492 493 pmullw mm1, [rdx+8] ; 494 paddw mm1, mm3 ; 495 496 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 497 psraw mm1, mmx_filter_shift ; 498 499 movd mm3, [rdi] ; 500 punpcklbw mm3, mm0 ; 501 502 psubw mm1, mm3 ; 503 paddw mm6, mm1 ; 504 505 pmaddwd mm1, mm1 ; 506 paddd mm7, mm1 ; 507 508%if ABI_IS_32BIT 509 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 510 add rdi, dword ptr arg(3) ;src_pixels_per_line ; 511%else 512 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 513 movsxd r9, dword ptr arg(3) ;src_pixels_per_line 514 add rsi, r8 515 add rdi, r9 516%endif 517 sub rcx, 1 ; 518 jnz .filter_block2d_bil4x4_var_mmx_loop ; 519 520 pxor mm3, mm3 ; 521 pxor mm2, mm2 ; 522 523 punpcklwd mm2, mm6 ; 524 punpckhwd mm3, mm6 ; 525 526 paddd mm2, mm3 ; 527 movq mm6, mm2 ; 528 529 psrlq mm6, 32 ; 530 paddd mm2, mm6 ; 531 532 psrad mm2, 16 ; 533 movq mm4, mm7 ; 534 535 psrlq mm4, 32 ; 536 paddd mm4, mm7 ; 537 538 mov rdi, arg(6) ;sum 539 mov rsi, arg(7) ;sumsquared 540 541 movd dword ptr [rdi], mm2 ; 542 movd dword ptr [rsi], mm4 ; 543 544 ; begin epilog 545 add rsp, 16 546 pop rdi 547 pop rsi 548 RESTORE_GOT 549 UNSHADOW_ARGS 550 pop rbp 551 ret 552 553;void vpx_filter_block2d_bil_var_mmx 554;( 555; unsigned char *ref_ptr, 556; int ref_pixels_per_line, 557; unsigned char *src_ptr, 558; int src_pixels_per_line, 559; unsigned int Height, 560; unsigned short *HFilter, 561; unsigned short *VFilter, 562; int *sum, 563; unsigned int *sumsquared 564;) 565global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE 566sym(vpx_filter_block2d_bil_var_mmx): 567 push rbp 568 mov rbp, rsp 569 SHADOW_ARGS_TO_STACK 9 570 GET_GOT rbx 571 push rsi 572 push rdi 573 sub rsp, 16 574 ; end prolog 575 576 pxor mm6, mm6 ; 577 pxor mm7, mm7 ; 578 mov rax, arg(5) ;HFilter ; 579 580 mov rdx, arg(6) ;VFilter ; 581 mov rsi, arg(0) ;ref_ptr ; 582 583 mov rdi, arg(2) ;src_ptr ; 584 movsxd rcx, dword ptr arg(4) ;Height ; 585 586 pxor mm0, mm0 ; 587 movq mm1, [rsi] ; 588 589 movq mm3, [rsi+1] ; 590 movq mm2, mm1 ; 591 592 movq mm4, mm3 ; 593 punpcklbw mm1, mm0 ; 594 595 punpckhbw mm2, mm0 ; 596 pmullw mm1, [rax] ; 597 598 pmullw mm2, [rax] ; 599 punpcklbw mm3, mm0 ; 600 601 punpckhbw mm4, mm0 ; 602 pmullw mm3, [rax+8] ; 603 604 pmullw mm4, [rax+8] ; 605 paddw mm1, mm3 ; 606 607 paddw mm2, mm4 ; 608 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 609 610 psraw mm1, mmx_filter_shift ; 611 paddw mm2, [GLOBAL(mmx_bi_rd)] ; 612 613 psraw mm2, mmx_filter_shift ; 614 movq mm5, mm1 615 616 packuswb mm5, mm2 ; 617%if ABI_IS_32BIT 618 add rsi, dword ptr arg(1) ;ref_pixels_per_line 619%else 620 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 621 add rsi, r8 622%endif 623 624.filter_block2d_bil_var_mmx_loop: 625 626 movq mm1, [rsi] ; 627 movq mm3, [rsi+1] ; 628 629 movq mm2, mm1 ; 630 movq mm4, mm3 ; 631 632 punpcklbw mm1, mm0 ; 633 punpckhbw mm2, mm0 ; 634 635 pmullw mm1, [rax] ; 636 pmullw mm2, [rax] ; 637 638 punpcklbw mm3, mm0 ; 639 punpckhbw mm4, mm0 ; 640 641 pmullw mm3, [rax+8] ; 642 pmullw mm4, [rax+8] ; 643 644 paddw mm1, mm3 ; 645 paddw mm2, mm4 ; 646 647 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 648 psraw mm1, mmx_filter_shift ; 649 650 paddw mm2, [GLOBAL(mmx_bi_rd)] ; 651 psraw mm2, mmx_filter_shift ; 652 653 movq mm3, mm5 ; 654 movq mm4, mm5 ; 655 656 punpcklbw mm3, mm0 ; 657 punpckhbw mm4, mm0 ; 658 659 movq mm5, mm1 ; 660 packuswb mm5, mm2 ; 661 662 pmullw mm3, [rdx] ; 663 pmullw mm4, [rdx] ; 664 665 pmullw mm1, [rdx+8] ; 666 pmullw mm2, [rdx+8] ; 667 668 paddw mm1, mm3 ; 669 paddw mm2, mm4 ; 670 671 paddw mm1, [GLOBAL(mmx_bi_rd)] ; 672 paddw mm2, [GLOBAL(mmx_bi_rd)] ; 673 674 psraw mm1, mmx_filter_shift ; 675 psraw mm2, mmx_filter_shift ; 676 677 movq mm3, [rdi] ; 678 movq mm4, mm3 ; 679 680 punpcklbw mm3, mm0 ; 681 punpckhbw mm4, mm0 ; 682 683 psubw mm1, mm3 ; 684 psubw mm2, mm4 ; 685 686 paddw mm6, mm1 ; 687 pmaddwd mm1, mm1 ; 688 689 paddw mm6, mm2 ; 690 pmaddwd mm2, mm2 ; 691 692 paddd mm7, mm1 ; 693 paddd mm7, mm2 ; 694 695%if ABI_IS_32BIT 696 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 697 add rdi, dword ptr arg(3) ;src_pixels_per_line ; 698%else 699 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 700 movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; 701 add rsi, r8 702 add rdi, r9 703%endif 704 sub rcx, 1 ; 705 jnz .filter_block2d_bil_var_mmx_loop ; 706 707 pxor mm3, mm3 ; 708 pxor mm2, mm2 ; 709 710 punpcklwd mm2, mm6 ; 711 punpckhwd mm3, mm6 ; 712 713 paddd mm2, mm3 ; 714 movq mm6, mm2 ; 715 716 psrlq mm6, 32 ; 717 paddd mm2, mm6 ; 718 719 psrad mm2, 16 ; 720 movq mm4, mm7 ; 721 722 psrlq mm4, 32 ; 723 paddd mm4, mm7 ; 724 725 mov rdi, arg(7) ;sum 726 mov rsi, arg(8) ;sumsquared 727 728 movd dword ptr [rdi], mm2 ; 729 movd dword ptr [rsi], mm4 ; 730 731 ; begin epilog 732 add rsp, 16 733 pop rdi 734 pop rsi 735 RESTORE_GOT 736 UNSHADOW_ARGS 737 pop rbp 738 ret 739 740SECTION_RODATA 741;short mmx_bi_rd[4] = { 64, 64, 64, 64}; 742align 16 743mmx_bi_rd: 744 times 4 dw 64 745