1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13extern sym(vp8_bilinear_filters_x86_8) 14 15 16%define BLOCK_HEIGHT_WIDTH 4 17%define vp8_filter_weight 128 18%define VP8_FILTER_SHIFT 7 19 20 21;void vp8_filter_block1d_h6_mmx 22;( 23; unsigned char *src_ptr, 24; unsigned short *output_ptr, 25; unsigned int src_pixels_per_line, 26; unsigned int pixel_step, 27; unsigned int output_height, 28; unsigned int output_width, 29; short * vp8_filter 30;) 31global sym(vp8_filter_block1d_h6_mmx) PRIVATE 32sym(vp8_filter_block1d_h6_mmx): 33 push rbp 34 mov rbp, rsp 35 SHADOW_ARGS_TO_STACK 7 36 GET_GOT rbx 37 push rsi 38 push rdi 39 ; end prolog 40 41 mov rdx, arg(6) ;vp8_filter 42 43 movq mm1, [rdx + 16] ; do both the negative taps first!!! 44 movq mm2, [rdx + 32] ; 45 movq mm6, [rdx + 48] ; 46 movq mm7, [rdx + 64] ; 47 48 mov rdi, arg(1) ;output_ptr 49 mov rsi, arg(0) ;src_ptr 50 movsxd rcx, dword ptr arg(4) ;output_height 51 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 52 pxor mm0, mm0 ; mm0 = 00000000 53 54.nextrow: 55 movq mm3, [rsi-2] ; mm3 = p-2..p5 56 movq mm4, mm3 ; mm4 = p-2..p5 57 psrlq mm3, 8 ; mm3 = p-1..p5 58 punpcklbw mm3, mm0 ; mm3 = p-1..p2 59 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 60 61 movq mm5, mm4 ; mm5 = p-2..p5 62 punpckhbw mm4, mm0 ; mm5 = p2..p5 63 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 64 paddsw mm3, mm4 ; mm3 += mm5 65 66 movq mm4, mm5 ; mm4 = p-2..p5; 67 psrlq mm5, 16 ; mm5 = p0..p5; 68 punpcklbw mm5, mm0 ; mm5 = p0..p3 69 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 70 paddsw mm3, mm5 ; mm3 += mm5 71 72 movq mm5, mm4 ; mm5 = p-2..p5 73 psrlq mm4, 24 ; mm4 = p1..p5 74 punpcklbw mm4, mm0 ; mm4 = p1..p4 75 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 76 paddsw mm3, mm4 ; mm3 += mm5 77 78 ; do outer positive taps 79 movd mm4, [rsi+3] 80 punpcklbw mm4, mm0 ; mm5 = p3..p6 81 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 82 paddsw mm3, mm4 ; mm3 += mm5 83 84 punpcklbw mm5, mm0 ; mm5 = p-2..p1 85 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 86 paddsw mm3, mm5 ; mm3 += mm5 87 88 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 89 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 90 packuswb mm3, mm0 ; pack and unpack to saturate 91 punpcklbw mm3, mm0 ; 92 93 movq [rdi], mm3 ; store the results in the destination 94 95%if ABI_IS_32BIT 96 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 97 add rdi, rax; 98%else 99 movsxd r8, dword ptr arg(2) ;src_pixels_per_line 100 add rdi, rax; 101 102 add rsi, r8 ; next line 103%endif 104 105 dec rcx ; decrement count 106 jnz .nextrow ; next row 107 108 ; begin epilog 109 pop rdi 110 pop rsi 111 RESTORE_GOT 112 UNSHADOW_ARGS 113 pop rbp 114 ret 115 116 117;void vp8_filter_block1dc_v6_mmx 118;( 119; short *src_ptr, 120; unsigned char *output_ptr, 121; int output_pitch, 122; unsigned int pixels_per_line, 123; unsigned int pixel_step, 124; unsigned int output_height, 125; unsigned int output_width, 126; short * vp8_filter 127;) 128global sym(vp8_filter_block1dc_v6_mmx) PRIVATE 129sym(vp8_filter_block1dc_v6_mmx): 130 push rbp 131 mov rbp, rsp 132 SHADOW_ARGS_TO_STACK 8 133 GET_GOT rbx 134 push rsi 135 push rdi 136 ; end prolog 137 138 movq mm5, [GLOBAL(rd)] 139 push rbx 140 mov rbx, arg(7) ;vp8_filter 141 movq mm1, [rbx + 16] ; do both the negative taps first!!! 142 movq mm2, [rbx + 32] ; 143 movq mm6, [rbx + 48] ; 144 movq mm7, [rbx + 64] ; 145 146 movsxd rdx, dword ptr arg(3) ;pixels_per_line 147 mov rdi, arg(1) ;output_ptr 148 mov rsi, arg(0) ;src_ptr 149 sub rsi, rdx 150 sub rsi, rdx 151 movsxd rcx, DWORD PTR arg(5) ;output_height 152 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 153 pxor mm0, mm0 ; mm0 = 00000000 154 155 156.nextrow_cv: 157 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 158 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 159 160 161 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 162 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 163 paddsw mm3, mm4 ; mm3 += mm4 164 165 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 166 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 167 paddsw mm3, mm4 ; mm3 += mm4 168 169 movq mm4, [rsi] ; mm4 = p0..p3 = row -2 170 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 171 paddsw mm3, mm4 ; mm3 += mm4 172 173 174 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 175 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 176 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 177 paddsw mm3, mm4 ; mm3 += mm4 178 179 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 180 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 181 paddsw mm3, mm4 ; mm3 += mm4 182 183 184 paddsw mm3, mm5 ; mm3 += round value 185 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 186 packuswb mm3, mm0 ; pack and saturate 187 188 movd [rdi],mm3 ; store the results in the destination 189 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 190 ; recon block should be in cache this shouldn't cost much. Its obviously 191 ; avoidable!!!. 192 lea rdi, [rdi+rax] ; 193 dec rcx ; decrement count 194 jnz .nextrow_cv ; next row 195 196 pop rbx 197 198 ; begin epilog 199 pop rdi 200 pop rsi 201 RESTORE_GOT 202 UNSHADOW_ARGS 203 pop rbp 204 ret 205 206 207;void bilinear_predict8x8_mmx 208;( 209; unsigned char *src_ptr, 210; int src_pixels_per_line, 211; int xoffset, 212; int yoffset, 213; unsigned char *dst_ptr, 214; int dst_pitch 215;) 216global sym(vp8_bilinear_predict8x8_mmx) PRIVATE 217sym(vp8_bilinear_predict8x8_mmx): 218 push rbp 219 mov rbp, rsp 220 SHADOW_ARGS_TO_STACK 6 221 GET_GOT rbx 222 push rsi 223 push rdi 224 ; end prolog 225 226 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 227 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 228 229 movsxd rax, dword ptr arg(2) ;xoffset 230 mov rdi, arg(4) ;dst_ptr ; 231 232 shl rax, 5 ; offset * 32 233 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 234 235 add rax, rcx ; HFilter 236 mov rsi, arg(0) ;src_ptr ; 237 238 movsxd rdx, dword ptr arg(5) ;dst_pitch 239 movq mm1, [rax] ; 240 241 movq mm2, [rax+16] ; 242 movsxd rax, dword ptr arg(3) ;yoffset 243 244 pxor mm0, mm0 ; 245 246 shl rax, 5 ; offset*32 247 add rax, rcx ; VFilter 248 249 lea rcx, [rdi+rdx*8] ; 250 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 251 252 253 254 ; get the first horizontal line done ; 255 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 256 movq mm4, mm3 ; make a copy of current line 257 258 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 259 punpckhbw mm4, mm0 ; 260 261 pmullw mm3, mm1 ; 262 pmullw mm4, mm1 ; 263 264 movq mm5, [rsi+1] ; 265 movq mm6, mm5 ; 266 267 punpcklbw mm5, mm0 ; 268 punpckhbw mm6, mm0 ; 269 270 pmullw mm5, mm2 ; 271 pmullw mm6, mm2 ; 272 273 paddw mm3, mm5 ; 274 paddw mm4, mm6 ; 275 276 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 277 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 278 279 paddw mm4, [GLOBAL(rd)] ; 280 psraw mm4, VP8_FILTER_SHIFT ; 281 282 movq mm7, mm3 ; 283 packuswb mm7, mm4 ; 284 285 add rsi, rdx ; next line 286.next_row_8x8: 287 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 288 movq mm4, mm3 ; make a copy of current line 289 290 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 291 punpckhbw mm4, mm0 ; 292 293 pmullw mm3, mm1 ; 294 pmullw mm4, mm1 ; 295 296 movq mm5, [rsi+1] ; 297 movq mm6, mm5 ; 298 299 punpcklbw mm5, mm0 ; 300 punpckhbw mm6, mm0 ; 301 302 pmullw mm5, mm2 ; 303 pmullw mm6, mm2 ; 304 305 paddw mm3, mm5 ; 306 paddw mm4, mm6 ; 307 308 movq mm5, mm7 ; 309 movq mm6, mm7 ; 310 311 punpcklbw mm5, mm0 ; 312 punpckhbw mm6, mm0 313 314 pmullw mm5, [rax] ; 315 pmullw mm6, [rax] ; 316 317 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 318 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 319 320 paddw mm4, [GLOBAL(rd)] ; 321 psraw mm4, VP8_FILTER_SHIFT ; 322 323 movq mm7, mm3 ; 324 packuswb mm7, mm4 ; 325 326 327 pmullw mm3, [rax+16] ; 328 pmullw mm4, [rax+16] ; 329 330 paddw mm3, mm5 ; 331 paddw mm4, mm6 ; 332 333 334 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 335 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 336 337 paddw mm4, [GLOBAL(rd)] ; 338 psraw mm4, VP8_FILTER_SHIFT ; 339 340 packuswb mm3, mm4 341 342 movq [rdi], mm3 ; store the results in the destination 343 344%if ABI_IS_32BIT 345 add rsi, rdx ; next line 346 add rdi, dword ptr arg(5) ;dst_pitch ; 347%else 348 movsxd r8, dword ptr arg(5) ;dst_pitch 349 add rsi, rdx ; next line 350 add rdi, r8 ;dst_pitch 351%endif 352 cmp rdi, rcx ; 353 jne .next_row_8x8 354 355 ; begin epilog 356 pop rdi 357 pop rsi 358 RESTORE_GOT 359 UNSHADOW_ARGS 360 pop rbp 361 ret 362 363 364;void bilinear_predict8x4_mmx 365;( 366; unsigned char *src_ptr, 367; int src_pixels_per_line, 368; int xoffset, 369; int yoffset, 370; unsigned char *dst_ptr, 371; int dst_pitch 372;) 373global sym(vp8_bilinear_predict8x4_mmx) PRIVATE 374sym(vp8_bilinear_predict8x4_mmx): 375 push rbp 376 mov rbp, rsp 377 SHADOW_ARGS_TO_STACK 6 378 GET_GOT rbx 379 push rsi 380 push rdi 381 ; end prolog 382 383 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 384 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 385 386 movsxd rax, dword ptr arg(2) ;xoffset 387 mov rdi, arg(4) ;dst_ptr ; 388 389 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 390 shl rax, 5 391 392 mov rsi, arg(0) ;src_ptr ; 393 add rax, rcx 394 395 movsxd rdx, dword ptr arg(5) ;dst_pitch 396 movq mm1, [rax] ; 397 398 movq mm2, [rax+16] ; 399 movsxd rax, dword ptr arg(3) ;yoffset 400 401 pxor mm0, mm0 ; 402 shl rax, 5 403 404 add rax, rcx 405 lea rcx, [rdi+rdx*4] ; 406 407 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 408 409 ; get the first horizontal line done ; 410 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 411 movq mm4, mm3 ; make a copy of current line 412 413 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 414 punpckhbw mm4, mm0 ; 415 416 pmullw mm3, mm1 ; 417 pmullw mm4, mm1 ; 418 419 movq mm5, [rsi+1] ; 420 movq mm6, mm5 ; 421 422 punpcklbw mm5, mm0 ; 423 punpckhbw mm6, mm0 ; 424 425 pmullw mm5, mm2 ; 426 pmullw mm6, mm2 ; 427 428 paddw mm3, mm5 ; 429 paddw mm4, mm6 ; 430 431 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 432 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 433 434 paddw mm4, [GLOBAL(rd)] ; 435 psraw mm4, VP8_FILTER_SHIFT ; 436 437 movq mm7, mm3 ; 438 packuswb mm7, mm4 ; 439 440 add rsi, rdx ; next line 441.next_row_8x4: 442 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 443 movq mm4, mm3 ; make a copy of current line 444 445 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 446 punpckhbw mm4, mm0 ; 447 448 pmullw mm3, mm1 ; 449 pmullw mm4, mm1 ; 450 451 movq mm5, [rsi+1] ; 452 movq mm6, mm5 ; 453 454 punpcklbw mm5, mm0 ; 455 punpckhbw mm6, mm0 ; 456 457 pmullw mm5, mm2 ; 458 pmullw mm6, mm2 ; 459 460 paddw mm3, mm5 ; 461 paddw mm4, mm6 ; 462 463 movq mm5, mm7 ; 464 movq mm6, mm7 ; 465 466 punpcklbw mm5, mm0 ; 467 punpckhbw mm6, mm0 468 469 pmullw mm5, [rax] ; 470 pmullw mm6, [rax] ; 471 472 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 473 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 474 475 paddw mm4, [GLOBAL(rd)] ; 476 psraw mm4, VP8_FILTER_SHIFT ; 477 478 movq mm7, mm3 ; 479 packuswb mm7, mm4 ; 480 481 482 pmullw mm3, [rax+16] ; 483 pmullw mm4, [rax+16] ; 484 485 paddw mm3, mm5 ; 486 paddw mm4, mm6 ; 487 488 489 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 490 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 491 492 paddw mm4, [GLOBAL(rd)] ; 493 psraw mm4, VP8_FILTER_SHIFT ; 494 495 packuswb mm3, mm4 496 497 movq [rdi], mm3 ; store the results in the destination 498 499%if ABI_IS_32BIT 500 add rsi, rdx ; next line 501 add rdi, dword ptr arg(5) ;dst_pitch ; 502%else 503 movsxd r8, dword ptr arg(5) ;dst_pitch 504 add rsi, rdx ; next line 505 add rdi, r8 506%endif 507 cmp rdi, rcx ; 508 jne .next_row_8x4 509 510 ; begin epilog 511 pop rdi 512 pop rsi 513 RESTORE_GOT 514 UNSHADOW_ARGS 515 pop rbp 516 ret 517 518 519;void bilinear_predict4x4_mmx 520;( 521; unsigned char *src_ptr, 522; int src_pixels_per_line, 523; int xoffset, 524; int yoffset, 525; unsigned char *dst_ptr, 526; int dst_pitch 527;) 528global sym(vp8_bilinear_predict4x4_mmx) PRIVATE 529sym(vp8_bilinear_predict4x4_mmx): 530 push rbp 531 mov rbp, rsp 532 SHADOW_ARGS_TO_STACK 6 533 GET_GOT rbx 534 push rsi 535 push rdi 536 ; end prolog 537 538 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 539 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 540 541 movsxd rax, dword ptr arg(2) ;xoffset 542 mov rdi, arg(4) ;dst_ptr ; 543 544 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 545 shl rax, 5 546 547 add rax, rcx ; HFilter 548 mov rsi, arg(0) ;src_ptr ; 549 550 movsxd rdx, dword ptr arg(5) ;ldst_pitch 551 movq mm1, [rax] ; 552 553 movq mm2, [rax+16] ; 554 movsxd rax, dword ptr arg(3) ;yoffset 555 556 pxor mm0, mm0 ; 557 shl rax, 5 558 559 add rax, rcx 560 lea rcx, [rdi+rdx*4] ; 561 562 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 563 564 ; get the first horizontal line done ; 565 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 566 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 567 568 pmullw mm3, mm1 ; 569 movd mm5, [rsi+1] ; 570 571 punpcklbw mm5, mm0 ; 572 pmullw mm5, mm2 ; 573 574 paddw mm3, mm5 ; 575 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 576 577 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 578 579 movq mm7, mm3 ; 580 packuswb mm7, mm0 ; 581 582 add rsi, rdx ; next line 583.next_row_4x4: 584 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 585 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 586 587 pmullw mm3, mm1 ; 588 movd mm5, [rsi+1] ; 589 590 punpcklbw mm5, mm0 ; 591 pmullw mm5, mm2 ; 592 593 paddw mm3, mm5 ; 594 595 movq mm5, mm7 ; 596 punpcklbw mm5, mm0 ; 597 598 pmullw mm5, [rax] ; 599 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 600 601 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 602 movq mm7, mm3 ; 603 604 packuswb mm7, mm0 ; 605 606 pmullw mm3, [rax+16] ; 607 paddw mm3, mm5 ; 608 609 610 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 611 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 612 613 packuswb mm3, mm0 614 movd [rdi], mm3 ; store the results in the destination 615 616%if ABI_IS_32BIT 617 add rsi, rdx ; next line 618 add rdi, dword ptr arg(5) ;dst_pitch ; 619%else 620 movsxd r8, dword ptr arg(5) ;dst_pitch ; 621 add rsi, rdx ; next line 622 add rdi, r8 623%endif 624 625 cmp rdi, rcx ; 626 jne .next_row_4x4 627 628 ; begin epilog 629 pop rdi 630 pop rsi 631 RESTORE_GOT 632 UNSHADOW_ARGS 633 pop rbp 634 ret 635 636 637 638SECTION_RODATA 639align 16 640rd: 641 times 4 dw 0x40 642 643align 16 644global HIDDEN_DATA(sym(vp8_six_tap_mmx)) 645sym(vp8_six_tap_mmx): 646 times 8 dw 0 647 times 8 dw 0 648 times 8 dw 128 649 times 8 dw 0 650 times 8 dw 0 651 times 8 dw 0 652 653 times 8 dw 0 654 times 8 dw -6 655 times 8 dw 123 656 times 8 dw 12 657 times 8 dw -1 658 times 8 dw 0 659 660 times 8 dw 2 661 times 8 dw -11 662 times 8 dw 108 663 times 8 dw 36 664 times 8 dw -8 665 times 8 dw 1 666 667 times 8 dw 0 668 times 8 dw -9 669 times 8 dw 93 670 times 8 dw 50 671 times 8 dw -6 672 times 8 dw 0 673 674 times 8 dw 3 675 times 8 dw -16 676 times 8 dw 77 677 times 8 dw 77 678 times 8 dw -16 679 times 8 dw 3 680 681 times 8 dw 0 682 times 8 dw -6 683 times 8 dw 50 684 times 8 dw 93 685 times 8 dw -9 686 times 8 dw 0 687 688 times 8 dw 1 689 times 8 dw -8 690 times 8 dw 36 691 times 8 dw 108 692 times 8 dw -11 693 times 8 dw 2 694 695 times 8 dw 0 696 times 8 dw -1 697 times 8 dw 12 698 times 8 dw 123 699 times 8 dw -6 700 times 8 dw 0 701 702 703