1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro VERTx4 1 15 mov rdx, arg(5) ;filter ptr 16 mov rsi, arg(0) ;src_ptr 17 mov rdi, arg(2) ;output_ptr 18 mov rcx, 0x0400040 19 20 movdqa xmm4, [rdx] ;load filters 21 movd xmm5, rcx 22 packsswb xmm4, xmm4 23 pshuflw xmm0, xmm4, 0b ;k0_k1 24 pshuflw xmm1, xmm4, 01010101b ;k2_k3 25 pshuflw xmm2, xmm4, 10101010b ;k4_k5 26 pshuflw xmm3, xmm4, 11111111b ;k6_k7 27 28 punpcklqdq xmm0, xmm0 29 punpcklqdq xmm1, xmm1 30 punpcklqdq xmm2, xmm2 31 punpcklqdq xmm3, xmm3 32 33 movdqa k0k1, xmm0 34 movdqa k2k3, xmm1 35 pshufd xmm5, xmm5, 0 36 movdqa k4k5, xmm2 37 movdqa k6k7, xmm3 38 movdqa krd, xmm5 39 40 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 41 42%if ABI_IS_32BIT=0 43 movsxd r8, DWORD PTR arg(3) ;out_pitch 44%endif 45 mov rax, rsi 46 movsxd rcx, DWORD PTR arg(4) ;output_height 47 add rax, rdx 48 49 lea rbx, [rdx + rdx*4] 50 add rbx, rdx ;pitch * 6 51 52.loop: 53 movd xmm0, [rsi] ;A 54 movd xmm1, [rsi + rdx] ;B 55 movd xmm2, [rsi + rdx * 2] ;C 56 movd xmm3, [rax + rdx * 2] ;D 57 movd xmm4, [rsi + rdx * 4] ;E 58 movd xmm5, [rax + rdx * 4] ;F 59 60 punpcklbw xmm0, xmm1 ;A B 61 punpcklbw xmm2, xmm3 ;C D 62 punpcklbw xmm4, xmm5 ;E F 63 64 movd xmm6, [rsi + rbx] ;G 65 movd xmm7, [rax + rbx] ;H 66 67 pmaddubsw xmm0, k0k1 68 pmaddubsw xmm2, k2k3 69 punpcklbw xmm6, xmm7 ;G H 70 pmaddubsw xmm4, k4k5 71 pmaddubsw xmm6, k6k7 72 73 movdqa xmm1, xmm2 74 paddsw xmm0, xmm6 75 pmaxsw xmm2, xmm4 76 pminsw xmm4, xmm1 77 paddsw xmm0, xmm4 78 paddsw xmm0, xmm2 79 80 paddsw xmm0, krd 81 psraw xmm0, 7 82 packuswb xmm0, xmm0 83 84 add rsi, rdx 85 add rax, rdx 86%if %1 87 movd xmm1, [rdi] 88 pavgb xmm0, xmm1 89%endif 90 movd [rdi], xmm0 91 92%if ABI_IS_32BIT 93 add rdi, DWORD PTR arg(3) ;out_pitch 94%else 95 add rdi, r8 96%endif 97 dec rcx 98 jnz .loop 99%endm 100 101%macro VERTx8 1 102 mov rdx, arg(5) ;filter ptr 103 mov rsi, arg(0) ;src_ptr 104 mov rdi, arg(2) ;output_ptr 105 mov rcx, 0x0400040 106 107 movdqa xmm4, [rdx] ;load filters 108 movq xmm5, rcx 109 packsswb xmm4, xmm4 110 pshuflw xmm0, xmm4, 0b ;k0_k1 111 pshuflw xmm1, xmm4, 01010101b ;k2_k3 112 pshuflw xmm2, xmm4, 10101010b ;k4_k5 113 pshuflw xmm3, xmm4, 11111111b ;k6_k7 114 115 punpcklqdq xmm0, xmm0 116 punpcklqdq xmm1, xmm1 117 punpcklqdq xmm2, xmm2 118 punpcklqdq xmm3, xmm3 119 120 movdqa k0k1, xmm0 121 movdqa k2k3, xmm1 122 pshufd xmm5, xmm5, 0 123 movdqa k4k5, xmm2 124 movdqa k6k7, xmm3 125 movdqa krd, xmm5 126 127 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 128 129%if ABI_IS_32BIT=0 130 movsxd r8, DWORD PTR arg(3) ;out_pitch 131%endif 132 mov rax, rsi 133 movsxd rcx, DWORD PTR arg(4) ;output_height 134 add rax, rdx 135 136 lea rbx, [rdx + rdx*4] 137 add rbx, rdx ;pitch * 6 138 139.loop: 140 movq xmm0, [rsi] ;A 141 movq xmm1, [rsi + rdx] ;B 142 movq xmm2, [rsi + rdx * 2] ;C 143 movq xmm3, [rax + rdx * 2] ;D 144 movq xmm4, [rsi + rdx * 4] ;E 145 movq xmm5, [rax + rdx * 4] ;F 146 147 punpcklbw xmm0, xmm1 ;A B 148 punpcklbw xmm2, xmm3 ;C D 149 punpcklbw xmm4, xmm5 ;E F 150 151 movq xmm6, [rsi + rbx] ;G 152 movq xmm7, [rax + rbx] ;H 153 154 pmaddubsw xmm0, k0k1 155 pmaddubsw xmm2, k2k3 156 punpcklbw xmm6, xmm7 ;G H 157 pmaddubsw xmm4, k4k5 158 pmaddubsw xmm6, k6k7 159 160 paddsw xmm0, xmm6 161 movdqa xmm1, xmm2 162 pmaxsw xmm2, xmm4 163 pminsw xmm4, xmm1 164 paddsw xmm0, xmm4 165 paddsw xmm0, xmm2 166 167 paddsw xmm0, krd 168 psraw xmm0, 7 169 packuswb xmm0, xmm0 170 171 add rsi, rdx 172 add rax, rdx 173%if %1 174 movq xmm1, [rdi] 175 pavgb xmm0, xmm1 176%endif 177 movq [rdi], xmm0 178 179%if ABI_IS_32BIT 180 add rdi, DWORD PTR arg(3) ;out_pitch 181%else 182 add rdi, r8 183%endif 184 dec rcx 185 jnz .loop 186%endm 187 188 189%macro VERTx16 1 190 mov rdx, arg(5) ;filter ptr 191 mov rsi, arg(0) ;src_ptr 192 mov rdi, arg(2) ;output_ptr 193 mov rcx, 0x0400040 194 195 movdqa xmm4, [rdx] ;load filters 196 movq xmm5, rcx 197 packsswb xmm4, xmm4 198 pshuflw xmm0, xmm4, 0b ;k0_k1 199 pshuflw xmm1, xmm4, 01010101b ;k2_k3 200 pshuflw xmm2, xmm4, 10101010b ;k4_k5 201 pshuflw xmm3, xmm4, 11111111b ;k6_k7 202 203 punpcklqdq xmm0, xmm0 204 punpcklqdq xmm1, xmm1 205 punpcklqdq xmm2, xmm2 206 punpcklqdq xmm3, xmm3 207 208 movdqa k0k1, xmm0 209 movdqa k2k3, xmm1 210 pshufd xmm5, xmm5, 0 211 movdqa k4k5, xmm2 212 movdqa k6k7, xmm3 213 movdqa krd, xmm5 214 215 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 216 217%if ABI_IS_32BIT=0 218 movsxd r8, DWORD PTR arg(3) ;out_pitch 219%endif 220 mov rax, rsi 221 movsxd rcx, DWORD PTR arg(4) ;output_height 222 add rax, rdx 223 224 lea rbx, [rdx + rdx*4] 225 add rbx, rdx ;pitch * 6 226 227.loop: 228 movq xmm0, [rsi] ;A 229 movq xmm1, [rsi + rdx] ;B 230 movq xmm2, [rsi + rdx * 2] ;C 231 movq xmm3, [rax + rdx * 2] ;D 232 movq xmm4, [rsi + rdx * 4] ;E 233 movq xmm5, [rax + rdx * 4] ;F 234 235 punpcklbw xmm0, xmm1 ;A B 236 punpcklbw xmm2, xmm3 ;C D 237 punpcklbw xmm4, xmm5 ;E F 238 239 movq xmm6, [rsi + rbx] ;G 240 movq xmm7, [rax + rbx] ;H 241 242 pmaddubsw xmm0, k0k1 243 pmaddubsw xmm2, k2k3 244 punpcklbw xmm6, xmm7 ;G H 245 pmaddubsw xmm4, k4k5 246 pmaddubsw xmm6, k6k7 247 248 paddsw xmm0, xmm6 249 movdqa xmm1, xmm2 250 pmaxsw xmm2, xmm4 251 pminsw xmm4, xmm1 252 paddsw xmm0, xmm4 253 paddsw xmm0, xmm2 254 255 paddsw xmm0, krd 256 psraw xmm0, 7 257 packuswb xmm0, xmm0 258%if %1 259 movq xmm1, [rdi] 260 pavgb xmm0, xmm1 261%endif 262 movq [rdi], xmm0 263 264 movq xmm0, [rsi + 8] ;A 265 movq xmm1, [rsi + rdx + 8] ;B 266 movq xmm2, [rsi + rdx * 2 + 8] ;C 267 movq xmm3, [rax + rdx * 2 + 8] ;D 268 movq xmm4, [rsi + rdx * 4 + 8] ;E 269 movq xmm5, [rax + rdx * 4 + 8] ;F 270 271 punpcklbw xmm0, xmm1 ;A B 272 punpcklbw xmm2, xmm3 ;C D 273 punpcklbw xmm4, xmm5 ;E F 274 275 276 movq xmm6, [rsi + rbx + 8] ;G 277 movq xmm7, [rax + rbx + 8] ;H 278 punpcklbw xmm6, xmm7 ;G H 279 280 281 pmaddubsw xmm0, k0k1 282 pmaddubsw xmm2, k2k3 283 pmaddubsw xmm4, k4k5 284 pmaddubsw xmm6, k6k7 285 286 paddsw xmm0, xmm6 287 paddsw xmm0, xmm2 288 paddsw xmm0, xmm4 289 paddsw xmm0, krd 290 291 psraw xmm0, 7 292 packuswb xmm0, xmm0 293 294 add rsi, rdx 295 add rax, rdx 296%if %1 297 movq xmm1, [rdi+8] 298 pavgb xmm0, xmm1 299%endif 300 301 movq [rdi+8], xmm0 302 303%if ABI_IS_32BIT 304 add rdi, DWORD PTR arg(3) ;out_pitch 305%else 306 add rdi, r8 307%endif 308 dec rcx 309 jnz .loop 310%endm 311 312;void vp9_filter_block1d8_v8_ssse3 313;( 314; unsigned char *src_ptr, 315; unsigned int src_pitch, 316; unsigned char *output_ptr, 317; unsigned int out_pitch, 318; unsigned int output_height, 319; short *filter 320;) 321global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE 322sym(vp9_filter_block1d4_v8_ssse3): 323 push rbp 324 mov rbp, rsp 325 SHADOW_ARGS_TO_STACK 6 326 SAVE_XMM 7 327 push rsi 328 push rdi 329 push rbx 330 ; end prolog 331 332 ALIGN_STACK 16, rax 333 sub rsp, 16*5 334 %define k0k1 [rsp + 16*0] 335 %define k2k3 [rsp + 16*1] 336 %define k4k5 [rsp + 16*2] 337 %define k6k7 [rsp + 16*3] 338 %define krd [rsp + 16*4] 339 340 VERTx4 0 341 342 add rsp, 16*5 343 pop rsp 344 pop rbx 345 ; begin epilog 346 pop rdi 347 pop rsi 348 RESTORE_XMM 349 UNSHADOW_ARGS 350 pop rbp 351 ret 352 353;void vp9_filter_block1d8_v8_ssse3 354;( 355; unsigned char *src_ptr, 356; unsigned int src_pitch, 357; unsigned char *output_ptr, 358; unsigned int out_pitch, 359; unsigned int output_height, 360; short *filter 361;) 362global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE 363sym(vp9_filter_block1d8_v8_ssse3): 364 push rbp 365 mov rbp, rsp 366 SHADOW_ARGS_TO_STACK 6 367 SAVE_XMM 7 368 push rsi 369 push rdi 370 push rbx 371 ; end prolog 372 373 ALIGN_STACK 16, rax 374 sub rsp, 16*5 375 %define k0k1 [rsp + 16*0] 376 %define k2k3 [rsp + 16*1] 377 %define k4k5 [rsp + 16*2] 378 %define k6k7 [rsp + 16*3] 379 %define krd [rsp + 16*4] 380 381 VERTx8 0 382 383 add rsp, 16*5 384 pop rsp 385 pop rbx 386 ; begin epilog 387 pop rdi 388 pop rsi 389 RESTORE_XMM 390 UNSHADOW_ARGS 391 pop rbp 392 ret 393 394;void vp9_filter_block1d16_v8_ssse3 395;( 396; unsigned char *src_ptr, 397; unsigned int src_pitch, 398; unsigned char *output_ptr, 399; unsigned int out_pitch, 400; unsigned int output_height, 401; short *filter 402;) 403global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE 404sym(vp9_filter_block1d16_v8_ssse3): 405 push rbp 406 mov rbp, rsp 407 SHADOW_ARGS_TO_STACK 6 408 SAVE_XMM 7 409 push rsi 410 push rdi 411 push rbx 412 ; end prolog 413 414 ALIGN_STACK 16, rax 415 sub rsp, 16*5 416 %define k0k1 [rsp + 16*0] 417 %define k2k3 [rsp + 16*1] 418 %define k4k5 [rsp + 16*2] 419 %define k6k7 [rsp + 16*3] 420 %define krd [rsp + 16*4] 421 422 VERTx16 0 423 424 add rsp, 16*5 425 pop rsp 426 pop rbx 427 ; begin epilog 428 pop rdi 429 pop rsi 430 RESTORE_XMM 431 UNSHADOW_ARGS 432 pop rbp 433 ret 434 435;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 436 437 438global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE 439sym(vp9_filter_block1d4_v8_avg_ssse3): 440 push rbp 441 mov rbp, rsp 442 SHADOW_ARGS_TO_STACK 6 443 SAVE_XMM 7 444 push rsi 445 push rdi 446 push rbx 447 ; end prolog 448 449 ALIGN_STACK 16, rax 450 sub rsp, 16*5 451 %define k0k1 [rsp + 16*0] 452 %define k2k3 [rsp + 16*1] 453 %define k4k5 [rsp + 16*2] 454 %define k6k7 [rsp + 16*3] 455 %define krd [rsp + 16*4] 456 457 VERTx4 1 458 459 add rsp, 16*5 460 pop rsp 461 pop rbx 462 ; begin epilog 463 pop rdi 464 pop rsi 465 RESTORE_XMM 466 UNSHADOW_ARGS 467 pop rbp 468 ret 469 470global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE 471sym(vp9_filter_block1d8_v8_avg_ssse3): 472 push rbp 473 mov rbp, rsp 474 SHADOW_ARGS_TO_STACK 6 475 SAVE_XMM 7 476 push rsi 477 push rdi 478 push rbx 479 ; end prolog 480 481 ALIGN_STACK 16, rax 482 sub rsp, 16*5 483 %define k0k1 [rsp + 16*0] 484 %define k2k3 [rsp + 16*1] 485 %define k4k5 [rsp + 16*2] 486 %define k6k7 [rsp + 16*3] 487 %define krd [rsp + 16*4] 488 489 VERTx8 1 490 491 add rsp, 16*5 492 pop rsp 493 pop rbx 494 ; begin epilog 495 pop rdi 496 pop rsi 497 RESTORE_XMM 498 UNSHADOW_ARGS 499 pop rbp 500 ret 501 502global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE 503sym(vp9_filter_block1d16_v8_avg_ssse3): 504 push rbp 505 mov rbp, rsp 506 SHADOW_ARGS_TO_STACK 6 507 SAVE_XMM 7 508 push rsi 509 push rdi 510 push rbx 511 ; end prolog 512 513 ALIGN_STACK 16, rax 514 sub rsp, 16*5 515 %define k0k1 [rsp + 16*0] 516 %define k2k3 [rsp + 16*1] 517 %define k4k5 [rsp + 16*2] 518 %define k6k7 [rsp + 16*3] 519 %define krd [rsp + 16*4] 520 521 VERTx16 1 522 523 add rsp, 16*5 524 pop rsp 525 pop rbx 526 ; begin epilog 527 pop rdi 528 pop rsi 529 RESTORE_XMM 530 UNSHADOW_ARGS 531 pop rbp 532 ret 533 534;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 535%macro HORIZx4_ROW 2 536 movdqa %2, %1 537 pshufb %1, [GLOBAL(shuf_t0t1)] 538 pshufb %2, [GLOBAL(shuf_t2t3)] 539 pmaddubsw %1, k0k1k4k5 540 pmaddubsw %2, k2k3k6k7 541 542 movdqa xmm4, %1 543 movdqa xmm5, %2 544 psrldq %1, 8 545 psrldq %2, 8 546 movdqa xmm6, xmm5 547 548 paddsw xmm4, %2 549 pmaxsw xmm5, %1 550 pminsw %1, xmm6 551 paddsw %1, xmm4 552 paddsw %1, xmm5 553 554 paddsw %1, krd 555 psraw %1, 7 556 packuswb %1, %1 557%endm 558 559%macro HORIZx4 1 560 mov rdx, arg(5) ;filter ptr 561 mov rsi, arg(0) ;src_ptr 562 mov rdi, arg(2) ;output_ptr 563 mov rcx, 0x0400040 564 565 movdqa xmm4, [rdx] ;load filters 566 movq xmm5, rcx 567 packsswb xmm4, xmm4 568 pshuflw xmm6, xmm4, 0b ;k0_k1 569 pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 570 pshuflw xmm7, xmm4, 01010101b ;k2_k3 571 pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 572 pshufd xmm5, xmm5, 0 ;rounding 573 574 movdqa k0k1k4k5, xmm6 575 movdqa k2k3k6k7, xmm7 576 movdqa krd, xmm5 577 578 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 579 movsxd rdx, dword ptr arg(3) ;output_pitch 580 movsxd rcx, dword ptr arg(4) ;output_height 581 shr rcx, 1 582.loop: 583 ;Do two rows once 584 movq xmm0, [rsi - 3] ;load src 585 movq xmm1, [rsi + 5] 586 movq xmm2, [rsi + rax - 3] 587 movq xmm3, [rsi + rax + 5] 588 punpcklqdq xmm0, xmm1 589 punpcklqdq xmm2, xmm3 590 591 HORIZx4_ROW xmm0, xmm1 592 HORIZx4_ROW xmm2, xmm3 593%if %1 594 movd xmm1, [rdi] 595 pavgb xmm0, xmm1 596 movd xmm3, [rdi + rdx] 597 pavgb xmm2, xmm3 598%endif 599 movd [rdi], xmm0 600 movd [rdi +rdx], xmm2 601 602 lea rsi, [rsi + rax] 603 prefetcht0 [rsi + 4 * rax - 3] 604 lea rsi, [rsi + rax] 605 lea rdi, [rdi + 2 * rdx] 606 prefetcht0 [rsi + 2 * rax - 3] 607 608 dec rcx 609 jnz .loop 610 611 ; Do last row if output_height is odd 612 movsxd rcx, dword ptr arg(4) ;output_height 613 and rcx, 1 614 je .done 615 616 movq xmm0, [rsi - 3] ; load src 617 movq xmm1, [rsi + 5] 618 punpcklqdq xmm0, xmm1 619 620 HORIZx4_ROW xmm0, xmm1 621%if %1 622 movd xmm1, [rdi] 623 pavgb xmm0, xmm1 624%endif 625 movd [rdi], xmm0 626.done 627%endm 628 629%macro HORIZx8_ROW 4 630 movdqa %2, %1 631 movdqa %3, %1 632 movdqa %4, %1 633 634 pshufb %1, [GLOBAL(shuf_t0t1)] 635 pshufb %2, [GLOBAL(shuf_t2t3)] 636 pshufb %3, [GLOBAL(shuf_t4t5)] 637 pshufb %4, [GLOBAL(shuf_t6t7)] 638 639 pmaddubsw %1, k0k1 640 pmaddubsw %2, k2k3 641 pmaddubsw %3, k4k5 642 pmaddubsw %4, k6k7 643 644 paddsw %1, %4 645 movdqa %4, %2 646 pmaxsw %2, %3 647 pminsw %3, %4 648 paddsw %1, %3 649 paddsw %1, %2 650 651 paddsw %1, krd 652 psraw %1, 7 653 packuswb %1, %1 654%endm 655 656%macro HORIZx8 1 657 mov rdx, arg(5) ;filter ptr 658 mov rsi, arg(0) ;src_ptr 659 mov rdi, arg(2) ;output_ptr 660 mov rcx, 0x0400040 661 662 movdqa xmm4, [rdx] ;load filters 663 movd xmm5, rcx 664 packsswb xmm4, xmm4 665 pshuflw xmm0, xmm4, 0b ;k0_k1 666 pshuflw xmm1, xmm4, 01010101b ;k2_k3 667 pshuflw xmm2, xmm4, 10101010b ;k4_k5 668 pshuflw xmm3, xmm4, 11111111b ;k6_k7 669 670 punpcklqdq xmm0, xmm0 671 punpcklqdq xmm1, xmm1 672 punpcklqdq xmm2, xmm2 673 punpcklqdq xmm3, xmm3 674 675 movdqa k0k1, xmm0 676 movdqa k2k3, xmm1 677 pshufd xmm5, xmm5, 0 678 movdqa k4k5, xmm2 679 movdqa k6k7, xmm3 680 movdqa krd, xmm5 681 682 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 683 movsxd rdx, dword ptr arg(3) ;output_pitch 684 movsxd rcx, dword ptr arg(4) ;output_height 685 shr rcx, 1 686 687.loop: 688 movq xmm0, [rsi - 3] ;load src 689 movq xmm3, [rsi + 5] 690 movq xmm4, [rsi + rax - 3] 691 movq xmm7, [rsi + rax + 5] 692 punpcklqdq xmm0, xmm3 693 punpcklqdq xmm4, xmm7 694 695 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 696 HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 697%if %1 698 movq xmm1, [rdi] 699 movq xmm2, [rdi + rdx] 700 pavgb xmm0, xmm1 701 pavgb xmm4, xmm2 702%endif 703 movq [rdi], xmm0 704 movq [rdi + rdx], xmm4 705 706 lea rsi, [rsi + rax] 707 prefetcht0 [rsi + 4 * rax - 3] 708 lea rsi, [rsi + rax] 709 lea rdi, [rdi + 2 * rdx] 710 prefetcht0 [rsi + 2 * rax - 3] 711 dec rcx 712 jnz .loop 713 714 ;Do last row if output_height is odd 715 movsxd rcx, dword ptr arg(4) ;output_height 716 and rcx, 1 717 je .done 718 719 movq xmm0, [rsi - 3] 720 movq xmm3, [rsi + 5] 721 punpcklqdq xmm0, xmm3 722 723 HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 724%if %1 725 movq xmm1, [rdi] 726 pavgb xmm0, xmm1 727%endif 728 movq [rdi], xmm0 729.done 730%endm 731 732%macro HORIZx16 1 733 mov rdx, arg(5) ;filter ptr 734 mov rsi, arg(0) ;src_ptr 735 mov rdi, arg(2) ;output_ptr 736 mov rcx, 0x0400040 737 738 movdqa xmm4, [rdx] ;load filters 739 movq xmm5, rcx 740 packsswb xmm4, xmm4 741 pshuflw xmm0, xmm4, 0b ;k0_k1 742 pshuflw xmm1, xmm4, 01010101b ;k2_k3 743 pshuflw xmm2, xmm4, 10101010b ;k4_k5 744 pshuflw xmm3, xmm4, 11111111b ;k6_k7 745 746 punpcklqdq xmm0, xmm0 747 punpcklqdq xmm1, xmm1 748 punpcklqdq xmm2, xmm2 749 punpcklqdq xmm3, xmm3 750 751 movdqa k0k1, xmm0 752 movdqa k2k3, xmm1 753 pshufd xmm5, xmm5, 0 754 movdqa k4k5, xmm2 755 movdqa k6k7, xmm3 756 movdqa krd, xmm5 757 758 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 759 movsxd rdx, dword ptr arg(3) ;output_pitch 760 movsxd rcx, dword ptr arg(4) ;output_height 761 762.loop: 763 prefetcht0 [rsi + 2 * rax -3] 764 765 movq xmm0, [rsi - 3] ;load src data 766 movq xmm4, [rsi + 5] 767 movq xmm7, [rsi + 13] 768 punpcklqdq xmm0, xmm4 769 punpcklqdq xmm4, xmm7 770 771 movdqa xmm1, xmm0 772 movdqa xmm2, xmm0 773 movdqa xmm3, xmm0 774 movdqa xmm5, xmm4 775 movdqa xmm6, xmm4 776 movdqa xmm7, xmm4 777 778 pshufb xmm0, [GLOBAL(shuf_t0t1)] 779 pshufb xmm1, [GLOBAL(shuf_t2t3)] 780 pshufb xmm2, [GLOBAL(shuf_t4t5)] 781 pshufb xmm3, [GLOBAL(shuf_t6t7)] 782 pshufb xmm4, [GLOBAL(shuf_t0t1)] 783 pshufb xmm5, [GLOBAL(shuf_t2t3)] 784 pshufb xmm6, [GLOBAL(shuf_t4t5)] 785 pshufb xmm7, [GLOBAL(shuf_t6t7)] 786 787 pmaddubsw xmm0, k0k1 788 pmaddubsw xmm1, k2k3 789 pmaddubsw xmm2, k4k5 790 pmaddubsw xmm3, k6k7 791 pmaddubsw xmm4, k0k1 792 pmaddubsw xmm5, k2k3 793 pmaddubsw xmm6, k4k5 794 pmaddubsw xmm7, k6k7 795 796 paddsw xmm0, xmm3 797 movdqa xmm3, xmm1 798 pmaxsw xmm1, xmm2 799 pminsw xmm2, xmm3 800 paddsw xmm0, xmm2 801 paddsw xmm0, xmm1 802 803 paddsw xmm4, xmm7 804 movdqa xmm7, xmm5 805 pmaxsw xmm5, xmm6 806 pminsw xmm6, xmm7 807 paddsw xmm4, xmm6 808 paddsw xmm4, xmm5 809 810 paddsw xmm0, krd 811 paddsw xmm4, krd 812 psraw xmm0, 7 813 psraw xmm4, 7 814 packuswb xmm0, xmm0 815 packuswb xmm4, xmm4 816 punpcklqdq xmm0, xmm4 817%if %1 818 movdqa xmm1, [rdi] 819 pavgb xmm0, xmm1 820%endif 821 822 lea rsi, [rsi + rax] 823 movdqa [rdi], xmm0 824 825 lea rdi, [rdi + rdx] 826 dec rcx 827 jnz .loop 828%endm 829 830;void vp9_filter_block1d4_h8_ssse3 831;( 832; unsigned char *src_ptr, 833; unsigned int src_pixels_per_line, 834; unsigned char *output_ptr, 835; unsigned int output_pitch, 836; unsigned int output_height, 837; short *filter 838;) 839global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE 840sym(vp9_filter_block1d4_h8_ssse3): 841 push rbp 842 mov rbp, rsp 843 SHADOW_ARGS_TO_STACK 6 844 SAVE_XMM 7 845 GET_GOT rbx 846 push rsi 847 push rdi 848 ; end prolog 849 850 ALIGN_STACK 16, rax 851 sub rsp, 16 * 3 852 %define k0k1k4k5 [rsp + 16 * 0] 853 %define k2k3k6k7 [rsp + 16 * 1] 854 %define krd [rsp + 16 * 2] 855 856 HORIZx4 0 857 858 add rsp, 16 * 3 859 pop rsp 860 ; begin epilog 861 pop rdi 862 pop rsi 863 RESTORE_GOT 864 RESTORE_XMM 865 UNSHADOW_ARGS 866 pop rbp 867 ret 868 869;void vp9_filter_block1d8_h8_ssse3 870;( 871; unsigned char *src_ptr, 872; unsigned int src_pixels_per_line, 873; unsigned char *output_ptr, 874; unsigned int output_pitch, 875; unsigned int output_height, 876; short *filter 877;) 878global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE 879sym(vp9_filter_block1d8_h8_ssse3): 880 push rbp 881 mov rbp, rsp 882 SHADOW_ARGS_TO_STACK 6 883 SAVE_XMM 7 884 GET_GOT rbx 885 push rsi 886 push rdi 887 ; end prolog 888 889 ALIGN_STACK 16, rax 890 sub rsp, 16*5 891 %define k0k1 [rsp + 16*0] 892 %define k2k3 [rsp + 16*1] 893 %define k4k5 [rsp + 16*2] 894 %define k6k7 [rsp + 16*3] 895 %define krd [rsp + 16*4] 896 897 HORIZx8 0 898 899 add rsp, 16*5 900 pop rsp 901 902 ; begin epilog 903 pop rdi 904 pop rsi 905 RESTORE_GOT 906 RESTORE_XMM 907 UNSHADOW_ARGS 908 pop rbp 909 ret 910 911;void vp9_filter_block1d16_h8_ssse3 912;( 913; unsigned char *src_ptr, 914; unsigned int src_pixels_per_line, 915; unsigned char *output_ptr, 916; unsigned int output_pitch, 917; unsigned int output_height, 918; short *filter 919;) 920global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE 921sym(vp9_filter_block1d16_h8_ssse3): 922 push rbp 923 mov rbp, rsp 924 SHADOW_ARGS_TO_STACK 6 925 SAVE_XMM 7 926 GET_GOT rbx 927 push rsi 928 push rdi 929 ; end prolog 930 931 ALIGN_STACK 16, rax 932 sub rsp, 16*5 933 %define k0k1 [rsp + 16*0] 934 %define k2k3 [rsp + 16*1] 935 %define k4k5 [rsp + 16*2] 936 %define k6k7 [rsp + 16*3] 937 %define krd [rsp + 16*4] 938 939 HORIZx16 0 940 941 add rsp, 16*5 942 pop rsp 943 944 ; begin epilog 945 pop rdi 946 pop rsi 947 RESTORE_GOT 948 RESTORE_XMM 949 UNSHADOW_ARGS 950 pop rbp 951 ret 952 953global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE 954sym(vp9_filter_block1d4_h8_avg_ssse3): 955 push rbp 956 mov rbp, rsp 957 SHADOW_ARGS_TO_STACK 6 958 SAVE_XMM 7 959 GET_GOT rbx 960 push rsi 961 push rdi 962 ; end prolog 963 964 ALIGN_STACK 16, rax 965 sub rsp, 16 * 3 966 %define k0k1k4k5 [rsp + 16 * 0] 967 %define k2k3k6k7 [rsp + 16 * 1] 968 %define krd [rsp + 16 * 2] 969 970 HORIZx4 1 971 972 add rsp, 16 * 3 973 pop rsp 974 ; begin epilog 975 pop rdi 976 pop rsi 977 RESTORE_GOT 978 RESTORE_XMM 979 UNSHADOW_ARGS 980 pop rbp 981 ret 982 983global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 984sym(vp9_filter_block1d8_h8_avg_ssse3): 985 push rbp 986 mov rbp, rsp 987 SHADOW_ARGS_TO_STACK 6 988 SAVE_XMM 7 989 GET_GOT rbx 990 push rsi 991 push rdi 992 ; end prolog 993 994 ALIGN_STACK 16, rax 995 sub rsp, 16*5 996 %define k0k1 [rsp + 16*0] 997 %define k2k3 [rsp + 16*1] 998 %define k4k5 [rsp + 16*2] 999 %define k6k7 [rsp + 16*3] 1000 %define krd [rsp + 16*4] 1001 1002 HORIZx8 1 1003 1004 add rsp, 16*5 1005 pop rsp 1006 1007 ; begin epilog 1008 pop rdi 1009 pop rsi 1010 RESTORE_GOT 1011 RESTORE_XMM 1012 UNSHADOW_ARGS 1013 pop rbp 1014 ret 1015 1016global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE 1017sym(vp9_filter_block1d16_h8_avg_ssse3): 1018 push rbp 1019 mov rbp, rsp 1020 SHADOW_ARGS_TO_STACK 6 1021 SAVE_XMM 7 1022 GET_GOT rbx 1023 push rsi 1024 push rdi 1025 ; end prolog 1026 1027 ALIGN_STACK 16, rax 1028 sub rsp, 16*5 1029 %define k0k1 [rsp + 16*0] 1030 %define k2k3 [rsp + 16*1] 1031 %define k4k5 [rsp + 16*2] 1032 %define k6k7 [rsp + 16*3] 1033 %define krd [rsp + 16*4] 1034 1035 HORIZx16 1 1036 1037 add rsp, 16*5 1038 pop rsp 1039 1040 ; begin epilog 1041 pop rdi 1042 pop rsi 1043 RESTORE_GOT 1044 RESTORE_XMM 1045 UNSHADOW_ARGS 1046 pop rbp 1047 ret 1048SECTION_RODATA 1049align 16 1050shuf_t0t1: 1051 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1052align 16 1053shuf_t2t3: 1054 db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1055align 16 1056shuf_t4t5: 1057 db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1058align 16 1059shuf_t6t7: 1060 db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14 1061