1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;Note: tap3 and tap4 have to be applied and added after other taps to avoid 15;overflow. 16 17%macro GET_FILTERS_4 0 18 mov rdx, arg(5) ;filter ptr 19 mov rcx, 0x0400040 20 21 movdqa xmm7, [rdx] ;load filters 22 pshuflw xmm0, xmm7, 0b ;k0 23 pshuflw xmm1, xmm7, 01010101b ;k1 24 pshuflw xmm2, xmm7, 10101010b ;k2 25 pshuflw xmm3, xmm7, 11111111b ;k3 26 psrldq xmm7, 8 27 pshuflw xmm4, xmm7, 0b ;k4 28 pshuflw xmm5, xmm7, 01010101b ;k5 29 pshuflw xmm6, xmm7, 10101010b ;k6 30 pshuflw xmm7, xmm7, 11111111b ;k7 31 32 punpcklqdq xmm0, xmm1 33 punpcklqdq xmm2, xmm3 34 punpcklqdq xmm5, xmm4 35 punpcklqdq xmm6, xmm7 36 37 movdqa k0k1, xmm0 38 movdqa k2k3, xmm2 39 movdqa k5k4, xmm5 40 movdqa k6k7, xmm6 41 42 movq xmm6, rcx 43 pshufd xmm6, xmm6, 0 44 movdqa krd, xmm6 45 46 pxor xmm7, xmm7 47 movdqa zero, xmm7 48%endm 49 50%macro APPLY_FILTER_4 1 51 punpckldq xmm0, xmm1 ;two row in one register 52 punpckldq xmm6, xmm7 53 punpckldq xmm2, xmm3 54 punpckldq xmm5, xmm4 55 56 punpcklbw xmm0, zero ;unpack to word 57 punpcklbw xmm6, zero 58 punpcklbw xmm2, zero 59 punpcklbw xmm5, zero 60 61 pmullw xmm0, k0k1 ;multiply the filter factors 62 pmullw xmm6, k6k7 63 pmullw xmm2, k2k3 64 pmullw xmm5, k5k4 65 66 paddsw xmm0, xmm6 ;sum 67 movdqa xmm1, xmm0 68 psrldq xmm1, 8 69 paddsw xmm0, xmm1 70 paddsw xmm0, xmm2 71 psrldq xmm2, 8 72 paddsw xmm0, xmm5 73 psrldq xmm5, 8 74 paddsw xmm0, xmm2 75 paddsw xmm0, xmm5 76 77 paddsw xmm0, krd ;rounding 78 psraw xmm0, 7 ;shift 79 packuswb xmm0, xmm0 ;pack to byte 80 81%if %1 82 movd xmm1, [rdi] 83 pavgb xmm0, xmm1 84%endif 85 movd [rdi], xmm0 86%endm 87 88%macro GET_FILTERS 0 89 mov rdx, arg(5) ;filter ptr 90 mov rsi, arg(0) ;src_ptr 91 mov rdi, arg(2) ;output_ptr 92 mov rcx, 0x0400040 93 94 movdqa xmm7, [rdx] ;load filters 95 pshuflw xmm0, xmm7, 0b ;k0 96 pshuflw xmm1, xmm7, 01010101b ;k1 97 pshuflw xmm2, xmm7, 10101010b ;k2 98 pshuflw xmm3, xmm7, 11111111b ;k3 99 pshufhw xmm4, xmm7, 0b ;k4 100 pshufhw xmm5, xmm7, 01010101b ;k5 101 pshufhw xmm6, xmm7, 10101010b ;k6 102 pshufhw xmm7, xmm7, 11111111b ;k7 103 104 punpcklwd xmm0, xmm0 105 punpcklwd xmm1, xmm1 106 punpcklwd xmm2, xmm2 107 punpcklwd xmm3, xmm3 108 punpckhwd xmm4, xmm4 109 punpckhwd xmm5, xmm5 110 punpckhwd xmm6, xmm6 111 punpckhwd xmm7, xmm7 112 113 movdqa k0, xmm0 ;store filter factors on stack 114 movdqa k1, xmm1 115 movdqa k2, xmm2 116 movdqa k3, xmm3 117 movdqa k4, xmm4 118 movdqa k5, xmm5 119 movdqa k6, xmm6 120 movdqa k7, xmm7 121 122 movq xmm6, rcx 123 pshufd xmm6, xmm6, 0 124 movdqa krd, xmm6 ;rounding 125 126 pxor xmm7, xmm7 127 movdqa zero, xmm7 128%endm 129 130%macro LOAD_VERT_8 1 131 movq xmm0, [rsi + %1] ;0 132 movq xmm1, [rsi + rax + %1] ;1 133 movq xmm6, [rsi + rdx * 2 + %1] ;6 134 lea rsi, [rsi + rax] 135 movq xmm7, [rsi + rdx * 2 + %1] ;7 136 movq xmm2, [rsi + rax + %1] ;2 137 movq xmm3, [rsi + rax * 2 + %1] ;3 138 movq xmm4, [rsi + rdx + %1] ;4 139 movq xmm5, [rsi + rax * 4 + %1] ;5 140%endm 141 142%macro APPLY_FILTER_8 2 143 punpcklbw xmm0, zero 144 punpcklbw xmm1, zero 145 punpcklbw xmm6, zero 146 punpcklbw xmm7, zero 147 punpcklbw xmm2, zero 148 punpcklbw xmm5, zero 149 punpcklbw xmm3, zero 150 punpcklbw xmm4, zero 151 152 pmullw xmm0, k0 153 pmullw xmm1, k1 154 pmullw xmm6, k6 155 pmullw xmm7, k7 156 pmullw xmm2, k2 157 pmullw xmm5, k5 158 pmullw xmm3, k3 159 pmullw xmm4, k4 160 161 paddsw xmm0, xmm1 162 paddsw xmm0, xmm6 163 paddsw xmm0, xmm7 164 paddsw xmm0, xmm2 165 paddsw xmm0, xmm5 166 paddsw xmm0, xmm3 167 paddsw xmm0, xmm4 168 169 paddsw xmm0, krd ;rounding 170 psraw xmm0, 7 ;shift 171 packuswb xmm0, xmm0 ;pack back to byte 172%if %1 173 movq xmm1, [rdi + %2] 174 pavgb xmm0, xmm1 175%endif 176 movq [rdi + %2], xmm0 177%endm 178 179;void vp9_filter_block1d4_v8_sse2 180;( 181; unsigned char *src_ptr, 182; unsigned int src_pitch, 183; unsigned char *output_ptr, 184; unsigned int out_pitch, 185; unsigned int output_height, 186; short *filter 187;) 188global sym(vp9_filter_block1d4_v8_sse2) PRIVATE 189sym(vp9_filter_block1d4_v8_sse2): 190 push rbp 191 mov rbp, rsp 192 SHADOW_ARGS_TO_STACK 6 193 SAVE_XMM 7 194 push rsi 195 push rdi 196 push rbx 197 ; end prolog 198 199 ALIGN_STACK 16, rax 200 sub rsp, 16 * 6 201 %define k0k1 [rsp + 16 * 0] 202 %define k2k3 [rsp + 16 * 1] 203 %define k5k4 [rsp + 16 * 2] 204 %define k6k7 [rsp + 16 * 3] 205 %define krd [rsp + 16 * 4] 206 %define zero [rsp + 16 * 5] 207 208 GET_FILTERS_4 209 210 mov rsi, arg(0) ;src_ptr 211 mov rdi, arg(2) ;output_ptr 212 213 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 214 movsxd rbx, DWORD PTR arg(3) ;out_pitch 215 lea rdx, [rax + rax * 2] 216 movsxd rcx, DWORD PTR arg(4) ;output_height 217 218.loop: 219 movd xmm0, [rsi] ;load src: row 0 220 movd xmm1, [rsi + rax] ;1 221 movd xmm6, [rsi + rdx * 2] ;6 222 lea rsi, [rsi + rax] 223 movd xmm7, [rsi + rdx * 2] ;7 224 movd xmm2, [rsi + rax] ;2 225 movd xmm3, [rsi + rax * 2] ;3 226 movd xmm4, [rsi + rdx] ;4 227 movd xmm5, [rsi + rax * 4] ;5 228 229 APPLY_FILTER_4 0 230 231 lea rdi, [rdi + rbx] 232 dec rcx 233 jnz .loop 234 235 add rsp, 16 * 6 236 pop rsp 237 pop rbx 238 ; begin epilog 239 pop rdi 240 pop rsi 241 RESTORE_XMM 242 UNSHADOW_ARGS 243 pop rbp 244 ret 245 246;void vp9_filter_block1d8_v8_sse2 247;( 248; unsigned char *src_ptr, 249; unsigned int src_pitch, 250; unsigned char *output_ptr, 251; unsigned int out_pitch, 252; unsigned int output_height, 253; short *filter 254;) 255global sym(vp9_filter_block1d8_v8_sse2) PRIVATE 256sym(vp9_filter_block1d8_v8_sse2): 257 push rbp 258 mov rbp, rsp 259 SHADOW_ARGS_TO_STACK 6 260 SAVE_XMM 7 261 push rsi 262 push rdi 263 push rbx 264 ; end prolog 265 266 ALIGN_STACK 16, rax 267 sub rsp, 16 * 10 268 %define k0 [rsp + 16 * 0] 269 %define k1 [rsp + 16 * 1] 270 %define k2 [rsp + 16 * 2] 271 %define k3 [rsp + 16 * 3] 272 %define k4 [rsp + 16 * 4] 273 %define k5 [rsp + 16 * 5] 274 %define k6 [rsp + 16 * 6] 275 %define k7 [rsp + 16 * 7] 276 %define krd [rsp + 16 * 8] 277 %define zero [rsp + 16 * 9] 278 279 GET_FILTERS 280 281 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 282 movsxd rbx, DWORD PTR arg(3) ;out_pitch 283 lea rdx, [rax + rax * 2] 284 movsxd rcx, DWORD PTR arg(4) ;output_height 285 286.loop: 287 LOAD_VERT_8 0 288 APPLY_FILTER_8 0, 0 289 290 lea rdi, [rdi + rbx] 291 dec rcx 292 jnz .loop 293 294 add rsp, 16 * 10 295 pop rsp 296 pop rbx 297 ; begin epilog 298 pop rdi 299 pop rsi 300 RESTORE_XMM 301 UNSHADOW_ARGS 302 pop rbp 303 ret 304 305;void vp9_filter_block1d16_v8_sse2 306;( 307; unsigned char *src_ptr, 308; unsigned int src_pitch, 309; unsigned char *output_ptr, 310; unsigned int out_pitch, 311; unsigned int output_height, 312; short *filter 313;) 314global sym(vp9_filter_block1d16_v8_sse2) PRIVATE 315sym(vp9_filter_block1d16_v8_sse2): 316 push rbp 317 mov rbp, rsp 318 SHADOW_ARGS_TO_STACK 6 319 SAVE_XMM 7 320 push rsi 321 push rdi 322 push rbx 323 ; end prolog 324 325 ALIGN_STACK 16, rax 326 sub rsp, 16 * 10 327 %define k0 [rsp + 16 * 0] 328 %define k1 [rsp + 16 * 1] 329 %define k2 [rsp + 16 * 2] 330 %define k3 [rsp + 16 * 3] 331 %define k4 [rsp + 16 * 4] 332 %define k5 [rsp + 16 * 5] 333 %define k6 [rsp + 16 * 6] 334 %define k7 [rsp + 16 * 7] 335 %define krd [rsp + 16 * 8] 336 %define zero [rsp + 16 * 9] 337 338 GET_FILTERS 339 340 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 341 movsxd rbx, DWORD PTR arg(3) ;out_pitch 342 lea rdx, [rax + rax * 2] 343 movsxd rcx, DWORD PTR arg(4) ;output_height 344 345.loop: 346 LOAD_VERT_8 0 347 APPLY_FILTER_8 0, 0 348 sub rsi, rax 349 350 LOAD_VERT_8 8 351 APPLY_FILTER_8 0, 8 352 add rdi, rbx 353 354 dec rcx 355 jnz .loop 356 357 add rsp, 16 * 10 358 pop rsp 359 pop rbx 360 ; begin epilog 361 pop rdi 362 pop rsi 363 RESTORE_XMM 364 UNSHADOW_ARGS 365 pop rbp 366 ret 367 368global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE 369sym(vp9_filter_block1d4_v8_avg_sse2): 370 push rbp 371 mov rbp, rsp 372 SHADOW_ARGS_TO_STACK 6 373 SAVE_XMM 7 374 push rsi 375 push rdi 376 push rbx 377 ; end prolog 378 379 ALIGN_STACK 16, rax 380 sub rsp, 16 * 6 381 %define k0k1 [rsp + 16 * 0] 382 %define k2k3 [rsp + 16 * 1] 383 %define k5k4 [rsp + 16 * 2] 384 %define k6k7 [rsp + 16 * 3] 385 %define krd [rsp + 16 * 4] 386 %define zero [rsp + 16 * 5] 387 388 GET_FILTERS_4 389 390 mov rsi, arg(0) ;src_ptr 391 mov rdi, arg(2) ;output_ptr 392 393 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 394 movsxd rbx, DWORD PTR arg(3) ;out_pitch 395 lea rdx, [rax + rax * 2] 396 movsxd rcx, DWORD PTR arg(4) ;output_height 397 398.loop: 399 movd xmm0, [rsi] ;load src: row 0 400 movd xmm1, [rsi + rax] ;1 401 movd xmm6, [rsi + rdx * 2] ;6 402 lea rsi, [rsi + rax] 403 movd xmm7, [rsi + rdx * 2] ;7 404 movd xmm2, [rsi + rax] ;2 405 movd xmm3, [rsi + rax * 2] ;3 406 movd xmm4, [rsi + rdx] ;4 407 movd xmm5, [rsi + rax * 4] ;5 408 409 APPLY_FILTER_4 1 410 411 lea rdi, [rdi + rbx] 412 dec rcx 413 jnz .loop 414 415 add rsp, 16 * 6 416 pop rsp 417 pop rbx 418 ; begin epilog 419 pop rdi 420 pop rsi 421 RESTORE_XMM 422 UNSHADOW_ARGS 423 pop rbp 424 ret 425 426global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE 427sym(vp9_filter_block1d8_v8_avg_sse2): 428 push rbp 429 mov rbp, rsp 430 SHADOW_ARGS_TO_STACK 6 431 SAVE_XMM 7 432 push rsi 433 push rdi 434 push rbx 435 ; end prolog 436 437 ALIGN_STACK 16, rax 438 sub rsp, 16 * 10 439 %define k0 [rsp + 16 * 0] 440 %define k1 [rsp + 16 * 1] 441 %define k2 [rsp + 16 * 2] 442 %define k3 [rsp + 16 * 3] 443 %define k4 [rsp + 16 * 4] 444 %define k5 [rsp + 16 * 5] 445 %define k6 [rsp + 16 * 6] 446 %define k7 [rsp + 16 * 7] 447 %define krd [rsp + 16 * 8] 448 %define zero [rsp + 16 * 9] 449 450 GET_FILTERS 451 452 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 453 movsxd rbx, DWORD PTR arg(3) ;out_pitch 454 lea rdx, [rax + rax * 2] 455 movsxd rcx, DWORD PTR arg(4) ;output_height 456.loop: 457 LOAD_VERT_8 0 458 APPLY_FILTER_8 1, 0 459 460 lea rdi, [rdi + rbx] 461 dec rcx 462 jnz .loop 463 464 add rsp, 16 * 10 465 pop rsp 466 pop rbx 467 ; begin epilog 468 pop rdi 469 pop rsi 470 RESTORE_XMM 471 UNSHADOW_ARGS 472 pop rbp 473 ret 474 475global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE 476sym(vp9_filter_block1d16_v8_avg_sse2): 477 push rbp 478 mov rbp, rsp 479 SHADOW_ARGS_TO_STACK 6 480 SAVE_XMM 7 481 push rsi 482 push rdi 483 push rbx 484 ; end prolog 485 486 ALIGN_STACK 16, rax 487 sub rsp, 16 * 10 488 %define k0 [rsp + 16 * 0] 489 %define k1 [rsp + 16 * 1] 490 %define k2 [rsp + 16 * 2] 491 %define k3 [rsp + 16 * 3] 492 %define k4 [rsp + 16 * 4] 493 %define k5 [rsp + 16 * 5] 494 %define k6 [rsp + 16 * 6] 495 %define k7 [rsp + 16 * 7] 496 %define krd [rsp + 16 * 8] 497 %define zero [rsp + 16 * 9] 498 499 GET_FILTERS 500 501 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 502 movsxd rbx, DWORD PTR arg(3) ;out_pitch 503 lea rdx, [rax + rax * 2] 504 movsxd rcx, DWORD PTR arg(4) ;output_height 505.loop: 506 LOAD_VERT_8 0 507 APPLY_FILTER_8 1, 0 508 sub rsi, rax 509 510 LOAD_VERT_8 8 511 APPLY_FILTER_8 1, 8 512 add rdi, rbx 513 514 dec rcx 515 jnz .loop 516 517 add rsp, 16 * 10 518 pop rsp 519 pop rbx 520 ; begin epilog 521 pop rdi 522 pop rsi 523 RESTORE_XMM 524 UNSHADOW_ARGS 525 pop rbp 526 ret 527 528;void vp9_filter_block1d4_h8_sse2 529;( 530; unsigned char *src_ptr, 531; unsigned int src_pixels_per_line, 532; unsigned char *output_ptr, 533; unsigned int output_pitch, 534; unsigned int output_height, 535; short *filter 536;) 537global sym(vp9_filter_block1d4_h8_sse2) PRIVATE 538sym(vp9_filter_block1d4_h8_sse2): 539 push rbp 540 mov rbp, rsp 541 SHADOW_ARGS_TO_STACK 6 542 SAVE_XMM 7 543 push rsi 544 push rdi 545 ; end prolog 546 547 ALIGN_STACK 16, rax 548 sub rsp, 16 * 6 549 %define k0k1 [rsp + 16 * 0] 550 %define k2k3 [rsp + 16 * 1] 551 %define k5k4 [rsp + 16 * 2] 552 %define k6k7 [rsp + 16 * 3] 553 %define krd [rsp + 16 * 4] 554 %define zero [rsp + 16 * 5] 555 556 GET_FILTERS_4 557 558 mov rsi, arg(0) ;src_ptr 559 mov rdi, arg(2) ;output_ptr 560 561 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 562 movsxd rdx, DWORD PTR arg(3) ;out_pitch 563 movsxd rcx, DWORD PTR arg(4) ;output_height 564 565.loop: 566 movdqu xmm0, [rsi - 3] ;load src 567 568 movdqa xmm1, xmm0 569 movdqa xmm6, xmm0 570 movdqa xmm7, xmm0 571 movdqa xmm2, xmm0 572 movdqa xmm3, xmm0 573 movdqa xmm5, xmm0 574 movdqa xmm4, xmm0 575 576 psrldq xmm1, 1 577 psrldq xmm6, 6 578 psrldq xmm7, 7 579 psrldq xmm2, 2 580 psrldq xmm3, 3 581 psrldq xmm5, 5 582 psrldq xmm4, 4 583 584 APPLY_FILTER_4 0 585 586 lea rsi, [rsi + rax] 587 lea rdi, [rdi + rdx] 588 dec rcx 589 jnz .loop 590 591 add rsp, 16 * 6 592 pop rsp 593 594 ; begin epilog 595 pop rdi 596 pop rsi 597 RESTORE_XMM 598 UNSHADOW_ARGS 599 pop rbp 600 ret 601 602;void vp9_filter_block1d8_h8_sse2 603;( 604; unsigned char *src_ptr, 605; unsigned int src_pixels_per_line, 606; unsigned char *output_ptr, 607; unsigned int output_pitch, 608; unsigned int output_height, 609; short *filter 610;) 611global sym(vp9_filter_block1d8_h8_sse2) PRIVATE 612sym(vp9_filter_block1d8_h8_sse2): 613 push rbp 614 mov rbp, rsp 615 SHADOW_ARGS_TO_STACK 6 616 SAVE_XMM 7 617 push rsi 618 push rdi 619 ; end prolog 620 621 ALIGN_STACK 16, rax 622 sub rsp, 16 * 10 623 %define k0 [rsp + 16 * 0] 624 %define k1 [rsp + 16 * 1] 625 %define k2 [rsp + 16 * 2] 626 %define k3 [rsp + 16 * 3] 627 %define k4 [rsp + 16 * 4] 628 %define k5 [rsp + 16 * 5] 629 %define k6 [rsp + 16 * 6] 630 %define k7 [rsp + 16 * 7] 631 %define krd [rsp + 16 * 8] 632 %define zero [rsp + 16 * 9] 633 634 GET_FILTERS 635 636 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 637 movsxd rdx, DWORD PTR arg(3) ;out_pitch 638 movsxd rcx, DWORD PTR arg(4) ;output_height 639 640.loop: 641 movdqu xmm0, [rsi - 3] ;load src 642 643 movdqa xmm1, xmm0 644 movdqa xmm6, xmm0 645 movdqa xmm7, xmm0 646 movdqa xmm2, xmm0 647 movdqa xmm5, xmm0 648 movdqa xmm3, xmm0 649 movdqa xmm4, xmm0 650 651 psrldq xmm1, 1 652 psrldq xmm6, 6 653 psrldq xmm7, 7 654 psrldq xmm2, 2 655 psrldq xmm5, 5 656 psrldq xmm3, 3 657 psrldq xmm4, 4 658 659 APPLY_FILTER_8 0, 0 660 661 lea rsi, [rsi + rax] 662 lea rdi, [rdi + rdx] 663 dec rcx 664 jnz .loop 665 666 add rsp, 16 * 10 667 pop rsp 668 669 ; begin epilog 670 pop rdi 671 pop rsi 672 RESTORE_XMM 673 UNSHADOW_ARGS 674 pop rbp 675 ret 676 677;void vp9_filter_block1d16_h8_sse2 678;( 679; unsigned char *src_ptr, 680; unsigned int src_pixels_per_line, 681; unsigned char *output_ptr, 682; unsigned int output_pitch, 683; unsigned int output_height, 684; short *filter 685;) 686global sym(vp9_filter_block1d16_h8_sse2) PRIVATE 687sym(vp9_filter_block1d16_h8_sse2): 688 push rbp 689 mov rbp, rsp 690 SHADOW_ARGS_TO_STACK 6 691 SAVE_XMM 7 692 push rsi 693 push rdi 694 ; end prolog 695 696 ALIGN_STACK 16, rax 697 sub rsp, 16 * 10 698 %define k0 [rsp + 16 * 0] 699 %define k1 [rsp + 16 * 1] 700 %define k2 [rsp + 16 * 2] 701 %define k3 [rsp + 16 * 3] 702 %define k4 [rsp + 16 * 4] 703 %define k5 [rsp + 16 * 5] 704 %define k6 [rsp + 16 * 6] 705 %define k7 [rsp + 16 * 7] 706 %define krd [rsp + 16 * 8] 707 %define zero [rsp + 16 * 9] 708 709 GET_FILTERS 710 711 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 712 movsxd rdx, DWORD PTR arg(3) ;out_pitch 713 movsxd rcx, DWORD PTR arg(4) ;output_height 714 715.loop: 716 movdqu xmm0, [rsi - 3] ;load src 717 718 movdqa xmm1, xmm0 719 movdqa xmm6, xmm0 720 movdqa xmm7, xmm0 721 movdqa xmm2, xmm0 722 movdqa xmm5, xmm0 723 movdqa xmm3, xmm0 724 movdqa xmm4, xmm0 725 726 psrldq xmm1, 1 727 psrldq xmm6, 6 728 psrldq xmm7, 7 729 psrldq xmm2, 2 730 psrldq xmm5, 5 731 psrldq xmm3, 3 732 psrldq xmm4, 4 733 734 APPLY_FILTER_8 0, 0 735 736 movdqu xmm0, [rsi + 5] ;load src 737 738 movdqa xmm1, xmm0 739 movdqa xmm6, xmm0 740 movdqa xmm7, xmm0 741 movdqa xmm2, xmm0 742 movdqa xmm5, xmm0 743 movdqa xmm3, xmm0 744 movdqa xmm4, xmm0 745 746 psrldq xmm1, 1 747 psrldq xmm6, 6 748 psrldq xmm7, 7 749 psrldq xmm2, 2 750 psrldq xmm5, 5 751 psrldq xmm3, 3 752 psrldq xmm4, 4 753 754 APPLY_FILTER_8 0, 8 755 756 lea rsi, [rsi + rax] 757 lea rdi, [rdi + rdx] 758 dec rcx 759 jnz .loop 760 761 add rsp, 16 * 10 762 pop rsp 763 764 ; begin epilog 765 pop rdi 766 pop rsi 767 RESTORE_XMM 768 UNSHADOW_ARGS 769 pop rbp 770 ret 771 772global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE 773sym(vp9_filter_block1d4_h8_avg_sse2): 774 push rbp 775 mov rbp, rsp 776 SHADOW_ARGS_TO_STACK 6 777 SAVE_XMM 7 778 push rsi 779 push rdi 780 ; end prolog 781 782 ALIGN_STACK 16, rax 783 sub rsp, 16 * 6 784 %define k0k1 [rsp + 16 * 0] 785 %define k2k3 [rsp + 16 * 1] 786 %define k5k4 [rsp + 16 * 2] 787 %define k6k7 [rsp + 16 * 3] 788 %define krd [rsp + 16 * 4] 789 %define zero [rsp + 16 * 5] 790 791 GET_FILTERS_4 792 793 mov rsi, arg(0) ;src_ptr 794 mov rdi, arg(2) ;output_ptr 795 796 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 797 movsxd rdx, DWORD PTR arg(3) ;out_pitch 798 movsxd rcx, DWORD PTR arg(4) ;output_height 799 800.loop: 801 movdqu xmm0, [rsi - 3] ;load src 802 803 movdqa xmm1, xmm0 804 movdqa xmm6, xmm0 805 movdqa xmm7, xmm0 806 movdqa xmm2, xmm0 807 movdqa xmm3, xmm0 808 movdqa xmm5, xmm0 809 movdqa xmm4, xmm0 810 811 psrldq xmm1, 1 812 psrldq xmm6, 6 813 psrldq xmm7, 7 814 psrldq xmm2, 2 815 psrldq xmm3, 3 816 psrldq xmm5, 5 817 psrldq xmm4, 4 818 819 APPLY_FILTER_4 1 820 821 lea rsi, [rsi + rax] 822 lea rdi, [rdi + rdx] 823 dec rcx 824 jnz .loop 825 826 add rsp, 16 * 6 827 pop rsp 828 829 ; begin epilog 830 pop rdi 831 pop rsi 832 RESTORE_XMM 833 UNSHADOW_ARGS 834 pop rbp 835 ret 836 837global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE 838sym(vp9_filter_block1d8_h8_avg_sse2): 839 push rbp 840 mov rbp, rsp 841 SHADOW_ARGS_TO_STACK 6 842 SAVE_XMM 7 843 push rsi 844 push rdi 845 ; end prolog 846 847 ALIGN_STACK 16, rax 848 sub rsp, 16 * 10 849 %define k0 [rsp + 16 * 0] 850 %define k1 [rsp + 16 * 1] 851 %define k2 [rsp + 16 * 2] 852 %define k3 [rsp + 16 * 3] 853 %define k4 [rsp + 16 * 4] 854 %define k5 [rsp + 16 * 5] 855 %define k6 [rsp + 16 * 6] 856 %define k7 [rsp + 16 * 7] 857 %define krd [rsp + 16 * 8] 858 %define zero [rsp + 16 * 9] 859 860 GET_FILTERS 861 862 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 863 movsxd rdx, DWORD PTR arg(3) ;out_pitch 864 movsxd rcx, DWORD PTR arg(4) ;output_height 865 866.loop: 867 movdqu xmm0, [rsi - 3] ;load src 868 869 movdqa xmm1, xmm0 870 movdqa xmm6, xmm0 871 movdqa xmm7, xmm0 872 movdqa xmm2, xmm0 873 movdqa xmm5, xmm0 874 movdqa xmm3, xmm0 875 movdqa xmm4, xmm0 876 877 psrldq xmm1, 1 878 psrldq xmm6, 6 879 psrldq xmm7, 7 880 psrldq xmm2, 2 881 psrldq xmm5, 5 882 psrldq xmm3, 3 883 psrldq xmm4, 4 884 885 APPLY_FILTER_8 1, 0 886 887 lea rsi, [rsi + rax] 888 lea rdi, [rdi + rdx] 889 dec rcx 890 jnz .loop 891 892 add rsp, 16 * 10 893 pop rsp 894 895 ; begin epilog 896 pop rdi 897 pop rsi 898 RESTORE_XMM 899 UNSHADOW_ARGS 900 pop rbp 901 ret 902 903global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE 904sym(vp9_filter_block1d16_h8_avg_sse2): 905 push rbp 906 mov rbp, rsp 907 SHADOW_ARGS_TO_STACK 6 908 SAVE_XMM 7 909 push rsi 910 push rdi 911 ; end prolog 912 913 ALIGN_STACK 16, rax 914 sub rsp, 16 * 10 915 %define k0 [rsp + 16 * 0] 916 %define k1 [rsp + 16 * 1] 917 %define k2 [rsp + 16 * 2] 918 %define k3 [rsp + 16 * 3] 919 %define k4 [rsp + 16 * 4] 920 %define k5 [rsp + 16 * 5] 921 %define k6 [rsp + 16 * 6] 922 %define k7 [rsp + 16 * 7] 923 %define krd [rsp + 16 * 8] 924 %define zero [rsp + 16 * 9] 925 926 GET_FILTERS 927 928 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 929 movsxd rdx, DWORD PTR arg(3) ;out_pitch 930 movsxd rcx, DWORD PTR arg(4) ;output_height 931 932.loop: 933 movdqu xmm0, [rsi - 3] ;load src 934 935 movdqa xmm1, xmm0 936 movdqa xmm6, xmm0 937 movdqa xmm7, xmm0 938 movdqa xmm2, xmm0 939 movdqa xmm5, xmm0 940 movdqa xmm3, xmm0 941 movdqa xmm4, xmm0 942 943 psrldq xmm1, 1 944 psrldq xmm6, 6 945 psrldq xmm7, 7 946 psrldq xmm2, 2 947 psrldq xmm5, 5 948 psrldq xmm3, 3 949 psrldq xmm4, 4 950 951 APPLY_FILTER_8 1, 0 952 953 movdqu xmm0, [rsi + 5] ;load src 954 955 movdqa xmm1, xmm0 956 movdqa xmm6, xmm0 957 movdqa xmm7, xmm0 958 movdqa xmm2, xmm0 959 movdqa xmm5, xmm0 960 movdqa xmm3, xmm0 961 movdqa xmm4, xmm0 962 963 psrldq xmm1, 1 964 psrldq xmm6, 6 965 psrldq xmm7, 7 966 psrldq xmm2, 2 967 psrldq xmm5, 5 968 psrldq xmm3, 3 969 psrldq xmm4, 4 970 971 APPLY_FILTER_8 1, 8 972 973 lea rsi, [rsi + rax] 974 lea rdi, [rdi + rdx] 975 dec rcx 976 jnz .loop 977 978 add rsp, 16 * 10 979 pop rsp 980 981 ; begin epilog 982 pop rdi 983 pop rsi 984 RESTORE_XMM 985 UNSHADOW_ARGS 986 pop rbp 987 ret 988