1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;macro in deblock functions 15%macro FIRST_2_ROWS 0 16 movdqa xmm4, xmm0 17 movdqa xmm6, xmm0 18 movdqa xmm5, xmm1 19 pavgb xmm5, xmm3 20 21 ;calculate absolute value 22 psubusb xmm4, xmm1 23 psubusb xmm1, xmm0 24 psubusb xmm6, xmm3 25 psubusb xmm3, xmm0 26 paddusb xmm4, xmm1 27 paddusb xmm6, xmm3 28 29 ;get threshold 30 movdqa xmm2, flimit 31 pxor xmm1, xmm1 32 movdqa xmm7, xmm2 33 34 ;get mask 35 psubusb xmm2, xmm4 36 psubusb xmm7, xmm6 37 pcmpeqb xmm2, xmm1 38 pcmpeqb xmm7, xmm1 39 por xmm7, xmm2 40%endmacro 41 42%macro SECOND_2_ROWS 0 43 movdqa xmm6, xmm0 44 movdqa xmm4, xmm0 45 movdqa xmm2, xmm1 46 pavgb xmm1, xmm3 47 48 ;calculate absolute value 49 psubusb xmm6, xmm2 50 psubusb xmm2, xmm0 51 psubusb xmm4, xmm3 52 psubusb xmm3, xmm0 53 paddusb xmm6, xmm2 54 paddusb xmm4, xmm3 55 56 pavgb xmm5, xmm1 57 58 ;get threshold 59 movdqa xmm2, flimit 60 pxor xmm1, xmm1 61 movdqa xmm3, xmm2 62 63 ;get mask 64 psubusb xmm2, xmm6 65 psubusb xmm3, xmm4 66 pcmpeqb xmm2, xmm1 67 pcmpeqb xmm3, xmm1 68 69 por xmm7, xmm2 70 por xmm7, xmm3 71 72 pavgb xmm5, xmm0 73 74 ;decide if or not to use filtered value 75 pand xmm0, xmm7 76 pandn xmm7, xmm5 77 paddusb xmm0, xmm7 78%endmacro 79 80%macro UPDATE_FLIMIT 0 81 movdqa xmm2, XMMWORD PTR [rbx] 82 movdqa [rsp], xmm2 83 add rbx, 16 84%endmacro 85 86;void vp8_post_proc_down_and_across_mb_row_sse2 87;( 88; unsigned char *src_ptr, 89; unsigned char *dst_ptr, 90; int src_pixels_per_line, 91; int dst_pixels_per_line, 92; int cols, 93; int *flimits, 94; int size 95;) 96global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE 97sym(vp8_post_proc_down_and_across_mb_row_sse2): 98 push rbp 99 mov rbp, rsp 100 SHADOW_ARGS_TO_STACK 7 101 SAVE_XMM 7 102 push rbx 103 push rsi 104 push rdi 105 ; end prolog 106 ALIGN_STACK 16, rax 107 sub rsp, 16 108 109 ; put flimit on stack 110 mov rbx, arg(5) ;flimits ptr 111 UPDATE_FLIMIT 112 113%define flimit [rsp] 114 115 mov rsi, arg(0) ;src_ptr 116 mov rdi, arg(1) ;dst_ptr 117 118 movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line 119 movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock 120.nextrow: 121 xor rdx, rdx ;col 122.nextcol: 123 ;load current and next 2 rows 124 movdqu xmm0, XMMWORD PTR [rsi] 125 movdqu xmm1, XMMWORD PTR [rsi + rax] 126 movdqu xmm3, XMMWORD PTR [rsi + 2*rax] 127 128 FIRST_2_ROWS 129 130 ;load above 2 rows 131 neg rax 132 movdqu xmm1, XMMWORD PTR [rsi + 2*rax] 133 movdqu xmm3, XMMWORD PTR [rsi + rax] 134 135 SECOND_2_ROWS 136 137 movdqu XMMWORD PTR [rdi], xmm0 138 139 neg rax ; positive stride 140 add rsi, 16 141 add rdi, 16 142 143 add rdx, 16 144 cmp edx, dword arg(4) ;cols 145 jge .downdone 146 UPDATE_FLIMIT 147 jmp .nextcol 148 149.downdone: 150 ; done with the all cols, start the across filtering in place 151 sub rsi, rdx 152 sub rdi, rdx 153 154 mov rbx, arg(5) ; flimits 155 UPDATE_FLIMIT 156 157 ; dup the first byte into the left border 8 times 158 movq mm1, [rdi] 159 punpcklbw mm1, mm1 160 punpcklwd mm1, mm1 161 punpckldq mm1, mm1 162 mov rdx, -8 163 movq [rdi+rdx], mm1 164 165 ; dup the last byte into the right border 166 movsxd rdx, dword arg(4) 167 movq mm1, [rdi + rdx + -1] 168 punpcklbw mm1, mm1 169 punpcklwd mm1, mm1 170 punpckldq mm1, mm1 171 movq [rdi+rdx], mm1 172 173 xor rdx, rdx 174 movq mm0, QWORD PTR [rdi-16]; 175 movq mm1, QWORD PTR [rdi-8]; 176 177.acrossnextcol: 178 movdqu xmm0, XMMWORD PTR [rdi + rdx] 179 movdqu xmm1, XMMWORD PTR [rdi + rdx -2] 180 movdqu xmm3, XMMWORD PTR [rdi + rdx -1] 181 182 FIRST_2_ROWS 183 184 movdqu xmm1, XMMWORD PTR [rdi + rdx +1] 185 movdqu xmm3, XMMWORD PTR [rdi + rdx +2] 186 187 SECOND_2_ROWS 188 189 movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes 190 movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes 191 movdq2q mm0, xmm0 192 psrldq xmm0, 8 193 movdq2q mm1, xmm0 194 195 add rdx, 16 196 cmp edx, dword arg(4) ;cols 197 jge .acrossdone 198 UPDATE_FLIMIT 199 jmp .acrossnextcol 200 201.acrossdone 202 ; last 16 pixels 203 movq QWORD PTR [rdi+rdx-16], mm0 204 205 cmp edx, dword arg(4) 206 jne .throw_last_8 207 movq QWORD PTR [rdi+rdx-8], mm1 208.throw_last_8: 209 ; done with this rwo 210 add rsi,rax ;next src line 211 mov eax, dword arg(3) ;dst_pixels_per_line 212 add rdi,rax ;next destination 213 mov eax, dword arg(2) ;src_pixels_per_line 214 215 mov rbx, arg(5) ;flimits 216 UPDATE_FLIMIT 217 218 dec rcx ;decrement count 219 jnz .nextrow ;next row 220 221 add rsp, 16 222 pop rsp 223 ; begin epilog 224 pop rdi 225 pop rsi 226 pop rbx 227 RESTORE_XMM 228 UNSHADOW_ARGS 229 pop rbp 230 ret 231%undef flimit 232 233;void vp8_mbpost_proc_down_xmm(unsigned char *dst, 234; int pitch, int rows, int cols,int flimit) 235extern sym(vp8_rv) 236global sym(vp8_mbpost_proc_down_xmm) PRIVATE 237sym(vp8_mbpost_proc_down_xmm): 238 push rbp 239 mov rbp, rsp 240 SHADOW_ARGS_TO_STACK 5 241 SAVE_XMM 7 242 GET_GOT rbx 243 push rsi 244 push rdi 245 ; end prolog 246 247 ALIGN_STACK 16, rax 248 sub rsp, 128+16 249 250 ; unsigned char d[16][8] at [rsp] 251 ; create flimit2 at [rsp+128] 252 mov eax, dword ptr arg(4) ;flimit 253 mov [rsp+128], eax 254 mov [rsp+128+4], eax 255 mov [rsp+128+8], eax 256 mov [rsp+128+12], eax 257%define flimit4 [rsp+128] 258 259%if ABI_IS_32BIT=0 260 lea r8, [GLOBAL(sym(vp8_rv))] 261%endif 262 263 ;rows +=8; 264 add dword arg(2), 8 265 266 ;for(c=0; c<cols; c+=8) 267.loop_col: 268 mov rsi, arg(0) ; s 269 pxor xmm0, xmm0 ; 270 271 movsxd rax, dword ptr arg(1) ;pitch ; 272 273 ; this copies the last row down into the border 8 rows 274 mov rdi, rsi 275 mov rdx, arg(2) 276 sub rdx, 9 277 imul rdx, rax 278 lea rdi, [rdi+rdx] 279 movq xmm1, QWORD ptr[rdi] ; first row 280 mov rcx, 8 281.init_borderd ; initialize borders 282 lea rdi, [rdi + rax] 283 movq [rdi], xmm1 284 285 dec rcx 286 jne .init_borderd 287 288 neg rax ; rax = -pitch 289 290 ; this copies the first row up into the border 8 rows 291 mov rdi, rsi 292 movq xmm1, QWORD ptr[rdi] ; first row 293 mov rcx, 8 294.init_border ; initialize borders 295 lea rdi, [rdi + rax] 296 movq [rdi], xmm1 297 298 dec rcx 299 jne .init_border 300 301 302 303 lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 304 neg rax 305 306 pxor xmm5, xmm5 307 pxor xmm6, xmm6 ; 308 309 pxor xmm7, xmm7 ; 310 mov rdi, rsi 311 312 mov rcx, 15 ; 313 314.loop_initvar: 315 movq xmm1, QWORD PTR [rdi]; 316 punpcklbw xmm1, xmm0 ; 317 318 paddw xmm5, xmm1 ; 319 pmullw xmm1, xmm1 ; 320 321 movdqa xmm2, xmm1 ; 322 punpcklwd xmm1, xmm0 ; 323 324 punpckhwd xmm2, xmm0 ; 325 paddd xmm6, xmm1 ; 326 327 paddd xmm7, xmm2 ; 328 lea rdi, [rdi+rax] ; 329 330 dec rcx 331 jne .loop_initvar 332 ;save the var and sum 333 xor rdx, rdx 334.loop_row: 335 movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] 336 movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] 337 338 punpcklbw xmm1, xmm0 339 punpcklbw xmm2, xmm0 340 341 paddw xmm5, xmm2 342 psubw xmm5, xmm1 343 344 pmullw xmm2, xmm2 345 movdqa xmm4, xmm2 346 347 punpcklwd xmm2, xmm0 348 punpckhwd xmm4, xmm0 349 350 paddd xmm6, xmm2 351 paddd xmm7, xmm4 352 353 pmullw xmm1, xmm1 354 movdqa xmm2, xmm1 355 356 punpcklwd xmm1, xmm0 357 psubd xmm6, xmm1 358 359 punpckhwd xmm2, xmm0 360 psubd xmm7, xmm2 361 362 363 movdqa xmm3, xmm6 364 pslld xmm3, 4 365 366 psubd xmm3, xmm6 367 movdqa xmm1, xmm5 368 369 movdqa xmm4, xmm5 370 pmullw xmm1, xmm1 371 372 pmulhw xmm4, xmm4 373 movdqa xmm2, xmm1 374 375 punpcklwd xmm1, xmm4 376 punpckhwd xmm2, xmm4 377 378 movdqa xmm4, xmm7 379 pslld xmm4, 4 380 381 psubd xmm4, xmm7 382 383 psubd xmm3, xmm1 384 psubd xmm4, xmm2 385 386 psubd xmm3, flimit4 387 psubd xmm4, flimit4 388 389 psrad xmm3, 31 390 psrad xmm4, 31 391 392 packssdw xmm3, xmm4 393 packsswb xmm3, xmm0 394 395 movq xmm1, QWORD PTR [rsi+rax*8] 396 397 movq xmm2, xmm1 398 punpcklbw xmm1, xmm0 399 400 paddw xmm1, xmm5 401 mov rcx, rdx 402 403 and rcx, 127 404%if ABI_IS_32BIT=1 && CONFIG_PIC=1 405 push rax 406 lea rax, [GLOBAL(sym(vp8_rv))] 407 movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] 408 pop rax 409%elif ABI_IS_32BIT=0 410 movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] 411%else 412 movdqu xmm4, [sym(vp8_rv) + rcx*2] 413%endif 414 415 paddw xmm1, xmm4 416 ;paddw xmm1, eight8s 417 psraw xmm1, 4 418 419 packuswb xmm1, xmm0 420 pand xmm1, xmm3 421 422 pandn xmm3, xmm2 423 por xmm1, xmm3 424 425 and rcx, 15 426 movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] 427 428 cmp edx, 8 429 jl .skip_assignment 430 431 mov rcx, rdx 432 sub rcx, 8 433 and rcx, 15 434 movq mm0, [rsp + rcx*8] ;d[rcx*8] 435 movq [rsi], mm0 436 437.skip_assignment 438 lea rsi, [rsi+rax] 439 440 lea rdi, [rdi+rax] 441 add rdx, 1 442 443 cmp edx, dword arg(2) ;rows 444 jl .loop_row 445 446 add dword arg(0), 8 ; s += 8 447 sub dword arg(3), 8 ; cols -= 8 448 cmp dword arg(3), 0 449 jg .loop_col 450 451 add rsp, 128+16 452 pop rsp 453 454 ; begin epilog 455 pop rdi 456 pop rsi 457 RESTORE_GOT 458 RESTORE_XMM 459 UNSHADOW_ARGS 460 pop rbp 461 ret 462%undef flimit4 463 464 465;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, 466; int pitch, int rows, int cols,int flimit) 467global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE 468sym(vp8_mbpost_proc_across_ip_xmm): 469 push rbp 470 mov rbp, rsp 471 SHADOW_ARGS_TO_STACK 5 472 SAVE_XMM 7 473 GET_GOT rbx 474 push rsi 475 push rdi 476 ; end prolog 477 478 ALIGN_STACK 16, rax 479 sub rsp, 16 480 481 ; create flimit4 at [rsp] 482 mov eax, dword ptr arg(4) ;flimit 483 mov [rsp], eax 484 mov [rsp+4], eax 485 mov [rsp+8], eax 486 mov [rsp+12], eax 487%define flimit4 [rsp] 488 489 490 ;for(r=0;r<rows;r++) 491.ip_row_loop: 492 493 xor rdx, rdx ;sumsq=0; 494 xor rcx, rcx ;sum=0; 495 mov rsi, arg(0); s 496 497 498 ; dup the first byte into the left border 8 times 499 movq mm1, [rsi] 500 punpcklbw mm1, mm1 501 punpcklwd mm1, mm1 502 punpckldq mm1, mm1 503 504 mov rdi, -8 505 movq [rsi+rdi], mm1 506 507 ; dup the last byte into the right border 508 movsxd rdx, dword arg(3) 509 movq mm1, [rsi + rdx + -1] 510 punpcklbw mm1, mm1 511 punpcklwd mm1, mm1 512 punpckldq mm1, mm1 513 movq [rsi+rdx], mm1 514 515.ip_var_loop: 516 ;for(i=-8;i<=6;i++) 517 ;{ 518 ; sumsq += s[i]*s[i]; 519 ; sum += s[i]; 520 ;} 521 movzx eax, byte [rsi+rdi] 522 add ecx, eax 523 mul al 524 add edx, eax 525 add rdi, 1 526 cmp rdi, 6 527 jle .ip_var_loop 528 529 530 ;mov rax, sumsq 531 ;movd xmm7, rax 532 movd xmm7, edx 533 534 ;mov rax, sum 535 ;movd xmm6, rax 536 movd xmm6, ecx 537 538 mov rsi, arg(0) ;s 539 xor rcx, rcx 540 541 movsxd rdx, dword arg(3) ;cols 542 add rdx, 8 543 pxor mm0, mm0 544 pxor mm1, mm1 545 546 pxor xmm0, xmm0 547.nextcol4: 548 549 movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 550 movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 551 552 punpcklbw xmm1, xmm0 ; expanding 553 punpcklbw xmm2, xmm0 ; expanding 554 555 punpcklwd xmm1, xmm0 ; expanding to dwords 556 punpcklwd xmm2, xmm0 ; expanding to dwords 557 558 psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 559 paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 560 561 paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 562 pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 563 564 paddd xmm6, xmm2 565 paddd xmm7, xmm1 566 567 pshufd xmm6, xmm6, 0 ; duplicate the last ones 568 pshufd xmm7, xmm7, 0 ; duplicate the last ones 569 570 psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 571 psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 572 573 pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared 574 pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared 575 576 paddd xmm6, xmm4 577 paddd xmm7, xmm3 578 579 pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared 580 pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared 581 582 paddd xmm7, xmm3 583 paddd xmm6, xmm4 584 585 pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared 586 pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared 587 588 paddd xmm7, xmm3 589 paddd xmm6, xmm4 590 591 movdqa xmm3, xmm6 592 pmaddwd xmm3, xmm3 593 594 movdqa xmm5, xmm7 595 pslld xmm5, 4 596 597 psubd xmm5, xmm7 598 psubd xmm5, xmm3 599 600 psubd xmm5, flimit4 601 psrad xmm5, 31 602 603 packssdw xmm5, xmm0 604 packsswb xmm5, xmm0 605 606 movd xmm1, DWORD PTR [rsi+rcx] 607 movq xmm2, xmm1 608 609 punpcklbw xmm1, xmm0 610 punpcklwd xmm1, xmm0 611 612 paddd xmm1, xmm6 613 paddd xmm1, [GLOBAL(four8s)] 614 615 psrad xmm1, 4 616 packssdw xmm1, xmm0 617 618 packuswb xmm1, xmm0 619 pand xmm1, xmm5 620 621 pandn xmm5, xmm2 622 por xmm5, xmm1 623 624 movd [rsi+rcx-8], mm0 625 movq mm0, mm1 626 627 movdq2q mm1, xmm5 628 psrldq xmm7, 12 629 630 psrldq xmm6, 12 631 add rcx, 4 632 633 cmp rcx, rdx 634 jl .nextcol4 635 636 ;s+=pitch; 637 movsxd rax, dword arg(1) 638 add arg(0), rax 639 640 sub dword arg(2), 1 ;rows-=1 641 cmp dword arg(2), 0 642 jg .ip_row_loop 643 644 add rsp, 16 645 pop rsp 646 647 ; begin epilog 648 pop rdi 649 pop rsi 650 RESTORE_GOT 651 RESTORE_XMM 652 UNSHADOW_ARGS 653 pop rbp 654 ret 655%undef flimit4 656 657 658;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, 659; unsigned char blackclamp[16], 660; unsigned char whiteclamp[16], 661; unsigned char bothclamp[16], 662; unsigned int Width, unsigned int Height, int Pitch) 663global sym(vp8_plane_add_noise_wmt) PRIVATE 664sym(vp8_plane_add_noise_wmt): 665 push rbp 666 mov rbp, rsp 667 SHADOW_ARGS_TO_STACK 8 668 GET_GOT rbx 669 push rsi 670 push rdi 671 ; end prolog 672 673.addnoise_loop: 674 call sym(LIBVPX_RAND) WRT_PLT 675 mov rcx, arg(1) ;noise 676 and rax, 0xff 677 add rcx, rax 678 679 ; we rely on the fact that the clamping vectors are stored contiguously 680 ; in black/white/both order. Note that we have to reload this here because 681 ; rdx could be trashed by rand() 682 mov rdx, arg(2) ; blackclamp 683 684 685 mov rdi, rcx 686 movsxd rcx, dword arg(5) ;[Width] 687 mov rsi, arg(0) ;Pos 688 xor rax,rax 689 690.addnoise_nextset: 691 movdqu xmm1,[rsi+rax] ; get the source 692 693 psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 694 paddusb xmm1, [rdx+32] ;bothclamp 695 psubusb xmm1, [rdx+16] ;whiteclamp 696 697 movdqu xmm2,[rdi+rax] ; get the noise for this line 698 paddb xmm1,xmm2 ; add it in 699 movdqu [rsi+rax],xmm1 ; store the result 700 701 add rax,16 ; move to the next line 702 703 cmp rax, rcx 704 jl .addnoise_nextset 705 706 movsxd rax, dword arg(7) ; Pitch 707 add arg(0), rax ; Start += Pitch 708 sub dword arg(6), 1 ; Height -= 1 709 jg .addnoise_loop 710 711 ; begin epilog 712 pop rdi 713 pop rsi 714 RESTORE_GOT 715 UNSHADOW_ARGS 716 pop rbp 717 ret 718 719 720SECTION_RODATA 721align 16 722four8s: 723 times 4 dd 8 724