1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%define BLOCK_HEIGHT_WIDTH 4 15%define VP8_FILTER_WEIGHT 128 16%define VP8_FILTER_SHIFT 7 17 18 19;/************************************************************************************ 20; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 21; input pixel array has output_height rows. This routine assumes that output_height is an 22; even number. This function handles 8 pixels in horizontal direction, calculating ONE 23; rows each iteration to take advantage of the 128 bits operations. 24; 25; This is an implementation of some of the SSE optimizations first seen in ffvp8 26; 27;*************************************************************************************/ 28;void vp8_filter_block1d8_h6_ssse3 29;( 30; unsigned char *src_ptr, 31; unsigned int src_pixels_per_line, 32; unsigned char *output_ptr, 33; unsigned int output_pitch, 34; unsigned int output_height, 35; unsigned int vp8_filter_index 36;) 37global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE 38sym(vp8_filter_block1d8_h6_ssse3): 39 push rbp 40 mov rbp, rsp 41 SHADOW_ARGS_TO_STACK 6 42 SAVE_XMM 7 43 GET_GOT rbx 44 push rsi 45 push rdi 46 ; end prolog 47 48 movsxd rdx, DWORD PTR arg(5) ;table index 49 xor rsi, rsi 50 shl rdx, 4 51 52 movdqa xmm7, [GLOBAL(rd)] 53 54 lea rax, [GLOBAL(k0_k5)] 55 add rax, rdx 56 mov rdi, arg(2) ;output_ptr 57 58 cmp esi, DWORD PTR [rax] 59 je vp8_filter_block1d8_h4_ssse3 60 61 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 62 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 63 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 64 65 mov rsi, arg(0) ;src_ptr 66 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 67 movsxd rcx, dword ptr arg(4) ;output_height 68 69 movsxd rdx, dword ptr arg(3) ;output_pitch 70 71 sub rdi, rdx 72;xmm3 free 73.filter_block1d8_h6_rowloop_ssse3: 74 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 75 76 movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 77 78 punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 79 80 movdqa xmm1, xmm0 81 pmaddubsw xmm0, xmm4 82 83 movdqa xmm2, xmm1 84 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 85 86 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 87 pmaddubsw xmm1, xmm5 88 89 lea rdi, [rdi + rdx] 90 pmaddubsw xmm2, xmm6 91 92 lea rsi, [rsi + rax] 93 dec rcx 94 95 paddsw xmm0, xmm1 96 paddsw xmm2, xmm7 97 98 paddsw xmm0, xmm2 99 100 psraw xmm0, 7 101 102 packuswb xmm0, xmm0 103 104 movq MMWORD Ptr [rdi], xmm0 105 jnz .filter_block1d8_h6_rowloop_ssse3 106 107 ; begin epilog 108 pop rdi 109 pop rsi 110 RESTORE_GOT 111 RESTORE_XMM 112 UNSHADOW_ARGS 113 pop rbp 114 ret 115 116vp8_filter_block1d8_h4_ssse3: 117 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 118 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 119 120 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] 121 movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] 122 123 mov rsi, arg(0) ;src_ptr 124 125 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 126 movsxd rcx, dword ptr arg(4) ;output_height 127 128 movsxd rdx, dword ptr arg(3) ;output_pitch 129 130 sub rdi, rdx 131 132.filter_block1d8_h4_rowloop_ssse3: 133 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 134 135 movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 136 137 punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 138 139 movdqa xmm2, xmm0 140 pshufb xmm0, xmm3 141 142 pshufb xmm2, xmm4 143 pmaddubsw xmm0, xmm5 144 145 lea rdi, [rdi + rdx] 146 pmaddubsw xmm2, xmm6 147 148 lea rsi, [rsi + rax] 149 dec rcx 150 151 paddsw xmm0, xmm7 152 153 paddsw xmm0, xmm2 154 155 psraw xmm0, 7 156 157 packuswb xmm0, xmm0 158 159 movq MMWORD Ptr [rdi], xmm0 160 161 jnz .filter_block1d8_h4_rowloop_ssse3 162 163 ; begin epilog 164 pop rdi 165 pop rsi 166 RESTORE_GOT 167 RESTORE_XMM 168 UNSHADOW_ARGS 169 pop rbp 170 ret 171;void vp8_filter_block1d16_h6_ssse3 172;( 173; unsigned char *src_ptr, 174; unsigned int src_pixels_per_line, 175; unsigned char *output_ptr, 176; unsigned int output_pitch, 177; unsigned int output_height, 178; unsigned int vp8_filter_index 179;) 180global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE 181sym(vp8_filter_block1d16_h6_ssse3): 182 push rbp 183 mov rbp, rsp 184 SHADOW_ARGS_TO_STACK 6 185 SAVE_XMM 7 186 GET_GOT rbx 187 push rsi 188 push rdi 189 ; end prolog 190 191 movsxd rdx, DWORD PTR arg(5) ;table index 192 xor rsi, rsi 193 shl rdx, 4 ; 194 195 lea rax, [GLOBAL(k0_k5)] 196 add rax, rdx 197 198 mov rdi, arg(2) ;output_ptr 199 200 mov rsi, arg(0) ;src_ptr 201 202 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 203 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 204 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 205 206 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 207 movsxd rcx, dword ptr arg(4) ;output_height 208 movsxd rdx, dword ptr arg(3) ;output_pitch 209 210.filter_block1d16_h6_rowloop_ssse3: 211 movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 212 213 movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 214 215 punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 216 217 movdqa xmm1, xmm0 218 pmaddubsw xmm0, xmm4 219 220 movdqa xmm2, xmm1 221 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 222 223 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 224 movq xmm3, MMWORD PTR [rsi + 6] 225 226 pmaddubsw xmm1, xmm5 227 movq xmm7, MMWORD PTR [rsi + 11] 228 229 pmaddubsw xmm2, xmm6 230 punpcklbw xmm3, xmm7 231 232 paddsw xmm0, xmm1 233 movdqa xmm1, xmm3 234 235 pmaddubsw xmm3, xmm4 236 paddsw xmm0, xmm2 237 238 movdqa xmm2, xmm1 239 paddsw xmm0, [GLOBAL(rd)] 240 241 pshufb xmm1, [GLOBAL(shuf2bfrom1)] 242 pshufb xmm2, [GLOBAL(shuf3bfrom1)] 243 244 psraw xmm0, 7 245 pmaddubsw xmm1, xmm5 246 247 pmaddubsw xmm2, xmm6 248 packuswb xmm0, xmm0 249 250 lea rsi, [rsi + rax] 251 paddsw xmm3, xmm1 252 253 paddsw xmm3, xmm2 254 255 paddsw xmm3, [GLOBAL(rd)] 256 257 psraw xmm3, 7 258 259 packuswb xmm3, xmm3 260 261 punpcklqdq xmm0, xmm3 262 263 movdqa XMMWORD Ptr [rdi], xmm0 264 265 lea rdi, [rdi + rdx] 266 dec rcx 267 jnz .filter_block1d16_h6_rowloop_ssse3 268 269 ; begin epilog 270 pop rdi 271 pop rsi 272 RESTORE_GOT 273 RESTORE_XMM 274 UNSHADOW_ARGS 275 pop rbp 276 ret 277 278;void vp8_filter_block1d4_h6_ssse3 279;( 280; unsigned char *src_ptr, 281; unsigned int src_pixels_per_line, 282; unsigned char *output_ptr, 283; unsigned int output_pitch, 284; unsigned int output_height, 285; unsigned int vp8_filter_index 286;) 287global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE 288sym(vp8_filter_block1d4_h6_ssse3): 289 push rbp 290 mov rbp, rsp 291 SHADOW_ARGS_TO_STACK 6 292 SAVE_XMM 7 293 GET_GOT rbx 294 push rsi 295 push rdi 296 ; end prolog 297 298 movsxd rdx, DWORD PTR arg(5) ;table index 299 xor rsi, rsi 300 shl rdx, 4 ; 301 302 lea rax, [GLOBAL(k0_k5)] 303 add rax, rdx 304 movdqa xmm7, [GLOBAL(rd)] 305 306 cmp esi, DWORD PTR [rax] 307 je .vp8_filter_block1d4_h4_ssse3 308 309 movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 310 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 311 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 312 313 mov rsi, arg(0) ;src_ptr 314 mov rdi, arg(2) ;output_ptr 315 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 316 movsxd rcx, dword ptr arg(4) ;output_height 317 318 movsxd rdx, dword ptr arg(3) ;output_pitch 319 320;xmm3 free 321.filter_block1d4_h6_rowloop_ssse3: 322 movdqu xmm0, XMMWORD PTR [rsi - 2] 323 324 movdqa xmm1, xmm0 325 pshufb xmm0, [GLOBAL(shuf1b)] 326 327 movdqa xmm2, xmm1 328 pshufb xmm1, [GLOBAL(shuf2b)] 329 pmaddubsw xmm0, xmm4 330 pshufb xmm2, [GLOBAL(shuf3b)] 331 pmaddubsw xmm1, xmm5 332 333;-- 334 pmaddubsw xmm2, xmm6 335 336 lea rsi, [rsi + rax] 337;-- 338 paddsw xmm0, xmm1 339 paddsw xmm0, xmm7 340 pxor xmm1, xmm1 341 paddsw xmm0, xmm2 342 psraw xmm0, 7 343 packuswb xmm0, xmm0 344 345 movd DWORD PTR [rdi], xmm0 346 347 add rdi, rdx 348 dec rcx 349 jnz .filter_block1d4_h6_rowloop_ssse3 350 351 ; begin epilog 352 pop rdi 353 pop rsi 354 RESTORE_GOT 355 RESTORE_XMM 356 UNSHADOW_ARGS 357 pop rbp 358 ret 359 360.vp8_filter_block1d4_h4_ssse3: 361 movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 362 movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 363 movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] 364 movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] 365 366 mov rsi, arg(0) ;src_ptr 367 mov rdi, arg(2) ;output_ptr 368 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 369 movsxd rcx, dword ptr arg(4) ;output_height 370 371 movsxd rdx, dword ptr arg(3) ;output_pitch 372 373.filter_block1d4_h4_rowloop_ssse3: 374 movdqu xmm1, XMMWORD PTR [rsi - 2] 375 376 movdqa xmm2, xmm1 377 pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] 378 pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] 379 pmaddubsw xmm1, xmm5 380 381;-- 382 pmaddubsw xmm2, xmm6 383 384 lea rsi, [rsi + rax] 385;-- 386 paddsw xmm1, xmm7 387 paddsw xmm1, xmm2 388 psraw xmm1, 7 389 packuswb xmm1, xmm1 390 391 movd DWORD PTR [rdi], xmm1 392 393 add rdi, rdx 394 dec rcx 395 jnz .filter_block1d4_h4_rowloop_ssse3 396 397 ; begin epilog 398 pop rdi 399 pop rsi 400 RESTORE_GOT 401 RESTORE_XMM 402 UNSHADOW_ARGS 403 pop rbp 404 ret 405 406 407 408;void vp8_filter_block1d16_v6_ssse3 409;( 410; unsigned char *src_ptr, 411; unsigned int src_pitch, 412; unsigned char *output_ptr, 413; unsigned int out_pitch, 414; unsigned int output_height, 415; unsigned int vp8_filter_index 416;) 417global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE 418sym(vp8_filter_block1d16_v6_ssse3): 419 push rbp 420 mov rbp, rsp 421 SHADOW_ARGS_TO_STACK 6 422 SAVE_XMM 7 423 GET_GOT rbx 424 push rsi 425 push rdi 426 ; end prolog 427 428 movsxd rdx, DWORD PTR arg(5) ;table index 429 xor rsi, rsi 430 shl rdx, 4 ; 431 432 lea rax, [GLOBAL(k0_k5)] 433 add rax, rdx 434 435 cmp esi, DWORD PTR [rax] 436 je .vp8_filter_block1d16_v4_ssse3 437 438 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 439 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 440 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 441 442 mov rsi, arg(0) ;src_ptr 443 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 444 mov rdi, arg(2) ;output_ptr 445 446%if ABI_IS_32BIT=0 447 movsxd r8, DWORD PTR arg(3) ;out_pitch 448%endif 449 mov rax, rsi 450 movsxd rcx, DWORD PTR arg(4) ;output_height 451 add rax, rdx 452 453 454.vp8_filter_block1d16_v6_ssse3_loop: 455 movq xmm1, MMWORD PTR [rsi] ;A 456 movq xmm2, MMWORD PTR [rsi + rdx] ;B 457 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 458 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 459 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 460 461 punpcklbw xmm2, xmm4 ;B D 462 punpcklbw xmm3, xmm0 ;C E 463 464 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 465 466 pmaddubsw xmm3, xmm6 467 punpcklbw xmm1, xmm0 ;A F 468 pmaddubsw xmm2, xmm7 469 pmaddubsw xmm1, xmm5 470 471 paddsw xmm2, xmm3 472 paddsw xmm2, xmm1 473 paddsw xmm2, [GLOBAL(rd)] 474 psraw xmm2, 7 475 packuswb xmm2, xmm2 476 477 movq MMWORD PTR [rdi], xmm2 ;store the results 478 479 movq xmm1, MMWORD PTR [rsi + 8] ;A 480 movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B 481 movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C 482 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 483 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 484 485 punpcklbw xmm2, xmm4 ;B D 486 punpcklbw xmm3, xmm0 ;C E 487 488 movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F 489 pmaddubsw xmm3, xmm6 490 punpcklbw xmm1, xmm0 ;A F 491 pmaddubsw xmm2, xmm7 492 pmaddubsw xmm1, xmm5 493 494 add rsi, rdx 495 add rax, rdx 496;-- 497;-- 498 paddsw xmm2, xmm3 499 paddsw xmm2, xmm1 500 paddsw xmm2, [GLOBAL(rd)] 501 psraw xmm2, 7 502 packuswb xmm2, xmm2 503 504 movq MMWORD PTR [rdi+8], xmm2 505 506%if ABI_IS_32BIT 507 add rdi, DWORD PTR arg(3) ;out_pitch 508%else 509 add rdi, r8 510%endif 511 dec rcx 512 jnz .vp8_filter_block1d16_v6_ssse3_loop 513 514 ; begin epilog 515 pop rdi 516 pop rsi 517 RESTORE_GOT 518 RESTORE_XMM 519 UNSHADOW_ARGS 520 pop rbp 521 ret 522 523.vp8_filter_block1d16_v4_ssse3: 524 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 525 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 526 527 mov rsi, arg(0) ;src_ptr 528 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 529 mov rdi, arg(2) ;output_ptr 530 531%if ABI_IS_32BIT=0 532 movsxd r8, DWORD PTR arg(3) ;out_pitch 533%endif 534 mov rax, rsi 535 movsxd rcx, DWORD PTR arg(4) ;output_height 536 add rax, rdx 537 538.vp8_filter_block1d16_v4_ssse3_loop: 539 movq xmm2, MMWORD PTR [rsi + rdx] ;B 540 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 541 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 542 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 543 544 punpcklbw xmm2, xmm4 ;B D 545 punpcklbw xmm3, xmm0 ;C E 546 547 pmaddubsw xmm3, xmm6 548 pmaddubsw xmm2, xmm7 549 movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B 550 movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C 551 movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 552 movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 553 554 paddsw xmm2, [GLOBAL(rd)] 555 paddsw xmm2, xmm3 556 psraw xmm2, 7 557 packuswb xmm2, xmm2 558 559 punpcklbw xmm5, xmm4 ;B D 560 punpcklbw xmm1, xmm0 ;C E 561 562 pmaddubsw xmm1, xmm6 563 pmaddubsw xmm5, xmm7 564 565 movdqa xmm4, [GLOBAL(rd)] 566 add rsi, rdx 567 add rax, rdx 568;-- 569;-- 570 paddsw xmm5, xmm1 571 paddsw xmm5, xmm4 572 psraw xmm5, 7 573 packuswb xmm5, xmm5 574 575 punpcklqdq xmm2, xmm5 576 577 movdqa XMMWORD PTR [rdi], xmm2 578 579%if ABI_IS_32BIT 580 add rdi, DWORD PTR arg(3) ;out_pitch 581%else 582 add rdi, r8 583%endif 584 dec rcx 585 jnz .vp8_filter_block1d16_v4_ssse3_loop 586 587 ; begin epilog 588 pop rdi 589 pop rsi 590 RESTORE_GOT 591 RESTORE_XMM 592 UNSHADOW_ARGS 593 pop rbp 594 ret 595 596;void vp8_filter_block1d8_v6_ssse3 597;( 598; unsigned char *src_ptr, 599; unsigned int src_pitch, 600; unsigned char *output_ptr, 601; unsigned int out_pitch, 602; unsigned int output_height, 603; unsigned int vp8_filter_index 604;) 605global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE 606sym(vp8_filter_block1d8_v6_ssse3): 607 push rbp 608 mov rbp, rsp 609 SHADOW_ARGS_TO_STACK 6 610 SAVE_XMM 7 611 GET_GOT rbx 612 push rsi 613 push rdi 614 ; end prolog 615 616 movsxd rdx, DWORD PTR arg(5) ;table index 617 xor rsi, rsi 618 shl rdx, 4 ; 619 620 lea rax, [GLOBAL(k0_k5)] 621 add rax, rdx 622 623 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 624 mov rdi, arg(2) ;output_ptr 625%if ABI_IS_32BIT=0 626 movsxd r8, DWORD PTR arg(3) ; out_pitch 627%endif 628 movsxd rcx, DWORD PTR arg(4) ;[output_height] 629 630 cmp esi, DWORD PTR [rax] 631 je .vp8_filter_block1d8_v4_ssse3 632 633 movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 634 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 635 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 636 637 mov rsi, arg(0) ;src_ptr 638 639 mov rax, rsi 640 add rax, rdx 641 642.vp8_filter_block1d8_v6_ssse3_loop: 643 movq xmm1, MMWORD PTR [rsi] ;A 644 movq xmm2, MMWORD PTR [rsi + rdx] ;B 645 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 646 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 647 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 648 649 punpcklbw xmm2, xmm4 ;B D 650 punpcklbw xmm3, xmm0 ;C E 651 652 movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 653 movdqa xmm4, [GLOBAL(rd)] 654 655 pmaddubsw xmm3, xmm6 656 punpcklbw xmm1, xmm0 ;A F 657 pmaddubsw xmm2, xmm7 658 pmaddubsw xmm1, xmm5 659 add rsi, rdx 660 add rax, rdx 661;-- 662;-- 663 paddsw xmm2, xmm3 664 paddsw xmm2, xmm1 665 paddsw xmm2, xmm4 666 psraw xmm2, 7 667 packuswb xmm2, xmm2 668 669 movq MMWORD PTR [rdi], xmm2 670 671%if ABI_IS_32BIT 672 add rdi, DWORD PTR arg(3) ;[out_pitch] 673%else 674 add rdi, r8 675%endif 676 dec rcx 677 jnz .vp8_filter_block1d8_v6_ssse3_loop 678 679 ; begin epilog 680 pop rdi 681 pop rsi 682 RESTORE_GOT 683 RESTORE_XMM 684 UNSHADOW_ARGS 685 pop rbp 686 ret 687 688.vp8_filter_block1d8_v4_ssse3: 689 movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 690 movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 691 movdqa xmm5, [GLOBAL(rd)] 692 693 mov rsi, arg(0) ;src_ptr 694 695 mov rax, rsi 696 add rax, rdx 697 698.vp8_filter_block1d8_v4_ssse3_loop: 699 movq xmm2, MMWORD PTR [rsi + rdx] ;B 700 movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 701 movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 702 movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 703 704 punpcklbw xmm2, xmm4 ;B D 705 punpcklbw xmm3, xmm0 ;C E 706 707 pmaddubsw xmm3, xmm6 708 pmaddubsw xmm2, xmm7 709 add rsi, rdx 710 add rax, rdx 711;-- 712;-- 713 paddsw xmm2, xmm3 714 paddsw xmm2, xmm5 715 psraw xmm2, 7 716 packuswb xmm2, xmm2 717 718 movq MMWORD PTR [rdi], xmm2 719 720%if ABI_IS_32BIT 721 add rdi, DWORD PTR arg(3) ;[out_pitch] 722%else 723 add rdi, r8 724%endif 725 dec rcx 726 jnz .vp8_filter_block1d8_v4_ssse3_loop 727 728 ; begin epilog 729 pop rdi 730 pop rsi 731 RESTORE_GOT 732 RESTORE_XMM 733 UNSHADOW_ARGS 734 pop rbp 735 ret 736;void vp8_filter_block1d4_v6_ssse3 737;( 738; unsigned char *src_ptr, 739; unsigned int src_pitch, 740; unsigned char *output_ptr, 741; unsigned int out_pitch, 742; unsigned int output_height, 743; unsigned int vp8_filter_index 744;) 745global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE 746sym(vp8_filter_block1d4_v6_ssse3): 747 push rbp 748 mov rbp, rsp 749 SHADOW_ARGS_TO_STACK 6 750 GET_GOT rbx 751 push rsi 752 push rdi 753 ; end prolog 754 755 movsxd rdx, DWORD PTR arg(5) ;table index 756 xor rsi, rsi 757 shl rdx, 4 ; 758 759 lea rax, [GLOBAL(k0_k5)] 760 add rax, rdx 761 762 movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 763 mov rdi, arg(2) ;output_ptr 764%if ABI_IS_32BIT=0 765 movsxd r8, DWORD PTR arg(3) ; out_pitch 766%endif 767 movsxd rcx, DWORD PTR arg(4) ;[output_height] 768 769 cmp esi, DWORD PTR [rax] 770 je .vp8_filter_block1d4_v4_ssse3 771 772 movq mm5, MMWORD PTR [rax] ;k0_k5 773 movq mm6, MMWORD PTR [rax+256] ;k2_k4 774 movq mm7, MMWORD PTR [rax+128] ;k1_k3 775 776 mov rsi, arg(0) ;src_ptr 777 778 mov rax, rsi 779 add rax, rdx 780 781.vp8_filter_block1d4_v6_ssse3_loop: 782 movd mm1, DWORD PTR [rsi] ;A 783 movd mm2, DWORD PTR [rsi + rdx] ;B 784 movd mm3, DWORD PTR [rsi + rdx * 2] ;C 785 movd mm4, DWORD PTR [rax + rdx * 2] ;D 786 movd mm0, DWORD PTR [rsi + rdx * 4] ;E 787 788 punpcklbw mm2, mm4 ;B D 789 punpcklbw mm3, mm0 ;C E 790 791 movd mm0, DWORD PTR [rax + rdx * 4] ;F 792 793 movq mm4, [GLOBAL(rd)] 794 795 pmaddubsw mm3, mm6 796 punpcklbw mm1, mm0 ;A F 797 pmaddubsw mm2, mm7 798 pmaddubsw mm1, mm5 799 add rsi, rdx 800 add rax, rdx 801;-- 802;-- 803 paddsw mm2, mm3 804 paddsw mm2, mm1 805 paddsw mm2, mm4 806 psraw mm2, 7 807 packuswb mm2, mm2 808 809 movd DWORD PTR [rdi], mm2 810 811%if ABI_IS_32BIT 812 add rdi, DWORD PTR arg(3) ;[out_pitch] 813%else 814 add rdi, r8 815%endif 816 dec rcx 817 jnz .vp8_filter_block1d4_v6_ssse3_loop 818 819 ; begin epilog 820 pop rdi 821 pop rsi 822 RESTORE_GOT 823 UNSHADOW_ARGS 824 pop rbp 825 ret 826 827.vp8_filter_block1d4_v4_ssse3: 828 movq mm6, MMWORD PTR [rax+256] ;k2_k4 829 movq mm7, MMWORD PTR [rax+128] ;k1_k3 830 movq mm5, MMWORD PTR [GLOBAL(rd)] 831 832 mov rsi, arg(0) ;src_ptr 833 834 mov rax, rsi 835 add rax, rdx 836 837.vp8_filter_block1d4_v4_ssse3_loop: 838 movd mm2, DWORD PTR [rsi + rdx] ;B 839 movd mm3, DWORD PTR [rsi + rdx * 2] ;C 840 movd mm4, DWORD PTR [rax + rdx * 2] ;D 841 movd mm0, DWORD PTR [rsi + rdx * 4] ;E 842 843 punpcklbw mm2, mm4 ;B D 844 punpcklbw mm3, mm0 ;C E 845 846 pmaddubsw mm3, mm6 847 pmaddubsw mm2, mm7 848 add rsi, rdx 849 add rax, rdx 850;-- 851;-- 852 paddsw mm2, mm3 853 paddsw mm2, mm5 854 psraw mm2, 7 855 packuswb mm2, mm2 856 857 movd DWORD PTR [rdi], mm2 858 859%if ABI_IS_32BIT 860 add rdi, DWORD PTR arg(3) ;[out_pitch] 861%else 862 add rdi, r8 863%endif 864 dec rcx 865 jnz .vp8_filter_block1d4_v4_ssse3_loop 866 867 ; begin epilog 868 pop rdi 869 pop rsi 870 RESTORE_GOT 871 UNSHADOW_ARGS 872 pop rbp 873 ret 874 875;void vp8_bilinear_predict16x16_ssse3 876;( 877; unsigned char *src_ptr, 878; int src_pixels_per_line, 879; int xoffset, 880; int yoffset, 881; unsigned char *dst_ptr, 882; int dst_pitch 883;) 884global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE 885sym(vp8_bilinear_predict16x16_ssse3): 886 push rbp 887 mov rbp, rsp 888 SHADOW_ARGS_TO_STACK 6 889 SAVE_XMM 7 890 GET_GOT rbx 891 push rsi 892 push rdi 893 ; end prolog 894 895 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 896 movsxd rax, dword ptr arg(2) ; xoffset 897 898 cmp rax, 0 ; skip first_pass filter if xoffset=0 899 je .b16x16_sp_only 900 901 shl rax, 4 902 lea rax, [rax + rcx] ; HFilter 903 904 mov rdi, arg(4) ; dst_ptr 905 mov rsi, arg(0) ; src_ptr 906 movsxd rdx, dword ptr arg(5) ; dst_pitch 907 908 movdqa xmm1, [rax] 909 910 movsxd rax, dword ptr arg(3) ; yoffset 911 912 cmp rax, 0 ; skip second_pass filter if yoffset=0 913 je .b16x16_fp_only 914 915 shl rax, 4 916 lea rax, [rax + rcx] ; VFilter 917 918 lea rcx, [rdi+rdx*8] 919 lea rcx, [rcx+rdx*8] 920 movsxd rdx, dword ptr arg(1) ; src_pixels_per_line 921 922 movdqa xmm2, [rax] 923 924%if ABI_IS_32BIT=0 925 movsxd r8, dword ptr arg(5) ; dst_pitch 926%endif 927 movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 928 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 929 930 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 931 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 932 933 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 934 935 lea rsi, [rsi + rdx] ; next line 936 937 pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 938 939 punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 940 pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 941 942 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 943 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 944 945 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 946 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 947 948 movdqa xmm7, xmm3 949 packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 950 951.next_row: 952 movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 953 movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 954 955 punpcklbw xmm6, xmm5 956 movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 957 958 movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 959 lea rsi, [rsi + rdx] ; next line 960 961 pmaddubsw xmm6, xmm1 962 963 punpcklbw xmm4, xmm5 964 pmaddubsw xmm4, xmm1 965 966 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 967 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 968 969 paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 970 psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 971 972 packuswb xmm6, xmm4 973 movdqa xmm5, xmm7 974 975 punpcklbw xmm5, xmm6 976 pmaddubsw xmm5, xmm2 977 978 punpckhbw xmm7, xmm6 979 pmaddubsw xmm7, xmm2 980 981 paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value 982 psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 983 984 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 985 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 986 987 packuswb xmm5, xmm7 988 movdqa xmm7, xmm6 989 990 movdqa [rdi], xmm5 ; store the results in the destination 991%if ABI_IS_32BIT 992 add rdi, DWORD PTR arg(5) ; dst_pitch 993%else 994 add rdi, r8 995%endif 996 997 cmp rdi, rcx 998 jne .next_row 999 1000 jmp .done 1001 1002.b16x16_sp_only: 1003 movsxd rax, dword ptr arg(3) ; yoffset 1004 shl rax, 4 1005 lea rax, [rax + rcx] ; VFilter 1006 1007 mov rdi, arg(4) ; dst_ptr 1008 mov rsi, arg(0) ; src_ptr 1009 movsxd rdx, dword ptr arg(5) ; dst_pitch 1010 1011 movdqa xmm1, [rax] ; VFilter 1012 1013 lea rcx, [rdi+rdx*8] 1014 lea rcx, [rcx+rdx*8] 1015 movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1016 1017 ; get the first horizontal line done 1018 movq xmm4, [rsi] ; load row 0 1019 movq xmm2, [rsi + 8] ; load row 0 1020 1021 lea rsi, [rsi + rax] ; next line 1022.next_row_sp: 1023 movq xmm3, [rsi] ; load row + 1 1024 movq xmm5, [rsi + 8] ; load row + 1 1025 1026 punpcklbw xmm4, xmm3 1027 punpcklbw xmm2, xmm5 1028 1029 pmaddubsw xmm4, xmm1 1030 movq xmm7, [rsi + rax] ; load row + 2 1031 1032 pmaddubsw xmm2, xmm1 1033 movq xmm6, [rsi + rax + 8] ; load row + 2 1034 1035 punpcklbw xmm3, xmm7 1036 punpcklbw xmm5, xmm6 1037 1038 pmaddubsw xmm3, xmm1 1039 paddw xmm4, [GLOBAL(rd)] 1040 1041 pmaddubsw xmm5, xmm1 1042 paddw xmm2, [GLOBAL(rd)] 1043 1044 psraw xmm4, VP8_FILTER_SHIFT 1045 psraw xmm2, VP8_FILTER_SHIFT 1046 1047 packuswb xmm4, xmm2 1048 paddw xmm3, [GLOBAL(rd)] 1049 1050 movdqa [rdi], xmm4 ; store row 0 1051 paddw xmm5, [GLOBAL(rd)] 1052 1053 psraw xmm3, VP8_FILTER_SHIFT 1054 psraw xmm5, VP8_FILTER_SHIFT 1055 1056 packuswb xmm3, xmm5 1057 movdqa xmm4, xmm7 1058 1059 movdqa [rdi + rdx],xmm3 ; store row 1 1060 lea rsi, [rsi + 2*rax] 1061 1062 movdqa xmm2, xmm6 1063 lea rdi, [rdi + 2*rdx] 1064 1065 cmp rdi, rcx 1066 jne .next_row_sp 1067 1068 jmp .done 1069 1070.b16x16_fp_only: 1071 lea rcx, [rdi+rdx*8] 1072 lea rcx, [rcx+rdx*8] 1073 movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1074 1075.next_row_fp: 1076 movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 1077 movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 1078 1079 punpcklbw xmm2, xmm4 1080 movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 1081 1082 pmaddubsw xmm2, xmm1 1083 movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 1084 1085 lea rsi, [rsi + rax] ; next line 1086 punpcklbw xmm3, xmm4 1087 1088 pmaddubsw xmm3, xmm1 1089 movq xmm5, [rsi] 1090 1091 paddw xmm2, [GLOBAL(rd)] 1092 movq xmm7, [rsi+1] 1093 1094 movq xmm6, [rsi+8] 1095 psraw xmm2, VP8_FILTER_SHIFT 1096 1097 punpcklbw xmm5, xmm7 1098 movq xmm7, [rsi+9] 1099 1100 paddw xmm3, [GLOBAL(rd)] 1101 pmaddubsw xmm5, xmm1 1102 1103 psraw xmm3, VP8_FILTER_SHIFT 1104 punpcklbw xmm6, xmm7 1105 1106 packuswb xmm2, xmm3 1107 pmaddubsw xmm6, xmm1 1108 1109 movdqa [rdi], xmm2 ; store the results in the destination 1110 paddw xmm5, [GLOBAL(rd)] 1111 1112 lea rdi, [rdi + rdx] ; dst_pitch 1113 psraw xmm5, VP8_FILTER_SHIFT 1114 1115 paddw xmm6, [GLOBAL(rd)] 1116 psraw xmm6, VP8_FILTER_SHIFT 1117 1118 packuswb xmm5, xmm6 1119 lea rsi, [rsi + rax] ; next line 1120 1121 movdqa [rdi], xmm5 ; store the results in the destination 1122 lea rdi, [rdi + rdx] ; dst_pitch 1123 1124 cmp rdi, rcx 1125 1126 jne .next_row_fp 1127 1128.done: 1129 ; begin epilog 1130 pop rdi 1131 pop rsi 1132 RESTORE_GOT 1133 RESTORE_XMM 1134 UNSHADOW_ARGS 1135 pop rbp 1136 ret 1137 1138;void vp8_bilinear_predict8x8_ssse3 1139;( 1140; unsigned char *src_ptr, 1141; int src_pixels_per_line, 1142; int xoffset, 1143; int yoffset, 1144; unsigned char *dst_ptr, 1145; int dst_pitch 1146;) 1147global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE 1148sym(vp8_bilinear_predict8x8_ssse3): 1149 push rbp 1150 mov rbp, rsp 1151 SHADOW_ARGS_TO_STACK 6 1152 SAVE_XMM 7 1153 GET_GOT rbx 1154 push rsi 1155 push rdi 1156 ; end prolog 1157 1158 ALIGN_STACK 16, rax 1159 sub rsp, 144 ; reserve 144 bytes 1160 1161 lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 1162 1163 mov rsi, arg(0) ;src_ptr 1164 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1165 1166 ;Read 9-line unaligned data in and put them on stack. This gives a big 1167 ;performance boost. 1168 movdqu xmm0, [rsi] 1169 lea rax, [rdx + rdx*2] 1170 movdqu xmm1, [rsi+rdx] 1171 movdqu xmm2, [rsi+rdx*2] 1172 add rsi, rax 1173 movdqu xmm3, [rsi] 1174 movdqu xmm4, [rsi+rdx] 1175 movdqu xmm5, [rsi+rdx*2] 1176 add rsi, rax 1177 movdqu xmm6, [rsi] 1178 movdqu xmm7, [rsi+rdx] 1179 1180 movdqa XMMWORD PTR [rsp], xmm0 1181 1182 movdqu xmm0, [rsi+rdx*2] 1183 1184 movdqa XMMWORD PTR [rsp+16], xmm1 1185 movdqa XMMWORD PTR [rsp+32], xmm2 1186 movdqa XMMWORD PTR [rsp+48], xmm3 1187 movdqa XMMWORD PTR [rsp+64], xmm4 1188 movdqa XMMWORD PTR [rsp+80], xmm5 1189 movdqa XMMWORD PTR [rsp+96], xmm6 1190 movdqa XMMWORD PTR [rsp+112], xmm7 1191 movdqa XMMWORD PTR [rsp+128], xmm0 1192 1193 movsxd rax, dword ptr arg(2) ; xoffset 1194 cmp rax, 0 ; skip first_pass filter if xoffset=0 1195 je .b8x8_sp_only 1196 1197 shl rax, 4 1198 add rax, rcx ; HFilter 1199 1200 mov rdi, arg(4) ; dst_ptr 1201 movsxd rdx, dword ptr arg(5) ; dst_pitch 1202 1203 movdqa xmm0, [rax] 1204 1205 movsxd rax, dword ptr arg(3) ; yoffset 1206 cmp rax, 0 ; skip second_pass filter if yoffset=0 1207 je .b8x8_fp_only 1208 1209 shl rax, 4 1210 lea rax, [rax + rcx] ; VFilter 1211 1212 lea rcx, [rdi+rdx*8] 1213 1214 movdqa xmm1, [rax] 1215 1216 ; get the first horizontal line done 1217 movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1218 movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx 1219 1220 psrldq xmm5, 1 1221 lea rsp, [rsp + 16] ; next line 1222 1223 punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 1224 pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 1225 1226 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1227 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1228 1229 movdqa xmm7, xmm3 1230 packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1231 1232.next_row: 1233 movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1234 lea rsp, [rsp + 16] ; next line 1235 1236 movdqa xmm5, xmm6 1237 1238 psrldq xmm5, 1 1239 1240 punpcklbw xmm6, xmm5 1241 pmaddubsw xmm6, xmm0 1242 1243 paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 1244 psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 1245 1246 packuswb xmm6, xmm6 1247 1248 punpcklbw xmm7, xmm6 1249 pmaddubsw xmm7, xmm1 1250 1251 paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 1252 psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 1253 1254 packuswb xmm7, xmm7 1255 1256 movq [rdi], xmm7 ; store the results in the destination 1257 lea rdi, [rdi + rdx] 1258 1259 movdqa xmm7, xmm6 1260 1261 cmp rdi, rcx 1262 jne .next_row 1263 1264 jmp .done8x8 1265 1266.b8x8_sp_only: 1267 movsxd rax, dword ptr arg(3) ; yoffset 1268 shl rax, 4 1269 lea rax, [rax + rcx] ; VFilter 1270 1271 mov rdi, arg(4) ;dst_ptr 1272 movsxd rdx, dword ptr arg(5) ; dst_pitch 1273 1274 movdqa xmm0, [rax] ; VFilter 1275 1276 movq xmm1, XMMWORD PTR [rsp] 1277 movq xmm2, XMMWORD PTR [rsp+16] 1278 1279 movq xmm3, XMMWORD PTR [rsp+32] 1280 punpcklbw xmm1, xmm2 1281 1282 movq xmm4, XMMWORD PTR [rsp+48] 1283 punpcklbw xmm2, xmm3 1284 1285 movq xmm5, XMMWORD PTR [rsp+64] 1286 punpcklbw xmm3, xmm4 1287 1288 movq xmm6, XMMWORD PTR [rsp+80] 1289 punpcklbw xmm4, xmm5 1290 1291 movq xmm7, XMMWORD PTR [rsp+96] 1292 punpcklbw xmm5, xmm6 1293 1294 pmaddubsw xmm1, xmm0 1295 pmaddubsw xmm2, xmm0 1296 1297 pmaddubsw xmm3, xmm0 1298 pmaddubsw xmm4, xmm0 1299 1300 pmaddubsw xmm5, xmm0 1301 punpcklbw xmm6, xmm7 1302 1303 pmaddubsw xmm6, xmm0 1304 paddw xmm1, [GLOBAL(rd)] 1305 1306 paddw xmm2, [GLOBAL(rd)] 1307 psraw xmm1, VP8_FILTER_SHIFT 1308 1309 paddw xmm3, [GLOBAL(rd)] 1310 psraw xmm2, VP8_FILTER_SHIFT 1311 1312 paddw xmm4, [GLOBAL(rd)] 1313 psraw xmm3, VP8_FILTER_SHIFT 1314 1315 paddw xmm5, [GLOBAL(rd)] 1316 psraw xmm4, VP8_FILTER_SHIFT 1317 1318 paddw xmm6, [GLOBAL(rd)] 1319 psraw xmm5, VP8_FILTER_SHIFT 1320 1321 psraw xmm6, VP8_FILTER_SHIFT 1322 packuswb xmm1, xmm1 1323 1324 packuswb xmm2, xmm2 1325 movq [rdi], xmm1 1326 1327 packuswb xmm3, xmm3 1328 movq [rdi+rdx], xmm2 1329 1330 packuswb xmm4, xmm4 1331 movq xmm1, XMMWORD PTR [rsp+112] 1332 1333 lea rdi, [rdi + 2*rdx] 1334 movq xmm2, XMMWORD PTR [rsp+128] 1335 1336 packuswb xmm5, xmm5 1337 movq [rdi], xmm3 1338 1339 packuswb xmm6, xmm6 1340 movq [rdi+rdx], xmm4 1341 1342 lea rdi, [rdi + 2*rdx] 1343 punpcklbw xmm7, xmm1 1344 1345 movq [rdi], xmm5 1346 pmaddubsw xmm7, xmm0 1347 1348 movq [rdi+rdx], xmm6 1349 punpcklbw xmm1, xmm2 1350 1351 pmaddubsw xmm1, xmm0 1352 paddw xmm7, [GLOBAL(rd)] 1353 1354 psraw xmm7, VP8_FILTER_SHIFT 1355 paddw xmm1, [GLOBAL(rd)] 1356 1357 psraw xmm1, VP8_FILTER_SHIFT 1358 packuswb xmm7, xmm7 1359 1360 packuswb xmm1, xmm1 1361 lea rdi, [rdi + 2*rdx] 1362 1363 movq [rdi], xmm7 1364 1365 movq [rdi+rdx], xmm1 1366 lea rsp, [rsp + 144] 1367 1368 jmp .done8x8 1369 1370.b8x8_fp_only: 1371 lea rcx, [rdi+rdx*8] 1372 1373.next_row_fp: 1374 movdqa xmm1, XMMWORD PTR [rsp] 1375 movdqa xmm3, XMMWORD PTR [rsp+16] 1376 1377 movdqa xmm2, xmm1 1378 movdqa xmm5, XMMWORD PTR [rsp+32] 1379 1380 psrldq xmm2, 1 1381 movdqa xmm7, XMMWORD PTR [rsp+48] 1382 1383 movdqa xmm4, xmm3 1384 psrldq xmm4, 1 1385 1386 movdqa xmm6, xmm5 1387 psrldq xmm6, 1 1388 1389 punpcklbw xmm1, xmm2 1390 pmaddubsw xmm1, xmm0 1391 1392 punpcklbw xmm3, xmm4 1393 pmaddubsw xmm3, xmm0 1394 1395 punpcklbw xmm5, xmm6 1396 pmaddubsw xmm5, xmm0 1397 1398 movdqa xmm2, xmm7 1399 psrldq xmm2, 1 1400 1401 punpcklbw xmm7, xmm2 1402 pmaddubsw xmm7, xmm0 1403 1404 paddw xmm1, [GLOBAL(rd)] 1405 psraw xmm1, VP8_FILTER_SHIFT 1406 1407 paddw xmm3, [GLOBAL(rd)] 1408 psraw xmm3, VP8_FILTER_SHIFT 1409 1410 paddw xmm5, [GLOBAL(rd)] 1411 psraw xmm5, VP8_FILTER_SHIFT 1412 1413 paddw xmm7, [GLOBAL(rd)] 1414 psraw xmm7, VP8_FILTER_SHIFT 1415 1416 packuswb xmm1, xmm1 1417 packuswb xmm3, xmm3 1418 1419 packuswb xmm5, xmm5 1420 movq [rdi], xmm1 1421 1422 packuswb xmm7, xmm7 1423 movq [rdi+rdx], xmm3 1424 1425 lea rdi, [rdi + 2*rdx] 1426 movq [rdi], xmm5 1427 1428 lea rsp, [rsp + 4*16] 1429 movq [rdi+rdx], xmm7 1430 1431 lea rdi, [rdi + 2*rdx] 1432 cmp rdi, rcx 1433 1434 jne .next_row_fp 1435 1436 lea rsp, [rsp + 16] 1437 1438.done8x8: 1439 ;add rsp, 144 1440 pop rsp 1441 ; begin epilog 1442 pop rdi 1443 pop rsi 1444 RESTORE_GOT 1445 RESTORE_XMM 1446 UNSHADOW_ARGS 1447 pop rbp 1448 ret 1449 1450SECTION_RODATA 1451align 16 1452shuf1b: 1453 db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 1454shuf2b: 1455 db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 1456shuf3b: 1457 db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 1458 1459align 16 1460shuf2bfrom1: 1461 db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 1462align 16 1463shuf3bfrom1: 1464 db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 1465 1466align 16 1467rd: 1468 times 8 dw 0x40 1469 1470align 16 1471k0_k5: 1472 times 8 db 0, 0 ;placeholder 1473 times 8 db 0, 0 1474 times 8 db 2, 1 1475 times 8 db 0, 0 1476 times 8 db 3, 3 1477 times 8 db 0, 0 1478 times 8 db 1, 2 1479 times 8 db 0, 0 1480k1_k3: 1481 times 8 db 0, 0 ;placeholder 1482 times 8 db -6, 12 1483 times 8 db -11, 36 1484 times 8 db -9, 50 1485 times 8 db -16, 77 1486 times 8 db -6, 93 1487 times 8 db -8, 108 1488 times 8 db -1, 123 1489k2_k4: 1490 times 8 db 128, 0 ;placeholder 1491 times 8 db 123, -1 1492 times 8 db 108, -8 1493 times 8 db 93, -6 1494 times 8 db 77, -16 1495 times 8 db 50, -9 1496 times 8 db 36, -11 1497 times 8 db 12, -6 1498align 16 1499vp8_bilinear_filters_ssse3: 1500 times 8 db 128, 0 1501 times 8 db 112, 16 1502 times 8 db 96, 32 1503 times 8 db 80, 48 1504 times 8 db 64, 64 1505 times 8 db 48, 80 1506 times 8 db 32, 96 1507 times 8 db 16, 112 1508 1509