1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13extern sym(vp8_bilinear_filters_x86_8) 14 15%define BLOCK_HEIGHT_WIDTH 4 16%define VP8_FILTER_WEIGHT 128 17%define VP8_FILTER_SHIFT 7 18 19 20;/************************************************************************************ 21; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 22; input pixel array has output_height rows. This routine assumes that output_height is an 23; even number. This function handles 8 pixels in horizontal direction, calculating ONE 24; rows each iteration to take advantage of the 128 bits operations. 25;*************************************************************************************/ 26;void vp8_filter_block1d8_h6_sse2 27;( 28; unsigned char *src_ptr, 29; unsigned short *output_ptr, 30; unsigned int src_pixels_per_line, 31; unsigned int pixel_step, 32; unsigned int output_height, 33; unsigned int output_width, 34; short *vp8_filter 35;) 36global sym(vp8_filter_block1d8_h6_sse2) PRIVATE 37sym(vp8_filter_block1d8_h6_sse2): 38 push rbp 39 mov rbp, rsp 40 SHADOW_ARGS_TO_STACK 7 41 SAVE_XMM 7 42 GET_GOT rbx 43 push rsi 44 push rdi 45 ; end prolog 46 47 mov rdx, arg(6) ;vp8_filter 48 mov rsi, arg(0) ;src_ptr 49 50 mov rdi, arg(1) ;output_ptr 51 52 movsxd rcx, dword ptr arg(4) ;output_height 53 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 54%if ABI_IS_32BIT=0 55 movsxd r8, dword ptr arg(5) ;output_width 56%endif 57 pxor xmm0, xmm0 ; clear xmm0 for unpack 58 59.filter_block1d8_h6_rowloop: 60 movq xmm3, MMWORD PTR [rsi - 2] 61 movq xmm1, MMWORD PTR [rsi + 6] 62 63 prefetcht2 [rsi+rax-2] 64 65 pslldq xmm1, 8 66 por xmm1, xmm3 67 68 movdqa xmm4, xmm1 69 movdqa xmm5, xmm1 70 71 movdqa xmm6, xmm1 72 movdqa xmm7, xmm1 73 74 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 75 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 76 77 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 78 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 79 80 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 81 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 82 83 84 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 85 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 86 87 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 88 89 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 90 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 91 92 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 93 94 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 95 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 96 97 98 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 99 100 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 101 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 102 103 104 paddsw xmm4, xmm7 105 paddsw xmm4, xmm5 106 107 paddsw xmm4, xmm3 108 paddsw xmm4, xmm6 109 110 paddsw xmm4, xmm1 111 paddsw xmm4, [GLOBAL(rd)] 112 113 psraw xmm4, 7 114 115 packuswb xmm4, xmm0 116 punpcklbw xmm4, xmm0 117 118 movdqa XMMWORD Ptr [rdi], xmm4 119 lea rsi, [rsi + rax] 120 121%if ABI_IS_32BIT 122 add rdi, DWORD Ptr arg(5) ;[output_width] 123%else 124 add rdi, r8 125%endif 126 dec rcx 127 128 jnz .filter_block1d8_h6_rowloop ; next row 129 130 ; begin epilog 131 pop rdi 132 pop rsi 133 RESTORE_GOT 134 RESTORE_XMM 135 UNSHADOW_ARGS 136 pop rbp 137 ret 138 139 140;void vp8_filter_block1d16_h6_sse2 141;( 142; unsigned char *src_ptr, 143; unsigned short *output_ptr, 144; unsigned int src_pixels_per_line, 145; unsigned int pixel_step, 146; unsigned int output_height, 147; unsigned int output_width, 148; short *vp8_filter 149;) 150;/************************************************************************************ 151; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 152; input pixel array has output_height rows. This routine assumes that output_height is an 153; even number. This function handles 8 pixels in horizontal direction, calculating ONE 154; rows each iteration to take advantage of the 128 bits operations. 155;*************************************************************************************/ 156global sym(vp8_filter_block1d16_h6_sse2) PRIVATE 157sym(vp8_filter_block1d16_h6_sse2): 158 push rbp 159 mov rbp, rsp 160 SHADOW_ARGS_TO_STACK 7 161 SAVE_XMM 7 162 GET_GOT rbx 163 push rsi 164 push rdi 165 ; end prolog 166 167 mov rdx, arg(6) ;vp8_filter 168 mov rsi, arg(0) ;src_ptr 169 170 mov rdi, arg(1) ;output_ptr 171 172 movsxd rcx, dword ptr arg(4) ;output_height 173 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 174%if ABI_IS_32BIT=0 175 movsxd r8, dword ptr arg(5) ;output_width 176%endif 177 178 pxor xmm0, xmm0 ; clear xmm0 for unpack 179 180.filter_block1d16_h6_sse2_rowloop: 181 movq xmm3, MMWORD PTR [rsi - 2] 182 movq xmm1, MMWORD PTR [rsi + 6] 183 184 movq xmm2, MMWORD PTR [rsi +14] 185 pslldq xmm2, 8 186 187 por xmm2, xmm1 188 prefetcht2 [rsi+rax-2] 189 190 pslldq xmm1, 8 191 por xmm1, xmm3 192 193 movdqa xmm4, xmm1 194 movdqa xmm5, xmm1 195 196 movdqa xmm6, xmm1 197 movdqa xmm7, xmm1 198 199 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 200 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 201 202 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 203 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 204 205 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 206 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 207 208 209 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 210 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 211 212 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 213 214 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 215 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 216 217 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 218 219 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 220 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 221 222 223 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 224 225 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 226 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 227 228 paddsw xmm4, xmm7 229 paddsw xmm4, xmm5 230 231 paddsw xmm4, xmm3 232 paddsw xmm4, xmm6 233 234 paddsw xmm4, xmm1 235 paddsw xmm4, [GLOBAL(rd)] 236 237 psraw xmm4, 7 238 239 packuswb xmm4, xmm0 240 punpcklbw xmm4, xmm0 241 242 movdqa XMMWORD Ptr [rdi], xmm4 243 244 movdqa xmm3, xmm2 245 movdqa xmm4, xmm2 246 247 movdqa xmm5, xmm2 248 movdqa xmm6, xmm2 249 250 movdqa xmm7, xmm2 251 252 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 253 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 254 255 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 256 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 257 258 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 259 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 260 261 262 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 263 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 264 265 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 266 267 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 268 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 269 270 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 271 272 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 273 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 274 275 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 276 277 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 278 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 279 280 281 paddsw xmm4, xmm7 282 paddsw xmm4, xmm5 283 284 paddsw xmm4, xmm3 285 paddsw xmm4, xmm6 286 287 paddsw xmm4, xmm2 288 paddsw xmm4, [GLOBAL(rd)] 289 290 psraw xmm4, 7 291 292 packuswb xmm4, xmm0 293 punpcklbw xmm4, xmm0 294 295 movdqa XMMWORD Ptr [rdi+16], xmm4 296 297 lea rsi, [rsi + rax] 298%if ABI_IS_32BIT 299 add rdi, DWORD Ptr arg(5) ;[output_width] 300%else 301 add rdi, r8 302%endif 303 304 dec rcx 305 jnz .filter_block1d16_h6_sse2_rowloop ; next row 306 307 ; begin epilog 308 pop rdi 309 pop rsi 310 RESTORE_GOT 311 RESTORE_XMM 312 UNSHADOW_ARGS 313 pop rbp 314 ret 315 316 317;void vp8_filter_block1d8_v6_sse2 318;( 319; short *src_ptr, 320; unsigned char *output_ptr, 321; int dst_ptich, 322; unsigned int pixels_per_line, 323; unsigned int pixel_step, 324; unsigned int output_height, 325; unsigned int output_width, 326; short * vp8_filter 327;) 328;/************************************************************************************ 329; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The 330; input pixel array has output_height rows. 331;*************************************************************************************/ 332global sym(vp8_filter_block1d8_v6_sse2) PRIVATE 333sym(vp8_filter_block1d8_v6_sse2): 334 push rbp 335 mov rbp, rsp 336 SHADOW_ARGS_TO_STACK 8 337 SAVE_XMM 7 338 GET_GOT rbx 339 push rsi 340 push rdi 341 ; end prolog 342 343 mov rax, arg(7) ;vp8_filter 344 movsxd rdx, dword ptr arg(3) ;pixels_per_line 345 346 mov rdi, arg(1) ;output_ptr 347 mov rsi, arg(0) ;src_ptr 348 349 sub rsi, rdx 350 sub rsi, rdx 351 352 movsxd rcx, DWORD PTR arg(5) ;[output_height] 353 pxor xmm0, xmm0 ; clear xmm0 354 355 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 356%if ABI_IS_32BIT=0 357 movsxd r8, dword ptr arg(2) ; dst_ptich 358%endif 359 360.vp8_filter_block1d8_v6_sse2_loop: 361 movdqa xmm1, XMMWORD PTR [rsi] 362 pmullw xmm1, [rax] 363 364 movdqa xmm2, XMMWORD PTR [rsi + rdx] 365 pmullw xmm2, [rax + 16] 366 367 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] 368 pmullw xmm3, [rax + 32] 369 370 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] 371 pmullw xmm5, [rax + 64] 372 373 add rsi, rdx 374 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] 375 376 pmullw xmm4, [rax + 48] 377 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] 378 379 pmullw xmm6, [rax + 80] 380 381 paddsw xmm2, xmm5 382 paddsw xmm2, xmm3 383 384 paddsw xmm2, xmm1 385 paddsw xmm2, xmm4 386 387 paddsw xmm2, xmm6 388 paddsw xmm2, xmm7 389 390 psraw xmm2, 7 391 packuswb xmm2, xmm0 ; pack and saturate 392 393 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 394%if ABI_IS_32BIT 395 add rdi, DWORD PTR arg(2) ;[dst_ptich] 396%else 397 add rdi, r8 398%endif 399 dec rcx ; decrement count 400 jnz .vp8_filter_block1d8_v6_sse2_loop ; next row 401 402 ; begin epilog 403 pop rdi 404 pop rsi 405 RESTORE_GOT 406 RESTORE_XMM 407 UNSHADOW_ARGS 408 pop rbp 409 ret 410 411 412;void vp8_filter_block1d16_v6_sse2 413;( 414; unsigned short *src_ptr, 415; unsigned char *output_ptr, 416; int dst_ptich, 417; unsigned int pixels_per_line, 418; unsigned int pixel_step, 419; unsigned int output_height, 420; unsigned int output_width, 421; const short *vp8_filter 422;) 423;/************************************************************************************ 424; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The 425; input pixel array has output_height rows. 426;*************************************************************************************/ 427global sym(vp8_filter_block1d16_v6_sse2) PRIVATE 428sym(vp8_filter_block1d16_v6_sse2): 429 push rbp 430 mov rbp, rsp 431 SHADOW_ARGS_TO_STACK 8 432 SAVE_XMM 7 433 GET_GOT rbx 434 push rsi 435 push rdi 436 ; end prolog 437 438 mov rax, arg(7) ;vp8_filter 439 movsxd rdx, dword ptr arg(3) ;pixels_per_line 440 441 mov rdi, arg(1) ;output_ptr 442 mov rsi, arg(0) ;src_ptr 443 444 sub rsi, rdx 445 sub rsi, rdx 446 447 movsxd rcx, DWORD PTR arg(5) ;[output_height] 448%if ABI_IS_32BIT=0 449 movsxd r8, dword ptr arg(2) ; dst_ptich 450%endif 451 452.vp8_filter_block1d16_v6_sse2_loop: 453; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. 454 movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 455 movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] 456 pmullw xmm1, [rax + 16] 457 pmullw xmm2, [rax + 16] 458 459 movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 460 movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] 461 pmullw xmm3, [rax + 64] 462 pmullw xmm4, [rax + 64] 463 464 movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 465 movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] 466 pmullw xmm5, [rax + 32] 467 pmullw xmm6, [rax + 32] 468 469 movdqa xmm7, XMMWORD PTR [rsi] ; line 1 470 movdqa xmm0, XMMWORD PTR [rsi + 16] 471 pmullw xmm7, [rax] 472 pmullw xmm0, [rax] 473 474 paddsw xmm1, xmm3 475 paddsw xmm2, xmm4 476 paddsw xmm1, xmm5 477 paddsw xmm2, xmm6 478 paddsw xmm1, xmm7 479 paddsw xmm2, xmm0 480 481 add rsi, rdx 482 483 movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 484 movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] 485 pmullw xmm3, [rax + 48] 486 pmullw xmm4, [rax + 48] 487 488 movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 489 movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] 490 pmullw xmm5, [rax + 80] 491 pmullw xmm6, [rax + 80] 492 493 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 494 pxor xmm0, xmm0 ; clear xmm0 495 496 paddsw xmm1, xmm3 497 paddsw xmm2, xmm4 498 paddsw xmm1, xmm5 499 paddsw xmm2, xmm6 500 501 paddsw xmm1, xmm7 502 paddsw xmm2, xmm7 503 504 psraw xmm1, 7 505 psraw xmm2, 7 506 507 packuswb xmm1, xmm2 ; pack and saturate 508 movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination 509%if ABI_IS_32BIT 510 add rdi, DWORD PTR arg(2) ;[dst_ptich] 511%else 512 add rdi, r8 513%endif 514 dec rcx ; decrement count 515 jnz .vp8_filter_block1d16_v6_sse2_loop ; next row 516 517 ; begin epilog 518 pop rdi 519 pop rsi 520 RESTORE_GOT 521 RESTORE_XMM 522 UNSHADOW_ARGS 523 pop rbp 524 ret 525 526 527;void vp8_filter_block1d8_h6_only_sse2 528;( 529; unsigned char *src_ptr, 530; unsigned int src_pixels_per_line, 531; unsigned char *output_ptr, 532; int dst_ptich, 533; unsigned int output_height, 534; const short *vp8_filter 535;) 536; First-pass filter only when yoffset==0 537global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE 538sym(vp8_filter_block1d8_h6_only_sse2): 539 push rbp 540 mov rbp, rsp 541 SHADOW_ARGS_TO_STACK 6 542 SAVE_XMM 7 543 GET_GOT rbx 544 push rsi 545 push rdi 546 ; end prolog 547 548 mov rdx, arg(5) ;vp8_filter 549 mov rsi, arg(0) ;src_ptr 550 551 mov rdi, arg(2) ;output_ptr 552 553 movsxd rcx, dword ptr arg(4) ;output_height 554 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 555%if ABI_IS_32BIT=0 556 movsxd r8, dword ptr arg(3) ;dst_ptich 557%endif 558 pxor xmm0, xmm0 ; clear xmm0 for unpack 559 560.filter_block1d8_h6_only_rowloop: 561 movq xmm3, MMWORD PTR [rsi - 2] 562 movq xmm1, MMWORD PTR [rsi + 6] 563 564 prefetcht2 [rsi+rax-2] 565 566 pslldq xmm1, 8 567 por xmm1, xmm3 568 569 movdqa xmm4, xmm1 570 movdqa xmm5, xmm1 571 572 movdqa xmm6, xmm1 573 movdqa xmm7, xmm1 574 575 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 576 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 577 578 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 579 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 580 581 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 582 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 583 584 585 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 586 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 587 588 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 589 590 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 591 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 592 593 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 594 595 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 596 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 597 598 599 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 600 601 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 602 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 603 604 605 paddsw xmm4, xmm7 606 paddsw xmm4, xmm5 607 608 paddsw xmm4, xmm3 609 paddsw xmm4, xmm6 610 611 paddsw xmm4, xmm1 612 paddsw xmm4, [GLOBAL(rd)] 613 614 psraw xmm4, 7 615 616 packuswb xmm4, xmm0 617 618 movq QWORD PTR [rdi], xmm4 ; store the results in the destination 619 lea rsi, [rsi + rax] 620 621%if ABI_IS_32BIT 622 add rdi, DWORD Ptr arg(3) ;dst_ptich 623%else 624 add rdi, r8 625%endif 626 dec rcx 627 628 jnz .filter_block1d8_h6_only_rowloop ; next row 629 630 ; begin epilog 631 pop rdi 632 pop rsi 633 RESTORE_GOT 634 RESTORE_XMM 635 UNSHADOW_ARGS 636 pop rbp 637 ret 638 639 640;void vp8_filter_block1d16_h6_only_sse2 641;( 642; unsigned char *src_ptr, 643; unsigned int src_pixels_per_line, 644; unsigned char *output_ptr, 645; int dst_ptich, 646; unsigned int output_height, 647; const short *vp8_filter 648;) 649; First-pass filter only when yoffset==0 650global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE 651sym(vp8_filter_block1d16_h6_only_sse2): 652 push rbp 653 mov rbp, rsp 654 SHADOW_ARGS_TO_STACK 6 655 SAVE_XMM 7 656 GET_GOT rbx 657 push rsi 658 push rdi 659 ; end prolog 660 661 mov rdx, arg(5) ;vp8_filter 662 mov rsi, arg(0) ;src_ptr 663 664 mov rdi, arg(2) ;output_ptr 665 666 movsxd rcx, dword ptr arg(4) ;output_height 667 movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 668%if ABI_IS_32BIT=0 669 movsxd r8, dword ptr arg(3) ;dst_ptich 670%endif 671 672 pxor xmm0, xmm0 ; clear xmm0 for unpack 673 674.filter_block1d16_h6_only_sse2_rowloop: 675 movq xmm3, MMWORD PTR [rsi - 2] 676 movq xmm1, MMWORD PTR [rsi + 6] 677 678 movq xmm2, MMWORD PTR [rsi +14] 679 pslldq xmm2, 8 680 681 por xmm2, xmm1 682 prefetcht2 [rsi+rax-2] 683 684 pslldq xmm1, 8 685 por xmm1, xmm3 686 687 movdqa xmm4, xmm1 688 movdqa xmm5, xmm1 689 690 movdqa xmm6, xmm1 691 movdqa xmm7, xmm1 692 693 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 694 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 695 696 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 697 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 698 699 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 700 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 701 702 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 703 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 704 705 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 706 707 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 708 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 709 710 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 711 712 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 713 psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 714 715 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 716 717 punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 718 pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 719 720 paddsw xmm4, xmm7 721 paddsw xmm4, xmm5 722 723 paddsw xmm4, xmm3 724 paddsw xmm4, xmm6 725 726 paddsw xmm4, xmm1 727 paddsw xmm4, [GLOBAL(rd)] 728 729 psraw xmm4, 7 730 731 packuswb xmm4, xmm0 ; lower 8 bytes 732 733 movq QWORD Ptr [rdi], xmm4 ; store the results in the destination 734 735 movdqa xmm3, xmm2 736 movdqa xmm4, xmm2 737 738 movdqa xmm5, xmm2 739 movdqa xmm6, xmm2 740 741 movdqa xmm7, xmm2 742 743 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 744 psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 745 746 pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 747 punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 748 749 psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 750 pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 751 752 punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 753 psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 754 755 pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 756 757 punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 758 psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 759 760 pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 761 762 punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 763 psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 764 765 pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 766 767 punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 768 pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 769 770 paddsw xmm4, xmm7 771 paddsw xmm4, xmm5 772 773 paddsw xmm4, xmm3 774 paddsw xmm4, xmm6 775 776 paddsw xmm4, xmm2 777 paddsw xmm4, [GLOBAL(rd)] 778 779 psraw xmm4, 7 780 781 packuswb xmm4, xmm0 ; higher 8 bytes 782 783 movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination 784 785 lea rsi, [rsi + rax] 786%if ABI_IS_32BIT 787 add rdi, DWORD Ptr arg(3) ;dst_ptich 788%else 789 add rdi, r8 790%endif 791 792 dec rcx 793 jnz .filter_block1d16_h6_only_sse2_rowloop ; next row 794 795 ; begin epilog 796 pop rdi 797 pop rsi 798 RESTORE_GOT 799 RESTORE_XMM 800 UNSHADOW_ARGS 801 pop rbp 802 ret 803 804 805;void vp8_filter_block1d8_v6_only_sse2 806;( 807; unsigned char *src_ptr, 808; unsigned int src_pixels_per_line, 809; unsigned char *output_ptr, 810; int dst_ptich, 811; unsigned int output_height, 812; const short *vp8_filter 813;) 814; Second-pass filter only when xoffset==0 815global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE 816sym(vp8_filter_block1d8_v6_only_sse2): 817 push rbp 818 mov rbp, rsp 819 SHADOW_ARGS_TO_STACK 6 820 SAVE_XMM 7 821 GET_GOT rbx 822 push rsi 823 push rdi 824 ; end prolog 825 826 mov rsi, arg(0) ;src_ptr 827 mov rdi, arg(2) ;output_ptr 828 829 movsxd rcx, dword ptr arg(4) ;output_height 830 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 831 832 mov rax, arg(5) ;vp8_filter 833 834 pxor xmm0, xmm0 ; clear xmm0 835 836 movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 837%if ABI_IS_32BIT=0 838 movsxd r8, dword ptr arg(3) ; dst_ptich 839%endif 840 841.vp8_filter_block1d8_v6_only_sse2_loop: 842 movq xmm1, MMWORD PTR [rsi] 843 movq xmm2, MMWORD PTR [rsi + rdx] 844 movq xmm3, MMWORD PTR [rsi + rdx * 2] 845 movq xmm5, MMWORD PTR [rsi + rdx * 4] 846 add rsi, rdx 847 movq xmm4, MMWORD PTR [rsi + rdx * 2] 848 movq xmm6, MMWORD PTR [rsi + rdx * 4] 849 850 punpcklbw xmm1, xmm0 851 pmullw xmm1, [rax] 852 853 punpcklbw xmm2, xmm0 854 pmullw xmm2, [rax + 16] 855 856 punpcklbw xmm3, xmm0 857 pmullw xmm3, [rax + 32] 858 859 punpcklbw xmm5, xmm0 860 pmullw xmm5, [rax + 64] 861 862 punpcklbw xmm4, xmm0 863 pmullw xmm4, [rax + 48] 864 865 punpcklbw xmm6, xmm0 866 pmullw xmm6, [rax + 80] 867 868 paddsw xmm2, xmm5 869 paddsw xmm2, xmm3 870 871 paddsw xmm2, xmm1 872 paddsw xmm2, xmm4 873 874 paddsw xmm2, xmm6 875 paddsw xmm2, xmm7 876 877 psraw xmm2, 7 878 packuswb xmm2, xmm0 ; pack and saturate 879 880 movq QWORD PTR [rdi], xmm2 ; store the results in the destination 881%if ABI_IS_32BIT 882 add rdi, DWORD PTR arg(3) ;[dst_ptich] 883%else 884 add rdi, r8 885%endif 886 dec rcx ; decrement count 887 jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row 888 889 ; begin epilog 890 pop rdi 891 pop rsi 892 RESTORE_GOT 893 RESTORE_XMM 894 UNSHADOW_ARGS 895 pop rbp 896 ret 897 898 899;void vp8_unpack_block1d16_h6_sse2 900;( 901; unsigned char *src_ptr, 902; unsigned short *output_ptr, 903; unsigned int src_pixels_per_line, 904; unsigned int output_height, 905; unsigned int output_width 906;) 907global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE 908sym(vp8_unpack_block1d16_h6_sse2): 909 push rbp 910 mov rbp, rsp 911 SHADOW_ARGS_TO_STACK 5 912 GET_GOT rbx 913 push rsi 914 push rdi 915 ; end prolog 916 917 mov rsi, arg(0) ;src_ptr 918 mov rdi, arg(1) ;output_ptr 919 920 movsxd rcx, dword ptr arg(3) ;output_height 921 movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 922 923 pxor xmm0, xmm0 ; clear xmm0 for unpack 924%if ABI_IS_32BIT=0 925 movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source 926%endif 927 928.unpack_block1d16_h6_sse2_rowloop: 929 movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 930 movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 931 932 punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 933 punpcklbw xmm1, xmm0 934 935 movdqa XMMWORD Ptr [rdi], xmm1 936 movdqa XMMWORD Ptr [rdi + 16], xmm3 937 938 lea rsi, [rsi + rax] 939%if ABI_IS_32BIT 940 add rdi, DWORD Ptr arg(4) ;[output_width] 941%else 942 add rdi, r8 943%endif 944 dec rcx 945 jnz .unpack_block1d16_h6_sse2_rowloop ; next row 946 947 ; begin epilog 948 pop rdi 949 pop rsi 950 RESTORE_GOT 951 UNSHADOW_ARGS 952 pop rbp 953 ret 954 955 956;void vp8_bilinear_predict16x16_sse2 957;( 958; unsigned char *src_ptr, 959; int src_pixels_per_line, 960; int xoffset, 961; int yoffset, 962; unsigned char *dst_ptr, 963; int dst_pitch 964;) 965extern sym(vp8_bilinear_filters_x86_8) 966global sym(vp8_bilinear_predict16x16_sse2) PRIVATE 967sym(vp8_bilinear_predict16x16_sse2): 968 push rbp 969 mov rbp, rsp 970 SHADOW_ARGS_TO_STACK 6 971 SAVE_XMM 7 972 GET_GOT rbx 973 push rsi 974 push rdi 975 ; end prolog 976 977 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] 978 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] 979 980 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 981 movsxd rax, dword ptr arg(2) ;xoffset 982 983 cmp rax, 0 ;skip first_pass filter if xoffset=0 984 je .b16x16_sp_only 985 986 shl rax, 5 987 add rax, rcx ;HFilter 988 989 mov rdi, arg(4) ;dst_ptr 990 mov rsi, arg(0) ;src_ptr 991 movsxd rdx, dword ptr arg(5) ;dst_pitch 992 993 movdqa xmm1, [rax] 994 movdqa xmm2, [rax+16] 995 996 movsxd rax, dword ptr arg(3) ;yoffset 997 998 cmp rax, 0 ;skip second_pass filter if yoffset=0 999 je .b16x16_fp_only 1000 1001 shl rax, 5 1002 add rax, rcx ;VFilter 1003 1004 lea rcx, [rdi+rdx*8] 1005 lea rcx, [rcx+rdx*8] 1006 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1007 1008 pxor xmm0, xmm0 1009 1010%if ABI_IS_32BIT=0 1011 movsxd r8, dword ptr arg(5) ;dst_pitch 1012%endif 1013 ; get the first horizontal line done 1014 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1015 movdqa xmm4, xmm3 ; make a copy of current line 1016 1017 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1018 punpckhbw xmm4, xmm0 1019 1020 pmullw xmm3, xmm1 1021 pmullw xmm4, xmm1 1022 1023 movdqu xmm5, [rsi+1] 1024 movdqa xmm6, xmm5 1025 1026 punpcklbw xmm5, xmm0 1027 punpckhbw xmm6, xmm0 1028 1029 pmullw xmm5, xmm2 1030 pmullw xmm6, xmm2 1031 1032 paddw xmm3, xmm5 1033 paddw xmm4, xmm6 1034 1035 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1036 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1037 1038 paddw xmm4, [GLOBAL(rd)] 1039 psraw xmm4, VP8_FILTER_SHIFT 1040 1041 movdqa xmm7, xmm3 1042 packuswb xmm7, xmm4 1043 1044 add rsi, rdx ; next line 1045.next_row: 1046 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1047 movdqa xmm4, xmm3 ; make a copy of current line 1048 1049 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1050 punpckhbw xmm4, xmm0 1051 1052 pmullw xmm3, xmm1 1053 pmullw xmm4, xmm1 1054 1055 movdqu xmm5, [rsi+1] 1056 movdqa xmm6, xmm5 1057 1058 punpcklbw xmm5, xmm0 1059 punpckhbw xmm6, xmm0 1060 1061 pmullw xmm5, xmm2 1062 pmullw xmm6, xmm2 1063 1064 paddw xmm3, xmm5 1065 paddw xmm4, xmm6 1066 1067 movdqa xmm5, xmm7 1068 movdqa xmm6, xmm7 1069 1070 punpcklbw xmm5, xmm0 1071 punpckhbw xmm6, xmm0 1072 1073 pmullw xmm5, [rax] 1074 pmullw xmm6, [rax] 1075 1076 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1077 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1078 1079 paddw xmm4, [GLOBAL(rd)] 1080 psraw xmm4, VP8_FILTER_SHIFT 1081 1082 movdqa xmm7, xmm3 1083 packuswb xmm7, xmm4 1084 1085 pmullw xmm3, [rax+16] 1086 pmullw xmm4, [rax+16] 1087 1088 paddw xmm3, xmm5 1089 paddw xmm4, xmm6 1090 1091 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1092 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1093 1094 paddw xmm4, [GLOBAL(rd)] 1095 psraw xmm4, VP8_FILTER_SHIFT 1096 1097 packuswb xmm3, xmm4 1098 movdqa [rdi], xmm3 ; store the results in the destination 1099 1100 add rsi, rdx ; next line 1101%if ABI_IS_32BIT 1102 add rdi, DWORD PTR arg(5) ;dst_pitch 1103%else 1104 add rdi, r8 1105%endif 1106 1107 cmp rdi, rcx 1108 jne .next_row 1109 1110 jmp .done 1111 1112.b16x16_sp_only: 1113 movsxd rax, dword ptr arg(3) ;yoffset 1114 shl rax, 5 1115 add rax, rcx ;VFilter 1116 1117 mov rdi, arg(4) ;dst_ptr 1118 mov rsi, arg(0) ;src_ptr 1119 movsxd rdx, dword ptr arg(5) ;dst_pitch 1120 1121 movdqa xmm1, [rax] 1122 movdqa xmm2, [rax+16] 1123 1124 lea rcx, [rdi+rdx*8] 1125 lea rcx, [rcx+rdx*8] 1126 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1127 1128 pxor xmm0, xmm0 1129 1130 ; get the first horizontal line done 1131 movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1132 1133 add rsi, rax ; next line 1134.next_row_spo: 1135 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1136 1137 movdqa xmm5, xmm7 1138 movdqa xmm6, xmm7 1139 1140 movdqa xmm4, xmm3 ; make a copy of current line 1141 movdqa xmm7, xmm3 1142 1143 punpcklbw xmm5, xmm0 1144 punpckhbw xmm6, xmm0 1145 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1146 punpckhbw xmm4, xmm0 1147 1148 pmullw xmm5, xmm1 1149 pmullw xmm6, xmm1 1150 pmullw xmm3, xmm2 1151 pmullw xmm4, xmm2 1152 1153 paddw xmm3, xmm5 1154 paddw xmm4, xmm6 1155 1156 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1157 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1158 1159 paddw xmm4, [GLOBAL(rd)] 1160 psraw xmm4, VP8_FILTER_SHIFT 1161 1162 packuswb xmm3, xmm4 1163 movdqa [rdi], xmm3 ; store the results in the destination 1164 1165 add rsi, rax ; next line 1166 add rdi, rdx ;dst_pitch 1167 cmp rdi, rcx 1168 jne .next_row_spo 1169 1170 jmp .done 1171 1172.b16x16_fp_only: 1173 lea rcx, [rdi+rdx*8] 1174 lea rcx, [rcx+rdx*8] 1175 movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1176 pxor xmm0, xmm0 1177 1178.next_row_fpo: 1179 movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1180 movdqa xmm4, xmm3 ; make a copy of current line 1181 1182 punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1183 punpckhbw xmm4, xmm0 1184 1185 pmullw xmm3, xmm1 1186 pmullw xmm4, xmm1 1187 1188 movdqu xmm5, [rsi+1] 1189 movdqa xmm6, xmm5 1190 1191 punpcklbw xmm5, xmm0 1192 punpckhbw xmm6, xmm0 1193 1194 pmullw xmm5, xmm2 1195 pmullw xmm6, xmm2 1196 1197 paddw xmm3, xmm5 1198 paddw xmm4, xmm6 1199 1200 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1201 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1202 1203 paddw xmm4, [GLOBAL(rd)] 1204 psraw xmm4, VP8_FILTER_SHIFT 1205 1206 packuswb xmm3, xmm4 1207 movdqa [rdi], xmm3 ; store the results in the destination 1208 1209 add rsi, rax ; next line 1210 add rdi, rdx ; dst_pitch 1211 cmp rdi, rcx 1212 jne .next_row_fpo 1213 1214.done: 1215 ; begin epilog 1216 pop rdi 1217 pop rsi 1218 RESTORE_GOT 1219 RESTORE_XMM 1220 UNSHADOW_ARGS 1221 pop rbp 1222 ret 1223 1224 1225;void vp8_bilinear_predict8x8_sse2 1226;( 1227; unsigned char *src_ptr, 1228; int src_pixels_per_line, 1229; int xoffset, 1230; int yoffset, 1231; unsigned char *dst_ptr, 1232; int dst_pitch 1233;) 1234global sym(vp8_bilinear_predict8x8_sse2) PRIVATE 1235sym(vp8_bilinear_predict8x8_sse2): 1236 push rbp 1237 mov rbp, rsp 1238 SHADOW_ARGS_TO_STACK 6 1239 SAVE_XMM 7 1240 GET_GOT rbx 1241 push rsi 1242 push rdi 1243 ; end prolog 1244 1245 ALIGN_STACK 16, rax 1246 sub rsp, 144 ; reserve 144 bytes 1247 1248 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] 1249 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] 1250 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1251 1252 mov rsi, arg(0) ;src_ptr 1253 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1254 1255 ;Read 9-line unaligned data in and put them on stack. This gives a big 1256 ;performance boost. 1257 movdqu xmm0, [rsi] 1258 lea rax, [rdx + rdx*2] 1259 movdqu xmm1, [rsi+rdx] 1260 movdqu xmm2, [rsi+rdx*2] 1261 add rsi, rax 1262 movdqu xmm3, [rsi] 1263 movdqu xmm4, [rsi+rdx] 1264 movdqu xmm5, [rsi+rdx*2] 1265 add rsi, rax 1266 movdqu xmm6, [rsi] 1267 movdqu xmm7, [rsi+rdx] 1268 1269 movdqa XMMWORD PTR [rsp], xmm0 1270 1271 movdqu xmm0, [rsi+rdx*2] 1272 1273 movdqa XMMWORD PTR [rsp+16], xmm1 1274 movdqa XMMWORD PTR [rsp+32], xmm2 1275 movdqa XMMWORD PTR [rsp+48], xmm3 1276 movdqa XMMWORD PTR [rsp+64], xmm4 1277 movdqa XMMWORD PTR [rsp+80], xmm5 1278 movdqa XMMWORD PTR [rsp+96], xmm6 1279 movdqa XMMWORD PTR [rsp+112], xmm7 1280 movdqa XMMWORD PTR [rsp+128], xmm0 1281 1282 movsxd rax, dword ptr arg(2) ;xoffset 1283 shl rax, 5 1284 add rax, rcx ;HFilter 1285 1286 mov rdi, arg(4) ;dst_ptr 1287 movsxd rdx, dword ptr arg(5) ;dst_pitch 1288 1289 movdqa xmm1, [rax] 1290 movdqa xmm2, [rax+16] 1291 1292 movsxd rax, dword ptr arg(3) ;yoffset 1293 shl rax, 5 1294 add rax, rcx ;VFilter 1295 1296 lea rcx, [rdi+rdx*8] 1297 1298 movdqa xmm5, [rax] 1299 movdqa xmm6, [rax+16] 1300 1301 pxor xmm0, xmm0 1302 1303 ; get the first horizontal line done 1304 movdqa xmm3, XMMWORD PTR [rsp] 1305 movdqa xmm4, xmm3 ; make a copy of current line 1306 psrldq xmm4, 1 1307 1308 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1309 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1310 1311 pmullw xmm3, xmm1 1312 pmullw xmm4, xmm2 1313 1314 paddw xmm3, xmm4 1315 1316 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1317 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1318 1319 movdqa xmm7, xmm3 1320 add rsp, 16 ; next line 1321.next_row8x8: 1322 movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1323 movdqa xmm4, xmm3 ; make a copy of current line 1324 psrldq xmm4, 1 1325 1326 punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1327 punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1328 1329 pmullw xmm3, xmm1 1330 pmullw xmm4, xmm2 1331 1332 paddw xmm3, xmm4 1333 pmullw xmm7, xmm5 1334 1335 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1336 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1337 1338 movdqa xmm4, xmm3 1339 1340 pmullw xmm3, xmm6 1341 paddw xmm3, xmm7 1342 1343 movdqa xmm7, xmm4 1344 1345 paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1346 psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1347 1348 packuswb xmm3, xmm0 1349 movq [rdi], xmm3 ; store the results in the destination 1350 1351 add rsp, 16 ; next line 1352 add rdi, rdx 1353 1354 cmp rdi, rcx 1355 jne .next_row8x8 1356 1357 ;add rsp, 144 1358 pop rsp 1359 ; begin epilog 1360 pop rdi 1361 pop rsi 1362 RESTORE_GOT 1363 RESTORE_XMM 1364 UNSHADOW_ARGS 1365 pop rbp 1366 ret 1367 1368 1369SECTION_RODATA 1370align 16 1371rd: 1372 times 8 dw 0x40 1373