1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17pw_64: times 8 dw 64 18even_byte_mask: times 8 dw 0x00ff 19 20; %define USE_PMULHRSW 21; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss 22; when using this instruction. 23; 24; The add order below (based on ffav1) must be followed to prevent outranges. 25; x = k0k1 + k4k5 26; y = k2k3 + k6k7 27; z = signed SAT(x + y) 28 29SECTION .text 30%define LOCAL_VARS_SIZE 16*6 31 32%macro SETUP_LOCAL_VARS 0 33 ; TODO(slavarnway): using xmm registers for these on ARCH_X86_64 + 34 ; pmaddubsw has a higher latency on some platforms, this might be eased by 35 ; interleaving the instructions. 36 %define k0k1 [rsp + 16*0] 37 %define k2k3 [rsp + 16*1] 38 %define k4k5 [rsp + 16*2] 39 %define k6k7 [rsp + 16*3] 40 packsswb m4, m4 41 ; TODO(slavarnway): multiple pshufb instructions had a higher latency on 42 ; some platforms. 43 pshuflw m0, m4, 0b ;k0_k1 44 pshuflw m1, m4, 01010101b ;k2_k3 45 pshuflw m2, m4, 10101010b ;k4_k5 46 pshuflw m3, m4, 11111111b ;k6_k7 47 punpcklqdq m0, m0 48 punpcklqdq m1, m1 49 punpcklqdq m2, m2 50 punpcklqdq m3, m3 51 mova k0k1, m0 52 mova k2k3, m1 53 mova k4k5, m2 54 mova k6k7, m3 55%if ARCH_X86_64 56 %define krd m12 57 %define tmp0 [rsp + 16*4] 58 %define tmp1 [rsp + 16*5] 59 mova krd, [GLOBAL(pw_64)] 60%else 61 %define krd [rsp + 16*4] 62%if CONFIG_PIC=0 63 mova m6, [GLOBAL(pw_64)] 64%else 65 ; build constants without accessing global memory 66 pcmpeqb m6, m6 ;all ones 67 psrlw m6, 15 68 psllw m6, 6 ;aka pw_64 69%endif 70 mova krd, m6 71%endif 72%endm 73 74;------------------------------------------------------------------------------- 75%if ARCH_X86_64 76 %define LOCAL_VARS_SIZE_H4 0 77%else 78 %define LOCAL_VARS_SIZE_H4 16*4 79%endif 80 81%macro SUBPIX_HFILTER4 1 82cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \ 83 src, sstride, dst, dstride, height, filter 84 mova m4, [filterq] 85 packsswb m4, m4 86%if ARCH_X86_64 87 %define k0k1k4k5 m8 88 %define k2k3k6k7 m9 89 %define krd m10 90 mova krd, [GLOBAL(pw_64)] 91 pshuflw k0k1k4k5, m4, 0b ;k0_k1 92 pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5 93 pshuflw k2k3k6k7, m4, 01010101b ;k2_k3 94 pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7 95%else 96 %define k0k1k4k5 [rsp + 16*0] 97 %define k2k3k6k7 [rsp + 16*1] 98 %define krd [rsp + 16*2] 99 pshuflw m6, m4, 0b ;k0_k1 100 pshufhw m6, m6, 10101010b ;k0_k1_k4_k5 101 pshuflw m7, m4, 01010101b ;k2_k3 102 pshufhw m7, m7, 11111111b ;k2_k3_k6_k7 103%if CONFIG_PIC=0 104 mova m1, [GLOBAL(pw_64)] 105%else 106 ; build constants without accessing global memory 107 pcmpeqb m1, m1 ;all ones 108 psrlw m1, 15 109 psllw m1, 6 ;aka pw_64 110%endif 111 mova k0k1k4k5, m6 112 mova k2k3k6k7, m7 113 mova krd, m1 114%endif 115 dec heightd 116 117.loop: 118 ;Do two rows at once 119 movu m4, [srcq - 3] 120 movu m5, [srcq + sstrideq - 3] 121 punpckhbw m1, m4, m4 122 punpcklbw m4, m4 123 punpckhbw m3, m5, m5 124 punpcklbw m5, m5 125 palignr m0, m1, m4, 1 126 pmaddubsw m0, k0k1k4k5 127 palignr m1, m4, 5 128 pmaddubsw m1, k2k3k6k7 129 palignr m2, m3, m5, 1 130 pmaddubsw m2, k0k1k4k5 131 palignr m3, m5, 5 132 pmaddubsw m3, k2k3k6k7 133 punpckhqdq m4, m0, m2 134 punpcklqdq m0, m2 135 punpckhqdq m5, m1, m3 136 punpcklqdq m1, m3 137 paddsw m0, m4 138 paddsw m1, m5 139%ifidn %1, h8_avg 140 movd m4, [dstq] 141 movd m5, [dstq + dstrideq] 142%endif 143 paddsw m0, m1 144 paddsw m0, krd 145 psraw m0, 7 146%ifidn %1, h8_add_src 147 pxor m3, m3 148 movu m4, [srcq] 149 movu m5, [srcq + sstrideq] 150 punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2 151 punpcklbw m4, m3 152 paddsw m0, m4 153%endif 154 packuswb m0, m0 155 psrldq m1, m0, 4 156 157%ifidn %1, h8_avg 158 pavgb m0, m4 159 pavgb m1, m5 160%endif 161 movd [dstq], m0 162 movd [dstq + dstrideq], m1 163 164 lea srcq, [srcq + sstrideq ] 165 prefetcht0 [srcq + 4 * sstrideq - 3] 166 lea srcq, [srcq + sstrideq ] 167 lea dstq, [dstq + 2 * dstrideq ] 168 prefetcht0 [srcq + 2 * sstrideq - 3] 169 170 sub heightd, 2 171 jg .loop 172 173 ; Do last row if output_height is odd 174 jne .done 175 176 movu m4, [srcq - 3] 177 punpckhbw m1, m4, m4 178 punpcklbw m4, m4 179 palignr m0, m1, m4, 1 180 palignr m1, m4, 5 181 pmaddubsw m0, k0k1k4k5 182 pmaddubsw m1, k2k3k6k7 183 psrldq m2, m0, 8 184 psrldq m3, m1, 8 185 paddsw m0, m2 186 paddsw m1, m3 187 paddsw m0, m1 188 paddsw m0, krd 189 psraw m0, 7 190%ifidn %1, h8_add_src 191 pxor m3, m3 192 movu m4, [srcq] 193 punpcklbw m4, m3 194 paddsw m0, m4 195%endif 196 packuswb m0, m0 197%ifidn %1, h8_avg 198 movd m4, [dstq] 199 pavgb m0, m4 200%endif 201 movd [dstq], m0 202.done: 203 REP_RET 204%endm 205 206;------------------------------------------------------------------------------- 207%macro SUBPIX_HFILTER8 1 208cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ 209 src, sstride, dst, dstride, height, filter 210 mova m4, [filterq] 211 SETUP_LOCAL_VARS 212 dec heightd 213 214.loop: 215 ;Do two rows at once 216 movu m0, [srcq - 3] 217 movu m4, [srcq + sstrideq - 3] 218 punpckhbw m1, m0, m0 219 punpcklbw m0, m0 220 palignr m5, m1, m0, 13 221 pmaddubsw m5, k6k7 222 palignr m2, m1, m0, 5 223 palignr m3, m1, m0, 9 224 palignr m1, m0, 1 225 pmaddubsw m1, k0k1 226 punpckhbw m6, m4, m4 227 punpcklbw m4, m4 228 pmaddubsw m2, k2k3 229 pmaddubsw m3, k4k5 230 231 palignr m7, m6, m4, 13 232 palignr m0, m6, m4, 5 233 pmaddubsw m7, k6k7 234 paddsw m1, m3 235 paddsw m2, m5 236 paddsw m1, m2 237%ifidn %1, h8_avg 238 movh m2, [dstq] 239 movhps m2, [dstq + dstrideq] 240%endif 241 palignr m5, m6, m4, 9 242 palignr m6, m4, 1 243 pmaddubsw m0, k2k3 244 pmaddubsw m6, k0k1 245 paddsw m1, krd 246 pmaddubsw m5, k4k5 247 psraw m1, 7 248 paddsw m0, m7 249 paddsw m6, m5 250 paddsw m6, m0 251 paddsw m6, krd 252 psraw m6, 7 253%ifidn %1, h8_add_src 254 pxor m3, m3 255 movu m4, [srcq] 256 movu m5, [srcq + sstrideq] 257 punpcklbw m4, m3 258 punpcklbw m5, m3 259 paddsw m1, m4 260 paddsw m6, m5 261%endif 262 packuswb m1, m6 263%ifidn %1, h8_avg 264 pavgb m1, m2 265%endif 266 movh [dstq], m1 267 movhps [dstq + dstrideq], m1 268 269 lea srcq, [srcq + sstrideq ] 270 prefetcht0 [srcq + 4 * sstrideq - 3] 271 lea srcq, [srcq + sstrideq ] 272 lea dstq, [dstq + 2 * dstrideq ] 273 prefetcht0 [srcq + 2 * sstrideq - 3] 274 sub heightd, 2 275 jg .loop 276 277 ; Do last row if output_height is odd 278 jne .done 279 280 movu m0, [srcq - 3] 281 punpckhbw m3, m0, m0 282 punpcklbw m0, m0 283 palignr m1, m3, m0, 1 284 palignr m2, m3, m0, 5 285 palignr m4, m3, m0, 13 286 palignr m3, m0, 9 287 pmaddubsw m1, k0k1 288 pmaddubsw m2, k2k3 289 pmaddubsw m3, k4k5 290 pmaddubsw m4, k6k7 291 paddsw m1, m3 292 paddsw m4, m2 293 paddsw m1, m4 294 paddsw m1, krd 295 psraw m1, 7 296%ifidn %1, h8_add_src 297 pxor m6, m6 298 movu m5, [srcq] 299 punpcklbw m5, m6 300 paddsw m1, m5 301%endif 302 packuswb m1, m1 303%ifidn %1, h8_avg 304 movh m0, [dstq] 305 pavgb m1, m0 306%endif 307 movh [dstq], m1 308.done: 309 REP_RET 310%endm 311 312;------------------------------------------------------------------------------- 313%macro SUBPIX_HFILTER16 1 314cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \ 315 src, sstride, dst, dstride, height, filter 316 mova m4, [filterq] 317 SETUP_LOCAL_VARS 318 319.loop: 320 prefetcht0 [srcq + 2 * sstrideq -3] 321 322 movu m0, [srcq - 3] 323 movu m4, [srcq - 2] 324 pmaddubsw m0, k0k1 325 pmaddubsw m4, k0k1 326 movu m1, [srcq - 1] 327 movu m5, [srcq + 0] 328 pmaddubsw m1, k2k3 329 pmaddubsw m5, k2k3 330 movu m2, [srcq + 1] 331 movu m6, [srcq + 2] 332 pmaddubsw m2, k4k5 333 pmaddubsw m6, k4k5 334 movu m3, [srcq + 3] 335 movu m7, [srcq + 4] 336 pmaddubsw m3, k6k7 337 pmaddubsw m7, k6k7 338 paddsw m0, m2 339 paddsw m1, m3 340 paddsw m0, m1 341 paddsw m4, m6 342 paddsw m5, m7 343 paddsw m4, m5 344 paddsw m0, krd 345 paddsw m4, krd 346 psraw m0, 7 347 psraw m4, 7 348%ifidn %1, h8_add_src 349%if ARCH_X86=1 && CONFIG_PIC=1 350 pcmpeqb m2, m2 ;all ones 351 psrlw m2, 8 ;even_byte_mask 352%else 353 mova m2, [GLOBAL(even_byte_mask)] 354%endif 355 movu m5, [srcq] 356 mova m7, m5 357 pand m5, m2 358 psrlw m7, 8 359 paddsw m0, m5 360 paddsw m4, m7 361%endif 362 packuswb m0, m0 363 packuswb m4, m4 364 punpcklbw m0, m4 365%ifidn %1, h8_avg 366 pavgb m0, [dstq] 367%endif 368 lea srcq, [srcq + sstrideq] 369 mova [dstq], m0 370 lea dstq, [dstq + dstrideq] 371 dec heightd 372 jnz .loop 373 REP_RET 374%endm 375 376INIT_XMM ssse3 377SUBPIX_HFILTER16 h8 378SUBPIX_HFILTER8 h8 379SUBPIX_HFILTER4 h8 380 381;------------------------------------------------------------------------------- 382 383; TODO(Linfeng): Detect cpu type and choose the code with better performance. 384%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1 385 386%if ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 387 %define NUM_GENERAL_REG_USED 9 388%else 389 %define NUM_GENERAL_REG_USED 6 390%endif 391 392%macro SUBPIX_VFILTER 2 393cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \ 394 src, sstride, dst, dstride, height, filter 395 mova m4, [filterq] 396 SETUP_LOCAL_VARS 397 398%ifidn %2, 8 399 %define movx movh 400%else 401 %define movx movd 402%endif 403 404 dec heightd 405 406%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 407 408%if ARCH_X86_64 409 %define src1q r7 410 %define sstride6q r8 411 %define dst_stride dstrideq 412%else 413 %define src1q filterq 414 %define sstride6q dstrideq 415 %define dst_stride dstridemp 416%endif 417 mov src1q, srcq 418 add src1q, sstrideq 419 lea sstride6q, [sstrideq + sstrideq * 4] 420 add sstride6q, sstrideq ;pitch * 6 421 422.loop: 423 ;Do two rows at once 424 movx m0, [srcq ] ;A 425 movx m1, [src1q ] ;B 426 punpcklbw m0, m1 ;A B 427 movx m2, [srcq + sstrideq * 2 ] ;C 428 pmaddubsw m0, k0k1 429 mova m6, m2 430 movx m3, [src1q + sstrideq * 2] ;D 431 punpcklbw m2, m3 ;C D 432 pmaddubsw m2, k2k3 433 movx m4, [srcq + sstrideq * 4 ] ;E 434 mova m7, m4 435 movx m5, [src1q + sstrideq * 4] ;F 436 punpcklbw m4, m5 ;E F 437 pmaddubsw m4, k4k5 438 punpcklbw m1, m6 ;A B next iter 439 movx m6, [srcq + sstride6q ] ;G 440 punpcklbw m5, m6 ;E F next iter 441 punpcklbw m3, m7 ;C D next iter 442 pmaddubsw m5, k4k5 443 movx m7, [src1q + sstride6q ] ;H 444 punpcklbw m6, m7 ;G H 445 pmaddubsw m6, k6k7 446 pmaddubsw m3, k2k3 447 pmaddubsw m1, k0k1 448 paddsw m0, m4 449 paddsw m2, m6 450 movx m6, [srcq + sstrideq * 8 ] ;H next iter 451 punpcklbw m7, m6 452 pmaddubsw m7, k6k7 453 paddsw m0, m2 454 paddsw m0, krd 455 psraw m0, 7 456 paddsw m1, m5 457%ifidn %1, v8_add_src 458 pxor m6, m6 459 movu m4, [srcq] 460 punpcklbw m4, m6 461 paddsw m0, m4 462%endif 463 packuswb m0, m0 464 465 paddsw m3, m7 466 paddsw m1, m3 467 paddsw m1, krd 468 psraw m1, 7 469%ifidn %1, v8_add_src 470 movu m4, [src1q] 471 punpcklbw m4, m6 472 paddsw m1, m4 473%endif 474 lea srcq, [srcq + sstrideq * 2 ] 475 lea src1q, [src1q + sstrideq * 2] 476 packuswb m1, m1 477 478%ifidn %1, v8_avg 479 movx m2, [dstq] 480 pavgb m0, m2 481%endif 482 movx [dstq], m0 483 add dstq, dst_stride 484%ifidn %1, v8_avg 485 movx m3, [dstq] 486 pavgb m1, m3 487%endif 488 movx [dstq], m1 489 add dstq, dst_stride 490 sub heightd, 2 491 jg .loop 492 493 ; Do last row if output_height is odd 494 jne .done 495 496 movx m0, [srcq ] ;A 497 movx m1, [srcq + sstrideq ] ;B 498 movx m6, [srcq + sstride6q ] ;G 499 punpcklbw m0, m1 ;A B 500 movx m7, [src1q + sstride6q ] ;H 501 pmaddubsw m0, k0k1 502 movx m2, [srcq + sstrideq * 2 ] ;C 503 punpcklbw m6, m7 ;G H 504 movx m3, [src1q + sstrideq * 2] ;D 505 pmaddubsw m6, k6k7 506 movx m4, [srcq + sstrideq * 4 ] ;E 507 punpcklbw m2, m3 ;C D 508 movx m5, [src1q + sstrideq * 4] ;F 509 punpcklbw m4, m5 ;E F 510 pmaddubsw m2, k2k3 511 pmaddubsw m4, k4k5 512 paddsw m2, m6 513 paddsw m0, m4 514 paddsw m0, m2 515 paddsw m0, krd 516 psraw m0, 7 517%ifidn %1, v8_add_src 518 pxor m6, m6 519 movu m4, [srcq] 520 punpcklbw m4, m6 521 paddsw m0, m4 522%endif 523 packuswb m0, m0 524%ifidn %1, v8_avg 525 movx m1, [dstq] 526 pavgb m0, m1 527%endif 528 movx [dstq], m0 529 530%else 531 ; ARCH_X86_64 532 533 movx m0, [srcq ] ;A 534 movx m1, [srcq + sstrideq ] ;B 535 lea srcq, [srcq + sstrideq * 2 ] 536 movx m2, [srcq] ;C 537 movx m3, [srcq + sstrideq] ;D 538 lea srcq, [srcq + sstrideq * 2 ] 539 movx m4, [srcq] ;E 540 movx m5, [srcq + sstrideq] ;F 541 lea srcq, [srcq + sstrideq * 2 ] 542 movx m6, [srcq] ;G 543 punpcklbw m0, m1 ;A B 544 punpcklbw m1, m2 ;A B next iter 545 punpcklbw m2, m3 ;C D 546 punpcklbw m3, m4 ;C D next iter 547 punpcklbw m4, m5 ;E F 548 punpcklbw m5, m6 ;E F next iter 549 550.loop: 551 ;Do two rows at once 552 movx m7, [srcq + sstrideq] ;H 553 lea srcq, [srcq + sstrideq * 2 ] 554 movx m14, [srcq] ;H next iter 555 punpcklbw m6, m7 ;G H 556 punpcklbw m7, m14 ;G H next iter 557 pmaddubsw m8, m0, k0k1 558 pmaddubsw m9, m1, k0k1 559 mova m0, m2 560 mova m1, m3 561 pmaddubsw m10, m2, k2k3 562 pmaddubsw m11, m3, k2k3 563 mova m2, m4 564 mova m3, m5 565 pmaddubsw m4, k4k5 566 pmaddubsw m5, k4k5 567 paddsw m8, m4 568 paddsw m9, m5 569 mova m4, m6 570 mova m5, m7 571 pmaddubsw m6, k6k7 572 pmaddubsw m7, k6k7 573 paddsw m10, m6 574 paddsw m11, m7 575 paddsw m8, m10 576 paddsw m9, m11 577 mova m6, m14 578 paddsw m8, krd 579 paddsw m9, krd 580 psraw m8, 7 581 psraw m9, 7 582%ifidn %2, 4 583 packuswb m8, m8 584 packuswb m9, m9 585%else 586 packuswb m8, m9 587%endif 588 589%ifidn %1, v8_avg 590 movx m7, [dstq] 591%ifidn %2, 4 592 movx m10, [dstq + dstrideq] 593 pavgb m9, m10 594%else 595 movhpd m7, [dstq + dstrideq] 596%endif 597 pavgb m8, m7 598%endif 599 movx [dstq], m8 600%ifidn %2, 4 601 movx [dstq + dstrideq], m9 602%else 603 movhpd [dstq + dstrideq], m8 604%endif 605 606 lea dstq, [dstq + dstrideq * 2 ] 607 sub heightd, 2 608 jg .loop 609 610 ; Do last row if output_height is odd 611 jne .done 612 613 movx m7, [srcq + sstrideq] ;H 614 punpcklbw m6, m7 ;G H 615 pmaddubsw m0, k0k1 616 pmaddubsw m2, k2k3 617 pmaddubsw m4, k4k5 618 pmaddubsw m6, k6k7 619 paddsw m0, m4 620 paddsw m2, m6 621 paddsw m0, m2 622 paddsw m0, krd 623 psraw m0, 7 624 packuswb m0, m0 625%ifidn %1, v8_avg 626 movx m1, [dstq] 627 pavgb m0, m1 628%endif 629 movx [dstq], m0 630 631%endif ; ARCH_X86_64 632 633.done: 634 REP_RET 635 636%endm 637 638;------------------------------------------------------------------------------- 639%macro SUBPIX_VFILTER16 1 640cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \ 641 src, sstride, dst, dstride, height, filter 642 mova m4, [filterq] 643 SETUP_LOCAL_VARS 644 645%if ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 646 647%if ARCH_X86_64 648 %define src1q r7 649 %define sstride6q r8 650 %define dst_stride dstrideq 651%else 652 %define src1q filterq 653 %define sstride6q dstrideq 654 %define dst_stride dstridemp 655%endif 656 lea src1q, [srcq + sstrideq] 657 lea sstride6q, [sstrideq + sstrideq * 4] 658 add sstride6q, sstrideq ;pitch * 6 659 660.loop: 661 movh m0, [srcq ] ;A 662 movh m1, [src1q ] ;B 663 movh m2, [srcq + sstrideq * 2 ] ;C 664 movh m3, [src1q + sstrideq * 2] ;D 665 movh m4, [srcq + sstrideq * 4 ] ;E 666 movh m5, [src1q + sstrideq * 4] ;F 667 668 punpcklbw m0, m1 ;A B 669 movh m6, [srcq + sstride6q] ;G 670 punpcklbw m2, m3 ;C D 671 movh m7, [src1q + sstride6q] ;H 672 punpcklbw m4, m5 ;E F 673 pmaddubsw m0, k0k1 674 movh m3, [srcq + 8] ;A 675 pmaddubsw m2, k2k3 676 punpcklbw m6, m7 ;G H 677 movh m5, [srcq + sstrideq + 8] ;B 678 pmaddubsw m4, k4k5 679 punpcklbw m3, m5 ;A B 680 movh m7, [srcq + sstrideq * 2 + 8] ;C 681 pmaddubsw m6, k6k7 682 movh m5, [src1q + sstrideq * 2 + 8] ;D 683 punpcklbw m7, m5 ;C D 684 paddsw m2, m6 685 pmaddubsw m3, k0k1 686 movh m1, [srcq + sstrideq * 4 + 8] ;E 687 paddsw m0, m4 688 pmaddubsw m7, k2k3 689 movh m6, [src1q + sstrideq * 4 + 8] ;F 690 punpcklbw m1, m6 ;E F 691 paddsw m0, m2 692 paddsw m0, krd 693 movh m2, [srcq + sstride6q + 8] ;G 694 pmaddubsw m1, k4k5 695 movh m5, [src1q + sstride6q + 8] ;H 696 psraw m0, 7 697 punpcklbw m2, m5 ;G H 698 pmaddubsw m2, k6k7 699 paddsw m7, m2 700 paddsw m3, m1 701 paddsw m3, m7 702 paddsw m3, krd 703 psraw m3, 7 704%ifidn %1, v8_add_src 705 pxor m6, m6 706 movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down 707 mova m5, m4 708 punpcklbw m4, m6 709 punpckhbw m5, m6 710 paddsw m0, m4 711 paddsw m3, m5 712%endif 713 packuswb m0, m3 714 715 add srcq, sstrideq 716 add src1q, sstrideq 717%ifidn %1, v8_avg 718 pavgb m0, [dstq] 719%endif 720 mova [dstq], m0 721 add dstq, dst_stride 722 dec heightd 723 jnz .loop 724 REP_RET 725 726%else 727 ; ARCH_X86_64 728 dec heightd 729 730 movu m1, [srcq ] ;A 731 movu m3, [srcq + sstrideq ] ;B 732 lea srcq, [srcq + sstrideq * 2] 733 punpcklbw m0, m1, m3 ;A B 734 punpckhbw m1, m3 ;A B 735 movu m5, [srcq] ;C 736 punpcklbw m2, m3, m5 ;A B next iter 737 punpckhbw m3, m5 ;A B next iter 738 mova tmp0, m2 ;store to stack 739 mova tmp1, m3 ;store to stack 740 movu m7, [srcq + sstrideq] ;D 741 lea srcq, [srcq + sstrideq * 2] 742 punpcklbw m4, m5, m7 ;C D 743 punpckhbw m5, m7 ;C D 744 movu m9, [srcq] ;E 745 punpcklbw m6, m7, m9 ;C D next iter 746 punpckhbw m7, m9 ;C D next iter 747 movu m11, [srcq + sstrideq] ;F 748 lea srcq, [srcq + sstrideq * 2] 749 punpcklbw m8, m9, m11 ;E F 750 punpckhbw m9, m11 ;E F 751 movu m2, [srcq] ;G 752 punpcklbw m10, m11, m2 ;E F next iter 753 punpckhbw m11, m2 ;E F next iter 754 755.loop: 756 ;Do two rows at once 757 pmaddubsw m13, m0, k0k1 758 mova m0, m4 759 pmaddubsw m14, m8, k4k5 760 pmaddubsw m15, m4, k2k3 761 mova m4, m8 762 paddsw m13, m14 763 movu m3, [srcq + sstrideq] ;H 764 lea srcq, [srcq + sstrideq * 2] 765 punpcklbw m14, m2, m3 ;G H 766 mova m8, m14 767 pmaddubsw m14, k6k7 768 paddsw m15, m14 769 paddsw m13, m15 770 paddsw m13, krd 771 psraw m13, 7 772 773 pmaddubsw m14, m1, k0k1 774 pmaddubsw m1, m9, k4k5 775 pmaddubsw m15, m5, k2k3 776 paddsw m14, m1 777 mova m1, m5 778 mova m5, m9 779 punpckhbw m2, m3 ;G H 780 mova m9, m2 781 pmaddubsw m2, k6k7 782 paddsw m15, m2 783 paddsw m14, m15 784 paddsw m14, krd 785 psraw m14, 7 786 packuswb m13, m14 787%ifidn %1, v8_avg 788 pavgb m13, [dstq] 789%endif 790 mova [dstq], m13 791 792 ; next iter 793 pmaddubsw m15, tmp0, k0k1 794 pmaddubsw m14, m10, k4k5 795 pmaddubsw m13, m6, k2k3 796 paddsw m15, m14 797 mova tmp0, m6 798 mova m6, m10 799 movu m2, [srcq] ;G next iter 800 punpcklbw m14, m3, m2 ;G H next iter 801 mova m10, m14 802 pmaddubsw m14, k6k7 803 paddsw m13, m14 804 paddsw m15, m13 805 paddsw m15, krd 806 psraw m15, 7 807 808 pmaddubsw m14, tmp1, k0k1 809 mova tmp1, m7 810 pmaddubsw m13, m7, k2k3 811 mova m7, m11 812 pmaddubsw m11, k4k5 813 paddsw m14, m11 814 punpckhbw m3, m2 ;G H next iter 815 mova m11, m3 816 pmaddubsw m3, k6k7 817 paddsw m13, m3 818 paddsw m14, m13 819 paddsw m14, krd 820 psraw m14, 7 821 packuswb m15, m14 822%ifidn %1, v8_avg 823 pavgb m15, [dstq + dstrideq] 824%endif 825 mova [dstq + dstrideq], m15 826 lea dstq, [dstq + dstrideq * 2] 827 sub heightd, 2 828 jg .loop 829 830 ; Do last row if output_height is odd 831 jne .done 832 833 movu m3, [srcq + sstrideq] ;H 834 punpcklbw m6, m2, m3 ;G H 835 punpckhbw m2, m3 ;G H 836 pmaddubsw m0, k0k1 837 pmaddubsw m1, k0k1 838 pmaddubsw m4, k2k3 839 pmaddubsw m5, k2k3 840 pmaddubsw m8, k4k5 841 pmaddubsw m9, k4k5 842 pmaddubsw m6, k6k7 843 pmaddubsw m2, k6k7 844 paddsw m0, m8 845 paddsw m1, m9 846 paddsw m4, m6 847 paddsw m5, m2 848 paddsw m0, m4 849 paddsw m1, m5 850 paddsw m0, krd 851 paddsw m1, krd 852 psraw m0, 7 853 psraw m1, 7 854 packuswb m0, m1 855%ifidn %1, v8_avg 856 pavgb m0, [dstq] 857%endif 858 mova [dstq], m0 859 860.done: 861 REP_RET 862 863%endif ; ARCH_X86_64 864 865%endm 866 867INIT_XMM ssse3 868SUBPIX_VFILTER16 v8 869SUBPIX_VFILTER v8, 8 870SUBPIX_VFILTER v8, 4 871