1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13extern sym(vp8_bilinear_filters_x86_8) 14 15 16%define BLOCK_HEIGHT_WIDTH 4 17%define vp8_filter_weight 128 18%define VP8_FILTER_SHIFT 7 19 20 21;void vp8_filter_block1d_h6_mmx 22;( 23; unsigned char *src_ptr, 24; unsigned short *output_ptr, 25; unsigned int src_pixels_per_line, 26; unsigned int pixel_step, 27; unsigned int output_height, 28; unsigned int output_width, 29; short * vp8_filter 30;) 31global sym(vp8_filter_block1d_h6_mmx) PRIVATE 32sym(vp8_filter_block1d_h6_mmx): 33 push rbp 34 mov rbp, rsp 35 SHADOW_ARGS_TO_STACK 7 36 GET_GOT rbx 37 push rsi 38 push rdi 39 ; end prolog 40 41 mov rdx, arg(6) ;vp8_filter 42 43 movq mm1, [rdx + 16] ; do both the negative taps first!!! 44 movq mm2, [rdx + 32] ; 45 movq mm6, [rdx + 48] ; 46 movq mm7, [rdx + 64] ; 47 48 mov rdi, arg(1) ;output_ptr 49 mov rsi, arg(0) ;src_ptr 50 movsxd rcx, dword ptr arg(4) ;output_height 51 movsxd rax, dword ptr arg(5) ;output_width ; destination pitch? 52 pxor mm0, mm0 ; mm0 = 00000000 53 54.nextrow: 55 movq mm3, [rsi-2] ; mm3 = p-2..p5 56 movq mm4, mm3 ; mm4 = p-2..p5 57 psrlq mm3, 8 ; mm3 = p-1..p5 58 punpcklbw mm3, mm0 ; mm3 = p-1..p2 59 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 60 61 movq mm5, mm4 ; mm5 = p-2..p5 62 punpckhbw mm4, mm0 ; mm5 = p2..p5 63 pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers 64 paddsw mm3, mm4 ; mm3 += mm5 65 66 movq mm4, mm5 ; mm4 = p-2..p5; 67 psrlq mm5, 16 ; mm5 = p0..p5; 68 punpcklbw mm5, mm0 ; mm5 = p0..p3 69 pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers 70 paddsw mm3, mm5 ; mm3 += mm5 71 72 movq mm5, mm4 ; mm5 = p-2..p5 73 psrlq mm4, 24 ; mm4 = p1..p5 74 punpcklbw mm4, mm0 ; mm4 = p1..p4 75 pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers 76 paddsw mm3, mm4 ; mm3 += mm5 77 78 ; do outer positive taps 79 movd mm4, [rsi+3] 80 punpcklbw mm4, mm0 ; mm5 = p3..p6 81 pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers 82 paddsw mm3, mm4 ; mm3 += mm5 83 84 punpcklbw mm5, mm0 ; mm5 = p-2..p1 85 pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers 86 paddsw mm3, mm5 ; mm3 += mm5 87 88 paddsw mm3, [GLOBAL(rd)] ; mm3 += round value 89 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 90 packuswb mm3, mm0 ; pack and unpack to saturate 91 punpcklbw mm3, mm0 ; 92 93 movq [rdi], mm3 ; store the results in the destination 94 95%if ABI_IS_32BIT 96 add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line 97 add rdi, rax; 98%else 99 movsxd r8, dword ptr arg(2) ;src_pixels_per_line 100 add rdi, rax; 101 102 add rsi, r8 ; next line 103%endif 104 105 dec rcx ; decrement count 106 jnz .nextrow ; next row 107 108 ; begin epilog 109 pop rdi 110 pop rsi 111 RESTORE_GOT 112 UNSHADOW_ARGS 113 pop rbp 114 ret 115 116 117;void vp8_filter_block1dc_v6_mmx 118;( 119; short *src_ptr, 120; unsigned char *output_ptr, 121; int output_pitch, 122; unsigned int pixels_per_line, 123; unsigned int pixel_step, 124; unsigned int output_height, 125; unsigned int output_width, 126; short * vp8_filter 127;) 128global sym(vp8_filter_block1dc_v6_mmx) PRIVATE 129sym(vp8_filter_block1dc_v6_mmx): 130 push rbp 131 mov rbp, rsp 132 SHADOW_ARGS_TO_STACK 8 133 GET_GOT rbx 134 push rsi 135 push rdi 136 ; end prolog 137 138 movq mm5, [GLOBAL(rd)] 139 push rbx 140 mov rbx, arg(7) ;vp8_filter 141 movq mm1, [rbx + 16] ; do both the negative taps first!!! 142 movq mm2, [rbx + 32] ; 143 movq mm6, [rbx + 48] ; 144 movq mm7, [rbx + 64] ; 145 146 movsxd rdx, dword ptr arg(3) ;pixels_per_line 147 mov rdi, arg(1) ;output_ptr 148 mov rsi, arg(0) ;src_ptr 149 sub rsi, rdx 150 sub rsi, rdx 151 movsxd rcx, DWORD PTR arg(5) ;output_height 152 movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch? 153 pxor mm0, mm0 ; mm0 = 00000000 154 155 156.nextrow_cv: 157 movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1 158 pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers. 159 160 161 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2 162 pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers. 163 paddsw mm3, mm4 ; mm3 += mm4 164 165 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0 166 pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers. 167 paddsw mm3, mm4 ; mm3 += mm4 168 169 movq mm4, [rsi] ; mm4 = p0..p3 = row -2 170 pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers. 171 paddsw mm3, mm4 ; mm3 += mm4 172 173 174 add rsi, rdx ; move source forward 1 line to avoid 3 * pitch 175 movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1 176 pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers. 177 paddsw mm3, mm4 ; mm3 += mm4 178 179 movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3 180 pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers. 181 paddsw mm3, mm4 ; mm3 += mm4 182 183 184 paddsw mm3, mm5 ; mm3 += round value 185 psraw mm3, VP8_FILTER_SHIFT ; mm3 /= 128 186 packuswb mm3, mm0 ; pack and saturate 187 188 movd [rdi],mm3 ; store the results in the destination 189 ; the subsequent iterations repeat 3 out of 4 of these reads. Since the 190 ; recon block should be in cache this shouldn't cost much. Its obviously 191 ; avoidable!!!. 192 lea rdi, [rdi+rax] ; 193 dec rcx ; decrement count 194 jnz .nextrow_cv ; next row 195 196 pop rbx 197 198 ; begin epilog 199 pop rdi 200 pop rsi 201 RESTORE_GOT 202 UNSHADOW_ARGS 203 pop rbp 204 ret 205 206 207;void bilinear_predict8x4_mmx 208;( 209; unsigned char *src_ptr, 210; int src_pixels_per_line, 211; int xoffset, 212; int yoffset, 213; unsigned char *dst_ptr, 214; int dst_pitch 215;) 216global sym(vp8_bilinear_predict8x4_mmx) PRIVATE 217sym(vp8_bilinear_predict8x4_mmx): 218 push rbp 219 mov rbp, rsp 220 SHADOW_ARGS_TO_STACK 6 221 GET_GOT rbx 222 push rsi 223 push rdi 224 ; end prolog 225 226 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 227 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 228 229 movsxd rax, dword ptr arg(2) ;xoffset 230 mov rdi, arg(4) ;dst_ptr ; 231 232 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 233 shl rax, 5 234 235 mov rsi, arg(0) ;src_ptr ; 236 add rax, rcx 237 238 movsxd rdx, dword ptr arg(5) ;dst_pitch 239 movq mm1, [rax] ; 240 241 movq mm2, [rax+16] ; 242 movsxd rax, dword ptr arg(3) ;yoffset 243 244 pxor mm0, mm0 ; 245 shl rax, 5 246 247 add rax, rcx 248 lea rcx, [rdi+rdx*4] ; 249 250 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 251 252 ; get the first horizontal line done ; 253 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 254 movq mm4, mm3 ; make a copy of current line 255 256 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 257 punpckhbw mm4, mm0 ; 258 259 pmullw mm3, mm1 ; 260 pmullw mm4, mm1 ; 261 262 movq mm5, [rsi+1] ; 263 movq mm6, mm5 ; 264 265 punpcklbw mm5, mm0 ; 266 punpckhbw mm6, mm0 ; 267 268 pmullw mm5, mm2 ; 269 pmullw mm6, mm2 ; 270 271 paddw mm3, mm5 ; 272 paddw mm4, mm6 ; 273 274 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 275 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 276 277 paddw mm4, [GLOBAL(rd)] ; 278 psraw mm4, VP8_FILTER_SHIFT ; 279 280 movq mm7, mm3 ; 281 packuswb mm7, mm4 ; 282 283 add rsi, rdx ; next line 284.next_row_8x4: 285 movq mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 286 movq mm4, mm3 ; make a copy of current line 287 288 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 289 punpckhbw mm4, mm0 ; 290 291 pmullw mm3, mm1 ; 292 pmullw mm4, mm1 ; 293 294 movq mm5, [rsi+1] ; 295 movq mm6, mm5 ; 296 297 punpcklbw mm5, mm0 ; 298 punpckhbw mm6, mm0 ; 299 300 pmullw mm5, mm2 ; 301 pmullw mm6, mm2 ; 302 303 paddw mm3, mm5 ; 304 paddw mm4, mm6 ; 305 306 movq mm5, mm7 ; 307 movq mm6, mm7 ; 308 309 punpcklbw mm5, mm0 ; 310 punpckhbw mm6, mm0 311 312 pmullw mm5, [rax] ; 313 pmullw mm6, [rax] ; 314 315 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 316 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 317 318 paddw mm4, [GLOBAL(rd)] ; 319 psraw mm4, VP8_FILTER_SHIFT ; 320 321 movq mm7, mm3 ; 322 packuswb mm7, mm4 ; 323 324 325 pmullw mm3, [rax+16] ; 326 pmullw mm4, [rax+16] ; 327 328 paddw mm3, mm5 ; 329 paddw mm4, mm6 ; 330 331 332 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 333 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 334 335 paddw mm4, [GLOBAL(rd)] ; 336 psraw mm4, VP8_FILTER_SHIFT ; 337 338 packuswb mm3, mm4 339 340 movq [rdi], mm3 ; store the results in the destination 341 342%if ABI_IS_32BIT 343 add rsi, rdx ; next line 344 add rdi, dword ptr arg(5) ;dst_pitch ; 345%else 346 movsxd r8, dword ptr arg(5) ;dst_pitch 347 add rsi, rdx ; next line 348 add rdi, r8 349%endif 350 cmp rdi, rcx ; 351 jne .next_row_8x4 352 353 ; begin epilog 354 pop rdi 355 pop rsi 356 RESTORE_GOT 357 UNSHADOW_ARGS 358 pop rbp 359 ret 360 361 362;void bilinear_predict4x4_mmx 363;( 364; unsigned char *src_ptr, 365; int src_pixels_per_line, 366; int xoffset, 367; int yoffset, 368; unsigned char *dst_ptr, 369; int dst_pitch 370;) 371global sym(vp8_bilinear_predict4x4_mmx) PRIVATE 372sym(vp8_bilinear_predict4x4_mmx): 373 push rbp 374 mov rbp, rsp 375 SHADOW_ARGS_TO_STACK 6 376 GET_GOT rbx 377 push rsi 378 push rdi 379 ; end prolog 380 381 ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]; 382 ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]; 383 384 movsxd rax, dword ptr arg(2) ;xoffset 385 mov rdi, arg(4) ;dst_ptr ; 386 387 lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 388 shl rax, 5 389 390 add rax, rcx ; HFilter 391 mov rsi, arg(0) ;src_ptr ; 392 393 movsxd rdx, dword ptr arg(5) ;ldst_pitch 394 movq mm1, [rax] ; 395 396 movq mm2, [rax+16] ; 397 movsxd rax, dword ptr arg(3) ;yoffset 398 399 pxor mm0, mm0 ; 400 shl rax, 5 401 402 add rax, rcx 403 lea rcx, [rdi+rdx*4] ; 404 405 movsxd rdx, dword ptr arg(1) ;src_pixels_per_line ; 406 407 ; get the first horizontal line done ; 408 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 409 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 410 411 pmullw mm3, mm1 ; 412 movd mm5, [rsi+1] ; 413 414 punpcklbw mm5, mm0 ; 415 pmullw mm5, mm2 ; 416 417 paddw mm3, mm5 ; 418 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 419 420 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 421 422 movq mm7, mm3 ; 423 packuswb mm7, mm0 ; 424 425 add rsi, rdx ; next line 426.next_row_4x4: 427 movd mm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 428 punpcklbw mm3, mm0 ; xx 00 01 02 03 04 05 06 429 430 pmullw mm3, mm1 ; 431 movd mm5, [rsi+1] ; 432 433 punpcklbw mm5, mm0 ; 434 pmullw mm5, mm2 ; 435 436 paddw mm3, mm5 ; 437 438 movq mm5, mm7 ; 439 punpcklbw mm5, mm0 ; 440 441 pmullw mm5, [rax] ; 442 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 443 444 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 445 movq mm7, mm3 ; 446 447 packuswb mm7, mm0 ; 448 449 pmullw mm3, [rax+16] ; 450 paddw mm3, mm5 ; 451 452 453 paddw mm3, [GLOBAL(rd)] ; xmm3 += round value 454 psraw mm3, VP8_FILTER_SHIFT ; xmm3 /= 128 455 456 packuswb mm3, mm0 457 movd [rdi], mm3 ; store the results in the destination 458 459%if ABI_IS_32BIT 460 add rsi, rdx ; next line 461 add rdi, dword ptr arg(5) ;dst_pitch ; 462%else 463 movsxd r8, dword ptr arg(5) ;dst_pitch ; 464 add rsi, rdx ; next line 465 add rdi, r8 466%endif 467 468 cmp rdi, rcx ; 469 jne .next_row_4x4 470 471 ; begin epilog 472 pop rdi 473 pop rsi 474 RESTORE_GOT 475 UNSHADOW_ARGS 476 pop rbp 477 ret 478 479 480 481SECTION_RODATA 482align 16 483rd: 484 times 4 dw 0x40 485 486align 16 487global HIDDEN_DATA(sym(vp8_six_tap_x86)) 488sym(vp8_six_tap_x86): 489 times 8 dw 0 490 times 8 dw 0 491 times 8 dw 128 492 times 8 dw 0 493 times 8 dw 0 494 times 8 dw 0 495 496 times 8 dw 0 497 times 8 dw -6 498 times 8 dw 123 499 times 8 dw 12 500 times 8 dw -1 501 times 8 dw 0 502 503 times 8 dw 2 504 times 8 dw -11 505 times 8 dw 108 506 times 8 dw 36 507 times 8 dw -8 508 times 8 dw 1 509 510 times 8 dw 0 511 times 8 dw -9 512 times 8 dw 93 513 times 8 dw 50 514 times 8 dw -6 515 times 8 dw 0 516 517 times 8 dw 3 518 times 8 dw -16 519 times 8 dw 77 520 times 8 dw 77 521 times 8 dw -16 522 times 8 dw 3 523 524 times 8 dw 0 525 times 8 dw -6 526 times 8 dw 50 527 times 8 dw 93 528 times 8 dw -9 529 times 8 dw 0 530 531 times 8 dw 1 532 times 8 dw -8 533 times 8 dw 36 534 times 8 dw 108 535 times 8 dw -11 536 times 8 dw 2 537 538 times 8 dw 0 539 times 8 dw -1 540 times 8 dw 12 541 times 8 dw 123 542 times 8 dw -6 543 times 8 dw 0 544 545 546