1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13%define _t0 0 14%define _t1 _t0 + 16 15%define _p3 _t1 + 16 16%define _p2 _p3 + 16 17%define _p1 _p2 + 16 18%define _p0 _p1 + 16 19%define _q0 _p0 + 16 20%define _q1 _q0 + 16 21%define _q2 _q1 + 16 22%define _q3 _q2 + 16 23%define lf_var_size 160 24 25; Use of pmaxub instead of psubusb to compute filter mask was seen 26; in ffvp8 27 28%macro LFH_FILTER_AND_HEV_MASK 1 29%if %1 30 movdqa xmm2, [rdi+2*rax] ; q3 31 movdqa xmm1, [rsi+2*rax] ; q2 32 movdqa xmm4, [rsi+rax] ; q1 33 movdqa xmm5, [rsi] ; q0 34 neg rax ; negate pitch to deal with above border 35%else 36 movlps xmm2, [rsi + rcx*2] ; q3 37 movlps xmm1, [rsi + rcx] ; q2 38 movlps xmm4, [rsi] ; q1 39 movlps xmm5, [rsi + rax] ; q0 40 41 movhps xmm2, [rdi + rcx*2] 42 movhps xmm1, [rdi + rcx] 43 movhps xmm4, [rdi] 44 movhps xmm5, [rdi + rax] 45 46 lea rsi, [rsi + rax*4] 47 lea rdi, [rdi + rax*4] 48 49 movdqa [rsp+_q2], xmm1 ; store q2 50 movdqa [rsp+_q1], xmm4 ; store q1 51%endif 52 movdqa xmm7, [rdx] ;limit 53 54 movdqa xmm6, xmm1 ; q2 55 movdqa xmm3, xmm4 ; q1 56 57 psubusb xmm1, xmm2 ; q2-=q3 58 psubusb xmm2, xmm6 ; q3-=q2 59 60 psubusb xmm4, xmm6 ; q1-=q2 61 psubusb xmm6, xmm3 ; q2-=q1 62 63 por xmm4, xmm6 ; abs(q2-q1) 64 por xmm1, xmm2 ; abs(q3-q2) 65 66 movdqa xmm0, xmm5 ; q0 67 pmaxub xmm1, xmm4 68 69 psubusb xmm5, xmm3 ; q0-=q1 70 psubusb xmm3, xmm0 ; q1-=q0 71 72 por xmm5, xmm3 ; abs(q0-q1) 73 movdqa [rsp+_t0], xmm5 ; save to t0 74 75 pmaxub xmm1, xmm5 76 77%if %1 78 movdqa xmm2, [rsi+4*rax] ; p3 79 movdqa xmm4, [rdi+4*rax] ; p2 80 movdqa xmm6, [rsi+2*rax] ; p1 81%else 82 movlps xmm2, [rsi + rax] ; p3 83 movlps xmm4, [rsi] ; p2 84 movlps xmm6, [rsi + rcx] ; p1 85 86 movhps xmm2, [rdi + rax] 87 movhps xmm4, [rdi] 88 movhps xmm6, [rdi + rcx] 89 90 movdqa [rsp+_p2], xmm4 ; store p2 91 movdqa [rsp+_p1], xmm6 ; store p1 92%endif 93 94 movdqa xmm5, xmm4 ; p2 95 movdqa xmm3, xmm6 ; p1 96 97 psubusb xmm4, xmm2 ; p2-=p3 98 psubusb xmm2, xmm5 ; p3-=p2 99 100 psubusb xmm3, xmm5 ; p1-=p2 101 pmaxub xmm1, xmm4 ; abs(p3 - p2) 102 103 psubusb xmm5, xmm6 ; p2-=p1 104 pmaxub xmm1, xmm2 ; abs(p3 - p2) 105 106 pmaxub xmm1, xmm5 ; abs(p2 - p1) 107 movdqa xmm2, xmm6 ; p1 108 109 pmaxub xmm1, xmm3 ; abs(p2 - p1) 110%if %1 111 movdqa xmm4, [rsi+rax] ; p0 112 movdqa xmm3, [rdi] ; q1 113%else 114 movlps xmm4, [rsi + rcx*2] ; p0 115 movhps xmm4, [rdi + rcx*2] 116 movdqa xmm3, [rsp+_q1] ; q1 117%endif 118 119 movdqa xmm5, xmm4 ; p0 120 psubusb xmm4, xmm6 ; p0-=p1 121 122 psubusb xmm6, xmm5 ; p1-=p0 123 124 por xmm6, xmm4 ; abs(p1 - p0) 125 mov rdx, arg(2) ; get blimit 126 127 movdqa [rsp+_t1], xmm6 ; save to t1 128 129 movdqa xmm4, xmm3 ; q1 130 pmaxub xmm1, xmm6 131 132 psubusb xmm3, xmm2 ; q1-=p1 133 psubusb xmm2, xmm4 ; p1-=q1 134 135 psubusb xmm1, xmm7 136 por xmm2, xmm3 ; abs(p1-q1) 137 138 movdqa xmm7, [rdx] ; blimit 139 mov rdx, arg(4) ; hev get thresh 140 141 movdqa xmm3, xmm0 ; q0 142 pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero 143 144 movdqa xmm6, xmm5 ; p0 145 psrlw xmm2, 1 ; abs(p1-q1)/2 146 147 psubusb xmm5, xmm3 ; p0-=q0 148 psubusb xmm3, xmm6 ; q0-=p0 149 por xmm5, xmm3 ; abs(p0 - q0) 150 151 paddusb xmm5, xmm5 ; abs(p0-q0)*2 152 153 movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0) 154 movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0) 155 156 paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2 157 158 movdqa xmm2, [rdx] ; hev 159 160 psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 161 psubusb xmm4, xmm2 ; hev 162 163 psubusb xmm3, xmm2 ; hev 164 por xmm1, xmm5 165 166 pxor xmm7, xmm7 167 paddb xmm4, xmm3 ; hev abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 168 169 pcmpeqb xmm4, xmm5 ; hev 170 pcmpeqb xmm3, xmm3 ; hev 171 172 pcmpeqb xmm1, xmm7 ; mask xmm1 173 pxor xmm4, xmm3 ; hev 174%endmacro 175 176%macro B_FILTER 1 177 movdqa xmm3, [GLOBAL(t80)] 178%if %1 == 0 179 movdqa xmm2, [rsp+_p1] ; p1 180 movdqa xmm7, [rsp+_q1] ; q1 181%elif %1 == 1 182 movdqa xmm2, [rsi+2*rax] ; p1 183 movdqa xmm7, [rdi] ; q1 184%elif %1 == 2 185 movdqa xmm2, [rsp+_p1] ; p1 186 movdqa xmm6, [rsp+_p0] ; p0 187 movdqa xmm0, [rsp+_q0] ; q0 188 movdqa xmm7, [rsp+_q1] ; q1 189%endif 190 191 pxor xmm2, xmm3 ; p1 offset to convert to signed values 192 pxor xmm7, xmm3 ; q1 offset to convert to signed values 193 194 psubsb xmm2, xmm7 ; p1 - q1 195 pxor xmm6, xmm3 ; offset to convert to signed values 196 197 pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1) 198 pxor xmm0, xmm3 ; offset to convert to signed values 199 200 movdqa xmm3, xmm0 ; q0 201 psubsb xmm0, xmm6 ; q0 - p0 202 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1) 203 paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1) 204 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1) 205 pand xmm1, xmm2 ; mask filter values we don't care about 206 207 movdqa xmm2, xmm1 208 paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4 209 paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3 210 211 punpckhbw xmm5, xmm2 ; axbxcxdx 212 punpcklbw xmm2, xmm2 ; exfxgxhx 213 214 punpcklbw xmm0, xmm1 ; exfxgxhx 215 psraw xmm5, 11 ; sign extended shift right by 3 216 217 punpckhbw xmm1, xmm1 ; axbxcxdx 218 psraw xmm2, 11 ; sign extended shift right by 3 219 220 packsswb xmm2, xmm5 ; (3* (q0 - p0) + hvm(p1 - q1) + 3) >> 3; 221 psraw xmm0, 11 ; sign extended shift right by 3 222 223 psraw xmm1, 11 ; sign extended shift right by 3 224 movdqa xmm5, xmm0 ; save results 225 226 packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3 227 228 paddsb xmm6, xmm2 ; p0+= p0 add 229 230 movdqa xmm2, [GLOBAL(ones)] 231 paddsw xmm5, xmm2 232 paddsw xmm1, xmm2 233 psraw xmm5, 1 ; partial shifted one more time for 2nd tap 234 psraw xmm1, 1 ; partial shifted one more time for 2nd tap 235 packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4 236 movdqa xmm2, [GLOBAL(t80)] 237 238%if %1 == 0 239 movdqa xmm1, [rsp+_p1] ; p1 240 lea rsi, [rsi + rcx*2] 241 lea rdi, [rdi + rcx*2] 242%elif %1 == 1 243 movdqa xmm1, [rsi+2*rax] ; p1 244%elif %1 == 2 245 movdqa xmm1, [rsp+_p1] ; p1 246%endif 247 248 pandn xmm4, xmm5 ; high edge variance additive 249 pxor xmm6, xmm2 ; unoffset 250 251 pxor xmm1, xmm2 ; reoffset 252 psubsb xmm3, xmm0 ; q0-= q0 add 253 254 paddsb xmm1, xmm4 ; p1+= p1 add 255 pxor xmm3, xmm2 ; unoffset 256 257 pxor xmm1, xmm2 ; unoffset 258 psubsb xmm7, xmm4 ; q1-= q1 add 259 260 pxor xmm7, xmm2 ; unoffset 261%if %1 == 0 262 movq [rsi], xmm6 ; p0 263 movhps [rdi], xmm6 264 movq [rsi + rax], xmm1 ; p1 265 movhps [rdi + rax], xmm1 266 movq [rsi + rcx], xmm3 ; q0 267 movhps [rdi + rcx], xmm3 268 movq [rsi + rcx*2], xmm7 ; q1 269 movhps [rdi + rcx*2], xmm7 270%elif %1 == 1 271 movdqa [rsi+rax], xmm6 ; write back 272 movdqa [rsi+2*rax], xmm1 ; write back 273 movdqa [rsi], xmm3 ; write back 274 movdqa [rdi], xmm7 ; write back 275%endif 276 277%endmacro 278 279%if ABI_IS_32BIT 280 281;void vp8_loop_filter_horizontal_edge_sse2 282;( 283; unsigned char *src_ptr, 284; int src_pixel_step, 285; const char *blimit, 286; const char *limit, 287; const char *thresh, 288;) 289global sym(vp8_loop_filter_horizontal_edge_sse2) PRIVATE 290sym(vp8_loop_filter_horizontal_edge_sse2): 291 push rbp 292 mov rbp, rsp 293 SHADOW_ARGS_TO_STACK 5 294 SAVE_XMM 7 295 GET_GOT rbx 296 push rsi 297 push rdi 298 ; end prolog 299 300 ALIGN_STACK 16, rax 301 sub rsp, lf_var_size 302 303 mov rsi, arg(0) ;src_ptr 304 movsxd rax, dword ptr arg(1) ;src_pixel_step 305 306 mov rdx, arg(3) ;limit 307 308 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 309 310 ; calculate breakout conditions and high edge variance 311 LFH_FILTER_AND_HEV_MASK 1 312 ; filter and write back the result 313 B_FILTER 1 314 315 add rsp, lf_var_size 316 pop rsp 317 ; begin epilog 318 pop rdi 319 pop rsi 320 RESTORE_GOT 321 RESTORE_XMM 322 UNSHADOW_ARGS 323 pop rbp 324 ret 325 326%endif 327 328;void vp8_loop_filter_horizontal_edge_uv_sse2 329;( 330; unsigned char *src_ptr, 331; int src_pixel_step, 332; const char *blimit, 333; const char *limit, 334; const char *thresh, 335; int count 336;) 337global sym(vp8_loop_filter_horizontal_edge_uv_sse2) PRIVATE 338sym(vp8_loop_filter_horizontal_edge_uv_sse2): 339 push rbp 340 mov rbp, rsp 341 SHADOW_ARGS_TO_STACK 6 342 SAVE_XMM 7 343 GET_GOT rbx 344 push rsi 345 push rdi 346 ; end prolog 347 348 ALIGN_STACK 16, rax 349 sub rsp, lf_var_size 350 351 mov rsi, arg(0) ; u 352 mov rdi, arg(5) ; v 353 movsxd rax, dword ptr arg(1) ; src_pixel_step 354 mov rcx, rax 355 neg rax ; negate pitch to deal with above border 356 357 mov rdx, arg(3) ;limit 358 359 lea rsi, [rsi + rcx] 360 lea rdi, [rdi + rcx] 361 362 ; calculate breakout conditions and high edge variance 363 LFH_FILTER_AND_HEV_MASK 0 364 ; filter and write back the result 365 B_FILTER 0 366 367 add rsp, lf_var_size 368 pop rsp 369 ; begin epilog 370 pop rdi 371 pop rsi 372 RESTORE_GOT 373 RESTORE_XMM 374 UNSHADOW_ARGS 375 pop rbp 376 ret 377 378 379%macro MB_FILTER_AND_WRITEBACK 1 380 movdqa xmm3, [GLOBAL(t80)] 381%if %1 == 0 382 movdqa xmm2, [rsp+_p1] ; p1 383 movdqa xmm7, [rsp+_q1] ; q1 384%elif %1 == 1 385 movdqa xmm2, [rsi+2*rax] ; p1 386 movdqa xmm7, [rdi] ; q1 387 388 mov rcx, rax 389 neg rcx 390%elif %1 == 2 391 movdqa xmm2, [rsp+_p1] ; p1 392 movdqa xmm6, [rsp+_p0] ; p0 393 movdqa xmm0, [rsp+_q0] ; q0 394 movdqa xmm7, [rsp+_q1] ; q1 395%endif 396 397 pxor xmm2, xmm3 ; p1 offset to convert to signed values 398 pxor xmm7, xmm3 ; q1 offset to convert to signed values 399 pxor xmm6, xmm3 ; offset to convert to signed values 400 pxor xmm0, xmm3 ; offset to convert to signed values 401 402 psubsb xmm2, xmm7 ; p1 - q1 403 404 movdqa xmm3, xmm0 ; q0 405 psubsb xmm0, xmm6 ; q0 - p0 406 paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1) 407 paddsb xmm2, xmm0 ; 2 * (q0 - p0) 408 paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1) 409 pand xmm1, xmm2 ; mask filter values we don't care about 410 411 movdqa xmm2, xmm1 ; vp8_filter 412 413 pand xmm2, xmm4 ; Filter2 = vp8_filter & hev 414 pxor xmm0, xmm0 415 416 pandn xmm4, xmm1 ; vp8_filter&=~hev 417 pxor xmm1, xmm1 418 419 punpcklbw xmm0, xmm4 ; Filter 2 (hi) 420 punpckhbw xmm1, xmm4 ; Filter 2 (lo) 421 422 movdqa xmm5, xmm2 423 424 movdqa xmm4, [GLOBAL(s9)] 425 paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3) 426 paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4) 427 428 pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9 429 pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9 430 431 punpckhbw xmm7, xmm5 ; axbxcxdx 432 punpcklbw xmm5, xmm5 ; exfxgxhx 433 434 psraw xmm7, 11 ; sign extended shift right by 3 435 436 psraw xmm5, 11 ; sign extended shift right by 3 437 punpckhbw xmm4, xmm2 ; axbxcxdx 438 439 punpcklbw xmm2, xmm2 ; exfxgxhx 440 psraw xmm4, 11 ; sign extended shift right by 3 441 442 packsswb xmm5, xmm7 ; Filter2 >>=3; 443 psraw xmm2, 11 ; sign extended shift right by 3 444 445 packsswb xmm2, xmm4 ; Filter1 >>=3; 446 447 paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2 448 449 psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1 450 movdqa xmm7, xmm1 451 452 movdqa xmm4, [GLOBAL(s63)] 453 movdqa xmm5, xmm0 454 movdqa xmm2, xmm5 455 paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63 456 paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63 457 movdqa xmm4, xmm7 458 459 paddw xmm5, xmm5 ; Filter 2 (hi) * 18 460 461 paddw xmm7, xmm7 ; Filter 2 (lo) * 18 462 paddw xmm5, xmm0 ; Filter 2 (hi) * 27 + 63 463 464 paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63 465 paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63 466 psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7 467 468 paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63 469 psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7 470 psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7 471 472 packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) 473 474 psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7 475 psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7 476 psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7 477 478 packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) 479 packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) 480 movdqa xmm7, [GLOBAL(t80)] 481 482%if %1 == 0 483 movdqa xmm1, [rsp+_q1] ; q1 484 movdqa xmm4, [rsp+_p1] ; p1 485 lea rsi, [rsi+rcx*2] 486 lea rdi, [rdi+rcx*2] 487 488%elif %1 == 1 489 movdqa xmm1, [rdi] ; q1 490 movdqa xmm4, [rsi+rax*2] ; p1 491%elif %1 == 2 492 movdqa xmm4, [rsp+_p1] ; p1 493 movdqa xmm1, [rsp+_q1] ; q1 494%endif 495 496 pxor xmm1, xmm7 497 pxor xmm4, xmm7 498 499 psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3) 500 paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3) 501 psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2) 502 paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2) 503 504%if %1 == 1 505 movdqa xmm2, [rdi+rax*4] ; p2 506 movdqa xmm5, [rdi+rcx] ; q2 507%else 508 movdqa xmm2, [rsp+_p2] ; p2 509 movdqa xmm5, [rsp+_q2] ; q2 510%endif 511 512 pxor xmm1, xmm7 ; *oq1 = sq^0x80; 513 pxor xmm4, xmm7 ; *op1 = sp^0x80; 514 pxor xmm2, xmm7 515 pxor xmm5, xmm7 516 paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u) 517 psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u) 518 pxor xmm2, xmm7 ; *op2 = sp^0x80; 519 pxor xmm5, xmm7 ; *oq2 = sq^0x80; 520 pxor xmm3, xmm7 ; *oq0 = sq^0x80 521 pxor xmm6, xmm7 ; *oq0 = sp^0x80 522%if %1 == 0 523 movq [rsi], xmm6 ; p0 524 movhps [rdi], xmm6 525 movq [rsi + rcx], xmm3 ; q0 526 movhps [rdi + rcx], xmm3 527 lea rdx, [rcx + rcx*2] 528 movq [rsi+rcx*2], xmm1 ; q1 529 movhps [rdi+rcx*2], xmm1 530 531 movq [rsi + rax], xmm4 ; p1 532 movhps [rdi + rax], xmm4 533 534 movq [rsi+rax*2], xmm2 ; p2 535 movhps [rdi+rax*2], xmm2 536 537 movq [rsi+rdx], xmm5 ; q2 538 movhps [rdi+rdx], xmm5 539%elif %1 == 1 540 movdqa [rdi+rcx], xmm5 ; q2 541 movdqa [rdi], xmm1 ; q1 542 movdqa [rsi], xmm3 ; q0 543 movdqa [rsi+rax ], xmm6 ; p0 544 movdqa [rsi+rax*2], xmm4 ; p1 545 movdqa [rdi+rax*4], xmm2 ; p2 546%elif %1 == 2 547 movdqa [rsp+_p1], xmm4 ; p1 548 movdqa [rsp+_p0], xmm6 ; p0 549 movdqa [rsp+_q0], xmm3 ; q0 550 movdqa [rsp+_q1], xmm1 ; q1 551%endif 552 553%endmacro 554 555 556;void vp8_mbloop_filter_horizontal_edge_sse2 557;( 558; unsigned char *src_ptr, 559; int src_pixel_step, 560; const char *blimit, 561; const char *limit, 562; const char *thresh, 563;) 564global sym(vp8_mbloop_filter_horizontal_edge_sse2) PRIVATE 565sym(vp8_mbloop_filter_horizontal_edge_sse2): 566 push rbp 567 mov rbp, rsp 568 SHADOW_ARGS_TO_STACK 5 569 SAVE_XMM 7 570 GET_GOT rbx 571 push rsi 572 push rdi 573 ; end prolog 574 575 ALIGN_STACK 16, rax 576 sub rsp, lf_var_size 577 578 mov rsi, arg(0) ;src_ptr 579 movsxd rax, dword ptr arg(1) ;src_pixel_step 580 mov rdx, arg(3) ;limit 581 582 lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing 583 584 ; calculate breakout conditions and high edge variance 585 LFH_FILTER_AND_HEV_MASK 1 586 ; filter and write back the results 587 MB_FILTER_AND_WRITEBACK 1 588 589 add rsp, lf_var_size 590 pop rsp 591 ; begin epilog 592 pop rdi 593 pop rsi 594 RESTORE_GOT 595 RESTORE_XMM 596 UNSHADOW_ARGS 597 pop rbp 598 ret 599 600 601;void vp8_mbloop_filter_horizontal_edge_uv_sse2 602;( 603; unsigned char *u, 604; int src_pixel_step, 605; const char *blimit, 606; const char *limit, 607; const char *thresh, 608; unsigned char *v 609;) 610global sym(vp8_mbloop_filter_horizontal_edge_uv_sse2) PRIVATE 611sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): 612 push rbp 613 mov rbp, rsp 614 SHADOW_ARGS_TO_STACK 6 615 SAVE_XMM 7 616 GET_GOT rbx 617 push rsi 618 push rdi 619 ; end prolog 620 621 ALIGN_STACK 16, rax 622 sub rsp, lf_var_size 623 624 mov rsi, arg(0) ; u 625 mov rdi, arg(5) ; v 626 movsxd rax, dword ptr arg(1) ; src_pixel_step 627 mov rcx, rax 628 neg rax ; negate pitch to deal with above border 629 mov rdx, arg(3) ;limit 630 631 lea rsi, [rsi + rcx] 632 lea rdi, [rdi + rcx] 633 634 ; calculate breakout conditions and high edge variance 635 LFH_FILTER_AND_HEV_MASK 0 636 ; filter and write back the results 637 MB_FILTER_AND_WRITEBACK 0 638 639 add rsp, lf_var_size 640 pop rsp 641 ; begin epilog 642 pop rdi 643 pop rsi 644 RESTORE_GOT 645 RESTORE_XMM 646 UNSHADOW_ARGS 647 pop rbp 648 ret 649 650 651%macro TRANSPOSE_16X8 2 652 movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 653 movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 654 movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 655 movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 656 movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 657 movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 658 659 punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 660 661 movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 662 663 movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 664 punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 665 666 movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 667 668 punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 669%if %1 670 lea rsi, [rsi+rax*8] 671 lea rdi, [rdi+rax*8] 672%else 673 mov rsi, arg(5) ; v_ptr 674%endif 675 676 movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 677 punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 678 punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 679 punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 680 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 681 682%if %1 == 0 683 lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing 684 lea rsi, [rsi - 4] 685%endif 686 687 movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 688 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 689 690 movdqa xmm7, xmm4 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 691 punpckhdq xmm3, xmm5 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 692 693 punpckhdq xmm7, xmm6 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 694 695 punpckldq xmm4, xmm6 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 696 697 punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 698 699 movdqa [rsp+_t0], xmm2 ; save to free XMM2 700 701 movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 702 movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 703 movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 704 movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 705 movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 706 707 punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 708 709 movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 710 711 punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 712 713 movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 714 715 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 716 717 movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 718 719 punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 720 721 movdqa xmm6, xmm1 ; 722 punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 723 724 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 725 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 726 727 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 728 729 punpckhwd xmm2, xmm0 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 730 731 movdqa xmm0, xmm5 732 punpckldq xmm0, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 733 734 punpckhdq xmm5, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 735 movdqa xmm1, xmm2 ; b7 a7 97 87 b6 a6 96 86 b5 a5 95 85 b4 a4 94 84 736 737 punpckldq xmm1, xmm6 ; f5 e5 d5 c5 b5 a5 95 85 f4 e4 d4 c4 b4 a4 94 84 738 739 punpckhdq xmm2, xmm6 ; f7 e7 d7 c7 b7 a7 97 87 f6 e6 d6 c6 b6 a6 96 86 740 movdqa xmm6, xmm7 ; 77 67 57 47 37 27 17 07 76 66 56 46 36 26 16 06 741 742 punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 743 744 punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07 745 746%if %2 == 0 747 movdqa [rsp+_q3], xmm7 ; save 7 748 movdqa [rsp+_q2], xmm6 ; save 6 749%endif 750 movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 751 punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 752 punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 753 movdqa [rsp+_p1], xmm2 ; save 2 754 755 movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04 756 punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 757 movdqa [rsp+_p0], xmm3 ; save 3 758 759 punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 760 761 movdqa [rsp+_q0], xmm4 ; save 4 762 movdqa [rsp+_q1], xmm5 ; save 5 763 movdqa xmm1, [rsp+_t0] 764 765 movdqa xmm2, xmm1 ; 766 punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 767 punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 768 769%if %2 == 0 770 movdqa [rsp+_p2], xmm1 771 movdqa [rsp+_p3], xmm2 772%endif 773 774%endmacro 775 776%macro LFV_FILTER_MASK_HEV_MASK 0 777 movdqa xmm0, xmm6 ; q2 778 psubusb xmm0, xmm7 ; q2-q3 779 780 psubusb xmm7, xmm6 ; q3-q2 781 movdqa xmm4, xmm5 ; q1 782 783 por xmm7, xmm0 ; abs (q3-q2) 784 psubusb xmm4, xmm6 ; q1-q2 785 786 movdqa xmm0, xmm1 787 psubusb xmm6, xmm5 ; q2-q1 788 789 por xmm6, xmm4 ; abs (q2-q1) 790 psubusb xmm0, xmm2 ; p2 - p3; 791 792 psubusb xmm2, xmm1 ; p3 - p2; 793 por xmm0, xmm2 ; abs(p2-p3) 794 795 movdqa xmm5, [rsp+_p1] ; p1 796 pmaxub xmm0, xmm7 797 798 movdqa xmm2, xmm5 ; p1 799 psubusb xmm5, xmm1 ; p1-p2 800 psubusb xmm1, xmm2 ; p2-p1 801 802 movdqa xmm7, xmm3 ; p0 803 psubusb xmm7, xmm2 ; p0-p1 804 805 por xmm1, xmm5 ; abs(p2-p1) 806 pmaxub xmm0, xmm6 807 808 pmaxub xmm0, xmm1 809 movdqa xmm1, xmm2 ; p1 810 811 psubusb xmm2, xmm3 ; p1-p0 812 813 por xmm2, xmm7 ; abs(p1-p0) 814 815 pmaxub xmm0, xmm2 816 817 movdqa xmm5, [rsp+_q0] ; q0 818 movdqa xmm7, [rsp+_q1] ; q1 819 820 mov rdx, arg(3) ; limit 821 822 movdqa xmm6, xmm5 ; q0 823 movdqa xmm4, xmm7 ; q1 824 825 psubusb xmm5, xmm7 ; q0-q1 826 psubusb xmm7, xmm6 ; q1-q0 827 828 por xmm7, xmm5 ; abs(q1-q0) 829 830 pmaxub xmm0, xmm7 831 832 psubusb xmm0, [rdx] ; limit 833 834 mov rdx, arg(2) ; blimit 835 movdqa xmm5, xmm4 ; q1 836 837 psubusb xmm5, xmm1 ; q1-=p1 838 psubusb xmm1, xmm4 ; p1-=q1 839 840 por xmm5, xmm1 ; abs(p1-q1) 841 movdqa xmm1, xmm3 ; p0 842 843 pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero 844 psubusb xmm1, xmm6 ; p0-q0 845 846 movdqa xmm4, [rdx] ; blimit 847 mov rdx, arg(4) ; get thresh 848 849 psrlw xmm5, 1 ; abs(p1-q1)/2 850 psubusb xmm6, xmm3 ; q0-p0 851 852 por xmm1, xmm6 ; abs(q0-p0) 853 paddusb xmm1, xmm1 ; abs(q0-p0)*2 854 movdqa xmm3, [rdx] 855 856 paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2 857 psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh 858 859 psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh 860 861 psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit 862 por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh 863 864 por xmm1, xmm0 ; mask 865 pcmpeqb xmm2, xmm0 866 867 pxor xmm0, xmm0 868 pcmpeqb xmm4, xmm4 869 870 pcmpeqb xmm1, xmm0 871 pxor xmm4, xmm2 872%endmacro 873 874%macro BV_TRANSPOSE 0 875 ; xmm1 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 876 ; xmm6 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 877 ; xmm3 = f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 878 ; xmm7 = f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05 879 movdqa xmm2, xmm1 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 880 punpcklbw xmm2, xmm6 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 881 882 movdqa xmm4, xmm3 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 883 punpckhbw xmm1, xmm6 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 884 885 punpcklbw xmm4, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 886 887 punpckhbw xmm3, xmm7 ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 888 889 movdqa xmm6, xmm2 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 890 punpcklwd xmm2, xmm4 ; 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 891 892 punpckhwd xmm6, xmm4 ; 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 893 movdqa xmm5, xmm1 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 894 895 punpcklwd xmm1, xmm3 ; b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 896 897 punpckhwd xmm5, xmm3 ; f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 898 ; xmm2 = 35 34 33 32 25 24 23 22 15 14 13 12 05 04 03 02 899 ; xmm6 = 75 74 73 72 65 64 63 62 55 54 53 52 45 44 43 42 900 ; xmm1 = b5 b4 b3 b2 a5 a4 a3 a2 95 94 93 92 85 84 83 82 901 ; xmm5 = f5 f4 f3 f2 e5 e4 e3 e2 d5 d4 d3 d2 c5 c4 c3 c2 902%endmacro 903 904%macro BV_WRITEBACK 2 905 movd [rsi+2], %1 906 movd [rsi+4*rax+2], %2 907 psrldq %1, 4 908 psrldq %2, 4 909 movd [rdi+2], %1 910 movd [rdi+4*rax+2], %2 911 psrldq %1, 4 912 psrldq %2, 4 913 movd [rsi+2*rax+2], %1 914 movd [rsi+2*rcx+2], %2 915 psrldq %1, 4 916 psrldq %2, 4 917 movd [rdi+2*rax+2], %1 918 movd [rdi+2*rcx+2], %2 919%endmacro 920 921%if ABI_IS_32BIT 922 923;void vp8_loop_filter_vertical_edge_sse2 924;( 925; unsigned char *src_ptr, 926; int src_pixel_step, 927; const char *blimit, 928; const char *limit, 929; const char *thresh, 930;) 931global sym(vp8_loop_filter_vertical_edge_sse2) PRIVATE 932sym(vp8_loop_filter_vertical_edge_sse2): 933 push rbp 934 mov rbp, rsp 935 SHADOW_ARGS_TO_STACK 5 936 SAVE_XMM 7 937 GET_GOT rbx 938 push rsi 939 push rdi 940 ; end prolog 941 942 ALIGN_STACK 16, rax 943 sub rsp, lf_var_size 944 945 mov rsi, arg(0) ; src_ptr 946 movsxd rax, dword ptr arg(1) ; src_pixel_step 947 948 lea rsi, [rsi - 4] 949 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 950 lea rcx, [rax*2+rax] 951 952 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 953 TRANSPOSE_16X8 1, 1 954 955 ; calculate filter mask and high edge variance 956 LFV_FILTER_MASK_HEV_MASK 957 958 ; start work on filters 959 B_FILTER 2 960 961 ; transpose and write back - only work on q1, q0, p0, p1 962 BV_TRANSPOSE 963 ; store 16-line result 964 965 lea rdx, [rax] 966 neg rdx 967 968 BV_WRITEBACK xmm1, xmm5 969 970 lea rsi, [rsi+rdx*8] 971 lea rdi, [rdi+rdx*8] 972 BV_WRITEBACK xmm2, xmm6 973 974 add rsp, lf_var_size 975 pop rsp 976 ; begin epilog 977 pop rdi 978 pop rsi 979 RESTORE_GOT 980 RESTORE_XMM 981 UNSHADOW_ARGS 982 pop rbp 983 ret 984 985%endif 986 987;void vp8_loop_filter_vertical_edge_uv_sse2 988;( 989; unsigned char *u, 990; int src_pixel_step, 991; const char *blimit, 992; const char *limit, 993; const char *thresh, 994; unsigned char *v 995;) 996global sym(vp8_loop_filter_vertical_edge_uv_sse2) PRIVATE 997sym(vp8_loop_filter_vertical_edge_uv_sse2): 998 push rbp 999 mov rbp, rsp 1000 SHADOW_ARGS_TO_STACK 6 1001 SAVE_XMM 7 1002 GET_GOT rbx 1003 push rsi 1004 push rdi 1005 ; end prolog 1006 1007 ALIGN_STACK 16, rax 1008 sub rsp, lf_var_size 1009 1010 mov rsi, arg(0) ; u_ptr 1011 movsxd rax, dword ptr arg(1) ; src_pixel_step 1012 1013 lea rsi, [rsi - 4] 1014 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1015 lea rcx, [rax+2*rax] 1016 1017 ;transpose 16x8 to 8x16, and store the 8-line result on stack. 1018 TRANSPOSE_16X8 0, 1 1019 1020 ; calculate filter mask and high edge variance 1021 LFV_FILTER_MASK_HEV_MASK 1022 1023 ; start work on filters 1024 B_FILTER 2 1025 1026 ; transpose and write back - only work on q1, q0, p0, p1 1027 BV_TRANSPOSE 1028 1029 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1030 1031 ; store 16-line result 1032 BV_WRITEBACK xmm1, xmm5 1033 1034 mov rsi, arg(0) ; u_ptr 1035 lea rsi, [rsi - 4] 1036 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1037 BV_WRITEBACK xmm2, xmm6 1038 1039 add rsp, lf_var_size 1040 pop rsp 1041 ; begin epilog 1042 pop rdi 1043 pop rsi 1044 RESTORE_GOT 1045 RESTORE_XMM 1046 UNSHADOW_ARGS 1047 pop rbp 1048 ret 1049 1050%macro MBV_TRANSPOSE 0 1051 movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1052 movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1053 1054 punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1055 punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1056 1057 movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1058 movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1059 1060 punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1061 punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1062 1063 movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1064 punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1065 1066 punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1067 movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1068 1069 punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1070 punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1071 1072 movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1073 punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1074 1075 movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06 1076 punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06 1077 1078 movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04 1079 punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04 1080 1081 punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44 1082 movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1083 1084 punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00 1085 punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20 1086%endmacro 1087 1088%macro MBV_WRITEBACK_1 0 1089 movq [rsi], xmm0 1090 movhps [rdi], xmm0 1091 1092 movq [rsi+2*rax], xmm6 1093 movhps [rdi+2*rax], xmm6 1094 1095 movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1096 punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 1097 punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 1098 1099 movq [rsi+4*rax], xmm0 1100 movhps [rdi+4*rax], xmm0 1101 1102 movq [rsi+2*rcx], xmm3 1103 movhps [rdi+2*rcx], xmm3 1104 1105 movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 1106 punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 1107 punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86 1108 1109 movdqa xmm0, xmm7 1110 punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84 1111 punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4 1112 1113 movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1114 punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80 1115 punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0 1116%endmacro 1117 1118%macro MBV_WRITEBACK_2 0 1119 movq [rsi], xmm1 1120 movhps [rdi], xmm1 1121 1122 movq [rsi+2*rax], xmm5 1123 movhps [rdi+2*rax], xmm5 1124 1125 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1126 punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 1127 punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 1128 1129 movq [rsi+4*rax], xmm1 1130 movhps [rdi+4*rax], xmm1 1131 1132 movq [rsi+2*rcx], xmm4 1133 movhps [rdi+2*rcx], xmm4 1134%endmacro 1135 1136 1137;void vp8_mbloop_filter_vertical_edge_sse2 1138;( 1139; unsigned char *src_ptr, 1140; int src_pixel_step, 1141; const char *blimit, 1142; const char *limit, 1143; const char *thresh, 1144;) 1145global sym(vp8_mbloop_filter_vertical_edge_sse2) PRIVATE 1146sym(vp8_mbloop_filter_vertical_edge_sse2): 1147 push rbp 1148 mov rbp, rsp 1149 SHADOW_ARGS_TO_STACK 5 1150 SAVE_XMM 7 1151 GET_GOT rbx 1152 push rsi 1153 push rdi 1154 ; end prolog 1155 1156 ALIGN_STACK 16, rax 1157 sub rsp, lf_var_size 1158 1159 mov rsi, arg(0) ; src_ptr 1160 movsxd rax, dword ptr arg(1) ; src_pixel_step 1161 1162 lea rsi, [rsi - 4] 1163 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1164 lea rcx, [rax*2+rax] 1165 1166 ; Transpose 1167 TRANSPOSE_16X8 1, 0 1168 1169 ; calculate filter mask and high edge variance 1170 LFV_FILTER_MASK_HEV_MASK 1171 1172 neg rax 1173 ; start work on filters 1174 MB_FILTER_AND_WRITEBACK 2 1175 1176 lea rsi, [rsi+rax*8] 1177 lea rdi, [rdi+rax*8] 1178 1179 ; transpose and write back 1180 MBV_TRANSPOSE 1181 1182 neg rax 1183 1184 MBV_WRITEBACK_1 1185 1186 1187 lea rsi, [rsi+rax*8] 1188 lea rdi, [rdi+rax*8] 1189 MBV_WRITEBACK_2 1190 1191 add rsp, lf_var_size 1192 pop rsp 1193 ; begin epilog 1194 pop rdi 1195 pop rsi 1196 RESTORE_GOT 1197 RESTORE_XMM 1198 UNSHADOW_ARGS 1199 pop rbp 1200 ret 1201 1202 1203;void vp8_mbloop_filter_vertical_edge_uv_sse2 1204;( 1205; unsigned char *u, 1206; int src_pixel_step, 1207; const char *blimit, 1208; const char *limit, 1209; const char *thresh, 1210; unsigned char *v 1211;) 1212global sym(vp8_mbloop_filter_vertical_edge_uv_sse2) PRIVATE 1213sym(vp8_mbloop_filter_vertical_edge_uv_sse2): 1214 push rbp 1215 mov rbp, rsp 1216 SHADOW_ARGS_TO_STACK 6 1217 SAVE_XMM 7 1218 GET_GOT rbx 1219 push rsi 1220 push rdi 1221 ; end prolog 1222 1223 ALIGN_STACK 16, rax 1224 sub rsp, lf_var_size 1225 1226 mov rsi, arg(0) ; u_ptr 1227 movsxd rax, dword ptr arg(1) ; src_pixel_step 1228 1229 lea rsi, [rsi - 4] 1230 lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing 1231 lea rcx, [rax+2*rax] 1232 1233 ; Transpose 1234 TRANSPOSE_16X8 0, 0 1235 1236 ; calculate filter mask and high edge variance 1237 LFV_FILTER_MASK_HEV_MASK 1238 1239 ; start work on filters 1240 MB_FILTER_AND_WRITEBACK 2 1241 1242 ; transpose and write back 1243 MBV_TRANSPOSE 1244 1245 mov rsi, arg(0) ;u_ptr 1246 lea rsi, [rsi - 4] 1247 lea rdi, [rsi + rax] 1248 MBV_WRITEBACK_1 1249 mov rsi, arg(5) ;v_ptr 1250 lea rsi, [rsi - 4] 1251 lea rdi, [rsi + rax] 1252 MBV_WRITEBACK_2 1253 1254 add rsp, lf_var_size 1255 pop rsp 1256 ; begin epilog 1257 pop rdi 1258 pop rsi 1259 RESTORE_GOT 1260 RESTORE_XMM 1261 UNSHADOW_ARGS 1262 pop rbp 1263 ret 1264 1265 1266;void vp8_loop_filter_simple_horizontal_edge_sse2 1267;( 1268; unsigned char *src_ptr, 1269; int src_pixel_step, 1270; const char *blimit, 1271;) 1272global sym(vp8_loop_filter_simple_horizontal_edge_sse2) PRIVATE 1273sym(vp8_loop_filter_simple_horizontal_edge_sse2): 1274 push rbp 1275 mov rbp, rsp 1276 SHADOW_ARGS_TO_STACK 3 1277 SAVE_XMM 7 1278 GET_GOT rbx 1279 ; end prolog 1280 1281 mov rcx, arg(0) ;src_ptr 1282 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1283 movdqa xmm6, [GLOBAL(tfe)] 1284 lea rdx, [rcx + rax] 1285 neg rax 1286 1287 ; calculate mask 1288 movdqa xmm0, [rdx] ; q1 1289 mov rdx, arg(2) ;blimit 1290 movdqa xmm1, [rcx+2*rax] ; p1 1291 1292 movdqa xmm2, xmm1 1293 movdqa xmm3, xmm0 1294 1295 psubusb xmm0, xmm1 ; q1-=p1 1296 psubusb xmm1, xmm3 ; p1-=q1 1297 por xmm1, xmm0 ; abs(p1-q1) 1298 pand xmm1, xmm6 ; set lsb of each byte to zero 1299 psrlw xmm1, 1 ; abs(p1-q1)/2 1300 1301 movdqa xmm7, XMMWORD PTR [rdx] 1302 1303 movdqa xmm5, [rcx+rax] ; p0 1304 movdqa xmm4, [rcx] ; q0 1305 movdqa xmm0, xmm4 ; q0 1306 movdqa xmm6, xmm5 ; p0 1307 psubusb xmm5, xmm4 ; p0-=q0 1308 psubusb xmm4, xmm6 ; q0-=p0 1309 por xmm5, xmm4 ; abs(p0 - q0) 1310 1311 movdqa xmm4, [GLOBAL(t80)] 1312 1313 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1314 paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1315 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1316 pxor xmm7, xmm7 1317 pcmpeqb xmm5, xmm7 1318 1319 1320 ; start work on filters 1321 pxor xmm2, xmm4 ; p1 offset to convert to signed values 1322 pxor xmm3, xmm4 ; q1 offset to convert to signed values 1323 psubsb xmm2, xmm3 ; p1 - q1 1324 1325 pxor xmm6, xmm4 ; offset to convert to signed values 1326 pxor xmm0, xmm4 ; offset to convert to signed values 1327 movdqa xmm3, xmm0 ; q0 1328 psubsb xmm0, xmm6 ; q0 - p0 1329 paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) 1330 paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) 1331 paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) 1332 pand xmm5, xmm2 ; mask filter values we don't care about 1333 1334 movdqa xmm0, xmm5 1335 paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 1336 paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 1337 1338 movdqa xmm1, [GLOBAL(te0)] 1339 movdqa xmm2, [GLOBAL(t1f)] 1340 1341; pxor xmm7, xmm7 1342 pcmpgtb xmm7, xmm0 ;save sign 1343 pand xmm7, xmm1 ;preserve the upper 3 bits 1344 psrlw xmm0, 3 1345 pand xmm0, xmm2 ;clear out upper 3 bits 1346 por xmm0, xmm7 ;add sign 1347 psubsb xmm3, xmm0 ; q0-= q0sz add 1348 1349 pxor xmm7, xmm7 1350 pcmpgtb xmm7, xmm5 ;save sign 1351 pand xmm7, xmm1 ;preserve the upper 3 bits 1352 psrlw xmm5, 3 1353 pand xmm5, xmm2 ;clear out upper 3 bits 1354 por xmm5, xmm7 ;add sign 1355 paddsb xmm6, xmm5 ; p0+= p0 add 1356 1357 pxor xmm3, xmm4 ; unoffset 1358 movdqa [rcx], xmm3 ; write back 1359 1360 pxor xmm6, xmm4 ; unoffset 1361 movdqa [rcx+rax], xmm6 ; write back 1362 1363 ; begin epilog 1364 RESTORE_GOT 1365 RESTORE_XMM 1366 UNSHADOW_ARGS 1367 pop rbp 1368 ret 1369 1370 1371;void vp8_loop_filter_simple_vertical_edge_sse2 1372;( 1373; unsigned char *src_ptr, 1374; int src_pixel_step, 1375; const char *blimit, 1376;) 1377global sym(vp8_loop_filter_simple_vertical_edge_sse2) PRIVATE 1378sym(vp8_loop_filter_simple_vertical_edge_sse2): 1379 push rbp ; save old base pointer value. 1380 mov rbp, rsp ; set new base pointer value. 1381 SHADOW_ARGS_TO_STACK 3 1382 SAVE_XMM 7 1383 GET_GOT rbx ; save callee-saved reg 1384 push rsi 1385 push rdi 1386 ; end prolog 1387 1388 ALIGN_STACK 16, rax 1389 sub rsp, 32 ; reserve 32 bytes 1390 %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; 1391 %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; 1392 1393 mov rsi, arg(0) ;src_ptr 1394 movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? 1395 1396 lea rsi, [rsi - 2 ] 1397 lea rdi, [rsi + rax] 1398 lea rdx, [rsi + rax*4] 1399 lea rcx, [rdx + rax] 1400 1401 movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 1402 movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 1403 movd xmm2, [rdi] ; 13 12 11 10 1404 movd xmm3, [rcx] ; 53 52 51 50 1405 punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 1406 punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 1407 1408 movd xmm4, [rsi + rax*2] ; 23 22 21 20 1409 movd xmm5, [rdx + rax*2] ; 63 62 61 60 1410 movd xmm6, [rdi + rax*2] ; 33 32 31 30 1411 movd xmm7, [rcx + rax*2] ; 73 72 71 70 1412 punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 1413 punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 1414 1415 punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 1416 punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 1417 1418 movdqa xmm1, xmm0 1419 punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 1420 punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 1421 1422 movdqa xmm2, xmm0 1423 punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 1424 punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 1425 1426 lea rsi, [rsi + rax*8] 1427 lea rdi, [rsi + rax] 1428 lea rdx, [rsi + rax*4] 1429 lea rcx, [rdx + rax] 1430 1431 movd xmm4, [rsi] ; 83 82 81 80 1432 movd xmm1, [rdx] ; c3 c2 c1 c0 1433 movd xmm6, [rdi] ; 93 92 91 90 1434 movd xmm3, [rcx] ; d3 d2 d1 d0 1435 punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 1436 punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 1437 1438 movd xmm1, [rsi + rax*2] ; a3 a2 a1 a0 1439 movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 1440 movd xmm3, [rdi + rax*2] ; b3 b2 b1 b0 1441 movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 1442 punpckldq xmm1, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 1443 punpckldq xmm3, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 1444 1445 punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 1446 punpcklbw xmm1, xmm3 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 1447 1448 movdqa xmm7, xmm4 1449 punpcklwd xmm4, xmm1 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 1450 punpckhwd xmm7, xmm1 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 1451 1452 movdqa xmm6, xmm4 1453 punpckldq xmm4, xmm7 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 1454 punpckhdq xmm6, xmm7 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 1455 1456 movdqa xmm1, xmm0 1457 movdqa xmm3, xmm2 1458 1459 punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1460 punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1461 punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1462 punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1463 1464 mov rdx, arg(2) ;blimit 1465 1466 ; calculate mask 1467 movdqa xmm6, xmm0 ; p1 1468 movdqa xmm7, xmm3 ; q1 1469 psubusb xmm7, xmm0 ; q1-=p1 1470 psubusb xmm6, xmm3 ; p1-=q1 1471 por xmm6, xmm7 ; abs(p1-q1) 1472 pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero 1473 psrlw xmm6, 1 ; abs(p1-q1)/2 1474 1475 movdqa xmm7, [rdx] 1476 1477 movdqa xmm5, xmm1 ; p0 1478 movdqa xmm4, xmm2 ; q0 1479 psubusb xmm5, xmm2 ; p0-=q0 1480 psubusb xmm4, xmm1 ; q0-=p0 1481 por xmm5, xmm4 ; abs(p0 - q0) 1482 paddusb xmm5, xmm5 ; abs(p0-q0)*2 1483 paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 1484 1485 movdqa xmm4, [GLOBAL(t80)] 1486 1487 psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit 1488 pxor xmm7, xmm7 1489 pcmpeqb xmm5, xmm7 ; mm5 = mask 1490 1491 ; start work on filters 1492 movdqa t0, xmm0 1493 movdqa t1, xmm3 1494 1495 pxor xmm0, xmm4 ; p1 offset to convert to signed values 1496 pxor xmm3, xmm4 ; q1 offset to convert to signed values 1497 psubsb xmm0, xmm3 ; p1 - q1 1498 1499 pxor xmm1, xmm4 ; offset to convert to signed values 1500 pxor xmm2, xmm4 ; offset to convert to signed values 1501 1502 movdqa xmm3, xmm2 ; offseted ; q0 1503 psubsb xmm2, xmm1 ; q0 - p0 1504 paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0) 1505 paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0) 1506 paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0) 1507 pand xmm5, xmm0 ; mask filter values we don't care about 1508 1509 movdqa xmm0, xmm5 1510 paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4 1511 paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4 1512 1513 movdqa xmm6, [GLOBAL(te0)] 1514 movdqa xmm2, [GLOBAL(t1f)] 1515 1516; pxor xmm7, xmm7 1517 pcmpgtb xmm7, xmm0 ;save sign 1518 pand xmm7, xmm6 ;preserve the upper 3 bits 1519 psrlw xmm0, 3 1520 pand xmm0, xmm2 ;clear out upper 3 bits 1521 por xmm0, xmm7 ;add sign 1522 psubsb xmm3, xmm0 ; q0-= q0sz add 1523 1524 pxor xmm7, xmm7 1525 pcmpgtb xmm7, xmm5 ;save sign 1526 pand xmm7, xmm6 ;preserve the upper 3 bits 1527 psrlw xmm5, 3 1528 pand xmm5, xmm2 ;clear out upper 3 bits 1529 por xmm5, xmm7 ;add sign 1530 paddsb xmm1, xmm5 ; p0+= p0 add 1531 1532 pxor xmm3, xmm4 ; unoffset q0 1533 pxor xmm1, xmm4 ; unoffset p0 1534 1535 movdqa xmm0, t0 ; p1 1536 movdqa xmm4, t1 ; q1 1537 1538 ; write out order: xmm0 xmm2 xmm1 xmm3 1539 lea rdx, [rsi + rax*4] 1540 1541 ; transpose back to write out 1542 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 1543 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 1544 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 1545 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 1546 movdqa xmm6, xmm0 1547 punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 1548 punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 1549 1550 movdqa xmm5, xmm3 1551 punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 1552 punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 1553 1554 movdqa xmm2, xmm0 1555 punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 1556 punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 1557 1558 movdqa xmm3, xmm6 1559 punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 1560 punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 1561 1562 movd [rsi], xmm6 ; write the second 8-line result 1563 movd [rdx], xmm3 1564 psrldq xmm6, 4 1565 psrldq xmm3, 4 1566 movd [rdi], xmm6 1567 movd [rcx], xmm3 1568 psrldq xmm6, 4 1569 psrldq xmm3, 4 1570 movd [rsi + rax*2], xmm6 1571 movd [rdx + rax*2], xmm3 1572 psrldq xmm6, 4 1573 psrldq xmm3, 4 1574 movd [rdi + rax*2], xmm6 1575 movd [rcx + rax*2], xmm3 1576 1577 neg rax 1578 lea rsi, [rsi + rax*8] 1579 neg rax 1580 lea rdi, [rsi + rax] 1581 lea rdx, [rsi + rax*4] 1582 lea rcx, [rdx + rax] 1583 1584 movd [rsi], xmm0 ; write the first 8-line result 1585 movd [rdx], xmm2 1586 psrldq xmm0, 4 1587 psrldq xmm2, 4 1588 movd [rdi], xmm0 1589 movd [rcx], xmm2 1590 psrldq xmm0, 4 1591 psrldq xmm2, 4 1592 movd [rsi + rax*2], xmm0 1593 movd [rdx + rax*2], xmm2 1594 psrldq xmm0, 4 1595 psrldq xmm2, 4 1596 movd [rdi + rax*2], xmm0 1597 movd [rcx + rax*2], xmm2 1598 1599 add rsp, 32 1600 pop rsp 1601 ; begin epilog 1602 pop rdi 1603 pop rsi 1604 RESTORE_GOT 1605 RESTORE_XMM 1606 UNSHADOW_ARGS 1607 pop rbp 1608 ret 1609 1610SECTION_RODATA 1611align 16 1612tfe: 1613 times 16 db 0xfe 1614align 16 1615t80: 1616 times 16 db 0x80 1617align 16 1618t1s: 1619 times 16 db 0x01 1620align 16 1621t3: 1622 times 16 db 0x03 1623align 16 1624t4: 1625 times 16 db 0x04 1626align 16 1627ones: 1628 times 8 dw 0x0001 1629align 16 1630s9: 1631 times 8 dw 0x0900 1632align 16 1633s63: 1634 times 8 dw 0x003f 1635align 16 1636te0: 1637 times 16 db 0xe0 1638align 16 1639t1f: 1640 times 16 db 0x1f 1641