1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) 15global sym(vp9_get_mb_ss_mmx) PRIVATE 16sym(vp9_get_mb_ss_mmx): 17 push rbp 18 mov rbp, rsp 19 SHADOW_ARGS_TO_STACK 7 20 GET_GOT rbx 21 push rsi 22 push rdi 23 sub rsp, 8 24 ; end prolog 25 26 mov rax, arg(0) ;src_ptr 27 mov rcx, 16 28 pxor mm4, mm4 29 30.NEXTROW: 31 movq mm0, [rax] 32 movq mm1, [rax+8] 33 movq mm2, [rax+16] 34 movq mm3, [rax+24] 35 pmaddwd mm0, mm0 36 pmaddwd mm1, mm1 37 pmaddwd mm2, mm2 38 pmaddwd mm3, mm3 39 40 paddd mm4, mm0 41 paddd mm4, mm1 42 paddd mm4, mm2 43 paddd mm4, mm3 44 45 add rax, 32 46 dec rcx 47 ja .NEXTROW 48 movq QWORD PTR [rsp], mm4 49 50 ;return sum[0]+sum[1]; 51 movsxd rax, dword ptr [rsp] 52 movsxd rcx, dword ptr [rsp+4] 53 add rax, rcx 54 55 56 ; begin epilog 57 add rsp, 8 58 pop rdi 59 pop rsi 60 RESTORE_GOT 61 UNSHADOW_ARGS 62 pop rbp 63 ret 64 65 66;unsigned int vp9_get8x8var_mmx 67;( 68; unsigned char *src_ptr, 69; int source_stride, 70; unsigned char *ref_ptr, 71; int recon_stride, 72; unsigned int *SSE, 73; int *Sum 74;) 75global sym(vp9_get8x8var_mmx) PRIVATE 76sym(vp9_get8x8var_mmx): 77 push rbp 78 mov rbp, rsp 79 SHADOW_ARGS_TO_STACK 6 80 push rsi 81 push rdi 82 push rbx 83 sub rsp, 16 84 ; end prolog 85 86 87 pxor mm5, mm5 ; Blank mmx6 88 pxor mm6, mm6 ; Blank mmx7 89 pxor mm7, mm7 ; Blank mmx7 90 91 mov rax, arg(0) ;[src_ptr] ; Load base addresses 92 mov rbx, arg(2) ;[ref_ptr] 93 movsxd rcx, dword ptr arg(1) ;[source_stride] 94 movsxd rdx, dword ptr arg(3) ;[recon_stride] 95 96 ; Row 1 97 movq mm0, [rax] ; Copy eight bytes to mm0 98 movq mm1, [rbx] ; Copy eight bytes to mm1 99 movq mm2, mm0 ; Take copies 100 movq mm3, mm1 ; Take copies 101 102 punpcklbw mm0, mm6 ; unpack to higher prrcision 103 punpcklbw mm1, mm6 104 punpckhbw mm2, mm6 ; unpack to higher prrcision 105 punpckhbw mm3, mm6 106 psubsw mm0, mm1 ; A-B (low order) to MM0 107 psubsw mm2, mm3 ; A-B (high order) to MM2 108 109 paddw mm5, mm0 ; accumulate differences in mm5 110 paddw mm5, mm2 ; accumulate differences in mm5 111 112 pmaddwd mm0, mm0 ; square and accumulate 113 pmaddwd mm2, mm2 ; square and accumulate 114 add rbx,rdx ; Inc pointer into ref data 115 add rax,rcx ; Inc pointer into the new data 116 movq mm1, [rbx] ; Copy eight bytes to mm1 117 paddd mm7, mm0 ; accumulate in mm7 118 paddd mm7, mm2 ; accumulate in mm7 119 120 121 ; Row 2 122 movq mm0, [rax] ; Copy eight bytes to mm0 123 movq mm2, mm0 ; Take copies 124 movq mm3, mm1 ; Take copies 125 126 punpcklbw mm0, mm6 ; unpack to higher prrcision 127 punpcklbw mm1, mm6 128 punpckhbw mm2, mm6 ; unpack to higher prrcision 129 punpckhbw mm3, mm6 130 psubsw mm0, mm1 ; A-B (low order) to MM0 131 psubsw mm2, mm3 ; A-B (high order) to MM2 132 133 paddw mm5, mm0 ; accumulate differences in mm5 134 paddw mm5, mm2 ; accumulate differences in mm5 135 136 pmaddwd mm0, mm0 ; square and accumulate 137 pmaddwd mm2, mm2 ; square and accumulate 138 add rbx,rdx ; Inc pointer into ref data 139 add rax,rcx ; Inc pointer into the new data 140 movq mm1, [rbx] ; Copy eight bytes to mm1 141 paddd mm7, mm0 ; accumulate in mm7 142 paddd mm7, mm2 ; accumulate in mm7 143 144 ; Row 3 145 movq mm0, [rax] ; Copy eight bytes to mm0 146 movq mm2, mm0 ; Take copies 147 movq mm3, mm1 ; Take copies 148 149 punpcklbw mm0, mm6 ; unpack to higher prrcision 150 punpcklbw mm1, mm6 151 punpckhbw mm2, mm6 ; unpack to higher prrcision 152 punpckhbw mm3, mm6 153 psubsw mm0, mm1 ; A-B (low order) to MM0 154 psubsw mm2, mm3 ; A-B (high order) to MM2 155 156 paddw mm5, mm0 ; accumulate differences in mm5 157 paddw mm5, mm2 ; accumulate differences in mm5 158 159 pmaddwd mm0, mm0 ; square and accumulate 160 pmaddwd mm2, mm2 ; square and accumulate 161 add rbx,rdx ; Inc pointer into ref data 162 add rax,rcx ; Inc pointer into the new data 163 movq mm1, [rbx] ; Copy eight bytes to mm1 164 paddd mm7, mm0 ; accumulate in mm7 165 paddd mm7, mm2 ; accumulate in mm7 166 167 ; Row 4 168 movq mm0, [rax] ; Copy eight bytes to mm0 169 movq mm2, mm0 ; Take copies 170 movq mm3, mm1 ; Take copies 171 172 punpcklbw mm0, mm6 ; unpack to higher prrcision 173 punpcklbw mm1, mm6 174 punpckhbw mm2, mm6 ; unpack to higher prrcision 175 punpckhbw mm3, mm6 176 psubsw mm0, mm1 ; A-B (low order) to MM0 177 psubsw mm2, mm3 ; A-B (high order) to MM2 178 179 paddw mm5, mm0 ; accumulate differences in mm5 180 paddw mm5, mm2 ; accumulate differences in mm5 181 182 pmaddwd mm0, mm0 ; square and accumulate 183 pmaddwd mm2, mm2 ; square and accumulate 184 add rbx,rdx ; Inc pointer into ref data 185 add rax,rcx ; Inc pointer into the new data 186 movq mm1, [rbx] ; Copy eight bytes to mm1 187 paddd mm7, mm0 ; accumulate in mm7 188 paddd mm7, mm2 ; accumulate in mm7 189 190 ; Row 5 191 movq mm0, [rax] ; Copy eight bytes to mm0 192 movq mm2, mm0 ; Take copies 193 movq mm3, mm1 ; Take copies 194 195 punpcklbw mm0, mm6 ; unpack to higher prrcision 196 punpcklbw mm1, mm6 197 punpckhbw mm2, mm6 ; unpack to higher prrcision 198 punpckhbw mm3, mm6 199 psubsw mm0, mm1 ; A-B (low order) to MM0 200 psubsw mm2, mm3 ; A-B (high order) to MM2 201 202 paddw mm5, mm0 ; accumulate differences in mm5 203 paddw mm5, mm2 ; accumulate differences in mm5 204 205 pmaddwd mm0, mm0 ; square and accumulate 206 pmaddwd mm2, mm2 ; square and accumulate 207 add rbx,rdx ; Inc pointer into ref data 208 add rax,rcx ; Inc pointer into the new data 209 movq mm1, [rbx] ; Copy eight bytes to mm1 210 ; movq mm4, [rbx + rdx] 211 paddd mm7, mm0 ; accumulate in mm7 212 paddd mm7, mm2 ; accumulate in mm7 213 214 ; Row 6 215 movq mm0, [rax] ; Copy eight bytes to mm0 216 movq mm2, mm0 ; Take copies 217 movq mm3, mm1 ; Take copies 218 219 punpcklbw mm0, mm6 ; unpack to higher prrcision 220 punpcklbw mm1, mm6 221 punpckhbw mm2, mm6 ; unpack to higher prrcision 222 punpckhbw mm3, mm6 223 psubsw mm0, mm1 ; A-B (low order) to MM0 224 psubsw mm2, mm3 ; A-B (high order) to MM2 225 226 paddw mm5, mm0 ; accumulate differences in mm5 227 paddw mm5, mm2 ; accumulate differences in mm5 228 229 pmaddwd mm0, mm0 ; square and accumulate 230 pmaddwd mm2, mm2 ; square and accumulate 231 add rbx,rdx ; Inc pointer into ref data 232 add rax,rcx ; Inc pointer into the new data 233 movq mm1, [rbx] ; Copy eight bytes to mm1 234 paddd mm7, mm0 ; accumulate in mm7 235 paddd mm7, mm2 ; accumulate in mm7 236 237 ; Row 7 238 movq mm0, [rax] ; Copy eight bytes to mm0 239 movq mm2, mm0 ; Take copies 240 movq mm3, mm1 ; Take copies 241 242 punpcklbw mm0, mm6 ; unpack to higher prrcision 243 punpcklbw mm1, mm6 244 punpckhbw mm2, mm6 ; unpack to higher prrcision 245 punpckhbw mm3, mm6 246 psubsw mm0, mm1 ; A-B (low order) to MM0 247 psubsw mm2, mm3 ; A-B (high order) to MM2 248 249 paddw mm5, mm0 ; accumulate differences in mm5 250 paddw mm5, mm2 ; accumulate differences in mm5 251 252 pmaddwd mm0, mm0 ; square and accumulate 253 pmaddwd mm2, mm2 ; square and accumulate 254 add rbx,rdx ; Inc pointer into ref data 255 add rax,rcx ; Inc pointer into the new data 256 movq mm1, [rbx] ; Copy eight bytes to mm1 257 paddd mm7, mm0 ; accumulate in mm7 258 paddd mm7, mm2 ; accumulate in mm7 259 260 ; Row 8 261 movq mm0, [rax] ; Copy eight bytes to mm0 262 movq mm2, mm0 ; Take copies 263 movq mm3, mm1 ; Take copies 264 265 punpcklbw mm0, mm6 ; unpack to higher prrcision 266 punpcklbw mm1, mm6 267 punpckhbw mm2, mm6 ; unpack to higher prrcision 268 punpckhbw mm3, mm6 269 psubsw mm0, mm1 ; A-B (low order) to MM0 270 psubsw mm2, mm3 ; A-B (high order) to MM2 271 272 paddw mm5, mm0 ; accumulate differences in mm5 273 paddw mm5, mm2 ; accumulate differences in mm5 274 275 pmaddwd mm0, mm0 ; square and accumulate 276 pmaddwd mm2, mm2 ; square and accumulate 277 add rbx,rdx ; Inc pointer into ref data 278 add rax,rcx ; Inc pointer into the new data 279 paddd mm7, mm0 ; accumulate in mm7 280 paddd mm7, mm2 ; accumulate in mm7 281 282 ; Now accumulate the final results. 283 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 284 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 285 movsx rdx, WORD PTR [rsp+8] 286 movsx rcx, WORD PTR [rsp+10] 287 movsx rbx, WORD PTR [rsp+12] 288 movsx rax, WORD PTR [rsp+14] 289 add rdx, rcx 290 add rbx, rax 291 add rdx, rbx ;XSum 292 movsxd rax, DWORD PTR [rsp] 293 movsxd rcx, DWORD PTR [rsp+4] 294 add rax, rcx ;XXSum 295 mov rsi, arg(4) ;SSE 296 mov rdi, arg(5) ;Sum 297 mov dword ptr [rsi], eax 298 mov dword ptr [rdi], edx 299 xor rax, rax ; return 0 300 301 302 ; begin epilog 303 add rsp, 16 304 pop rbx 305 pop rdi 306 pop rsi 307 UNSHADOW_ARGS 308 pop rbp 309 ret 310 311 312 313;unsigned int 314;vp9_get4x4var_mmx 315;( 316; unsigned char *src_ptr, 317; int source_stride, 318; unsigned char *ref_ptr, 319; int recon_stride, 320; unsigned int *SSE, 321; int *Sum 322;) 323global sym(vp9_get4x4var_mmx) PRIVATE 324sym(vp9_get4x4var_mmx): 325 push rbp 326 mov rbp, rsp 327 SHADOW_ARGS_TO_STACK 6 328 push rsi 329 push rdi 330 push rbx 331 sub rsp, 16 332 ; end prolog 333 334 335 pxor mm5, mm5 ; Blank mmx6 336 pxor mm6, mm6 ; Blank mmx7 337 pxor mm7, mm7 ; Blank mmx7 338 339 mov rax, arg(0) ;[src_ptr] ; Load base addresses 340 mov rbx, arg(2) ;[ref_ptr] 341 movsxd rcx, dword ptr arg(1) ;[source_stride] 342 movsxd rdx, dword ptr arg(3) ;[recon_stride] 343 344 ; Row 1 345 movd mm0, [rax] ; Copy 4 bytes to mm0 346 movd mm1, [rbx] ; Copy 4 bytes to mm1 347 punpcklbw mm0, mm6 ; unpack to higher prrcision 348 punpcklbw mm1, mm6 349 psubsw mm0, mm1 ; A-B (low order) to MM0 350 paddw mm5, mm0 ; accumulate differences in mm5 351 pmaddwd mm0, mm0 ; square and accumulate 352 add rbx,rdx ; Inc pointer into ref data 353 add rax,rcx ; Inc pointer into the new data 354 movd mm1, [rbx] ; Copy 4 bytes to mm1 355 paddd mm7, mm0 ; accumulate in mm7 356 357 358 ; Row 2 359 movd mm0, [rax] ; Copy 4 bytes to mm0 360 punpcklbw mm0, mm6 ; unpack to higher prrcision 361 punpcklbw mm1, mm6 362 psubsw mm0, mm1 ; A-B (low order) to MM0 363 paddw mm5, mm0 ; accumulate differences in mm5 364 365 pmaddwd mm0, mm0 ; square and accumulate 366 add rbx,rdx ; Inc pointer into ref data 367 add rax,rcx ; Inc pointer into the new data 368 movd mm1, [rbx] ; Copy 4 bytes to mm1 369 paddd mm7, mm0 ; accumulate in mm7 370 371 ; Row 3 372 movd mm0, [rax] ; Copy 4 bytes to mm0 373 punpcklbw mm0, mm6 ; unpack to higher prrcision 374 punpcklbw mm1, mm6 375 psubsw mm0, mm1 ; A-B (low order) to MM0 376 paddw mm5, mm0 ; accumulate differences in mm5 377 378 pmaddwd mm0, mm0 ; square and accumulate 379 add rbx,rdx ; Inc pointer into ref data 380 add rax,rcx ; Inc pointer into the new data 381 movd mm1, [rbx] ; Copy 4 bytes to mm1 382 paddd mm7, mm0 ; accumulate in mm7 383 384 ; Row 4 385 movd mm0, [rax] ; Copy 4 bytes to mm0 386 387 punpcklbw mm0, mm6 ; unpack to higher prrcision 388 punpcklbw mm1, mm6 389 psubsw mm0, mm1 ; A-B (low order) to MM0 390 391 paddw mm5, mm0 ; accumulate differences in mm5 392 393 pmaddwd mm0, mm0 ; square and accumulate 394 paddd mm7, mm0 ; accumulate in mm7 395 396 397 ; Now accumulate the final results. 398 movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 399 movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 400 movsx rdx, WORD PTR [rsp+8] 401 movsx rcx, WORD PTR [rsp+10] 402 movsx rbx, WORD PTR [rsp+12] 403 movsx rax, WORD PTR [rsp+14] 404 add rdx, rcx 405 add rbx, rax 406 add rdx, rbx ;XSum 407 movsxd rax, DWORD PTR [rsp] 408 movsxd rcx, DWORD PTR [rsp+4] 409 add rax, rcx ;XXSum 410 mov rsi, arg(4) ;SSE 411 mov rdi, arg(5) ;Sum 412 mov dword ptr [rsi], eax 413 mov dword ptr [rdi], edx 414 xor rax, rax ; return 0 415 416 417 ; begin epilog 418 add rsp, 16 419 pop rbx 420 pop rdi 421 pop rsi 422 UNSHADOW_ARGS 423 pop rbp 424 ret 425 426 427 428;unsigned int 429;vp9_get4x4sse_cs_mmx 430;( 431; unsigned char *src_ptr, 432; int source_stride, 433; unsigned char *ref_ptr, 434; int recon_stride 435;) 436global sym(vp9_get4x4sse_cs_mmx) PRIVATE 437sym(vp9_get4x4sse_cs_mmx): 438 push rbp 439 mov rbp, rsp 440 SHADOW_ARGS_TO_STACK 4 441 push rsi 442 push rdi 443 push rbx 444 ; end prolog 445 446 447 pxor mm6, mm6 ; Blank mmx7 448 pxor mm7, mm7 ; Blank mmx7 449 450 mov rax, arg(0) ;[src_ptr] ; Load base addresses 451 mov rbx, arg(2) ;[ref_ptr] 452 movsxd rcx, dword ptr arg(1) ;[source_stride] 453 movsxd rdx, dword ptr arg(3) ;[recon_stride] 454 ; Row 1 455 movd mm0, [rax] ; Copy eight bytes to mm0 456 movd mm1, [rbx] ; Copy eight bytes to mm1 457 punpcklbw mm0, mm6 ; unpack to higher prrcision 458 punpcklbw mm1, mm6 459 psubsw mm0, mm1 ; A-B (low order) to MM0 460 pmaddwd mm0, mm0 ; square and accumulate 461 add rbx,rdx ; Inc pointer into ref data 462 add rax,rcx ; Inc pointer into the new data 463 movd mm1, [rbx] ; Copy eight bytes to mm1 464 paddd mm7, mm0 ; accumulate in mm7 465 466 ; Row 2 467 movd mm0, [rax] ; Copy eight bytes to mm0 468 punpcklbw mm0, mm6 ; unpack to higher prrcision 469 punpcklbw mm1, mm6 470 psubsw mm0, mm1 ; A-B (low order) to MM0 471 pmaddwd mm0, mm0 ; square and accumulate 472 add rbx,rdx ; Inc pointer into ref data 473 add rax,rcx ; Inc pointer into the new data 474 movd mm1, [rbx] ; Copy eight bytes to mm1 475 paddd mm7, mm0 ; accumulate in mm7 476 477 ; Row 3 478 movd mm0, [rax] ; Copy eight bytes to mm0 479 punpcklbw mm1, mm6 480 punpcklbw mm0, mm6 ; unpack to higher prrcision 481 psubsw mm0, mm1 ; A-B (low order) to MM0 482 483 pmaddwd mm0, mm0 ; square and accumulate 484 add rbx,rdx ; Inc pointer into ref data 485 add rax,rcx ; Inc pointer into the new data 486 movd mm1, [rbx] ; Copy eight bytes to mm1 487 paddd mm7, mm0 ; accumulate in mm7 488 489 ; Row 4 490 movd mm0, [rax] ; Copy eight bytes to mm0 491 punpcklbw mm0, mm6 ; unpack to higher prrcision 492 punpcklbw mm1, mm6 493 psubsw mm0, mm1 ; A-B (low order) to MM0 494 pmaddwd mm0, mm0 ; square and accumulate 495 paddd mm7, mm0 ; accumulate in mm7 496 497 movq mm0, mm7 ; 498 psrlq mm7, 32 499 500 paddd mm0, mm7 501 movq rax, mm0 502 503 504 ; begin epilog 505 pop rbx 506 pop rdi 507 pop rsi 508 UNSHADOW_ARGS 509 pop rbp 510 ret 511