1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) 15global sym(vp8_block_error_xmm) PRIVATE 16sym(vp8_block_error_xmm): 17 push rbp 18 mov rbp, rsp 19 SHADOW_ARGS_TO_STACK 2 20 push rsi 21 push rdi 22 ; end prologue 23 24 mov rsi, arg(0) ;coeff_ptr 25 mov rdi, arg(1) ;dcoef_ptr 26 27 movdqa xmm0, [rsi] 28 movdqa xmm1, [rdi] 29 30 movdqa xmm2, [rsi+16] 31 movdqa xmm3, [rdi+16] 32 33 psubw xmm0, xmm1 34 psubw xmm2, xmm3 35 36 pmaddwd xmm0, xmm0 37 pmaddwd xmm2, xmm2 38 39 paddd xmm0, xmm2 40 41 pxor xmm5, xmm5 42 movdqa xmm1, xmm0 43 44 punpckldq xmm0, xmm5 45 punpckhdq xmm1, xmm5 46 47 paddd xmm0, xmm1 48 movdqa xmm1, xmm0 49 50 psrldq xmm0, 8 51 paddd xmm0, xmm1 52 53 movq rax, xmm0 54 55 pop rdi 56 pop rsi 57 ; begin epilog 58 UNSHADOW_ARGS 59 pop rbp 60 ret 61 62;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) 63global sym(vp8_block_error_mmx) PRIVATE 64sym(vp8_block_error_mmx): 65 push rbp 66 mov rbp, rsp 67 SHADOW_ARGS_TO_STACK 2 68 push rsi 69 push rdi 70 ; end prolog 71 72 73 mov rsi, arg(0) ;coeff_ptr 74 pxor mm7, mm7 75 76 mov rdi, arg(1) ;dcoef_ptr 77 movq mm3, [rsi] 78 79 movq mm4, [rdi] 80 movq mm5, [rsi+8] 81 82 movq mm6, [rdi+8] 83 pxor mm1, mm1 ; from movd mm1, dc ; dc =0 84 85 movq mm2, mm7 86 psubw mm5, mm6 87 88 por mm1, mm2 89 pmaddwd mm5, mm5 90 91 pcmpeqw mm1, mm7 92 psubw mm3, mm4 93 94 pand mm1, mm3 95 pmaddwd mm1, mm1 96 97 paddd mm1, mm5 98 movq mm3, [rsi+16] 99 100 movq mm4, [rdi+16] 101 movq mm5, [rsi+24] 102 103 movq mm6, [rdi+24] 104 psubw mm5, mm6 105 106 pmaddwd mm5, mm5 107 psubw mm3, mm4 108 109 pmaddwd mm3, mm3 110 paddd mm3, mm5 111 112 paddd mm1, mm3 113 movq mm0, mm1 114 115 psrlq mm1, 32 116 paddd mm0, mm1 117 118 movq rax, mm0 119 120 pop rdi 121 pop rsi 122 ; begin epilog 123 UNSHADOW_ARGS 124 pop rbp 125 ret 126 127 128;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 129global sym(vp8_mbblock_error_mmx_impl) PRIVATE 130sym(vp8_mbblock_error_mmx_impl): 131 push rbp 132 mov rbp, rsp 133 SHADOW_ARGS_TO_STACK 3 134 push rsi 135 push rdi 136 ; end prolog 137 138 139 mov rsi, arg(0) ;coeff_ptr 140 pxor mm7, mm7 141 142 mov rdi, arg(1) ;dcoef_ptr 143 pxor mm2, mm2 144 145 movd mm1, dword ptr arg(2) ;dc 146 por mm1, mm2 147 148 pcmpeqw mm1, mm7 149 mov rcx, 16 150 151.mberror_loop_mmx: 152 movq mm3, [rsi] 153 movq mm4, [rdi] 154 155 movq mm5, [rsi+8] 156 movq mm6, [rdi+8] 157 158 159 psubw mm5, mm6 160 pmaddwd mm5, mm5 161 162 psubw mm3, mm4 163 pand mm3, mm1 164 165 pmaddwd mm3, mm3 166 paddd mm2, mm5 167 168 paddd mm2, mm3 169 movq mm3, [rsi+16] 170 171 movq mm4, [rdi+16] 172 movq mm5, [rsi+24] 173 174 movq mm6, [rdi+24] 175 psubw mm5, mm6 176 177 pmaddwd mm5, mm5 178 psubw mm3, mm4 179 180 pmaddwd mm3, mm3 181 paddd mm2, mm5 182 183 paddd mm2, mm3 184 add rsi, 32 185 186 add rdi, 32 187 sub rcx, 1 188 189 jnz .mberror_loop_mmx 190 191 movq mm0, mm2 192 psrlq mm2, 32 193 194 paddd mm0, mm2 195 movq rax, mm0 196 197 pop rdi 198 pop rsi 199 ; begin epilog 200 UNSHADOW_ARGS 201 pop rbp 202 ret 203 204 205;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 206global sym(vp8_mbblock_error_xmm_impl) PRIVATE 207sym(vp8_mbblock_error_xmm_impl): 208 push rbp 209 mov rbp, rsp 210 SHADOW_ARGS_TO_STACK 3 211 SAVE_XMM 6 212 push rsi 213 push rdi 214 ; end prolog 215 216 217 mov rsi, arg(0) ;coeff_ptr 218 pxor xmm6, xmm6 219 220 mov rdi, arg(1) ;dcoef_ptr 221 pxor xmm4, xmm4 222 223 movd xmm5, dword ptr arg(2) ;dc 224 por xmm5, xmm4 225 226 pcmpeqw xmm5, xmm6 227 mov rcx, 16 228 229.mberror_loop: 230 movdqa xmm0, [rsi] 231 movdqa xmm1, [rdi] 232 233 movdqa xmm2, [rsi+16] 234 movdqa xmm3, [rdi+16] 235 236 237 psubw xmm2, xmm3 238 pmaddwd xmm2, xmm2 239 240 psubw xmm0, xmm1 241 pand xmm0, xmm5 242 243 pmaddwd xmm0, xmm0 244 add rsi, 32 245 246 add rdi, 32 247 248 sub rcx, 1 249 paddd xmm4, xmm2 250 251 paddd xmm4, xmm0 252 jnz .mberror_loop 253 254 movdqa xmm0, xmm4 255 punpckldq xmm0, xmm6 256 257 punpckhdq xmm4, xmm6 258 paddd xmm0, xmm4 259 260 movdqa xmm1, xmm0 261 psrldq xmm0, 8 262 263 paddd xmm0, xmm1 264 movq rax, xmm0 265 266 pop rdi 267 pop rsi 268 ; begin epilog 269 RESTORE_XMM 270 UNSHADOW_ARGS 271 pop rbp 272 ret 273 274 275;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); 276global sym(vp8_mbuverror_mmx_impl) PRIVATE 277sym(vp8_mbuverror_mmx_impl): 278 push rbp 279 mov rbp, rsp 280 SHADOW_ARGS_TO_STACK 2 281 push rsi 282 push rdi 283 ; end prolog 284 285 286 mov rsi, arg(0) ;s_ptr 287 mov rdi, arg(1) ;d_ptr 288 289 mov rcx, 16 290 pxor mm7, mm7 291 292.mbuverror_loop_mmx: 293 294 movq mm1, [rsi] 295 movq mm2, [rdi] 296 297 psubw mm1, mm2 298 pmaddwd mm1, mm1 299 300 301 movq mm3, [rsi+8] 302 movq mm4, [rdi+8] 303 304 psubw mm3, mm4 305 pmaddwd mm3, mm3 306 307 308 paddd mm7, mm1 309 paddd mm7, mm3 310 311 312 add rsi, 16 313 add rdi, 16 314 315 dec rcx 316 jnz .mbuverror_loop_mmx 317 318 movq mm0, mm7 319 psrlq mm7, 32 320 321 paddd mm0, mm7 322 movq rax, mm0 323 324 pop rdi 325 pop rsi 326 ; begin epilog 327 UNSHADOW_ARGS 328 pop rbp 329 ret 330 331 332;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); 333global sym(vp8_mbuverror_xmm_impl) PRIVATE 334sym(vp8_mbuverror_xmm_impl): 335 push rbp 336 mov rbp, rsp 337 SHADOW_ARGS_TO_STACK 2 338 push rsi 339 push rdi 340 ; end prolog 341 342 343 mov rsi, arg(0) ;s_ptr 344 mov rdi, arg(1) ;d_ptr 345 346 mov rcx, 16 347 pxor xmm3, xmm3 348 349.mbuverror_loop: 350 351 movdqa xmm1, [rsi] 352 movdqa xmm2, [rdi] 353 354 psubw xmm1, xmm2 355 pmaddwd xmm1, xmm1 356 357 paddd xmm3, xmm1 358 359 add rsi, 16 360 add rdi, 16 361 362 dec rcx 363 jnz .mbuverror_loop 364 365 pxor xmm0, xmm0 366 movdqa xmm1, xmm3 367 368 movdqa xmm2, xmm1 369 punpckldq xmm1, xmm0 370 371 punpckhdq xmm2, xmm0 372 paddd xmm1, xmm2 373 374 movdqa xmm2, xmm1 375 376 psrldq xmm1, 8 377 paddd xmm1, xmm2 378 379 movq rax, xmm1 380 381 pop rdi 382 pop rsi 383 ; begin epilog 384 UNSHADOW_ARGS 385 pop rbp 386 ret 387