1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro GET_PARAM_4 0 14 mov rdx, arg(5) ;filter ptr 15 mov rsi, arg(0) ;src_ptr 16 mov rdi, arg(2) ;output_ptr 17 mov rcx, 0x0400040 18 19 movdqa xmm3, [rdx] ;load filters 20 pshuflw xmm4, xmm3, 11111111b ;k3 21 psrldq xmm3, 8 22 pshuflw xmm3, xmm3, 0b ;k4 23 punpcklqdq xmm4, xmm3 ;k3k4 24 25 movq xmm3, rcx ;rounding 26 pshufd xmm3, xmm3, 0 27 28 pxor xmm2, xmm2 29 30 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 31 movsxd rdx, DWORD PTR arg(3) ;out_pitch 32 movsxd rcx, DWORD PTR arg(4) ;output_height 33%endm 34 35%macro APPLY_FILTER_4 1 36 37 punpckldq xmm0, xmm1 ;two row in one register 38 punpcklbw xmm0, xmm2 ;unpack to word 39 pmullw xmm0, xmm4 ;multiply the filter factors 40 41 movdqa xmm1, xmm0 42 psrldq xmm1, 8 43 paddsw xmm0, xmm1 44 45 paddsw xmm0, xmm3 ;rounding 46 psraw xmm0, 7 ;shift 47 packuswb xmm0, xmm0 ;pack to byte 48 49%if %1 50 movd xmm1, [rdi] 51 pavgb xmm0, xmm1 52%endif 53 54 movd [rdi], xmm0 55 lea rsi, [rsi + rax] 56 lea rdi, [rdi + rdx] 57 dec rcx 58%endm 59 60%macro GET_PARAM 0 61 mov rdx, arg(5) ;filter ptr 62 mov rsi, arg(0) ;src_ptr 63 mov rdi, arg(2) ;output_ptr 64 mov rcx, 0x0400040 65 66 movdqa xmm7, [rdx] ;load filters 67 68 pshuflw xmm6, xmm7, 11111111b ;k3 69 pshufhw xmm7, xmm7, 0b ;k4 70 punpcklwd xmm6, xmm6 71 punpckhwd xmm7, xmm7 72 73 movq xmm4, rcx ;rounding 74 pshufd xmm4, xmm4, 0 75 76 pxor xmm5, xmm5 77 78 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 79 movsxd rdx, DWORD PTR arg(3) ;out_pitch 80 movsxd rcx, DWORD PTR arg(4) ;output_height 81%endm 82 83%macro APPLY_FILTER_8 1 84 punpcklbw xmm0, xmm5 85 punpcklbw xmm1, xmm5 86 87 pmullw xmm0, xmm6 88 pmullw xmm1, xmm7 89 paddsw xmm0, xmm1 90 paddsw xmm0, xmm4 ;rounding 91 psraw xmm0, 7 ;shift 92 packuswb xmm0, xmm0 ;pack back to byte 93%if %1 94 movq xmm1, [rdi] 95 pavgb xmm0, xmm1 96%endif 97 movq [rdi], xmm0 ;store the result 98 99 lea rsi, [rsi + rax] 100 lea rdi, [rdi + rdx] 101 dec rcx 102%endm 103 104%macro APPLY_FILTER_16 1 105 punpcklbw xmm0, xmm5 106 punpcklbw xmm1, xmm5 107 punpckhbw xmm2, xmm5 108 punpckhbw xmm3, xmm5 109 110 pmullw xmm0, xmm6 111 pmullw xmm1, xmm7 112 pmullw xmm2, xmm6 113 pmullw xmm3, xmm7 114 115 paddsw xmm0, xmm1 116 paddsw xmm2, xmm3 117 118 paddsw xmm0, xmm4 ;rounding 119 paddsw xmm2, xmm4 120 psraw xmm0, 7 ;shift 121 psraw xmm2, 7 122 packuswb xmm0, xmm2 ;pack back to byte 123%if %1 124 movdqu xmm1, [rdi] 125 pavgb xmm0, xmm1 126%endif 127 movdqu [rdi], xmm0 ;store the result 128 129 lea rsi, [rsi + rax] 130 lea rdi, [rdi + rdx] 131 dec rcx 132%endm 133 134SECTION .text 135 136global sym(vpx_filter_block1d4_v2_sse2) PRIVATE 137sym(vpx_filter_block1d4_v2_sse2): 138 push rbp 139 mov rbp, rsp 140 SHADOW_ARGS_TO_STACK 6 141 push rsi 142 push rdi 143 ; end prolog 144 145 GET_PARAM_4 146.loop: 147 movd xmm0, [rsi] ;load src 148 movd xmm1, [rsi + rax] 149 150 APPLY_FILTER_4 0 151 jnz .loop 152 153 ; begin epilog 154 pop rdi 155 pop rsi 156 UNSHADOW_ARGS 157 pop rbp 158 ret 159 160global sym(vpx_filter_block1d8_v2_sse2) PRIVATE 161sym(vpx_filter_block1d8_v2_sse2): 162 push rbp 163 mov rbp, rsp 164 SHADOW_ARGS_TO_STACK 6 165 SAVE_XMM 7 166 push rsi 167 push rdi 168 ; end prolog 169 170 GET_PARAM 171.loop: 172 movq xmm0, [rsi] ;0 173 movq xmm1, [rsi + rax] ;1 174 175 APPLY_FILTER_8 0 176 jnz .loop 177 178 ; begin epilog 179 pop rdi 180 pop rsi 181 RESTORE_XMM 182 UNSHADOW_ARGS 183 pop rbp 184 ret 185 186global sym(vpx_filter_block1d16_v2_sse2) PRIVATE 187sym(vpx_filter_block1d16_v2_sse2): 188 push rbp 189 mov rbp, rsp 190 SHADOW_ARGS_TO_STACK 6 191 SAVE_XMM 7 192 push rsi 193 push rdi 194 ; end prolog 195 196 GET_PARAM 197.loop: 198 movdqu xmm0, [rsi] ;0 199 movdqu xmm1, [rsi + rax] ;1 200 movdqa xmm2, xmm0 201 movdqa xmm3, xmm1 202 203 APPLY_FILTER_16 0 204 jnz .loop 205 206 ; begin epilog 207 pop rdi 208 pop rsi 209 RESTORE_XMM 210 UNSHADOW_ARGS 211 pop rbp 212 ret 213 214global sym(vpx_filter_block1d4_v2_avg_sse2) PRIVATE 215sym(vpx_filter_block1d4_v2_avg_sse2): 216 push rbp 217 mov rbp, rsp 218 SHADOW_ARGS_TO_STACK 6 219 push rsi 220 push rdi 221 ; end prolog 222 223 GET_PARAM_4 224.loop: 225 movd xmm0, [rsi] ;load src 226 movd xmm1, [rsi + rax] 227 228 APPLY_FILTER_4 1 229 jnz .loop 230 231 ; begin epilog 232 pop rdi 233 pop rsi 234 UNSHADOW_ARGS 235 pop rbp 236 ret 237 238global sym(vpx_filter_block1d8_v2_avg_sse2) PRIVATE 239sym(vpx_filter_block1d8_v2_avg_sse2): 240 push rbp 241 mov rbp, rsp 242 SHADOW_ARGS_TO_STACK 6 243 SAVE_XMM 7 244 push rsi 245 push rdi 246 ; end prolog 247 248 GET_PARAM 249.loop: 250 movq xmm0, [rsi] ;0 251 movq xmm1, [rsi + rax] ;1 252 253 APPLY_FILTER_8 1 254 jnz .loop 255 256 ; begin epilog 257 pop rdi 258 pop rsi 259 RESTORE_XMM 260 UNSHADOW_ARGS 261 pop rbp 262 ret 263 264global sym(vpx_filter_block1d16_v2_avg_sse2) PRIVATE 265sym(vpx_filter_block1d16_v2_avg_sse2): 266 push rbp 267 mov rbp, rsp 268 SHADOW_ARGS_TO_STACK 6 269 SAVE_XMM 7 270 push rsi 271 push rdi 272 ; end prolog 273 274 GET_PARAM 275.loop: 276 movdqu xmm0, [rsi] ;0 277 movdqu xmm1, [rsi + rax] ;1 278 movdqa xmm2, xmm0 279 movdqa xmm3, xmm1 280 281 APPLY_FILTER_16 1 282 jnz .loop 283 284 ; begin epilog 285 pop rdi 286 pop rsi 287 RESTORE_XMM 288 UNSHADOW_ARGS 289 pop rbp 290 ret 291 292global sym(vpx_filter_block1d4_h2_sse2) PRIVATE 293sym(vpx_filter_block1d4_h2_sse2): 294 push rbp 295 mov rbp, rsp 296 SHADOW_ARGS_TO_STACK 6 297 push rsi 298 push rdi 299 ; end prolog 300 301 GET_PARAM_4 302.loop: 303 movdqu xmm0, [rsi] ;load src 304 movdqa xmm1, xmm0 305 psrldq xmm1, 1 306 307 APPLY_FILTER_4 0 308 jnz .loop 309 310 ; begin epilog 311 pop rdi 312 pop rsi 313 UNSHADOW_ARGS 314 pop rbp 315 ret 316 317global sym(vpx_filter_block1d8_h2_sse2) PRIVATE 318sym(vpx_filter_block1d8_h2_sse2): 319 push rbp 320 mov rbp, rsp 321 SHADOW_ARGS_TO_STACK 6 322 SAVE_XMM 7 323 push rsi 324 push rdi 325 ; end prolog 326 327 GET_PARAM 328.loop: 329 movdqu xmm0, [rsi] ;load src 330 movdqa xmm1, xmm0 331 psrldq xmm1, 1 332 333 APPLY_FILTER_8 0 334 jnz .loop 335 336 ; begin epilog 337 pop rdi 338 pop rsi 339 RESTORE_XMM 340 UNSHADOW_ARGS 341 pop rbp 342 ret 343 344global sym(vpx_filter_block1d16_h2_sse2) PRIVATE 345sym(vpx_filter_block1d16_h2_sse2): 346 push rbp 347 mov rbp, rsp 348 SHADOW_ARGS_TO_STACK 6 349 SAVE_XMM 7 350 push rsi 351 push rdi 352 ; end prolog 353 354 GET_PARAM 355.loop: 356 movdqu xmm0, [rsi] ;load src 357 movdqu xmm1, [rsi + 1] 358 movdqa xmm2, xmm0 359 movdqa xmm3, xmm1 360 361 APPLY_FILTER_16 0 362 jnz .loop 363 364 ; begin epilog 365 pop rdi 366 pop rsi 367 RESTORE_XMM 368 UNSHADOW_ARGS 369 pop rbp 370 ret 371 372global sym(vpx_filter_block1d4_h2_avg_sse2) PRIVATE 373sym(vpx_filter_block1d4_h2_avg_sse2): 374 push rbp 375 mov rbp, rsp 376 SHADOW_ARGS_TO_STACK 6 377 push rsi 378 push rdi 379 ; end prolog 380 381 GET_PARAM_4 382.loop: 383 movdqu xmm0, [rsi] ;load src 384 movdqa xmm1, xmm0 385 psrldq xmm1, 1 386 387 APPLY_FILTER_4 1 388 jnz .loop 389 390 ; begin epilog 391 pop rdi 392 pop rsi 393 UNSHADOW_ARGS 394 pop rbp 395 ret 396 397global sym(vpx_filter_block1d8_h2_avg_sse2) PRIVATE 398sym(vpx_filter_block1d8_h2_avg_sse2): 399 push rbp 400 mov rbp, rsp 401 SHADOW_ARGS_TO_STACK 6 402 SAVE_XMM 7 403 push rsi 404 push rdi 405 ; end prolog 406 407 GET_PARAM 408.loop: 409 movdqu xmm0, [rsi] ;load src 410 movdqa xmm1, xmm0 411 psrldq xmm1, 1 412 413 APPLY_FILTER_8 1 414 jnz .loop 415 416 ; begin epilog 417 pop rdi 418 pop rsi 419 RESTORE_XMM 420 UNSHADOW_ARGS 421 pop rbp 422 ret 423 424global sym(vpx_filter_block1d16_h2_avg_sse2) PRIVATE 425sym(vpx_filter_block1d16_h2_avg_sse2): 426 push rbp 427 mov rbp, rsp 428 SHADOW_ARGS_TO_STACK 6 429 SAVE_XMM 7 430 push rsi 431 push rdi 432 ; end prolog 433 434 GET_PARAM 435.loop: 436 movdqu xmm0, [rsi] ;load src 437 movdqu xmm1, [rsi + 1] 438 movdqa xmm2, xmm0 439 movdqa xmm3, xmm1 440 441 APPLY_FILTER_16 1 442 jnz .loop 443 444 ; begin epilog 445 pop rdi 446 pop rsi 447 RESTORE_XMM 448 UNSHADOW_ARGS 449 pop rbp 450 ret 451