1; 2; Copyright (c) 2014 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%include "vpx_ports/x86_abi_support.asm" 12 13%macro GET_PARAM_4 0 14 mov rdx, arg(5) ;filter ptr 15 mov rsi, arg(0) ;src_ptr 16 mov rdi, arg(2) ;output_ptr 17 mov rcx, 0x0400040 18 19 movdqa xmm3, [rdx] ;load filters 20 psrldq xmm3, 6 21 packsswb xmm3, xmm3 22 pshuflw xmm3, xmm3, 0b ;k3_k4 23 24 movq xmm2, rcx ;rounding 25 pshufd xmm2, xmm2, 0 26 27 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 28 movsxd rdx, DWORD PTR arg(3) ;out_pitch 29 movsxd rcx, DWORD PTR arg(4) ;output_height 30%endm 31 32%macro APPLY_FILTER_4 1 33 punpcklbw xmm0, xmm1 34 pmaddubsw xmm0, xmm3 35 36 paddsw xmm0, xmm2 ;rounding 37 psraw xmm0, 7 ;shift 38 packuswb xmm0, xmm0 ;pack to byte 39 40%if %1 41 movd xmm1, [rdi] 42 pavgb xmm0, xmm1 43%endif 44 movd [rdi], xmm0 45 lea rsi, [rsi + rax] 46 lea rdi, [rdi + rdx] 47 dec rcx 48%endm 49 50%macro GET_PARAM 0 51 mov rdx, arg(5) ;filter ptr 52 mov rsi, arg(0) ;src_ptr 53 mov rdi, arg(2) ;output_ptr 54 mov rcx, 0x0400040 55 56 movdqa xmm7, [rdx] ;load filters 57 psrldq xmm7, 6 58 packsswb xmm7, xmm7 59 pshuflw xmm7, xmm7, 0b ;k3_k4 60 punpcklwd xmm7, xmm7 61 62 movq xmm6, rcx ;rounding 63 pshufd xmm6, xmm6, 0 64 65 movsxd rax, DWORD PTR arg(1) ;pixels_per_line 66 movsxd rdx, DWORD PTR arg(3) ;out_pitch 67 movsxd rcx, DWORD PTR arg(4) ;output_height 68%endm 69 70%macro APPLY_FILTER_8 1 71 punpcklbw xmm0, xmm1 72 pmaddubsw xmm0, xmm7 73 74 paddsw xmm0, xmm6 ;rounding 75 psraw xmm0, 7 ;shift 76 packuswb xmm0, xmm0 ;pack back to byte 77 78%if %1 79 movq xmm1, [rdi] 80 pavgb xmm0, xmm1 81%endif 82 movq [rdi], xmm0 ;store the result 83 84 lea rsi, [rsi + rax] 85 lea rdi, [rdi + rdx] 86 dec rcx 87%endm 88 89%macro APPLY_FILTER_16 1 90 punpcklbw xmm0, xmm1 91 punpckhbw xmm2, xmm1 92 pmaddubsw xmm0, xmm7 93 pmaddubsw xmm2, xmm7 94 95 paddsw xmm0, xmm6 ;rounding 96 paddsw xmm2, xmm6 97 psraw xmm0, 7 ;shift 98 psraw xmm2, 7 99 packuswb xmm0, xmm2 ;pack back to byte 100 101%if %1 102 movdqu xmm1, [rdi] 103 pavgb xmm0, xmm1 104%endif 105 movdqu [rdi], xmm0 ;store the result 106 107 lea rsi, [rsi + rax] 108 lea rdi, [rdi + rdx] 109 dec rcx 110%endm 111 112global sym(vp9_filter_block1d4_v2_ssse3) PRIVATE 113sym(vp9_filter_block1d4_v2_ssse3): 114 push rbp 115 mov rbp, rsp 116 SHADOW_ARGS_TO_STACK 6 117 push rsi 118 push rdi 119 ; end prolog 120 121 GET_PARAM_4 122.loop: 123 movd xmm0, [rsi] ;load src 124 movd xmm1, [rsi + rax] 125 126 APPLY_FILTER_4 0 127 jnz .loop 128 129 ; begin epilog 130 pop rdi 131 pop rsi 132 UNSHADOW_ARGS 133 pop rbp 134 ret 135 136global sym(vp9_filter_block1d8_v2_ssse3) PRIVATE 137sym(vp9_filter_block1d8_v2_ssse3): 138 push rbp 139 mov rbp, rsp 140 SHADOW_ARGS_TO_STACK 6 141 SAVE_XMM 7 142 push rsi 143 push rdi 144 ; end prolog 145 146 GET_PARAM 147.loop: 148 movq xmm0, [rsi] ;0 149 movq xmm1, [rsi + rax] ;1 150 151 APPLY_FILTER_8 0 152 jnz .loop 153 154 ; begin epilog 155 pop rdi 156 pop rsi 157 RESTORE_XMM 158 UNSHADOW_ARGS 159 pop rbp 160 ret 161 162global sym(vp9_filter_block1d16_v2_ssse3) PRIVATE 163sym(vp9_filter_block1d16_v2_ssse3): 164 push rbp 165 mov rbp, rsp 166 SHADOW_ARGS_TO_STACK 6 167 SAVE_XMM 7 168 push rsi 169 push rdi 170 ; end prolog 171 172 GET_PARAM 173.loop: 174 movdqu xmm0, [rsi] ;0 175 movdqu xmm1, [rsi + rax] ;1 176 movdqa xmm2, xmm0 177 178 APPLY_FILTER_16 0 179 jnz .loop 180 181 ; begin epilog 182 pop rdi 183 pop rsi 184 RESTORE_XMM 185 UNSHADOW_ARGS 186 pop rbp 187 ret 188 189global sym(vp9_filter_block1d4_v2_avg_ssse3) PRIVATE 190sym(vp9_filter_block1d4_v2_avg_ssse3): 191 push rbp 192 mov rbp, rsp 193 SHADOW_ARGS_TO_STACK 6 194 push rsi 195 push rdi 196 ; end prolog 197 198 GET_PARAM_4 199.loop: 200 movd xmm0, [rsi] ;load src 201 movd xmm1, [rsi + rax] 202 203 APPLY_FILTER_4 1 204 jnz .loop 205 206 ; begin epilog 207 pop rdi 208 pop rsi 209 UNSHADOW_ARGS 210 pop rbp 211 ret 212 213global sym(vp9_filter_block1d8_v2_avg_ssse3) PRIVATE 214sym(vp9_filter_block1d8_v2_avg_ssse3): 215 push rbp 216 mov rbp, rsp 217 SHADOW_ARGS_TO_STACK 6 218 SAVE_XMM 7 219 push rsi 220 push rdi 221 ; end prolog 222 223 GET_PARAM 224.loop: 225 movq xmm0, [rsi] ;0 226 movq xmm1, [rsi + rax] ;1 227 228 APPLY_FILTER_8 1 229 jnz .loop 230 231 ; begin epilog 232 pop rdi 233 pop rsi 234 RESTORE_XMM 235 UNSHADOW_ARGS 236 pop rbp 237 ret 238 239global sym(vp9_filter_block1d16_v2_avg_ssse3) PRIVATE 240sym(vp9_filter_block1d16_v2_avg_ssse3): 241 push rbp 242 mov rbp, rsp 243 SHADOW_ARGS_TO_STACK 6 244 SAVE_XMM 7 245 push rsi 246 push rdi 247 ; end prolog 248 249 GET_PARAM 250.loop: 251 movdqu xmm0, [rsi] ;0 252 movdqu xmm1, [rsi + rax] ;1 253 movdqa xmm2, xmm0 254 255 APPLY_FILTER_16 1 256 jnz .loop 257 258 ; begin epilog 259 pop rdi 260 pop rsi 261 RESTORE_XMM 262 UNSHADOW_ARGS 263 pop rbp 264 ret 265 266global sym(vp9_filter_block1d4_h2_ssse3) PRIVATE 267sym(vp9_filter_block1d4_h2_ssse3): 268 push rbp 269 mov rbp, rsp 270 SHADOW_ARGS_TO_STACK 6 271 push rsi 272 push rdi 273 ; end prolog 274 275 GET_PARAM_4 276.loop: 277 movdqu xmm0, [rsi] ;load src 278 movdqa xmm1, xmm0 279 psrldq xmm1, 1 280 281 APPLY_FILTER_4 0 282 jnz .loop 283 284 ; begin epilog 285 pop rdi 286 pop rsi 287 UNSHADOW_ARGS 288 pop rbp 289 ret 290 291global sym(vp9_filter_block1d8_h2_ssse3) PRIVATE 292sym(vp9_filter_block1d8_h2_ssse3): 293 push rbp 294 mov rbp, rsp 295 SHADOW_ARGS_TO_STACK 6 296 SAVE_XMM 7 297 push rsi 298 push rdi 299 ; end prolog 300 301 GET_PARAM 302.loop: 303 movdqu xmm0, [rsi] ;load src 304 movdqa xmm1, xmm0 305 psrldq xmm1, 1 306 307 APPLY_FILTER_8 0 308 jnz .loop 309 310 ; begin epilog 311 pop rdi 312 pop rsi 313 RESTORE_XMM 314 UNSHADOW_ARGS 315 pop rbp 316 ret 317 318global sym(vp9_filter_block1d16_h2_ssse3) PRIVATE 319sym(vp9_filter_block1d16_h2_ssse3): 320 push rbp 321 mov rbp, rsp 322 SHADOW_ARGS_TO_STACK 6 323 SAVE_XMM 7 324 push rsi 325 push rdi 326 ; end prolog 327 328 GET_PARAM 329.loop: 330 movdqu xmm0, [rsi] ;load src 331 movdqu xmm1, [rsi + 1] 332 movdqa xmm2, xmm0 333 334 APPLY_FILTER_16 0 335 jnz .loop 336 337 ; begin epilog 338 pop rdi 339 pop rsi 340 RESTORE_XMM 341 UNSHADOW_ARGS 342 pop rbp 343 ret 344 345global sym(vp9_filter_block1d4_h2_avg_ssse3) PRIVATE 346sym(vp9_filter_block1d4_h2_avg_ssse3): 347 push rbp 348 mov rbp, rsp 349 SHADOW_ARGS_TO_STACK 6 350 push rsi 351 push rdi 352 ; end prolog 353 354 GET_PARAM_4 355.loop: 356 movdqu xmm0, [rsi] ;load src 357 movdqa xmm1, xmm0 358 psrldq xmm1, 1 359 360 APPLY_FILTER_4 1 361 jnz .loop 362 363 ; begin epilog 364 pop rdi 365 pop rsi 366 UNSHADOW_ARGS 367 pop rbp 368 ret 369 370global sym(vp9_filter_block1d8_h2_avg_ssse3) PRIVATE 371sym(vp9_filter_block1d8_h2_avg_ssse3): 372 push rbp 373 mov rbp, rsp 374 SHADOW_ARGS_TO_STACK 6 375 SAVE_XMM 7 376 push rsi 377 push rdi 378 ; end prolog 379 380 GET_PARAM 381.loop: 382 movdqu xmm0, [rsi] ;load src 383 movdqa xmm1, xmm0 384 psrldq xmm1, 1 385 386 APPLY_FILTER_8 1 387 jnz .loop 388 389 ; begin epilog 390 pop rdi 391 pop rsi 392 RESTORE_XMM 393 UNSHADOW_ARGS 394 pop rbp 395 ret 396 397global sym(vp9_filter_block1d16_h2_avg_ssse3) PRIVATE 398sym(vp9_filter_block1d16_h2_avg_ssse3): 399 push rbp 400 mov rbp, rsp 401 SHADOW_ARGS_TO_STACK 6 402 SAVE_XMM 7 403 push rsi 404 push rdi 405 ; end prolog 406 407 GET_PARAM 408.loop: 409 movdqu xmm0, [rsi] ;load src 410 movdqu xmm1, [rsi + 1] 411 movdqa xmm2, xmm0 412 413 APPLY_FILTER_16 1 414 jnz .loop 415 416 ; begin epilog 417 pop rdi 418 pop rsi 419 RESTORE_XMM 420 UNSHADOW_ARGS 421 pop rbp 422 ret 423