1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14%macro PROCESS_16X2X3 1 15%if %1 16 movdqa xmm0, XMMWORD PTR [rsi] 17 lddqu xmm5, XMMWORD PTR [rdi] 18 lddqu xmm6, XMMWORD PTR [rdi+1] 19 lddqu xmm7, XMMWORD PTR [rdi+2] 20 21 psadbw xmm5, xmm0 22 psadbw xmm6, xmm0 23 psadbw xmm7, xmm0 24%else 25 movdqa xmm0, XMMWORD PTR [rsi] 26 lddqu xmm1, XMMWORD PTR [rdi] 27 lddqu xmm2, XMMWORD PTR [rdi+1] 28 lddqu xmm3, XMMWORD PTR [rdi+2] 29 30 psadbw xmm1, xmm0 31 psadbw xmm2, xmm0 32 psadbw xmm3, xmm0 33 34 paddw xmm5, xmm1 35 paddw xmm6, xmm2 36 paddw xmm7, xmm3 37%endif 38 movdqa xmm0, XMMWORD PTR [rsi+rax] 39 lddqu xmm1, XMMWORD PTR [rdi+rdx] 40 lddqu xmm2, XMMWORD PTR [rdi+rdx+1] 41 lddqu xmm3, XMMWORD PTR [rdi+rdx+2] 42 43 lea rsi, [rsi+rax*2] 44 lea rdi, [rdi+rdx*2] 45 46 psadbw xmm1, xmm0 47 psadbw xmm2, xmm0 48 psadbw xmm3, xmm0 49 50 paddw xmm5, xmm1 51 paddw xmm6, xmm2 52 paddw xmm7, xmm3 53%endmacro 54 55%macro PROCESS_16X2X3_OFFSET 2 56%if %1 57 movdqa xmm0, XMMWORD PTR [rsi] 58 movdqa xmm4, XMMWORD PTR [rdi] 59 movdqa xmm7, XMMWORD PTR [rdi+16] 60 61 movdqa xmm5, xmm7 62 palignr xmm5, xmm4, %2 63 64 movdqa xmm6, xmm7 65 palignr xmm6, xmm4, (%2+1) 66 67 palignr xmm7, xmm4, (%2+2) 68 69 psadbw xmm5, xmm0 70 psadbw xmm6, xmm0 71 psadbw xmm7, xmm0 72%else 73 movdqa xmm0, XMMWORD PTR [rsi] 74 movdqa xmm4, XMMWORD PTR [rdi] 75 movdqa xmm3, XMMWORD PTR [rdi+16] 76 77 movdqa xmm1, xmm3 78 palignr xmm1, xmm4, %2 79 80 movdqa xmm2, xmm3 81 palignr xmm2, xmm4, (%2+1) 82 83 palignr xmm3, xmm4, (%2+2) 84 85 psadbw xmm1, xmm0 86 psadbw xmm2, xmm0 87 psadbw xmm3, xmm0 88 89 paddw xmm5, xmm1 90 paddw xmm6, xmm2 91 paddw xmm7, xmm3 92%endif 93 movdqa xmm0, XMMWORD PTR [rsi+rax] 94 movdqa xmm4, XMMWORD PTR [rdi+rdx] 95 movdqa xmm3, XMMWORD PTR [rdi+rdx+16] 96 97 movdqa xmm1, xmm3 98 palignr xmm1, xmm4, %2 99 100 movdqa xmm2, xmm3 101 palignr xmm2, xmm4, (%2+1) 102 103 palignr xmm3, xmm4, (%2+2) 104 105 lea rsi, [rsi+rax*2] 106 lea rdi, [rdi+rdx*2] 107 108 psadbw xmm1, xmm0 109 psadbw xmm2, xmm0 110 psadbw xmm3, xmm0 111 112 paddw xmm5, xmm1 113 paddw xmm6, xmm2 114 paddw xmm7, xmm3 115%endmacro 116 117%macro PROCESS_16X16X3_OFFSET 2 118%2_aligned_by_%1: 119 120 sub rdi, %1 121 122 PROCESS_16X2X3_OFFSET 1, %1 123 PROCESS_16X2X3_OFFSET 0, %1 124 PROCESS_16X2X3_OFFSET 0, %1 125 PROCESS_16X2X3_OFFSET 0, %1 126 PROCESS_16X2X3_OFFSET 0, %1 127 PROCESS_16X2X3_OFFSET 0, %1 128 PROCESS_16X2X3_OFFSET 0, %1 129 PROCESS_16X2X3_OFFSET 0, %1 130 131 jmp %2_store_off 132 133%endmacro 134 135%macro PROCESS_16X8X3_OFFSET 2 136%2_aligned_by_%1: 137 138 sub rdi, %1 139 140 PROCESS_16X2X3_OFFSET 1, %1 141 PROCESS_16X2X3_OFFSET 0, %1 142 PROCESS_16X2X3_OFFSET 0, %1 143 PROCESS_16X2X3_OFFSET 0, %1 144 145 jmp %2_store_off 146 147%endmacro 148 149;void int vpx_sad16x16x3_ssse3( 150; unsigned char *src_ptr, 151; int src_stride, 152; unsigned char *ref_ptr, 153; int ref_stride, 154; int *results) 155global sym(vpx_sad16x16x3_ssse3) PRIVATE 156sym(vpx_sad16x16x3_ssse3): 157 push rbp 158 mov rbp, rsp 159 SHADOW_ARGS_TO_STACK 5 160 SAVE_XMM 7 161 push rsi 162 push rdi 163 push rcx 164 ; end prolog 165 166 mov rsi, arg(0) ;src_ptr 167 mov rdi, arg(2) ;ref_ptr 168 169 mov rdx, 0xf 170 and rdx, rdi 171 172 jmp .vpx_sad16x16x3_ssse3_skiptable 173.vpx_sad16x16x3_ssse3_jumptable: 174 dd .vpx_sad16x16x3_ssse3_aligned_by_0 - .vpx_sad16x16x3_ssse3_do_jump 175 dd .vpx_sad16x16x3_ssse3_aligned_by_1 - .vpx_sad16x16x3_ssse3_do_jump 176 dd .vpx_sad16x16x3_ssse3_aligned_by_2 - .vpx_sad16x16x3_ssse3_do_jump 177 dd .vpx_sad16x16x3_ssse3_aligned_by_3 - .vpx_sad16x16x3_ssse3_do_jump 178 dd .vpx_sad16x16x3_ssse3_aligned_by_4 - .vpx_sad16x16x3_ssse3_do_jump 179 dd .vpx_sad16x16x3_ssse3_aligned_by_5 - .vpx_sad16x16x3_ssse3_do_jump 180 dd .vpx_sad16x16x3_ssse3_aligned_by_6 - .vpx_sad16x16x3_ssse3_do_jump 181 dd .vpx_sad16x16x3_ssse3_aligned_by_7 - .vpx_sad16x16x3_ssse3_do_jump 182 dd .vpx_sad16x16x3_ssse3_aligned_by_8 - .vpx_sad16x16x3_ssse3_do_jump 183 dd .vpx_sad16x16x3_ssse3_aligned_by_9 - .vpx_sad16x16x3_ssse3_do_jump 184 dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump 185 dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump 186 dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump 187 dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump 188 dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump 189 dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump 190.vpx_sad16x16x3_ssse3_skiptable: 191 192 call .vpx_sad16x16x3_ssse3_do_jump 193.vpx_sad16x16x3_ssse3_do_jump: 194 pop rcx ; get the address of do_jump 195 mov rax, .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump 196 add rax, rcx ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable 197 198 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 199 add rcx, rax 200 201 movsxd rax, dword ptr arg(1) ;src_stride 202 movsxd rdx, dword ptr arg(3) ;ref_stride 203 204 jmp rcx 205 206 PROCESS_16X16X3_OFFSET 0, .vpx_sad16x16x3_ssse3 207 PROCESS_16X16X3_OFFSET 1, .vpx_sad16x16x3_ssse3 208 PROCESS_16X16X3_OFFSET 2, .vpx_sad16x16x3_ssse3 209 PROCESS_16X16X3_OFFSET 3, .vpx_sad16x16x3_ssse3 210 PROCESS_16X16X3_OFFSET 4, .vpx_sad16x16x3_ssse3 211 PROCESS_16X16X3_OFFSET 5, .vpx_sad16x16x3_ssse3 212 PROCESS_16X16X3_OFFSET 6, .vpx_sad16x16x3_ssse3 213 PROCESS_16X16X3_OFFSET 7, .vpx_sad16x16x3_ssse3 214 PROCESS_16X16X3_OFFSET 8, .vpx_sad16x16x3_ssse3 215 PROCESS_16X16X3_OFFSET 9, .vpx_sad16x16x3_ssse3 216 PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3 217 PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3 218 PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3 219 PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3 220 PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3 221 222.vpx_sad16x16x3_ssse3_aligned_by_15: 223 PROCESS_16X2X3 1 224 PROCESS_16X2X3 0 225 PROCESS_16X2X3 0 226 PROCESS_16X2X3 0 227 PROCESS_16X2X3 0 228 PROCESS_16X2X3 0 229 PROCESS_16X2X3 0 230 PROCESS_16X2X3 0 231 232.vpx_sad16x16x3_ssse3_store_off: 233 mov rdi, arg(4) ;Results 234 235 movq xmm0, xmm5 236 psrldq xmm5, 8 237 238 paddw xmm0, xmm5 239 movd [rdi], xmm0 240;- 241 movq xmm0, xmm6 242 psrldq xmm6, 8 243 244 paddw xmm0, xmm6 245 movd [rdi+4], xmm0 246;- 247 movq xmm0, xmm7 248 psrldq xmm7, 8 249 250 paddw xmm0, xmm7 251 movd [rdi+8], xmm0 252 253 ; begin epilog 254 pop rcx 255 pop rdi 256 pop rsi 257 RESTORE_XMM 258 UNSHADOW_ARGS 259 pop rbp 260 ret 261 262;void int vpx_sad16x8x3_ssse3( 263; unsigned char *src_ptr, 264; int src_stride, 265; unsigned char *ref_ptr, 266; int ref_stride, 267; int *results) 268global sym(vpx_sad16x8x3_ssse3) PRIVATE 269sym(vpx_sad16x8x3_ssse3): 270 push rbp 271 mov rbp, rsp 272 SHADOW_ARGS_TO_STACK 5 273 SAVE_XMM 7 274 push rsi 275 push rdi 276 push rcx 277 ; end prolog 278 279 mov rsi, arg(0) ;src_ptr 280 mov rdi, arg(2) ;ref_ptr 281 282 mov rdx, 0xf 283 and rdx, rdi 284 285 jmp .vpx_sad16x8x3_ssse3_skiptable 286.vpx_sad16x8x3_ssse3_jumptable: 287 dd .vpx_sad16x8x3_ssse3_aligned_by_0 - .vpx_sad16x8x3_ssse3_do_jump 288 dd .vpx_sad16x8x3_ssse3_aligned_by_1 - .vpx_sad16x8x3_ssse3_do_jump 289 dd .vpx_sad16x8x3_ssse3_aligned_by_2 - .vpx_sad16x8x3_ssse3_do_jump 290 dd .vpx_sad16x8x3_ssse3_aligned_by_3 - .vpx_sad16x8x3_ssse3_do_jump 291 dd .vpx_sad16x8x3_ssse3_aligned_by_4 - .vpx_sad16x8x3_ssse3_do_jump 292 dd .vpx_sad16x8x3_ssse3_aligned_by_5 - .vpx_sad16x8x3_ssse3_do_jump 293 dd .vpx_sad16x8x3_ssse3_aligned_by_6 - .vpx_sad16x8x3_ssse3_do_jump 294 dd .vpx_sad16x8x3_ssse3_aligned_by_7 - .vpx_sad16x8x3_ssse3_do_jump 295 dd .vpx_sad16x8x3_ssse3_aligned_by_8 - .vpx_sad16x8x3_ssse3_do_jump 296 dd .vpx_sad16x8x3_ssse3_aligned_by_9 - .vpx_sad16x8x3_ssse3_do_jump 297 dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump 298 dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump 299 dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump 300 dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump 301 dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump 302 dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump 303.vpx_sad16x8x3_ssse3_skiptable: 304 305 call .vpx_sad16x8x3_ssse3_do_jump 306.vpx_sad16x8x3_ssse3_do_jump: 307 pop rcx ; get the address of do_jump 308 mov rax, .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump 309 add rax, rcx ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable 310 311 movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 312 add rcx, rax 313 314 movsxd rax, dword ptr arg(1) ;src_stride 315 movsxd rdx, dword ptr arg(3) ;ref_stride 316 317 jmp rcx 318 319 PROCESS_16X8X3_OFFSET 0, .vpx_sad16x8x3_ssse3 320 PROCESS_16X8X3_OFFSET 1, .vpx_sad16x8x3_ssse3 321 PROCESS_16X8X3_OFFSET 2, .vpx_sad16x8x3_ssse3 322 PROCESS_16X8X3_OFFSET 3, .vpx_sad16x8x3_ssse3 323 PROCESS_16X8X3_OFFSET 4, .vpx_sad16x8x3_ssse3 324 PROCESS_16X8X3_OFFSET 5, .vpx_sad16x8x3_ssse3 325 PROCESS_16X8X3_OFFSET 6, .vpx_sad16x8x3_ssse3 326 PROCESS_16X8X3_OFFSET 7, .vpx_sad16x8x3_ssse3 327 PROCESS_16X8X3_OFFSET 8, .vpx_sad16x8x3_ssse3 328 PROCESS_16X8X3_OFFSET 9, .vpx_sad16x8x3_ssse3 329 PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3 330 PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3 331 PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3 332 PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3 333 PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3 334 335.vpx_sad16x8x3_ssse3_aligned_by_15: 336 337 PROCESS_16X2X3 1 338 PROCESS_16X2X3 0 339 PROCESS_16X2X3 0 340 PROCESS_16X2X3 0 341 342.vpx_sad16x8x3_ssse3_store_off: 343 mov rdi, arg(4) ;Results 344 345 movq xmm0, xmm5 346 psrldq xmm5, 8 347 348 paddw xmm0, xmm5 349 movd [rdi], xmm0 350;- 351 movq xmm0, xmm6 352 psrldq xmm6, 8 353 354 paddw xmm0, xmm6 355 movd [rdi+4], xmm0 356;- 357 movq xmm0, xmm7 358 psrldq xmm7, 8 359 360 paddw xmm0, xmm7 361 movd [rdi+8], xmm0 362 363 ; begin epilog 364 pop rcx 365 pop rdi 366 pop rsi 367 RESTORE_XMM 368 UNSHADOW_ARGS 369 pop rbp 370 ret 371