1; 2; Copyright (c) 2012 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11 12%include "vpx_ports/x86_abi_support.asm" 13 14;void vp8_filter_by_weight16x16_sse2 15;( 16; unsigned char *src, 17; int src_stride, 18; unsigned char *dst, 19; int dst_stride, 20; int src_weight 21;) 22global sym(vp8_filter_by_weight16x16_sse2) PRIVATE 23sym(vp8_filter_by_weight16x16_sse2): 24 push rbp 25 mov rbp, rsp 26 SHADOW_ARGS_TO_STACK 5 27 SAVE_XMM 6 28 GET_GOT rbx 29 push rsi 30 push rdi 31 ; end prolog 32 33 movd xmm0, arg(4) ; src_weight 34 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 35 punpcklqdq xmm0, xmm0 ; replicate to all hi words 36 37 movdqa xmm1, [GLOBAL(tMFQE)] 38 psubw xmm1, xmm0 ; dst_weight 39 40 mov rax, arg(0) ; src 41 mov rsi, arg(1) ; src_stride 42 mov rdx, arg(2) ; dst 43 mov rdi, arg(3) ; dst_stride 44 45 mov rcx, 16 ; loop count 46 pxor xmm6, xmm6 47 48.combine 49 movdqa xmm2, [rax] 50 movdqa xmm4, [rdx] 51 add rax, rsi 52 53 ; src * src_weight 54 movdqa xmm3, xmm2 55 punpcklbw xmm2, xmm6 56 punpckhbw xmm3, xmm6 57 pmullw xmm2, xmm0 58 pmullw xmm3, xmm0 59 60 ; dst * dst_weight 61 movdqa xmm5, xmm4 62 punpcklbw xmm4, xmm6 63 punpckhbw xmm5, xmm6 64 pmullw xmm4, xmm1 65 pmullw xmm5, xmm1 66 67 ; sum, round and shift 68 paddw xmm2, xmm4 69 paddw xmm3, xmm5 70 paddw xmm2, [GLOBAL(tMFQE_round)] 71 paddw xmm3, [GLOBAL(tMFQE_round)] 72 psrlw xmm2, 4 73 psrlw xmm3, 4 74 75 packuswb xmm2, xmm3 76 movdqa [rdx], xmm2 77 add rdx, rdi 78 79 dec rcx 80 jnz .combine 81 82 ; begin epilog 83 pop rdi 84 pop rsi 85 RESTORE_GOT 86 RESTORE_XMM 87 UNSHADOW_ARGS 88 pop rbp 89 90 ret 91 92;void vp8_filter_by_weight8x8_sse2 93;( 94; unsigned char *src, 95; int src_stride, 96; unsigned char *dst, 97; int dst_stride, 98; int src_weight 99;) 100global sym(vp8_filter_by_weight8x8_sse2) PRIVATE 101sym(vp8_filter_by_weight8x8_sse2): 102 push rbp 103 mov rbp, rsp 104 SHADOW_ARGS_TO_STACK 5 105 GET_GOT rbx 106 push rsi 107 push rdi 108 ; end prolog 109 110 movd xmm0, arg(4) ; src_weight 111 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 112 punpcklqdq xmm0, xmm0 ; replicate to all hi words 113 114 movdqa xmm1, [GLOBAL(tMFQE)] 115 psubw xmm1, xmm0 ; dst_weight 116 117 mov rax, arg(0) ; src 118 mov rsi, arg(1) ; src_stride 119 mov rdx, arg(2) ; dst 120 mov rdi, arg(3) ; dst_stride 121 122 mov rcx, 8 ; loop count 123 pxor xmm4, xmm4 124 125.combine 126 movq xmm2, [rax] 127 movq xmm3, [rdx] 128 add rax, rsi 129 130 ; src * src_weight 131 punpcklbw xmm2, xmm4 132 pmullw xmm2, xmm0 133 134 ; dst * dst_weight 135 punpcklbw xmm3, xmm4 136 pmullw xmm3, xmm1 137 138 ; sum, round and shift 139 paddw xmm2, xmm3 140 paddw xmm2, [GLOBAL(tMFQE_round)] 141 psrlw xmm2, 4 142 143 packuswb xmm2, xmm4 144 movq [rdx], xmm2 145 add rdx, rdi 146 147 dec rcx 148 jnz .combine 149 150 ; begin epilog 151 pop rdi 152 pop rsi 153 RESTORE_GOT 154 UNSHADOW_ARGS 155 pop rbp 156 157 ret 158 159;void vp8_variance_and_sad_16x16_sse2 | arg 160;( 161; unsigned char *src1, 0 162; int stride1, 1 163; unsigned char *src2, 2 164; int stride2, 3 165; unsigned int *variance, 4 166; unsigned int *sad, 5 167;) 168global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE 169sym(vp8_variance_and_sad_16x16_sse2): 170 push rbp 171 mov rbp, rsp 172 SHADOW_ARGS_TO_STACK 6 173 GET_GOT rbx 174 push rsi 175 push rdi 176 ; end prolog 177 178 mov rax, arg(0) ; src1 179 mov rcx, arg(1) ; stride1 180 mov rdx, arg(2) ; src2 181 mov rdi, arg(3) ; stride2 182 183 mov rsi, 16 ; block height 184 185 ; Prep accumulator registers 186 pxor xmm3, xmm3 ; SAD 187 pxor xmm4, xmm4 ; sum of src2 188 pxor xmm5, xmm5 ; sum of src2^2 189 190 ; Because we're working with the actual output frames 191 ; we can't depend on any kind of data alignment. 192.accumulate 193 movdqa xmm0, [rax] ; src1 194 movdqa xmm1, [rdx] ; src2 195 add rax, rcx ; src1 + stride1 196 add rdx, rdi ; src2 + stride2 197 198 ; SAD(src1, src2) 199 psadbw xmm0, xmm1 200 paddusw xmm3, xmm0 201 202 ; SUM(src2) 203 pxor xmm2, xmm2 204 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 205 paddusw xmm4, xmm2 206 207 ; pmaddubsw would be ideal if it took two unsigned values. instead, 208 ; it expects a signed and an unsigned value. so instead we zero extend 209 ; and operate on words. 210 pxor xmm2, xmm2 211 movdqa xmm0, xmm1 212 punpcklbw xmm0, xmm2 213 punpckhbw xmm1, xmm2 214 pmaddwd xmm0, xmm0 215 pmaddwd xmm1, xmm1 216 paddd xmm5, xmm0 217 paddd xmm5, xmm1 218 219 sub rsi, 1 220 jnz .accumulate 221 222 ; phaddd only operates on adjacent double words. 223 ; Finalize SAD and store 224 movdqa xmm0, xmm3 225 psrldq xmm0, 8 226 paddusw xmm0, xmm3 227 paddd xmm0, [GLOBAL(t128)] 228 psrld xmm0, 8 229 230 mov rax, arg(5) 231 movd [rax], xmm0 232 233 ; Accumulate sum of src2 234 movdqa xmm0, xmm4 235 psrldq xmm0, 8 236 paddusw xmm0, xmm4 237 ; Square src2. Ignore high value 238 pmuludq xmm0, xmm0 239 psrld xmm0, 8 240 241 ; phaddw could be used to sum adjacent values but we want 242 ; all the values summed. promote to doubles, accumulate, 243 ; shift and sum 244 pxor xmm2, xmm2 245 movdqa xmm1, xmm5 246 punpckldq xmm1, xmm2 247 punpckhdq xmm5, xmm2 248 paddd xmm1, xmm5 249 movdqa xmm2, xmm1 250 psrldq xmm1, 8 251 paddd xmm1, xmm2 252 253 psubd xmm1, xmm0 254 255 ; (variance + 128) >> 8 256 paddd xmm1, [GLOBAL(t128)] 257 psrld xmm1, 8 258 mov rax, arg(4) 259 260 movd [rax], xmm1 261 262 263 ; begin epilog 264 pop rdi 265 pop rsi 266 RESTORE_GOT 267 UNSHADOW_ARGS 268 pop rbp 269 ret 270 271SECTION_RODATA 272align 16 273t128: 274%ifndef __NASM_VER__ 275 ddq 128 276%elif CONFIG_BIG_ENDIAN 277 dq 0, 128 278%else 279 dq 128, 0 280%endif 281align 16 282tMFQE: ; 1 << MFQE_PRECISION 283 times 8 dw 0x10 284align 16 285tMFQE_round: ; 1 << (MFQE_PRECISION - 1) 286 times 8 dw 0x08 287 288