1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11; This file is a duplicate of mfqe_sse2.asm in VP8. 12; TODO(jackychen): Find a way to fix the duplicate. 13%include "vpx_ports/x86_abi_support.asm" 14 15;void vp9_filter_by_weight16x16_sse2 16;( 17; unsigned char *src, 18; int src_stride, 19; unsigned char *dst, 20; int dst_stride, 21; int src_weight 22;) 23global sym(vp9_filter_by_weight16x16_sse2) PRIVATE 24sym(vp9_filter_by_weight16x16_sse2): 25 push rbp 26 mov rbp, rsp 27 SHADOW_ARGS_TO_STACK 5 28 SAVE_XMM 6 29 GET_GOT rbx 30 push rsi 31 push rdi 32 ; end prolog 33 34 movd xmm0, arg(4) ; src_weight 35 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 36 punpcklqdq xmm0, xmm0 ; replicate to all hi words 37 38 movdqa xmm1, [GLOBAL(tMFQE)] 39 psubw xmm1, xmm0 ; dst_weight 40 41 mov rax, arg(0) ; src 42 mov rsi, arg(1) ; src_stride 43 mov rdx, arg(2) ; dst 44 mov rdi, arg(3) ; dst_stride 45 46 mov rcx, 16 ; loop count 47 pxor xmm6, xmm6 48 49.combine 50 movdqa xmm2, [rax] 51 movdqa xmm4, [rdx] 52 add rax, rsi 53 54 ; src * src_weight 55 movdqa xmm3, xmm2 56 punpcklbw xmm2, xmm6 57 punpckhbw xmm3, xmm6 58 pmullw xmm2, xmm0 59 pmullw xmm3, xmm0 60 61 ; dst * dst_weight 62 movdqa xmm5, xmm4 63 punpcklbw xmm4, xmm6 64 punpckhbw xmm5, xmm6 65 pmullw xmm4, xmm1 66 pmullw xmm5, xmm1 67 68 ; sum, round and shift 69 paddw xmm2, xmm4 70 paddw xmm3, xmm5 71 paddw xmm2, [GLOBAL(tMFQE_round)] 72 paddw xmm3, [GLOBAL(tMFQE_round)] 73 psrlw xmm2, 4 74 psrlw xmm3, 4 75 76 packuswb xmm2, xmm3 77 movdqa [rdx], xmm2 78 add rdx, rdi 79 80 dec rcx 81 jnz .combine 82 83 ; begin epilog 84 pop rdi 85 pop rsi 86 RESTORE_GOT 87 RESTORE_XMM 88 UNSHADOW_ARGS 89 pop rbp 90 91 ret 92 93;void vp9_filter_by_weight8x8_sse2 94;( 95; unsigned char *src, 96; int src_stride, 97; unsigned char *dst, 98; int dst_stride, 99; int src_weight 100;) 101global sym(vp9_filter_by_weight8x8_sse2) PRIVATE 102sym(vp9_filter_by_weight8x8_sse2): 103 push rbp 104 mov rbp, rsp 105 SHADOW_ARGS_TO_STACK 5 106 GET_GOT rbx 107 push rsi 108 push rdi 109 ; end prolog 110 111 movd xmm0, arg(4) ; src_weight 112 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 113 punpcklqdq xmm0, xmm0 ; replicate to all hi words 114 115 movdqa xmm1, [GLOBAL(tMFQE)] 116 psubw xmm1, xmm0 ; dst_weight 117 118 mov rax, arg(0) ; src 119 mov rsi, arg(1) ; src_stride 120 mov rdx, arg(2) ; dst 121 mov rdi, arg(3) ; dst_stride 122 123 mov rcx, 8 ; loop count 124 pxor xmm4, xmm4 125 126.combine 127 movq xmm2, [rax] 128 movq xmm3, [rdx] 129 add rax, rsi 130 131 ; src * src_weight 132 punpcklbw xmm2, xmm4 133 pmullw xmm2, xmm0 134 135 ; dst * dst_weight 136 punpcklbw xmm3, xmm4 137 pmullw xmm3, xmm1 138 139 ; sum, round and shift 140 paddw xmm2, xmm3 141 paddw xmm2, [GLOBAL(tMFQE_round)] 142 psrlw xmm2, 4 143 144 packuswb xmm2, xmm4 145 movq [rdx], xmm2 146 add rdx, rdi 147 148 dec rcx 149 jnz .combine 150 151 ; begin epilog 152 pop rdi 153 pop rsi 154 RESTORE_GOT 155 UNSHADOW_ARGS 156 pop rbp 157 158 ret 159 160;void vp9_variance_and_sad_16x16_sse2 | arg 161;( 162; unsigned char *src1, 0 163; int stride1, 1 164; unsigned char *src2, 2 165; int stride2, 3 166; unsigned int *variance, 4 167; unsigned int *sad, 5 168;) 169global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE 170sym(vp9_variance_and_sad_16x16_sse2): 171 push rbp 172 mov rbp, rsp 173 SHADOW_ARGS_TO_STACK 6 174 GET_GOT rbx 175 push rsi 176 push rdi 177 ; end prolog 178 179 mov rax, arg(0) ; src1 180 mov rcx, arg(1) ; stride1 181 mov rdx, arg(2) ; src2 182 mov rdi, arg(3) ; stride2 183 184 mov rsi, 16 ; block height 185 186 ; Prep accumulator registers 187 pxor xmm3, xmm3 ; SAD 188 pxor xmm4, xmm4 ; sum of src2 189 pxor xmm5, xmm5 ; sum of src2^2 190 191 ; Because we're working with the actual output frames 192 ; we can't depend on any kind of data alignment. 193.accumulate 194 movdqa xmm0, [rax] ; src1 195 movdqa xmm1, [rdx] ; src2 196 add rax, rcx ; src1 + stride1 197 add rdx, rdi ; src2 + stride2 198 199 ; SAD(src1, src2) 200 psadbw xmm0, xmm1 201 paddusw xmm3, xmm0 202 203 ; SUM(src2) 204 pxor xmm2, xmm2 205 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 206 paddusw xmm4, xmm2 207 208 ; pmaddubsw would be ideal if it took two unsigned values. instead, 209 ; it expects a signed and an unsigned value. so instead we zero extend 210 ; and operate on words. 211 pxor xmm2, xmm2 212 movdqa xmm0, xmm1 213 punpcklbw xmm0, xmm2 214 punpckhbw xmm1, xmm2 215 pmaddwd xmm0, xmm0 216 pmaddwd xmm1, xmm1 217 paddd xmm5, xmm0 218 paddd xmm5, xmm1 219 220 sub rsi, 1 221 jnz .accumulate 222 223 ; phaddd only operates on adjacent double words. 224 ; Finalize SAD and store 225 movdqa xmm0, xmm3 226 psrldq xmm0, 8 227 paddusw xmm0, xmm3 228 paddd xmm0, [GLOBAL(t128)] 229 psrld xmm0, 8 230 231 mov rax, arg(5) 232 movd [rax], xmm0 233 234 ; Accumulate sum of src2 235 movdqa xmm0, xmm4 236 psrldq xmm0, 8 237 paddusw xmm0, xmm4 238 ; Square src2. Ignore high value 239 pmuludq xmm0, xmm0 240 psrld xmm0, 8 241 242 ; phaddw could be used to sum adjacent values but we want 243 ; all the values summed. promote to doubles, accumulate, 244 ; shift and sum 245 pxor xmm2, xmm2 246 movdqa xmm1, xmm5 247 punpckldq xmm1, xmm2 248 punpckhdq xmm5, xmm2 249 paddd xmm1, xmm5 250 movdqa xmm2, xmm1 251 psrldq xmm1, 8 252 paddd xmm1, xmm2 253 254 psubd xmm1, xmm0 255 256 ; (variance + 128) >> 8 257 paddd xmm1, [GLOBAL(t128)] 258 psrld xmm1, 8 259 mov rax, arg(4) 260 261 movd [rax], xmm1 262 263 264 ; begin epilog 265 pop rdi 266 pop rsi 267 RESTORE_GOT 268 UNSHADOW_ARGS 269 pop rbp 270 ret 271 272SECTION_RODATA 273align 16 274t128: 275%ifndef __NASM_VER__ 276 ddq 128 277%elif CONFIG_BIG_ENDIAN 278 dq 0, 128 279%else 280 dq 128, 0 281%endif 282align 16 283tMFQE: ; 1 << MFQE_PRECISION 284 times 8 dw 0x10 285align 16 286tMFQE_round: ; 1 << (MFQE_PRECISION - 1) 287 times 8 dw 0x08 288