1; 2; Copyright (c) 2015 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11; This file is a duplicate of mfqe_sse2.asm in VP8. 12; TODO(jackychen): Find a way to fix the duplicate. 13%include "vpx_ports/x86_abi_support.asm" 14 15SECTION .text 16 17;void vp9_filter_by_weight16x16_sse2 18;( 19; unsigned char *src, 20; int src_stride, 21; unsigned char *dst, 22; int dst_stride, 23; int src_weight 24;) 25global sym(vp9_filter_by_weight16x16_sse2) PRIVATE 26sym(vp9_filter_by_weight16x16_sse2): 27 push rbp 28 mov rbp, rsp 29 SHADOW_ARGS_TO_STACK 5 30 SAVE_XMM 6 31 GET_GOT rbx 32 push rsi 33 push rdi 34 ; end prolog 35 36 movd xmm0, arg(4) ; src_weight 37 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 38 punpcklqdq xmm0, xmm0 ; replicate to all hi words 39 40 movdqa xmm1, [GLOBAL(tMFQE)] 41 psubw xmm1, xmm0 ; dst_weight 42 43 mov rax, arg(0) ; src 44 mov rsi, arg(1) ; src_stride 45 mov rdx, arg(2) ; dst 46 mov rdi, arg(3) ; dst_stride 47 48 mov rcx, 16 ; loop count 49 pxor xmm6, xmm6 50 51.combine: 52 movdqa xmm2, [rax] 53 movdqa xmm4, [rdx] 54 add rax, rsi 55 56 ; src * src_weight 57 movdqa xmm3, xmm2 58 punpcklbw xmm2, xmm6 59 punpckhbw xmm3, xmm6 60 pmullw xmm2, xmm0 61 pmullw xmm3, xmm0 62 63 ; dst * dst_weight 64 movdqa xmm5, xmm4 65 punpcklbw xmm4, xmm6 66 punpckhbw xmm5, xmm6 67 pmullw xmm4, xmm1 68 pmullw xmm5, xmm1 69 70 ; sum, round and shift 71 paddw xmm2, xmm4 72 paddw xmm3, xmm5 73 paddw xmm2, [GLOBAL(tMFQE_round)] 74 paddw xmm3, [GLOBAL(tMFQE_round)] 75 psrlw xmm2, 4 76 psrlw xmm3, 4 77 78 packuswb xmm2, xmm3 79 movdqa [rdx], xmm2 80 add rdx, rdi 81 82 dec rcx 83 jnz .combine 84 85 ; begin epilog 86 pop rdi 87 pop rsi 88 RESTORE_GOT 89 RESTORE_XMM 90 UNSHADOW_ARGS 91 pop rbp 92 93 ret 94 95;void vp9_filter_by_weight8x8_sse2 96;( 97; unsigned char *src, 98; int src_stride, 99; unsigned char *dst, 100; int dst_stride, 101; int src_weight 102;) 103global sym(vp9_filter_by_weight8x8_sse2) PRIVATE 104sym(vp9_filter_by_weight8x8_sse2): 105 push rbp 106 mov rbp, rsp 107 SHADOW_ARGS_TO_STACK 5 108 GET_GOT rbx 109 push rsi 110 push rdi 111 ; end prolog 112 113 movd xmm0, arg(4) ; src_weight 114 pshuflw xmm0, xmm0, 0x0 ; replicate to all low words 115 punpcklqdq xmm0, xmm0 ; replicate to all hi words 116 117 movdqa xmm1, [GLOBAL(tMFQE)] 118 psubw xmm1, xmm0 ; dst_weight 119 120 mov rax, arg(0) ; src 121 mov rsi, arg(1) ; src_stride 122 mov rdx, arg(2) ; dst 123 mov rdi, arg(3) ; dst_stride 124 125 mov rcx, 8 ; loop count 126 pxor xmm4, xmm4 127 128.combine: 129 movq xmm2, [rax] 130 movq xmm3, [rdx] 131 add rax, rsi 132 133 ; src * src_weight 134 punpcklbw xmm2, xmm4 135 pmullw xmm2, xmm0 136 137 ; dst * dst_weight 138 punpcklbw xmm3, xmm4 139 pmullw xmm3, xmm1 140 141 ; sum, round and shift 142 paddw xmm2, xmm3 143 paddw xmm2, [GLOBAL(tMFQE_round)] 144 psrlw xmm2, 4 145 146 packuswb xmm2, xmm4 147 movq [rdx], xmm2 148 add rdx, rdi 149 150 dec rcx 151 jnz .combine 152 153 ; begin epilog 154 pop rdi 155 pop rsi 156 RESTORE_GOT 157 UNSHADOW_ARGS 158 pop rbp 159 160 ret 161 162;void vp9_variance_and_sad_16x16_sse2 | arg 163;( 164; unsigned char *src1, 0 165; int stride1, 1 166; unsigned char *src2, 2 167; int stride2, 3 168; unsigned int *variance, 4 169; unsigned int *sad, 5 170;) 171global sym(vp9_variance_and_sad_16x16_sse2) PRIVATE 172sym(vp9_variance_and_sad_16x16_sse2): 173 push rbp 174 mov rbp, rsp 175 SHADOW_ARGS_TO_STACK 6 176 GET_GOT rbx 177 push rsi 178 push rdi 179 ; end prolog 180 181 mov rax, arg(0) ; src1 182 mov rcx, arg(1) ; stride1 183 mov rdx, arg(2) ; src2 184 mov rdi, arg(3) ; stride2 185 186 mov rsi, 16 ; block height 187 188 ; Prep accumulator registers 189 pxor xmm3, xmm3 ; SAD 190 pxor xmm4, xmm4 ; sum of src2 191 pxor xmm5, xmm5 ; sum of src2^2 192 193 ; Because we're working with the actual output frames 194 ; we can't depend on any kind of data alignment. 195.accumulate: 196 movdqa xmm0, [rax] ; src1 197 movdqa xmm1, [rdx] ; src2 198 add rax, rcx ; src1 + stride1 199 add rdx, rdi ; src2 + stride2 200 201 ; SAD(src1, src2) 202 psadbw xmm0, xmm1 203 paddusw xmm3, xmm0 204 205 ; SUM(src2) 206 pxor xmm2, xmm2 207 psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 208 paddusw xmm4, xmm2 209 210 ; pmaddubsw would be ideal if it took two unsigned values. instead, 211 ; it expects a signed and an unsigned value. so instead we zero extend 212 ; and operate on words. 213 pxor xmm2, xmm2 214 movdqa xmm0, xmm1 215 punpcklbw xmm0, xmm2 216 punpckhbw xmm1, xmm2 217 pmaddwd xmm0, xmm0 218 pmaddwd xmm1, xmm1 219 paddd xmm5, xmm0 220 paddd xmm5, xmm1 221 222 sub rsi, 1 223 jnz .accumulate 224 225 ; phaddd only operates on adjacent double words. 226 ; Finalize SAD and store 227 movdqa xmm0, xmm3 228 psrldq xmm0, 8 229 paddusw xmm0, xmm3 230 paddd xmm0, [GLOBAL(t128)] 231 psrld xmm0, 8 232 233 mov rax, arg(5) 234 movd [rax], xmm0 235 236 ; Accumulate sum of src2 237 movdqa xmm0, xmm4 238 psrldq xmm0, 8 239 paddusw xmm0, xmm4 240 ; Square src2. Ignore high value 241 pmuludq xmm0, xmm0 242 psrld xmm0, 8 243 244 ; phaddw could be used to sum adjacent values but we want 245 ; all the values summed. promote to doubles, accumulate, 246 ; shift and sum 247 pxor xmm2, xmm2 248 movdqa xmm1, xmm5 249 punpckldq xmm1, xmm2 250 punpckhdq xmm5, xmm2 251 paddd xmm1, xmm5 252 movdqa xmm2, xmm1 253 psrldq xmm1, 8 254 paddd xmm1, xmm2 255 256 psubd xmm1, xmm0 257 258 ; (variance + 128) >> 8 259 paddd xmm1, [GLOBAL(t128)] 260 psrld xmm1, 8 261 mov rax, arg(4) 262 263 movd [rax], xmm1 264 265 266 ; begin epilog 267 pop rdi 268 pop rsi 269 RESTORE_GOT 270 UNSHADOW_ARGS 271 pop rbp 272 ret 273 274SECTION_RODATA 275align 16 276t128: 277%ifndef __NASM_VER__ 278 ddq 128 279%elif CONFIG_BIG_ENDIAN 280 dq 0, 128 281%else 282 dq 128, 0 283%endif 284align 16 285tMFQE: ; 1 << MFQE_PRECISION 286 times 8 dw 0x10 287align 16 288tMFQE_round: ; 1 << (MFQE_PRECISION - 1) 289 times 8 dw 0x08 290