1;
2;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_filter_by_weight16x16_sse2
15;(
16;    unsigned char *src,
17;    int            src_stride,
18;    unsigned char *dst,
19;    int            dst_stride,
20;    int            src_weight
21;)
22global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
23sym(vp8_filter_by_weight16x16_sse2):
24    push        rbp
25    mov         rbp, rsp
26    SHADOW_ARGS_TO_STACK 5
27    SAVE_XMM 6
28    GET_GOT     rbx
29    push        rsi
30    push        rdi
31    ; end prolog
32
33    movd        xmm0, arg(4)                ; src_weight
34    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
35    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
36
37    movdqa      xmm1, [GLOBAL(tMFQE)]
38    psubw       xmm1, xmm0                  ; dst_weight
39
40    mov         rax, arg(0)                 ; src
41    mov         rsi, arg(1)                 ; src_stride
42    mov         rdx, arg(2)                 ; dst
43    mov         rdi, arg(3)                 ; dst_stride
44
45    mov         rcx, 16                     ; loop count
46    pxor        xmm6, xmm6
47
48.combine
49    movdqa      xmm2, [rax]
50    movdqa      xmm4, [rdx]
51    add         rax, rsi
52
53    ; src * src_weight
54    movdqa      xmm3, xmm2
55    punpcklbw   xmm2, xmm6
56    punpckhbw   xmm3, xmm6
57    pmullw      xmm2, xmm0
58    pmullw      xmm3, xmm0
59
60    ; dst * dst_weight
61    movdqa      xmm5, xmm4
62    punpcklbw   xmm4, xmm6
63    punpckhbw   xmm5, xmm6
64    pmullw      xmm4, xmm1
65    pmullw      xmm5, xmm1
66
67    ; sum, round and shift
68    paddw       xmm2, xmm4
69    paddw       xmm3, xmm5
70    paddw       xmm2, [GLOBAL(tMFQE_round)]
71    paddw       xmm3, [GLOBAL(tMFQE_round)]
72    psrlw       xmm2, 4
73    psrlw       xmm3, 4
74
75    packuswb    xmm2, xmm3
76    movdqa      [rdx], xmm2
77    add         rdx, rdi
78
79    dec         rcx
80    jnz         .combine
81
82    ; begin epilog
83    pop         rdi
84    pop         rsi
85    RESTORE_GOT
86    RESTORE_XMM
87    UNSHADOW_ARGS
88    pop         rbp
89
90    ret
91
92;void vp8_filter_by_weight8x8_sse2
93;(
94;    unsigned char *src,
95;    int            src_stride,
96;    unsigned char *dst,
97;    int            dst_stride,
98;    int            src_weight
99;)
100global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
101sym(vp8_filter_by_weight8x8_sse2):
102    push        rbp
103    mov         rbp, rsp
104    SHADOW_ARGS_TO_STACK 5
105    GET_GOT     rbx
106    push        rsi
107    push        rdi
108    ; end prolog
109
110    movd        xmm0, arg(4)                ; src_weight
111    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
112    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
113
114    movdqa      xmm1, [GLOBAL(tMFQE)]
115    psubw       xmm1, xmm0                  ; dst_weight
116
117    mov         rax, arg(0)                 ; src
118    mov         rsi, arg(1)                 ; src_stride
119    mov         rdx, arg(2)                 ; dst
120    mov         rdi, arg(3)                 ; dst_stride
121
122    mov         rcx, 8                      ; loop count
123    pxor        xmm4, xmm4
124
125.combine
126    movq        xmm2, [rax]
127    movq        xmm3, [rdx]
128    add         rax, rsi
129
130    ; src * src_weight
131    punpcklbw   xmm2, xmm4
132    pmullw      xmm2, xmm0
133
134    ; dst * dst_weight
135    punpcklbw   xmm3, xmm4
136    pmullw      xmm3, xmm1
137
138    ; sum, round and shift
139    paddw       xmm2, xmm3
140    paddw       xmm2, [GLOBAL(tMFQE_round)]
141    psrlw       xmm2, 4
142
143    packuswb    xmm2, xmm4
144    movq        [rdx], xmm2
145    add         rdx, rdi
146
147    dec         rcx
148    jnz         .combine
149
150    ; begin epilog
151    pop         rdi
152    pop         rsi
153    RESTORE_GOT
154    UNSHADOW_ARGS
155    pop         rbp
156
157    ret
158
159;void vp8_variance_and_sad_16x16_sse2 | arg
160;(
161;    unsigned char *src1,          0
162;    int            stride1,       1
163;    unsigned char *src2,          2
164;    int            stride2,       3
165;    unsigned int  *variance,      4
166;    unsigned int  *sad,           5
167;)
168global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
169sym(vp8_variance_and_sad_16x16_sse2):
170    push        rbp
171    mov         rbp, rsp
172    SHADOW_ARGS_TO_STACK 6
173    GET_GOT     rbx
174    push        rsi
175    push        rdi
176    ; end prolog
177
178    mov         rax,        arg(0)          ; src1
179    mov         rcx,        arg(1)          ; stride1
180    mov         rdx,        arg(2)          ; src2
181    mov         rdi,        arg(3)          ; stride2
182
183    mov         rsi,        16              ; block height
184
185    ; Prep accumulator registers
186    pxor        xmm3, xmm3                  ; SAD
187    pxor        xmm4, xmm4                  ; sum of src2
188    pxor        xmm5, xmm5                  ; sum of src2^2
189
190    ; Because we're working with the actual output frames
191    ; we can't depend on any kind of data alignment.
192.accumulate
193    movdqa      xmm0, [rax]                 ; src1
194    movdqa      xmm1, [rdx]                 ; src2
195    add         rax, rcx                    ; src1 + stride1
196    add         rdx, rdi                    ; src2 + stride2
197
198    ; SAD(src1, src2)
199    psadbw      xmm0, xmm1
200    paddusw     xmm3, xmm0
201
202    ; SUM(src2)
203    pxor        xmm2, xmm2
204    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
205    paddusw     xmm4, xmm2
206
207    ; pmaddubsw would be ideal if it took two unsigned values. instead,
208    ; it expects a signed and an unsigned value. so instead we zero extend
209    ; and operate on words.
210    pxor        xmm2, xmm2
211    movdqa      xmm0, xmm1
212    punpcklbw   xmm0, xmm2
213    punpckhbw   xmm1, xmm2
214    pmaddwd     xmm0, xmm0
215    pmaddwd     xmm1, xmm1
216    paddd       xmm5, xmm0
217    paddd       xmm5, xmm1
218
219    sub         rsi,        1
220    jnz         .accumulate
221
222    ; phaddd only operates on adjacent double words.
223    ; Finalize SAD and store
224    movdqa      xmm0, xmm3
225    psrldq      xmm0, 8
226    paddusw     xmm0, xmm3
227    paddd       xmm0, [GLOBAL(t128)]
228    psrld       xmm0, 8
229
230    mov         rax,  arg(5)
231    movd        [rax], xmm0
232
233    ; Accumulate sum of src2
234    movdqa      xmm0, xmm4
235    psrldq      xmm0, 8
236    paddusw     xmm0, xmm4
237    ; Square src2. Ignore high value
238    pmuludq     xmm0, xmm0
239    psrld       xmm0, 8
240
241    ; phaddw could be used to sum adjacent values but we want
242    ; all the values summed. promote to doubles, accumulate,
243    ; shift and sum
244    pxor        xmm2, xmm2
245    movdqa      xmm1, xmm5
246    punpckldq   xmm1, xmm2
247    punpckhdq   xmm5, xmm2
248    paddd       xmm1, xmm5
249    movdqa      xmm2, xmm1
250    psrldq      xmm1, 8
251    paddd       xmm1, xmm2
252
253    psubd       xmm1, xmm0
254
255    ; (variance + 128) >> 8
256    paddd       xmm1, [GLOBAL(t128)]
257    psrld       xmm1, 8
258    mov         rax,  arg(4)
259
260    movd        [rax], xmm1
261
262
263    ; begin epilog
264    pop         rdi
265    pop         rsi
266    RESTORE_GOT
267    UNSHADOW_ARGS
268    pop         rbp
269    ret
270
271SECTION_RODATA
272align 16
273t128:
274%ifndef __NASM_VER__
275    ddq 128
276%elif CONFIG_BIG_ENDIAN
277    dq  0, 128
278%else
279    dq  128, 0
280%endif
281align 16
282tMFQE: ; 1 << MFQE_PRECISION
283    times 8 dw 0x10
284align 16
285tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
286    times 8 dw 0x08
287
288