1;
2;  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;void vp8_filter_by_weight16x16_sse2
17;(
18;    unsigned char *src,
19;    int            src_stride,
20;    unsigned char *dst,
21;    int            dst_stride,
22;    int            src_weight
23;)
24global sym(vp8_filter_by_weight16x16_sse2) PRIVATE
25sym(vp8_filter_by_weight16x16_sse2):
26    push        rbp
27    mov         rbp, rsp
28    SHADOW_ARGS_TO_STACK 5
29    SAVE_XMM 6
30    GET_GOT     rbx
31    push        rsi
32    push        rdi
33    ; end prolog
34
35    movd        xmm0, arg(4)                ; src_weight
36    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
37    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
38
39    movdqa      xmm1, [GLOBAL(tMFQE)]
40    psubw       xmm1, xmm0                  ; dst_weight
41
42    mov         rax, arg(0)                 ; src
43    mov         rsi, arg(1)                 ; src_stride
44    mov         rdx, arg(2)                 ; dst
45    mov         rdi, arg(3)                 ; dst_stride
46
47    mov         rcx, 16                     ; loop count
48    pxor        xmm6, xmm6
49
50.combine:
51    movdqa      xmm2, [rax]
52    movdqa      xmm4, [rdx]
53    add         rax, rsi
54
55    ; src * src_weight
56    movdqa      xmm3, xmm2
57    punpcklbw   xmm2, xmm6
58    punpckhbw   xmm3, xmm6
59    pmullw      xmm2, xmm0
60    pmullw      xmm3, xmm0
61
62    ; dst * dst_weight
63    movdqa      xmm5, xmm4
64    punpcklbw   xmm4, xmm6
65    punpckhbw   xmm5, xmm6
66    pmullw      xmm4, xmm1
67    pmullw      xmm5, xmm1
68
69    ; sum, round and shift
70    paddw       xmm2, xmm4
71    paddw       xmm3, xmm5
72    paddw       xmm2, [GLOBAL(tMFQE_round)]
73    paddw       xmm3, [GLOBAL(tMFQE_round)]
74    psrlw       xmm2, 4
75    psrlw       xmm3, 4
76
77    packuswb    xmm2, xmm3
78    movdqa      [rdx], xmm2
79    add         rdx, rdi
80
81    dec         rcx
82    jnz         .combine
83
84    ; begin epilog
85    pop         rdi
86    pop         rsi
87    RESTORE_GOT
88    RESTORE_XMM
89    UNSHADOW_ARGS
90    pop         rbp
91
92    ret
93
94;void vp8_filter_by_weight8x8_sse2
95;(
96;    unsigned char *src,
97;    int            src_stride,
98;    unsigned char *dst,
99;    int            dst_stride,
100;    int            src_weight
101;)
102global sym(vp8_filter_by_weight8x8_sse2) PRIVATE
103sym(vp8_filter_by_weight8x8_sse2):
104    push        rbp
105    mov         rbp, rsp
106    SHADOW_ARGS_TO_STACK 5
107    GET_GOT     rbx
108    push        rsi
109    push        rdi
110    ; end prolog
111
112    movd        xmm0, arg(4)                ; src_weight
113    pshuflw     xmm0, xmm0, 0x0             ; replicate to all low words
114    punpcklqdq  xmm0, xmm0                  ; replicate to all hi words
115
116    movdqa      xmm1, [GLOBAL(tMFQE)]
117    psubw       xmm1, xmm0                  ; dst_weight
118
119    mov         rax, arg(0)                 ; src
120    mov         rsi, arg(1)                 ; src_stride
121    mov         rdx, arg(2)                 ; dst
122    mov         rdi, arg(3)                 ; dst_stride
123
124    mov         rcx, 8                      ; loop count
125    pxor        xmm4, xmm4
126
127.combine:
128    movq        xmm2, [rax]
129    movq        xmm3, [rdx]
130    add         rax, rsi
131
132    ; src * src_weight
133    punpcklbw   xmm2, xmm4
134    pmullw      xmm2, xmm0
135
136    ; dst * dst_weight
137    punpcklbw   xmm3, xmm4
138    pmullw      xmm3, xmm1
139
140    ; sum, round and shift
141    paddw       xmm2, xmm3
142    paddw       xmm2, [GLOBAL(tMFQE_round)]
143    psrlw       xmm2, 4
144
145    packuswb    xmm2, xmm4
146    movq        [rdx], xmm2
147    add         rdx, rdi
148
149    dec         rcx
150    jnz         .combine
151
152    ; begin epilog
153    pop         rdi
154    pop         rsi
155    RESTORE_GOT
156    UNSHADOW_ARGS
157    pop         rbp
158
159    ret
160
161;void vp8_variance_and_sad_16x16_sse2 | arg
162;(
163;    unsigned char *src1,          0
164;    int            stride1,       1
165;    unsigned char *src2,          2
166;    int            stride2,       3
167;    unsigned int  *variance,      4
168;    unsigned int  *sad,           5
169;)
170global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE
171sym(vp8_variance_and_sad_16x16_sse2):
172    push        rbp
173    mov         rbp, rsp
174    SHADOW_ARGS_TO_STACK 6
175    GET_GOT     rbx
176    push        rsi
177    push        rdi
178    ; end prolog
179
180    mov         rax,        arg(0)          ; src1
181    mov         rcx,        arg(1)          ; stride1
182    mov         rdx,        arg(2)          ; src2
183    mov         rdi,        arg(3)          ; stride2
184
185    mov         rsi,        16              ; block height
186
187    ; Prep accumulator registers
188    pxor        xmm3, xmm3                  ; SAD
189    pxor        xmm4, xmm4                  ; sum of src2
190    pxor        xmm5, xmm5                  ; sum of src2^2
191
192    ; Because we're working with the actual output frames
193    ; we can't depend on any kind of data alignment.
194.accumulate:
195    movdqa      xmm0, [rax]                 ; src1
196    movdqa      xmm1, [rdx]                 ; src2
197    add         rax, rcx                    ; src1 + stride1
198    add         rdx, rdi                    ; src2 + stride2
199
200    ; SAD(src1, src2)
201    psadbw      xmm0, xmm1
202    paddusw     xmm3, xmm0
203
204    ; SUM(src2)
205    pxor        xmm2, xmm2
206    psadbw      xmm2, xmm1                  ; sum src2 by misusing SAD against 0
207    paddusw     xmm4, xmm2
208
209    ; pmaddubsw would be ideal if it took two unsigned values. instead,
210    ; it expects a signed and an unsigned value. so instead we zero extend
211    ; and operate on words.
212    pxor        xmm2, xmm2
213    movdqa      xmm0, xmm1
214    punpcklbw   xmm0, xmm2
215    punpckhbw   xmm1, xmm2
216    pmaddwd     xmm0, xmm0
217    pmaddwd     xmm1, xmm1
218    paddd       xmm5, xmm0
219    paddd       xmm5, xmm1
220
221    sub         rsi,        1
222    jnz         .accumulate
223
224    ; phaddd only operates on adjacent double words.
225    ; Finalize SAD and store
226    movdqa      xmm0, xmm3
227    psrldq      xmm0, 8
228    paddusw     xmm0, xmm3
229    paddd       xmm0, [GLOBAL(t128)]
230    psrld       xmm0, 8
231
232    mov         rax,  arg(5)
233    movd        [rax], xmm0
234
235    ; Accumulate sum of src2
236    movdqa      xmm0, xmm4
237    psrldq      xmm0, 8
238    paddusw     xmm0, xmm4
239    ; Square src2. Ignore high value
240    pmuludq     xmm0, xmm0
241    psrld       xmm0, 8
242
243    ; phaddw could be used to sum adjacent values but we want
244    ; all the values summed. promote to doubles, accumulate,
245    ; shift and sum
246    pxor        xmm2, xmm2
247    movdqa      xmm1, xmm5
248    punpckldq   xmm1, xmm2
249    punpckhdq   xmm5, xmm2
250    paddd       xmm1, xmm5
251    movdqa      xmm2, xmm1
252    psrldq      xmm1, 8
253    paddd       xmm1, xmm2
254
255    psubd       xmm1, xmm0
256
257    ; (variance + 128) >> 8
258    paddd       xmm1, [GLOBAL(t128)]
259    psrld       xmm1, 8
260    mov         rax,  arg(4)
261
262    movd        [rax], xmm1
263
264
265    ; begin epilog
266    pop         rdi
267    pop         rsi
268    RESTORE_GOT
269    UNSHADOW_ARGS
270    pop         rbp
271    ret
272
273SECTION_RODATA
274align 16
275t128:
276%ifndef __NASM_VER__
277    ddq 128
278%elif CONFIG_BIG_ENDIAN
279    dq  0, 128
280%else
281    dq  128, 0
282%endif
283align 16
284tMFQE: ; 1 << MFQE_PRECISION
285    times 8 dw 0x10
286align 16
287tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
288    times 8 dw 0x08
289
290