1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14SECTION .text
15
16;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q)
17global sym(vp8_dequantize_b_impl_mmx) PRIVATE
18sym(vp8_dequantize_b_impl_mmx):
19    push        rbp
20    mov         rbp, rsp
21    SHADOW_ARGS_TO_STACK 3
22    push        rsi
23    push        rdi
24    ; end prolog
25
26        mov       rsi, arg(0) ;sq
27        mov       rdi, arg(1) ;dq
28        mov       rax, arg(2) ;q
29
30        movq      mm1, [rsi]
31        pmullw    mm1, [rax+0]            ; mm4 *= kernel 0 modifiers.
32        movq      [rdi], mm1
33
34        movq      mm1, [rsi+8]
35        pmullw    mm1, [rax+8]            ; mm4 *= kernel 0 modifiers.
36        movq      [rdi+8], mm1
37
38        movq      mm1, [rsi+16]
39        pmullw    mm1, [rax+16]            ; mm4 *= kernel 0 modifiers.
40        movq      [rdi+16], mm1
41
42        movq      mm1, [rsi+24]
43        pmullw    mm1, [rax+24]            ; mm4 *= kernel 0 modifiers.
44        movq      [rdi+24], mm1
45
46    ; begin epilog
47    pop rdi
48    pop rsi
49    UNSHADOW_ARGS
50    pop         rbp
51    ret
52
53
54;void dequant_idct_add_mmx(
55;short *input,            0
56;short *dq,               1
57;unsigned char *dest,     2
58;int stride)              3
59global sym(vp8_dequant_idct_add_mmx) PRIVATE
60sym(vp8_dequant_idct_add_mmx):
61    push        rbp
62    mov         rbp, rsp
63    SHADOW_ARGS_TO_STACK 4
64    GET_GOT     rbx
65    push        rdi
66    ; end prolog
67
68        mov         rax,    arg(0) ;input
69        mov         rdx,    arg(1) ;dq
70
71
72        movq        mm0,    [rax   ]
73        pmullw      mm0,    [rdx]
74
75        movq        mm1,    [rax +8]
76        pmullw      mm1,    [rdx +8]
77
78        movq        mm2,    [rax+16]
79        pmullw      mm2,    [rdx+16]
80
81        movq        mm3,    [rax+24]
82        pmullw      mm3,    [rdx+24]
83
84        mov         rdx,    arg(2) ;dest
85
86        pxor        mm7,    mm7
87
88
89        movq        [rax],   mm7
90        movq        [rax+8], mm7
91
92        movq        [rax+16],mm7
93        movq        [rax+24],mm7
94
95
96        movsxd      rdi,            dword ptr arg(3) ;stride
97
98        psubw       mm0,            mm2             ; b1= 0-2
99        paddw       mm2,            mm2             ;
100
101        movq        mm5,            mm1
102        paddw       mm2,            mm0             ; a1 =0+2
103
104        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
105        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
106
107        movq        mm7,            mm3             ;
108        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
109
110        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
111        psubw       mm7,            mm5             ; c1
112
113        movq        mm5,            mm1
114        movq        mm4,            mm3
115
116        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
117        paddw       mm5,            mm1
118
119        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
120        paddw       mm3,            mm4
121
122        paddw       mm3,            mm5             ; d1
123        movq        mm6,            mm2             ; a1
124
125        movq        mm4,            mm0             ; b1
126        paddw       mm2,            mm3             ;0
127
128        paddw       mm4,            mm7             ;1
129        psubw       mm0,            mm7             ;2
130
131        psubw       mm6,            mm3             ;3
132
133        movq        mm1,            mm2             ; 03 02 01 00
134        movq        mm3,            mm4             ; 23 22 21 20
135
136        punpcklwd   mm1,            mm0             ; 11 01 10 00
137        punpckhwd   mm2,            mm0             ; 13 03 12 02
138
139        punpcklwd   mm3,            mm6             ; 31 21 30 20
140        punpckhwd   mm4,            mm6             ; 33 23 32 22
141
142        movq        mm0,            mm1             ; 11 01 10 00
143        movq        mm5,            mm2             ; 13 03 12 02
144
145        punpckldq   mm0,            mm3             ; 30 20 10 00
146        punpckhdq   mm1,            mm3             ; 31 21 11 01
147
148        punpckldq   mm2,            mm4             ; 32 22 12 02
149        punpckhdq   mm5,            mm4             ; 33 23 13 03
150
151        movq        mm3,            mm5             ; 33 23 13 03
152
153        psubw       mm0,            mm2             ; b1= 0-2
154        paddw       mm2,            mm2             ;
155
156        movq        mm5,            mm1
157        paddw       mm2,            mm0             ; a1 =0+2
158
159        pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
160        paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
161
162        movq        mm7,            mm3             ;
163        pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
164
165        paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
166        psubw       mm7,            mm5             ; c1
167
168        movq        mm5,            mm1
169        movq        mm4,            mm3
170
171        pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
172        paddw       mm5,            mm1
173
174        pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
175        paddw       mm3,            mm4
176
177        paddw       mm3,            mm5             ; d1
178        paddw       mm0,            [GLOBAL(fours)]
179
180        paddw       mm2,            [GLOBAL(fours)]
181        movq        mm6,            mm2             ; a1
182
183        movq        mm4,            mm0             ; b1
184        paddw       mm2,            mm3             ;0
185
186        paddw       mm4,            mm7             ;1
187        psubw       mm0,            mm7             ;2
188
189        psubw       mm6,            mm3             ;3
190        psraw       mm2,            3
191
192        psraw       mm0,            3
193        psraw       mm4,            3
194
195        psraw       mm6,            3
196
197        movq        mm1,            mm2             ; 03 02 01 00
198        movq        mm3,            mm4             ; 23 22 21 20
199
200        punpcklwd   mm1,            mm0             ; 11 01 10 00
201        punpckhwd   mm2,            mm0             ; 13 03 12 02
202
203        punpcklwd   mm3,            mm6             ; 31 21 30 20
204        punpckhwd   mm4,            mm6             ; 33 23 32 22
205
206        movq        mm0,            mm1             ; 11 01 10 00
207        movq        mm5,            mm2             ; 13 03 12 02
208
209        punpckldq   mm0,            mm3             ; 30 20 10 00
210        punpckhdq   mm1,            mm3             ; 31 21 11 01
211
212        punpckldq   mm2,            mm4             ; 32 22 12 02
213        punpckhdq   mm5,            mm4             ; 33 23 13 03
214
215        pxor        mm7,            mm7
216
217        movd        mm4,            [rdx]
218        punpcklbw   mm4,            mm7
219        paddsw      mm0,            mm4
220        packuswb    mm0,            mm7
221        movd        [rdx],          mm0
222
223        movd        mm4,            [rdx+rdi]
224        punpcklbw   mm4,            mm7
225        paddsw      mm1,            mm4
226        packuswb    mm1,            mm7
227        movd        [rdx+rdi],      mm1
228
229        movd        mm4,            [rdx+2*rdi]
230        punpcklbw   mm4,            mm7
231        paddsw      mm2,            mm4
232        packuswb    mm2,            mm7
233        movd        [rdx+rdi*2],    mm2
234
235        add         rdx,            rdi
236
237        movd        mm4,            [rdx+2*rdi]
238        punpcklbw   mm4,            mm7
239        paddsw      mm5,            mm4
240        packuswb    mm5,            mm7
241        movd        [rdx+rdi*2],    mm5
242
243    ; begin epilog
244    pop rdi
245    RESTORE_GOT
246    UNSHADOW_ARGS
247    pop         rbp
248    ret
249
250SECTION_RODATA
251align 16
252x_s1sqr2:
253    times 4 dw 0x8A8C
254align 16
255x_c1sqr2less1:
256    times 4 dw 0x4E7B
257align 16
258fours:
259    times 4 dw 0x0004
260