1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14; /****************************************************************************
15; * Notes:
16; *
17; * This implementation makes use of 16 bit fixed point version of two multiply
18; * constants:
19; *        1.   sqrt(2) * cos (pi/8)
20; *        2.   sqrt(2) * sin (pi/8)
21; * Because the first constant is bigger than 1, to maintain the same 16 bit
22; * fixed point precision as the second one, we use a trick of
23; *        x * a = x + x*(a-1)
24; * so
25; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
26; *
27; * For the second constant, because of the 16bit version is 35468, which
28; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
29; * number.
30; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
31; *
32; **************************************************************************/
33
34SECTION .text
35
36;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
37;int pitch, unsigned char *dest,int stride)
38global sym(vp8_short_idct4x4llm_mmx) PRIVATE
39sym(vp8_short_idct4x4llm_mmx):
40    push        rbp
41    mov         rbp, rsp
42    SHADOW_ARGS_TO_STACK 5
43    GET_GOT     rbx
44    push        rsi
45    push        rdi
46    ; end prolog
47
48    mov         rax,    arg(0)              ;input
49    mov         rsi,    arg(1)              ;pred
50
51    movq        mm0,    [rax   ]
52    movq        mm1,    [rax+ 8]
53    movq        mm2,    [rax+16]
54    movq        mm3,    [rax+24]
55
56%if 0
57    pxor        mm7,    mm7
58    movq        [rax],   mm7
59    movq        [rax+8], mm7
60    movq        [rax+16],mm7
61    movq        [rax+24],mm7
62%endif
63    movsxd      rax,    dword ptr arg(2)    ;pitch
64    mov         rdx,    arg(3)              ;dest
65    movsxd      rdi,    dword ptr arg(4)    ;stride
66
67
68    psubw       mm0,            mm2             ; b1= 0-2
69    paddw       mm2,            mm2             ;
70
71    movq        mm5,            mm1
72    paddw       mm2,            mm0             ; a1 =0+2
73
74    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
75    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
76
77    movq        mm7,            mm3             ;
78    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
79
80    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
81    psubw       mm7,            mm5             ; c1
82
83    movq        mm5,            mm1
84    movq        mm4,            mm3
85
86    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
87    paddw       mm5,            mm1
88
89    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
90    paddw       mm3,            mm4
91
92    paddw       mm3,            mm5             ; d1
93    movq        mm6,            mm2             ; a1
94
95    movq        mm4,            mm0             ; b1
96    paddw       mm2,            mm3             ;0
97
98    paddw       mm4,            mm7             ;1
99    psubw       mm0,            mm7             ;2
100
101    psubw       mm6,            mm3             ;3
102
103    movq        mm1,            mm2             ; 03 02 01 00
104    movq        mm3,            mm4             ; 23 22 21 20
105
106    punpcklwd   mm1,            mm0             ; 11 01 10 00
107    punpckhwd   mm2,            mm0             ; 13 03 12 02
108
109    punpcklwd   mm3,            mm6             ; 31 21 30 20
110    punpckhwd   mm4,            mm6             ; 33 23 32 22
111
112    movq        mm0,            mm1             ; 11 01 10 00
113    movq        mm5,            mm2             ; 13 03 12 02
114
115    punpckldq   mm0,            mm3             ; 30 20 10 00
116    punpckhdq   mm1,            mm3             ; 31 21 11 01
117
118    punpckldq   mm2,            mm4             ; 32 22 12 02
119    punpckhdq   mm5,            mm4             ; 33 23 13 03
120
121    movq        mm3,            mm5             ; 33 23 13 03
122
123    psubw       mm0,            mm2             ; b1= 0-2
124    paddw       mm2,            mm2             ;
125
126    movq        mm5,            mm1
127    paddw       mm2,            mm0             ; a1 =0+2
128
129    pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
130    paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
131
132    movq        mm7,            mm3             ;
133    pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
134
135    paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
136    psubw       mm7,            mm5             ; c1
137
138    movq        mm5,            mm1
139    movq        mm4,            mm3
140
141    pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
142    paddw       mm5,            mm1
143
144    pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
145    paddw       mm3,            mm4
146
147    paddw       mm3,            mm5             ; d1
148    paddw       mm0,            [GLOBAL(fours)]
149
150    paddw       mm2,            [GLOBAL(fours)]
151    movq        mm6,            mm2             ; a1
152
153    movq        mm4,            mm0             ; b1
154    paddw       mm2,            mm3             ;0
155
156    paddw       mm4,            mm7             ;1
157    psubw       mm0,            mm7             ;2
158
159    psubw       mm6,            mm3             ;3
160    psraw       mm2,            3
161
162    psraw       mm0,            3
163    psraw       mm4,            3
164
165    psraw       mm6,            3
166
167    movq        mm1,            mm2             ; 03 02 01 00
168    movq        mm3,            mm4             ; 23 22 21 20
169
170    punpcklwd   mm1,            mm0             ; 11 01 10 00
171    punpckhwd   mm2,            mm0             ; 13 03 12 02
172
173    punpcklwd   mm3,            mm6             ; 31 21 30 20
174    punpckhwd   mm4,            mm6             ; 33 23 32 22
175
176    movq        mm0,            mm1             ; 11 01 10 00
177    movq        mm5,            mm2             ; 13 03 12 02
178
179    punpckldq   mm0,            mm3             ; 30 20 10 00
180    punpckhdq   mm1,            mm3             ; 31 21 11 01
181
182    punpckldq   mm2,            mm4             ; 32 22 12 02
183    punpckhdq   mm5,            mm4             ; 33 23 13 03
184
185    pxor        mm7,            mm7
186
187    movd        mm4,            [rsi]
188    punpcklbw   mm4,            mm7
189    paddsw      mm0,            mm4
190    packuswb    mm0,            mm7
191    movd        [rdx],          mm0
192
193    movd        mm4,            [rsi+rax]
194    punpcklbw   mm4,            mm7
195    paddsw      mm1,            mm4
196    packuswb    mm1,            mm7
197    movd        [rdx+rdi],      mm1
198
199    movd        mm4,            [rsi+2*rax]
200    punpcklbw   mm4,            mm7
201    paddsw      mm2,            mm4
202    packuswb    mm2,            mm7
203    movd        [rdx+rdi*2],    mm2
204
205    add         rdx,            rdi
206    add         rsi,            rax
207
208    movd        mm4,            [rsi+2*rax]
209    punpcklbw   mm4,            mm7
210    paddsw      mm5,            mm4
211    packuswb    mm5,            mm7
212    movd        [rdx+rdi*2],    mm5
213
214    ; begin epilog
215    pop rdi
216    pop rsi
217    RESTORE_GOT
218    UNSHADOW_ARGS
219    pop         rbp
220    ret
221
222;void vp8_dc_only_idct_add_mmx(
223;short input_dc,
224;unsigned char *pred_ptr,
225;int pred_stride,
226;unsigned char *dst_ptr,
227;int stride)
228global sym(vp8_dc_only_idct_add_mmx) PRIVATE
229sym(vp8_dc_only_idct_add_mmx):
230    push        rbp
231    mov         rbp, rsp
232    SHADOW_ARGS_TO_STACK 5
233    GET_GOT     rbx
234    ; end prolog
235
236        movd        mm5,            arg(0) ;input_dc
237        mov         rax,            arg(1) ;pred_ptr
238        movsxd      rdx,            dword ptr arg(2) ;pred_stride
239
240        pxor        mm0,            mm0
241
242        paddw       mm5,            [GLOBAL(fours)]
243        lea         rcx,            [rdx + rdx*2]
244
245        psraw       mm5,            3
246
247        punpcklwd   mm5,            mm5
248
249        punpckldq   mm5,            mm5
250
251        movd        mm1,            [rax]
252        movd        mm2,            [rax+rdx]
253        movd        mm3,            [rax+2*rdx]
254        movd        mm4,            [rax+rcx]
255
256        mov         rax,            arg(3) ;d -- destination
257        movsxd      rdx,            dword ptr arg(4) ;dst_stride
258
259        punpcklbw   mm1,            mm0
260        paddsw      mm1,            mm5
261        packuswb    mm1,            mm0              ; pack and unpack to saturate
262        lea         rcx,            [rdx + rdx*2]
263
264        punpcklbw   mm2,            mm0
265        paddsw      mm2,            mm5
266        packuswb    mm2,            mm0              ; pack and unpack to saturate
267
268        punpcklbw   mm3,            mm0
269        paddsw      mm3,            mm5
270        packuswb    mm3,            mm0              ; pack and unpack to saturate
271
272        punpcklbw   mm4,            mm0
273        paddsw      mm4,            mm5
274        packuswb    mm4,            mm0              ; pack and unpack to saturate
275
276        movd        [rax],          mm1
277        movd        [rax+rdx],      mm2
278        movd        [rax+2*rdx],    mm3
279        movd        [rax+rcx],      mm4
280
281    ; begin epilog
282    RESTORE_GOT
283    UNSHADOW_ARGS
284    pop         rbp
285    ret
286
287SECTION_RODATA
288align 16
289x_s1sqr2:
290    times 4 dw 0x8A8C
291align 16
292x_c1sqr2less1:
293    times 4 dw 0x4E7B
294align 16
295fours:
296    times 4 dw 0x0004
297