1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11
12%include "vpx_ports/x86_abi_support.asm"
13
14;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
15global sym(vp8_short_fdct4x4_mmx) PRIVATE
16sym(vp8_short_fdct4x4_mmx):
17    push        rbp
18    mov         rbp,        rsp
19    SHADOW_ARGS_TO_STACK 3
20    GET_GOT     rbx
21    push        rsi
22    push        rdi
23    ; end prolog
24
25        mov         rsi,        arg(0)      ; input
26        mov         rdi,        arg(1)      ; output
27
28        movsxd      rax,        dword ptr arg(2) ;pitch
29
30        lea         rcx,        [rsi + rax*2]
31        ; read the input data
32        movq        mm0,        [rsi]
33        movq        mm1,        [rsi + rax]
34
35        movq        mm2,        [rcx]
36        movq        mm4,        [rcx + rax]
37
38        ; transpose for the first stage
39        movq        mm3,        mm0         ; 00 01 02 03
40        movq        mm5,        mm2         ; 20 21 22 23
41
42        punpcklwd   mm0,        mm1         ; 00 10 01 11
43        punpckhwd   mm3,        mm1         ; 02 12 03 13
44
45        punpcklwd   mm2,        mm4         ; 20 30 21 31
46        punpckhwd   mm5,        mm4         ; 22 32 23 33
47
48        movq        mm1,        mm0         ; 00 10 01 11
49        punpckldq   mm0,        mm2         ; 00 10 20 30
50
51        punpckhdq   mm1,        mm2         ; 01 11 21 31
52
53        movq        mm2,        mm3         ; 02 12 03 13
54        punpckldq   mm2,        mm5         ; 02 12 22 32
55
56        punpckhdq   mm3,        mm5         ; 03 13 23 33
57
58        ; mm0 0
59        ; mm1 1
60        ; mm2 2
61        ; mm3 3
62
63        ; first stage
64        movq        mm5,        mm0
65        movq        mm4,        mm1
66
67        paddw       mm0,        mm3         ; a1 = 0 + 3
68        paddw       mm1,        mm2         ; b1 = 1 + 2
69
70        psubw       mm4,        mm2         ; c1 = 1 - 2
71        psubw       mm5,        mm3         ; d1 = 0 - 3
72
73        psllw       mm5,        3
74        psllw       mm4,        3
75
76        psllw       mm0,        3
77        psllw       mm1,        3
78
79        ; output 0 and 2
80        movq        mm2,        mm0         ; a1
81
82        paddw       mm0,        mm1         ; op[0] = a1 + b1
83        psubw       mm2,        mm1         ; op[2] = a1 - b1
84
85        ; output 1 and 3
86        ; interleave c1, d1
87        movq        mm1,        mm5         ; d1
88        punpcklwd   mm1,        mm4         ; c1 d1
89        punpckhwd   mm5,        mm4         ; c1 d1
90
91        movq        mm3,        mm1
92        movq        mm4,        mm5
93
94        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
95        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
96
97        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
98        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
99
100        paddd       mm1,        MMWORD PTR[GLOBAL(_14500)]
101        paddd       mm4,        MMWORD PTR[GLOBAL(_14500)]
102        paddd       mm3,        MMWORD PTR[GLOBAL(_7500)]
103        paddd       mm5,        MMWORD PTR[GLOBAL(_7500)]
104
105        psrad       mm1,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
106        psrad       mm4,        12          ; (c1 * 2217 + d1 * 5352 +  14500)>>12
107        psrad       mm3,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
108        psrad       mm5,        12          ; (d1 * 2217 - c1 * 5352 +   7500)>>12
109
110        packssdw    mm1,        mm4         ; op[1]
111        packssdw    mm3,        mm5         ; op[3]
112
113        ; done with vertical
114        ; transpose for the second stage
115        movq        mm4,        mm0         ; 00 10 20 30
116        movq        mm5,        mm2         ; 02 12 22 32
117
118        punpcklwd   mm0,        mm1         ; 00 01 10 11
119        punpckhwd   mm4,        mm1         ; 20 21 30 31
120
121        punpcklwd   mm2,        mm3         ; 02 03 12 13
122        punpckhwd   mm5,        mm3         ; 22 23 32 33
123
124        movq        mm1,        mm0         ; 00 01 10 11
125        punpckldq   mm0,        mm2         ; 00 01 02 03
126
127        punpckhdq   mm1,        mm2         ; 01 22 12 13
128
129        movq        mm2,        mm4         ; 20 31 30 31
130        punpckldq   mm2,        mm5         ; 20 21 22 23
131
132        punpckhdq   mm4,        mm5         ; 30 31 32 33
133
134        ; mm0 0
135        ; mm1 1
136        ; mm2 2
137        ; mm3 4
138
139        movq        mm5,        mm0
140        movq        mm3,        mm1
141
142        paddw       mm0,        mm4         ; a1 = 0 + 3
143        paddw       mm1,        mm2         ; b1 = 1 + 2
144
145        psubw       mm3,        mm2         ; c1 = 1 - 2
146        psubw       mm5,        mm4         ; d1 = 0 - 3
147
148        pxor        mm6,        mm6         ; zero out for compare
149
150        pcmpeqw     mm6,        mm5         ; d1 != 0
151
152        pandn       mm6,        MMWORD PTR[GLOBAL(_cmp_mask)]   ; clear upper,
153                                                                ; and keep bit 0 of lower
154
155        ; output 0 and 2
156        movq        mm2,        mm0         ; a1
157
158        paddw       mm0,        mm1         ; a1 + b1
159        psubw       mm2,        mm1         ; a1 - b1
160
161        paddw       mm0,        MMWORD PTR[GLOBAL(_7w)]
162        paddw       mm2,        MMWORD PTR[GLOBAL(_7w)]
163
164        psraw       mm0,        4           ; op[0] = (a1 + b1 + 7)>>4
165        psraw       mm2,        4           ; op[8] = (a1 - b1 + 7)>>4
166
167        movq        MMWORD PTR[rdi + 0 ],  mm0
168        movq        MMWORD PTR[rdi + 16],  mm2
169
170        ; output 1 and 3
171        ; interleave c1, d1
172        movq        mm1,        mm5         ; d1
173        punpcklwd   mm1,        mm3         ; c1 d1
174        punpckhwd   mm5,        mm3         ; c1 d1
175
176        movq        mm3,        mm1
177        movq        mm4,        mm5
178
179        pmaddwd     mm1,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
180        pmaddwd     mm4,        MMWORD PTR[GLOBAL (_5352_2217)]    ; c1*2217 + d1*5352
181
182        pmaddwd     mm3,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
183        pmaddwd     mm5,        MMWORD PTR[GLOBAL(_2217_neg5352)]  ; d1*2217 - c1*5352
184
185        paddd       mm1,        MMWORD PTR[GLOBAL(_12000)]
186        paddd       mm4,        MMWORD PTR[GLOBAL(_12000)]
187        paddd       mm3,        MMWORD PTR[GLOBAL(_51000)]
188        paddd       mm5,        MMWORD PTR[GLOBAL(_51000)]
189
190        psrad       mm1,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
191        psrad       mm4,        16          ; (c1 * 2217 + d1 * 5352 +  14500)>>16
192        psrad       mm3,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
193        psrad       mm5,        16          ; (d1 * 2217 - c1 * 5352 +   7500)>>16
194
195        packssdw    mm1,        mm4         ; op[4]
196        packssdw    mm3,        mm5         ; op[12]
197
198        paddw       mm1,        mm6         ; op[4] += (d1!=0)
199
200        movq        MMWORD PTR[rdi + 8 ],  mm1
201        movq        MMWORD PTR[rdi + 24],  mm3
202
203     ; begin epilog
204    pop         rdi
205    pop         rsi
206    RESTORE_GOT
207    UNSHADOW_ARGS
208    pop         rbp
209    ret
210
211SECTION_RODATA
212align 8
213_5352_2217:
214    dw 5352
215    dw 2217
216    dw 5352
217    dw 2217
218align 8
219_2217_neg5352:
220    dw 2217
221    dw -5352
222    dw 2217
223    dw -5352
224align 8
225_cmp_mask:
226    times 4 dw 1
227align 8
228_7w:
229    times 4 dw 7
230align 8
231_14500:
232    times 2 dd 14500
233align 8
234_7500:
235    times 2 dd 7500
236align 8
237_12000:
238    times 2 dd 12000
239align 8
240_51000:
241    times 2 dd 51000
242