1;
2; jfdctint.asm - accurate integer FDCT (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a slow-but-accurate integer implementation of the
18; forward DCT (Discrete Cosine Transform). The following code is based
19; directly on the IJG's original jfdctint.c; see the jfdctint.c for
20; more details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25%include "jdct.inc"
26
27; --------------------------------------------------------------------------
28
29%define CONST_BITS      13
30%define PASS1_BITS      2
31
32%define DESCALE_P1      (CONST_BITS-PASS1_BITS)
33%define DESCALE_P2      (CONST_BITS+PASS1_BITS)
34
35%if CONST_BITS == 13
36F_0_298 equ      2446           ; FIX(0.298631336)
37F_0_390 equ      3196           ; FIX(0.390180644)
38F_0_541 equ      4433           ; FIX(0.541196100)
39F_0_765 equ      6270           ; FIX(0.765366865)
40F_0_899 equ      7373           ; FIX(0.899976223)
41F_1_175 equ      9633           ; FIX(1.175875602)
42F_1_501 equ     12299           ; FIX(1.501321110)
43F_1_847 equ     15137           ; FIX(1.847759065)
44F_1_961 equ     16069           ; FIX(1.961570560)
45F_2_053 equ     16819           ; FIX(2.053119869)
46F_2_562 equ     20995           ; FIX(2.562915447)
47F_3_072 equ     25172           ; FIX(3.072711026)
48%else
49; NASM cannot do compile-time arithmetic on floating-point constants.
50%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
51F_0_298 equ     DESCALE( 320652955,30-CONST_BITS)       ; FIX(0.298631336)
52F_0_390 equ     DESCALE( 418953276,30-CONST_BITS)       ; FIX(0.390180644)
53F_0_541 equ     DESCALE( 581104887,30-CONST_BITS)       ; FIX(0.541196100)
54F_0_765 equ     DESCALE( 821806413,30-CONST_BITS)       ; FIX(0.765366865)
55F_0_899 equ     DESCALE( 966342111,30-CONST_BITS)       ; FIX(0.899976223)
56F_1_175 equ     DESCALE(1262586813,30-CONST_BITS)       ; FIX(1.175875602)
57F_1_501 equ     DESCALE(1612031267,30-CONST_BITS)       ; FIX(1.501321110)
58F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
59F_1_961 equ     DESCALE(2106220350,30-CONST_BITS)       ; FIX(1.961570560)
60F_2_053 equ     DESCALE(2204520673,30-CONST_BITS)       ; FIX(2.053119869)
61F_2_562 equ     DESCALE(2751909506,30-CONST_BITS)       ; FIX(2.562915447)
62F_3_072 equ     DESCALE(3299298341,30-CONST_BITS)       ; FIX(3.072711026)
63%endif
64
65; --------------------------------------------------------------------------
66        SECTION SEG_CONST
67
68        alignz  16
69        global  EXTN(jconst_fdct_islow_mmx)
70
71EXTN(jconst_fdct_islow_mmx):
72
73PW_F130_F054    times 2 dw  (F_0_541+F_0_765), F_0_541
74PW_F054_MF130   times 2 dw  F_0_541, (F_0_541-F_1_847)
75PW_MF078_F117   times 2 dw  (F_1_175-F_1_961), F_1_175
76PW_F117_F078    times 2 dw  F_1_175, (F_1_175-F_0_390)
77PW_MF060_MF089  times 2 dw  (F_0_298-F_0_899),-F_0_899
78PW_MF089_F060   times 2 dw -F_0_899, (F_1_501-F_0_899)
79PW_MF050_MF256  times 2 dw  (F_2_053-F_2_562),-F_2_562
80PW_MF256_F050   times 2 dw -F_2_562, (F_3_072-F_2_562)
81PD_DESCALE_P1   times 2 dd  1 << (DESCALE_P1-1)
82PD_DESCALE_P2   times 2 dd  1 << (DESCALE_P2-1)
83PW_DESCALE_P2X  times 4 dw  1 << (PASS1_BITS-1)
84
85        alignz  16
86
87; --------------------------------------------------------------------------
88        SECTION SEG_TEXT
89        BITS    32
90;
91; Perform the forward DCT on one block of samples.
92;
93; GLOBAL(void)
94; jsimd_fdct_islow_mmx (DCTELEM * data)
95;
96
97%define data(b)         (b)+8           ; DCTELEM * data
98
99%define original_ebp    ebp+0
100%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
101%define WK_NUM          2
102
103        align   16
104        global  EXTN(jsimd_fdct_islow_mmx)
105
106EXTN(jsimd_fdct_islow_mmx):
107        push    ebp
108        mov     eax,esp                         ; eax = original ebp
109        sub     esp, byte 4
110        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
111        mov     [esp],eax
112        mov     ebp,esp                         ; ebp = aligned ebp
113        lea     esp, [wk(0)]
114        pushpic ebx
115;       push    ecx             ; need not be preserved
116;       push    edx             ; need not be preserved
117;       push    esi             ; unused
118;       push    edi             ; unused
119
120        get_GOT ebx             ; get GOT address
121
122        ; ---- Pass 1: process rows.
123
124        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
125        mov     ecx, DCTSIZE/4
126        alignx  16,7
127.rowloop:
128
129        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
130        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
131        movq    mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
132        movq    mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
133
134        ; mm0=(20 21 22 23), mm2=(24 25 26 27)
135        ; mm1=(30 31 32 33), mm3=(34 35 36 37)
136
137        movq      mm4,mm0               ; transpose coefficients(phase 1)
138        punpcklwd mm0,mm1               ; mm0=(20 30 21 31)
139        punpckhwd mm4,mm1               ; mm4=(22 32 23 33)
140        movq      mm5,mm2               ; transpose coefficients(phase 1)
141        punpcklwd mm2,mm3               ; mm2=(24 34 25 35)
142        punpckhwd mm5,mm3               ; mm5=(26 36 27 37)
143
144        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
145        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
146        movq    mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
147        movq    mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
148
149        ; mm6=(00 01 02 03), mm1=(04 05 06 07)
150        ; mm7=(10 11 12 13), mm3=(14 15 16 17)
151
152        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
153        movq    MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
154
155        movq      mm4,mm6               ; transpose coefficients(phase 1)
156        punpcklwd mm6,mm7               ; mm6=(00 10 01 11)
157        punpckhwd mm4,mm7               ; mm4=(02 12 03 13)
158        movq      mm2,mm1               ; transpose coefficients(phase 1)
159        punpcklwd mm1,mm3               ; mm1=(04 14 05 15)
160        punpckhwd mm2,mm3               ; mm2=(06 16 07 17)
161
162        movq      mm7,mm6               ; transpose coefficients(phase 2)
163        punpckldq mm6,mm0               ; mm6=(00 10 20 30)=data0
164        punpckhdq mm7,mm0               ; mm7=(01 11 21 31)=data1
165        movq      mm3,mm2               ; transpose coefficients(phase 2)
166        punpckldq mm2,mm5               ; mm2=(06 16 26 36)=data6
167        punpckhdq mm3,mm5               ; mm3=(07 17 27 37)=data7
168
169        movq    mm0,mm7
170        movq    mm5,mm6
171        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
172        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
173        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
174        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
175
176        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
177        movq    mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
178        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
179        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
180
181        movq      mm7,mm4               ; transpose coefficients(phase 2)
182        punpckldq mm4,mm2               ; mm4=(02 12 22 32)=data2
183        punpckhdq mm7,mm2               ; mm7=(03 13 23 33)=data3
184        movq      mm6,mm1               ; transpose coefficients(phase 2)
185        punpckldq mm1,mm3               ; mm1=(04 14 24 34)=data4
186        punpckhdq mm6,mm3               ; mm6=(05 15 25 35)=data5
187
188        movq    mm2,mm7
189        movq    mm3,mm4
190        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
191        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
192        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
193        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
194
195        ; -- Even part
196
197        movq    mm1,mm5
198        movq    mm6,mm0
199        paddw   mm5,mm7                 ; mm5=tmp10
200        paddw   mm0,mm4                 ; mm0=tmp11
201        psubw   mm1,mm7                 ; mm1=tmp13
202        psubw   mm6,mm4                 ; mm6=tmp12
203
204        movq    mm7,mm5
205        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
206        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
207
208        psllw   mm5,PASS1_BITS          ; mm5=data0
209        psllw   mm7,PASS1_BITS          ; mm7=data4
210
211        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
212        movq    MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm7
213
214        ; (Original)
215        ; z1 = (tmp12 + tmp13) * 0.541196100;
216        ; data2 = z1 + tmp13 * 0.765366865;
217        ; data6 = z1 + tmp12 * -1.847759065;
218        ;
219        ; (This implementation)
220        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
221        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
222
223        movq      mm4,mm1               ; mm1=tmp13
224        movq      mm0,mm1
225        punpcklwd mm4,mm6               ; mm6=tmp12
226        punpckhwd mm0,mm6
227        movq      mm1,mm4
228        movq      mm6,mm0
229        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
230        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
231        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
232        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
233
234        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
235        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P1)]
236        psrad   mm4,DESCALE_P1
237        psrad   mm0,DESCALE_P1
238        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
239        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P1)]
240        psrad   mm1,DESCALE_P1
241        psrad   mm6,DESCALE_P1
242
243        packssdw  mm4,mm0               ; mm4=data2
244        packssdw  mm1,mm6               ; mm1=data6
245
246        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
247        movq    MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm1
248
249        ; -- Odd part
250
251        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
252        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
253
254        movq    mm0,mm2                 ; mm2=tmp4
255        movq    mm6,mm3                 ; mm3=tmp5
256        paddw   mm0,mm5                 ; mm0=z3
257        paddw   mm6,mm7                 ; mm6=z4
258
259        ; (Original)
260        ; z5 = (z3 + z4) * 1.175875602;
261        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
262        ; z3 += z5;  z4 += z5;
263        ;
264        ; (This implementation)
265        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
266        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
267
268        movq      mm4,mm0
269        movq      mm1,mm0
270        punpcklwd mm4,mm6
271        punpckhwd mm1,mm6
272        movq      mm0,mm4
273        movq      mm6,mm1
274        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
275        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
276        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
277        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
278
279        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
280        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
281
282        ; (Original)
283        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
284        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
285        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
286        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
287        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
288        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
289        ;
290        ; (This implementation)
291        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
292        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
293        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
294        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
295        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
296        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
297
298        movq      mm4,mm2
299        movq      mm1,mm2
300        punpcklwd mm4,mm7
301        punpckhwd mm1,mm7
302        movq      mm2,mm4
303        movq      mm7,mm1
304        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
305        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
306        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
307        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
308
309        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
310        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
311        paddd   mm2,mm0                 ; mm2=data1L
312        paddd   mm7,mm6                 ; mm7=data1H
313
314        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P1)]
315        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
316        psrad   mm4,DESCALE_P1
317        psrad   mm1,DESCALE_P1
318        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P1)]
319        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
320        psrad   mm2,DESCALE_P1
321        psrad   mm7,DESCALE_P1
322
323        packssdw  mm4,mm1               ; mm4=data7
324        packssdw  mm2,mm7               ; mm2=data1
325
326        movq    MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm4
327        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
328
329        movq      mm1,mm3
330        movq      mm7,mm3
331        punpcklwd mm1,mm5
332        punpckhwd mm7,mm5
333        movq      mm3,mm1
334        movq      mm5,mm7
335        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
336        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
337        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
338        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
339
340        paddd   mm1,mm0                 ; mm1=data5L
341        paddd   mm7,mm6                 ; mm7=data5H
342        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
343        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
344
345        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P1)]
346        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P1)]
347        psrad   mm1,DESCALE_P1
348        psrad   mm7,DESCALE_P1
349        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P1)]
350        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P1)]
351        psrad   mm3,DESCALE_P1
352        psrad   mm5,DESCALE_P1
353
354        packssdw  mm1,mm7               ; mm1=data5
355        packssdw  mm3,mm5               ; mm3=data3
356
357        movq    MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm1
358        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
359
360        add     edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
361        dec     ecx
362        jnz     near .rowloop
363
364        ; ---- Pass 2: process columns.
365
366        mov     edx, POINTER [data(eax)]        ; (DCTELEM *)
367        mov     ecx, DCTSIZE/4
368        alignx  16,7
369.columnloop:
370
371        movq    mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
372        movq    mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
373        movq    mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
374        movq    mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
375
376        ; mm0=(02 12 22 32), mm2=(42 52 62 72)
377        ; mm1=(03 13 23 33), mm3=(43 53 63 73)
378
379        movq      mm4,mm0               ; transpose coefficients(phase 1)
380        punpcklwd mm0,mm1               ; mm0=(02 03 12 13)
381        punpckhwd mm4,mm1               ; mm4=(22 23 32 33)
382        movq      mm5,mm2               ; transpose coefficients(phase 1)
383        punpcklwd mm2,mm3               ; mm2=(42 43 52 53)
384        punpckhwd mm5,mm3               ; mm5=(62 63 72 73)
385
386        movq    mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
387        movq    mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
388        movq    mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
389        movq    mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
390
391        ; mm6=(00 10 20 30), mm1=(40 50 60 70)
392        ; mm7=(01 11 21 31), mm3=(41 51 61 71)
393
394        movq    MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
395        movq    MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
396
397        movq      mm4,mm6               ; transpose coefficients(phase 1)
398        punpcklwd mm6,mm7               ; mm6=(00 01 10 11)
399        punpckhwd mm4,mm7               ; mm4=(20 21 30 31)
400        movq      mm2,mm1               ; transpose coefficients(phase 1)
401        punpcklwd mm1,mm3               ; mm1=(40 41 50 51)
402        punpckhwd mm2,mm3               ; mm2=(60 61 70 71)
403
404        movq      mm7,mm6               ; transpose coefficients(phase 2)
405        punpckldq mm6,mm0               ; mm6=(00 01 02 03)=data0
406        punpckhdq mm7,mm0               ; mm7=(10 11 12 13)=data1
407        movq      mm3,mm2               ; transpose coefficients(phase 2)
408        punpckldq mm2,mm5               ; mm2=(60 61 62 63)=data6
409        punpckhdq mm3,mm5               ; mm3=(70 71 72 73)=data7
410
411        movq    mm0,mm7
412        movq    mm5,mm6
413        psubw   mm7,mm2                 ; mm7=data1-data6=tmp6
414        psubw   mm6,mm3                 ; mm6=data0-data7=tmp7
415        paddw   mm0,mm2                 ; mm0=data1+data6=tmp1
416        paddw   mm5,mm3                 ; mm5=data0+data7=tmp0
417
418        movq    mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
419        movq    mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
420        movq    MMWORD [wk(0)], mm7     ; wk(0)=tmp6
421        movq    MMWORD [wk(1)], mm6     ; wk(1)=tmp7
422
423        movq      mm7,mm4               ; transpose coefficients(phase 2)
424        punpckldq mm4,mm2               ; mm4=(20 21 22 23)=data2
425        punpckhdq mm7,mm2               ; mm7=(30 31 32 33)=data3
426        movq      mm6,mm1               ; transpose coefficients(phase 2)
427        punpckldq mm1,mm3               ; mm1=(40 41 42 43)=data4
428        punpckhdq mm6,mm3               ; mm6=(50 51 52 53)=data5
429
430        movq    mm2,mm7
431        movq    mm3,mm4
432        paddw   mm7,mm1                 ; mm7=data3+data4=tmp3
433        paddw   mm4,mm6                 ; mm4=data2+data5=tmp2
434        psubw   mm2,mm1                 ; mm2=data3-data4=tmp4
435        psubw   mm3,mm6                 ; mm3=data2-data5=tmp5
436
437        ; -- Even part
438
439        movq    mm1,mm5
440        movq    mm6,mm0
441        paddw   mm5,mm7                 ; mm5=tmp10
442        paddw   mm0,mm4                 ; mm0=tmp11
443        psubw   mm1,mm7                 ; mm1=tmp13
444        psubw   mm6,mm4                 ; mm6=tmp12
445
446        movq    mm7,mm5
447        paddw   mm5,mm0                 ; mm5=tmp10+tmp11
448        psubw   mm7,mm0                 ; mm7=tmp10-tmp11
449
450        paddw   mm5,[GOTOFF(ebx,PW_DESCALE_P2X)]
451        paddw   mm7,[GOTOFF(ebx,PW_DESCALE_P2X)]
452        psraw   mm5,PASS1_BITS          ; mm5=data0
453        psraw   mm7,PASS1_BITS          ; mm7=data4
454
455        movq    MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm5
456        movq    MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm7
457
458        ; (Original)
459        ; z1 = (tmp12 + tmp13) * 0.541196100;
460        ; data2 = z1 + tmp13 * 0.765366865;
461        ; data6 = z1 + tmp12 * -1.847759065;
462        ;
463        ; (This implementation)
464        ; data2 = tmp13 * (0.541196100 + 0.765366865) + tmp12 * 0.541196100;
465        ; data6 = tmp13 * 0.541196100 + tmp12 * (0.541196100 - 1.847759065);
466
467        movq      mm4,mm1               ; mm1=tmp13
468        movq      mm0,mm1
469        punpcklwd mm4,mm6               ; mm6=tmp12
470        punpckhwd mm0,mm6
471        movq      mm1,mm4
472        movq      mm6,mm0
473        pmaddwd   mm4,[GOTOFF(ebx,PW_F130_F054)]        ; mm4=data2L
474        pmaddwd   mm0,[GOTOFF(ebx,PW_F130_F054)]        ; mm0=data2H
475        pmaddwd   mm1,[GOTOFF(ebx,PW_F054_MF130)]       ; mm1=data6L
476        pmaddwd   mm6,[GOTOFF(ebx,PW_F054_MF130)]       ; mm6=data6H
477
478        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
479        paddd   mm0,[GOTOFF(ebx,PD_DESCALE_P2)]
480        psrad   mm4,DESCALE_P2
481        psrad   mm0,DESCALE_P2
482        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
483        paddd   mm6,[GOTOFF(ebx,PD_DESCALE_P2)]
484        psrad   mm1,DESCALE_P2
485        psrad   mm6,DESCALE_P2
486
487        packssdw  mm4,mm0               ; mm4=data2
488        packssdw  mm1,mm6               ; mm1=data6
489
490        movq    MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
491        movq    MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm1
492
493        ; -- Odd part
494
495        movq    mm5, MMWORD [wk(0)]     ; mm5=tmp6
496        movq    mm7, MMWORD [wk(1)]     ; mm7=tmp7
497
498        movq    mm0,mm2                 ; mm2=tmp4
499        movq    mm6,mm3                 ; mm3=tmp5
500        paddw   mm0,mm5                 ; mm0=z3
501        paddw   mm6,mm7                 ; mm6=z4
502
503        ; (Original)
504        ; z5 = (z3 + z4) * 1.175875602;
505        ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
506        ; z3 += z5;  z4 += z5;
507        ;
508        ; (This implementation)
509        ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
510        ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
511
512        movq      mm4,mm0
513        movq      mm1,mm0
514        punpcklwd mm4,mm6
515        punpckhwd mm1,mm6
516        movq      mm0,mm4
517        movq      mm6,mm1
518        pmaddwd   mm4,[GOTOFF(ebx,PW_MF078_F117)]       ; mm4=z3L
519        pmaddwd   mm1,[GOTOFF(ebx,PW_MF078_F117)]       ; mm1=z3H
520        pmaddwd   mm0,[GOTOFF(ebx,PW_F117_F078)]        ; mm0=z4L
521        pmaddwd   mm6,[GOTOFF(ebx,PW_F117_F078)]        ; mm6=z4H
522
523        movq    MMWORD [wk(0)], mm4     ; wk(0)=z3L
524        movq    MMWORD [wk(1)], mm1     ; wk(1)=z3H
525
526        ; (Original)
527        ; z1 = tmp4 + tmp7;  z2 = tmp5 + tmp6;
528        ; tmp4 = tmp4 * 0.298631336;  tmp5 = tmp5 * 2.053119869;
529        ; tmp6 = tmp6 * 3.072711026;  tmp7 = tmp7 * 1.501321110;
530        ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
531        ; data7 = tmp4 + z1 + z3;  data5 = tmp5 + z2 + z4;
532        ; data3 = tmp6 + z2 + z3;  data1 = tmp7 + z1 + z4;
533        ;
534        ; (This implementation)
535        ; tmp4 = tmp4 * (0.298631336 - 0.899976223) + tmp7 * -0.899976223;
536        ; tmp5 = tmp5 * (2.053119869 - 2.562915447) + tmp6 * -2.562915447;
537        ; tmp6 = tmp5 * -2.562915447 + tmp6 * (3.072711026 - 2.562915447);
538        ; tmp7 = tmp4 * -0.899976223 + tmp7 * (1.501321110 - 0.899976223);
539        ; data7 = tmp4 + z3;  data5 = tmp5 + z4;
540        ; data3 = tmp6 + z3;  data1 = tmp7 + z4;
541
542        movq      mm4,mm2
543        movq      mm1,mm2
544        punpcklwd mm4,mm7
545        punpckhwd mm1,mm7
546        movq      mm2,mm4
547        movq      mm7,mm1
548        pmaddwd   mm4,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm4=tmp4L
549        pmaddwd   mm1,[GOTOFF(ebx,PW_MF060_MF089)]      ; mm1=tmp4H
550        pmaddwd   mm2,[GOTOFF(ebx,PW_MF089_F060)]       ; mm2=tmp7L
551        pmaddwd   mm7,[GOTOFF(ebx,PW_MF089_F060)]       ; mm7=tmp7H
552
553        paddd   mm4, MMWORD [wk(0)]     ; mm4=data7L
554        paddd   mm1, MMWORD [wk(1)]     ; mm1=data7H
555        paddd   mm2,mm0                 ; mm2=data1L
556        paddd   mm7,mm6                 ; mm7=data1H
557
558        paddd   mm4,[GOTOFF(ebx,PD_DESCALE_P2)]
559        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
560        psrad   mm4,DESCALE_P2
561        psrad   mm1,DESCALE_P2
562        paddd   mm2,[GOTOFF(ebx,PD_DESCALE_P2)]
563        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
564        psrad   mm2,DESCALE_P2
565        psrad   mm7,DESCALE_P2
566
567        packssdw  mm4,mm1               ; mm4=data7
568        packssdw  mm2,mm7               ; mm2=data1
569
570        movq    MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm4
571        movq    MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm2
572
573        movq      mm1,mm3
574        movq      mm7,mm3
575        punpcklwd mm1,mm5
576        punpckhwd mm7,mm5
577        movq      mm3,mm1
578        movq      mm5,mm7
579        pmaddwd   mm1,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm1=tmp5L
580        pmaddwd   mm7,[GOTOFF(ebx,PW_MF050_MF256)]      ; mm7=tmp5H
581        pmaddwd   mm3,[GOTOFF(ebx,PW_MF256_F050)]       ; mm3=tmp6L
582        pmaddwd   mm5,[GOTOFF(ebx,PW_MF256_F050)]       ; mm5=tmp6H
583
584        paddd   mm1,mm0                 ; mm1=data5L
585        paddd   mm7,mm6                 ; mm7=data5H
586        paddd   mm3, MMWORD [wk(0)]     ; mm3=data3L
587        paddd   mm5, MMWORD [wk(1)]     ; mm5=data3H
588
589        paddd   mm1,[GOTOFF(ebx,PD_DESCALE_P2)]
590        paddd   mm7,[GOTOFF(ebx,PD_DESCALE_P2)]
591        psrad   mm1,DESCALE_P2
592        psrad   mm7,DESCALE_P2
593        paddd   mm3,[GOTOFF(ebx,PD_DESCALE_P2)]
594        paddd   mm5,[GOTOFF(ebx,PD_DESCALE_P2)]
595        psrad   mm3,DESCALE_P2
596        psrad   mm5,DESCALE_P2
597
598        packssdw  mm1,mm7               ; mm1=data5
599        packssdw  mm3,mm5               ; mm3=data3
600
601        movq    MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm1
602        movq    MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm3
603
604        add     edx, byte 4*SIZEOF_DCTELEM
605        dec     ecx
606        jnz     near .columnloop
607
608        emms            ; empty MMX state
609
610;       pop     edi             ; unused
611;       pop     esi             ; unused
612;       pop     edx             ; need not be preserved
613;       pop     ecx             ; need not be preserved
614        poppic  ebx
615        mov     esp,ebp         ; esp <- aligned ebp
616        pop     esp             ; esp <- original ebp
617        pop     ebp
618        ret
619
620; For some reason, the OS X linker does not honor the request to align the
621; segment unless we do this.
622        align   16
623