1;
2; jfdctfst.asm - fast integer FDCT (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a fast, not so accurate integer implementation of
18; the forward DCT (Discrete Cosine Transform). The following code is
19; based directly on the IJG's original jfdctfst.c; see the jfdctfst.c
20; for more details.
21;
22; [TAB8]
23
24%include "jsimdext.inc"
25%include "jdct.inc"
26
27; --------------------------------------------------------------------------
28
29%define CONST_BITS  8  ; 14 is also OK.
30
31%if CONST_BITS == 8
32F_0_382 equ  98  ; FIX(0.382683433)
33F_0_541 equ 139  ; FIX(0.541196100)
34F_0_707 equ 181  ; FIX(0.707106781)
35F_1_306 equ 334  ; FIX(1.306562965)
36%else
37; NASM cannot do compile-time arithmetic on floating-point constants.
38%define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
39F_0_382 equ DESCALE( 410903207, 30 - CONST_BITS)  ; FIX(0.382683433)
40F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
41F_0_707 equ DESCALE( 759250124, 30 - CONST_BITS)  ; FIX(0.707106781)
42F_1_306 equ DESCALE(1402911301, 30 - CONST_BITS)  ; FIX(1.306562965)
43%endif
44
45; --------------------------------------------------------------------------
46    SECTION     SEG_CONST
47
48; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
49; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
50
51%define PRE_MULTIPLY_SCALE_BITS  2
52%define CONST_SHIFT              (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
53
54    alignz      32
55    GLOBAL_DATA(jconst_fdct_ifast_mmx)
56
57EXTN(jconst_fdct_ifast_mmx):
58
59PW_F0707 times 4 dw F_0_707 << CONST_SHIFT
60PW_F0382 times 4 dw F_0_382 << CONST_SHIFT
61PW_F0541 times 4 dw F_0_541 << CONST_SHIFT
62PW_F1306 times 4 dw F_1_306 << CONST_SHIFT
63
64    alignz      32
65
66; --------------------------------------------------------------------------
67    SECTION     SEG_TEXT
68    BITS        32
69;
70; Perform the forward DCT on one block of samples.
71;
72; GLOBAL(void)
73; jsimd_fdct_ifast_mmx(DCTELEM *data)
74;
75
76%define data(b)       (b) + 8           ; DCTELEM *data
77
78%define original_ebp  ebp + 0
79%define wk(i)         ebp - (WK_NUM - (i)) * SIZEOF_MMWORD  ; mmword wk[WK_NUM]
80%define WK_NUM        2
81
82    align       32
83    GLOBAL_FUNCTION(jsimd_fdct_ifast_mmx)
84
85EXTN(jsimd_fdct_ifast_mmx):
86    push        ebp
87    mov         eax, esp                    ; eax = original ebp
88    sub         esp, byte 4
89    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
90    mov         [esp], eax
91    mov         ebp, esp                    ; ebp = aligned ebp
92    lea         esp, [wk(0)]
93    pushpic     ebx
94;   push        ecx                     ; need not be preserved
95;   push        edx                     ; need not be preserved
96;   push        esi                     ; unused
97;   push        edi                     ; unused
98
99    get_GOT     ebx                     ; get GOT address
100
101    ; ---- Pass 1: process rows.
102
103    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
104    mov         ecx, DCTSIZE/4
105    alignx      16, 7
106.rowloop:
107
108    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
109    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
110    movq        mm2, MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)]
111    movq        mm3, MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)]
112
113    ; mm0=(20 21 22 23), mm2=(24 25 26 27)
114    ; mm1=(30 31 32 33), mm3=(34 35 36 37)
115
116    movq        mm4, mm0                ; transpose coefficients(phase 1)
117    punpcklwd   mm0, mm1                ; mm0=(20 30 21 31)
118    punpckhwd   mm4, mm1                ; mm4=(22 32 23 33)
119    movq        mm5, mm2                ; transpose coefficients(phase 1)
120    punpcklwd   mm2, mm3                ; mm2=(24 34 25 35)
121    punpckhwd   mm5, mm3                ; mm5=(26 36 27 37)
122
123    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
124    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
125    movq        mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)]
126    movq        mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)]
127
128    ; mm6=(00 01 02 03), mm1=(04 05 06 07)
129    ; mm7=(10 11 12 13), mm3=(14 15 16 17)
130
131    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 32 23 33)
132    movq        MMWORD [wk(1)], mm2     ; wk(1)=(24 34 25 35)
133
134    movq        mm4, mm6                ; transpose coefficients(phase 1)
135    punpcklwd   mm6, mm7                ; mm6=(00 10 01 11)
136    punpckhwd   mm4, mm7                ; mm4=(02 12 03 13)
137    movq        mm2, mm1                ; transpose coefficients(phase 1)
138    punpcklwd   mm1, mm3                ; mm1=(04 14 05 15)
139    punpckhwd   mm2, mm3                ; mm2=(06 16 07 17)
140
141    movq        mm7, mm6                ; transpose coefficients(phase 2)
142    punpckldq   mm6, mm0                ; mm6=(00 10 20 30)=data0
143    punpckhdq   mm7, mm0                ; mm7=(01 11 21 31)=data1
144    movq        mm3, mm2                ; transpose coefficients(phase 2)
145    punpckldq   mm2, mm5                ; mm2=(06 16 26 36)=data6
146    punpckhdq   mm3, mm5                ; mm3=(07 17 27 37)=data7
147
148    movq        mm0, mm7
149    movq        mm5, mm6
150    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
151    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
152    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
153    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
154
155    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 32 23 33)
156    movq        mm3, MMWORD [wk(1)]     ; mm3=(24 34 25 35)
157    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
158    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
159
160    movq        mm7, mm4                ; transpose coefficients(phase 2)
161    punpckldq   mm4, mm2                ; mm4=(02 12 22 32)=data2
162    punpckhdq   mm7, mm2                ; mm7=(03 13 23 33)=data3
163    movq        mm6, mm1                ; transpose coefficients(phase 2)
164    punpckldq   mm1, mm3                ; mm1=(04 14 24 34)=data4
165    punpckhdq   mm6, mm3                ; mm6=(05 15 25 35)=data5
166
167    movq        mm2, mm7
168    movq        mm3, mm4
169    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
170    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
171    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
172    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
173
174    ; -- Even part
175
176    movq        mm1, mm5
177    movq        mm6, mm0
178    psubw       mm5, mm7                ; mm5=tmp13
179    psubw       mm0, mm4                ; mm0=tmp12
180    paddw       mm1, mm7                ; mm1=tmp10
181    paddw       mm6, mm4                ; mm6=tmp11
182
183    paddw       mm0, mm5
184    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
185    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
186
187    movq        mm7, mm1
188    movq        mm4, mm5
189    psubw       mm1, mm6                ; mm1=data4
190    psubw       mm5, mm0                ; mm5=data6
191    paddw       mm7, mm6                ; mm7=data0
192    paddw       mm4, mm0                ; mm4=data2
193
194    movq        MMWORD [MMBLOCK(0,1,edx,SIZEOF_DCTELEM)], mm1
195    movq        MMWORD [MMBLOCK(2,1,edx,SIZEOF_DCTELEM)], mm5
196    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
197    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
198
199    ; -- Odd part
200
201    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
202    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
203
204    paddw       mm2, mm3                ; mm2=tmp10
205    paddw       mm3, mm6                ; mm3=tmp11
206    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
207
208    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
209    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
210
211    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
212    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
213
214    movq        mm1, mm2                     ; mm1=tmp10
215    psubw       mm2, mm6
216    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
217    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
218    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
219    paddw       mm1, mm2                     ; mm1=z2
220    paddw       mm6, mm2                     ; mm6=z4
221
222    movq        mm5, mm0
223    psubw       mm0, mm3                ; mm0=z13
224    paddw       mm5, mm3                ; mm5=z11
225
226    movq        mm7, mm0
227    movq        mm4, mm5
228    psubw       mm0, mm1                ; mm0=data3
229    psubw       mm5, mm6                ; mm5=data7
230    paddw       mm7, mm1                ; mm7=data5
231    paddw       mm4, mm6                ; mm4=data1
232
233    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
234    movq        MMWORD [MMBLOCK(3,1,edx,SIZEOF_DCTELEM)], mm5
235    movq        MMWORD [MMBLOCK(1,1,edx,SIZEOF_DCTELEM)], mm7
236    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
237
238    add         edx, byte 4*DCTSIZE*SIZEOF_DCTELEM
239    dec         ecx
240    jnz         near .rowloop
241
242    ; ---- Pass 2: process columns.
243
244    mov         edx, POINTER [data(eax)]  ; (DCTELEM *)
245    mov         ecx, DCTSIZE/4
246    alignx      16, 7
247.columnloop:
248
249    movq        mm0, MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)]
250    movq        mm1, MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)]
251    movq        mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)]
252    movq        mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)]
253
254    ; mm0=(02 12 22 32), mm2=(42 52 62 72)
255    ; mm1=(03 13 23 33), mm3=(43 53 63 73)
256
257    movq        mm4, mm0                ; transpose coefficients(phase 1)
258    punpcklwd   mm0, mm1                ; mm0=(02 03 12 13)
259    punpckhwd   mm4, mm1                ; mm4=(22 23 32 33)
260    movq        mm5, mm2                ; transpose coefficients(phase 1)
261    punpcklwd   mm2, mm3                ; mm2=(42 43 52 53)
262    punpckhwd   mm5, mm3                ; mm5=(62 63 72 73)
263
264    movq        mm6, MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)]
265    movq        mm7, MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)]
266    movq        mm1, MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)]
267    movq        mm3, MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)]
268
269    ; mm6=(00 10 20 30), mm1=(40 50 60 70)
270    ; mm7=(01 11 21 31), mm3=(41 51 61 71)
271
272    movq        MMWORD [wk(0)], mm4     ; wk(0)=(22 23 32 33)
273    movq        MMWORD [wk(1)], mm2     ; wk(1)=(42 43 52 53)
274
275    movq        mm4, mm6                ; transpose coefficients(phase 1)
276    punpcklwd   mm6, mm7                ; mm6=(00 01 10 11)
277    punpckhwd   mm4, mm7                ; mm4=(20 21 30 31)
278    movq        mm2, mm1                ; transpose coefficients(phase 1)
279    punpcklwd   mm1, mm3                ; mm1=(40 41 50 51)
280    punpckhwd   mm2, mm3                ; mm2=(60 61 70 71)
281
282    movq        mm7, mm6                ; transpose coefficients(phase 2)
283    punpckldq   mm6, mm0                ; mm6=(00 01 02 03)=data0
284    punpckhdq   mm7, mm0                ; mm7=(10 11 12 13)=data1
285    movq        mm3, mm2                ; transpose coefficients(phase 2)
286    punpckldq   mm2, mm5                ; mm2=(60 61 62 63)=data6
287    punpckhdq   mm3, mm5                ; mm3=(70 71 72 73)=data7
288
289    movq        mm0, mm7
290    movq        mm5, mm6
291    psubw       mm7, mm2                ; mm7=data1-data6=tmp6
292    psubw       mm6, mm3                ; mm6=data0-data7=tmp7
293    paddw       mm0, mm2                ; mm0=data1+data6=tmp1
294    paddw       mm5, mm3                ; mm5=data0+data7=tmp0
295
296    movq        mm2, MMWORD [wk(0)]     ; mm2=(22 23 32 33)
297    movq        mm3, MMWORD [wk(1)]     ; mm3=(42 43 52 53)
298    movq        MMWORD [wk(0)], mm7     ; wk(0)=tmp6
299    movq        MMWORD [wk(1)], mm6     ; wk(1)=tmp7
300
301    movq        mm7, mm4                ; transpose coefficients(phase 2)
302    punpckldq   mm4, mm2                ; mm4=(20 21 22 23)=data2
303    punpckhdq   mm7, mm2                ; mm7=(30 31 32 33)=data3
304    movq        mm6, mm1                ; transpose coefficients(phase 2)
305    punpckldq   mm1, mm3                ; mm1=(40 41 42 43)=data4
306    punpckhdq   mm6, mm3                ; mm6=(50 51 52 53)=data5
307
308    movq        mm2, mm7
309    movq        mm3, mm4
310    paddw       mm7, mm1                ; mm7=data3+data4=tmp3
311    paddw       mm4, mm6                ; mm4=data2+data5=tmp2
312    psubw       mm2, mm1                ; mm2=data3-data4=tmp4
313    psubw       mm3, mm6                ; mm3=data2-data5=tmp5
314
315    ; -- Even part
316
317    movq        mm1, mm5
318    movq        mm6, mm0
319    psubw       mm5, mm7                ; mm5=tmp13
320    psubw       mm0, mm4                ; mm0=tmp12
321    paddw       mm1, mm7                ; mm1=tmp10
322    paddw       mm6, mm4                ; mm6=tmp11
323
324    paddw       mm0, mm5
325    psllw       mm0, PRE_MULTIPLY_SCALE_BITS
326    pmulhw      mm0, [GOTOFF(ebx,PW_F0707)]  ; mm0=z1
327
328    movq        mm7, mm1
329    movq        mm4, mm5
330    psubw       mm1, mm6                ; mm1=data4
331    psubw       mm5, mm0                ; mm5=data6
332    paddw       mm7, mm6                ; mm7=data0
333    paddw       mm4, mm0                ; mm4=data2
334
335    movq        MMWORD [MMBLOCK(4,0,edx,SIZEOF_DCTELEM)], mm1
336    movq        MMWORD [MMBLOCK(6,0,edx,SIZEOF_DCTELEM)], mm5
337    movq        MMWORD [MMBLOCK(0,0,edx,SIZEOF_DCTELEM)], mm7
338    movq        MMWORD [MMBLOCK(2,0,edx,SIZEOF_DCTELEM)], mm4
339
340    ; -- Odd part
341
342    movq        mm6, MMWORD [wk(0)]     ; mm6=tmp6
343    movq        mm0, MMWORD [wk(1)]     ; mm0=tmp7
344
345    paddw       mm2, mm3                ; mm2=tmp10
346    paddw       mm3, mm6                ; mm3=tmp11
347    paddw       mm6, mm0                ; mm6=tmp12, mm0=tmp7
348
349    psllw       mm2, PRE_MULTIPLY_SCALE_BITS
350    psllw       mm6, PRE_MULTIPLY_SCALE_BITS
351
352    psllw       mm3, PRE_MULTIPLY_SCALE_BITS
353    pmulhw      mm3, [GOTOFF(ebx,PW_F0707)]  ; mm3=z3
354
355    movq        mm1, mm2                     ; mm1=tmp10
356    psubw       mm2, mm6
357    pmulhw      mm2, [GOTOFF(ebx,PW_F0382)]  ; mm2=z5
358    pmulhw      mm1, [GOTOFF(ebx,PW_F0541)]  ; mm1=MULTIPLY(tmp10,FIX_0_54119610)
359    pmulhw      mm6, [GOTOFF(ebx,PW_F1306)]  ; mm6=MULTIPLY(tmp12,FIX_1_30656296)
360    paddw       mm1, mm2                     ; mm1=z2
361    paddw       mm6, mm2                     ; mm6=z4
362
363    movq        mm5, mm0
364    psubw       mm0, mm3                ; mm0=z13
365    paddw       mm5, mm3                ; mm5=z11
366
367    movq        mm7, mm0
368    movq        mm4, mm5
369    psubw       mm0, mm1                ; mm0=data3
370    psubw       mm5, mm6                ; mm5=data7
371    paddw       mm7, mm1                ; mm7=data5
372    paddw       mm4, mm6                ; mm4=data1
373
374    movq        MMWORD [MMBLOCK(3,0,edx,SIZEOF_DCTELEM)], mm0
375    movq        MMWORD [MMBLOCK(7,0,edx,SIZEOF_DCTELEM)], mm5
376    movq        MMWORD [MMBLOCK(5,0,edx,SIZEOF_DCTELEM)], mm7
377    movq        MMWORD [MMBLOCK(1,0,edx,SIZEOF_DCTELEM)], mm4
378
379    add         edx, byte 4*SIZEOF_DCTELEM
380    dec         ecx
381    jnz         near .columnloop
382
383    emms                                ; empty MMX state
384
385;   pop         edi                     ; unused
386;   pop         esi                     ; unused
387;   pop         edx                     ; need not be preserved
388;   pop         ecx                     ; need not be preserved
389    poppic      ebx
390    mov         esp, ebp                ; esp <- aligned ebp
391    pop         esp                     ; esp <- original ebp
392    pop         ebp
393    ret
394
395; For some reason, the OS X linker does not honor the request to align the
396; segment unless we do this.
397    align       32
398