1;
2; jidctflt.asm - floating-point IDCT (3DNow! & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27    SECTION     SEG_CONST
28
29    alignz      32
30    GLOBAL_DATA(jconst_idct_float_3dnow)
31
32EXTN(jconst_idct_float_3dnow):
33
34PD_1_414        times 2 dd 1.414213562373095048801689
35PD_1_847        times 2 dd 1.847759065022573512256366
36PD_1_082        times 2 dd 1.082392200292393968799446
37PD_2_613        times 2 dd 2.613125929752753055713286
38PD_RNDINT_MAGIC times 2 dd 100663296.0  ; (float)(0x00C00000 << 3)
39PB_CENTERJSAMP  times 8 db CENTERJSAMPLE
40
41    alignz      32
42
43; --------------------------------------------------------------------------
44    SECTION     SEG_TEXT
45    BITS        32
46;
47; Perform dequantization and inverse DCT on one block of coefficients.
48;
49; GLOBAL(void)
50; jsimd_idct_float_3dnow(void *dct_table, JCOEFPTR coef_block,
51;                        JSAMPARRAY output_buf, JDIMENSION output_col)
52;
53
54%define dct_table(b)   (b) + 8          ; void *dct_table
55%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
56%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
57%define output_col(b)  (b) + 20         ; JDIMENSION output_col
58
59%define original_ebp   ebp + 0
60%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
61                                        ; mmword wk[WK_NUM]
62%define WK_NUM         2
63%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
64                                        ; FAST_FLOAT workspace[DCTSIZE2]
65
66    align       32
67    GLOBAL_FUNCTION(jsimd_idct_float_3dnow)
68
69EXTN(jsimd_idct_float_3dnow):
70    push        ebp
71    mov         eax, esp                    ; eax = original ebp
72    sub         esp, byte 4
73    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
74    mov         [esp], eax
75    mov         ebp, esp                    ; ebp = aligned ebp
76    lea         esp, [workspace]
77    push        ebx
78;   push        ecx                     ; need not be preserved
79;   push        edx                     ; need not be preserved
80    push        esi
81    push        edi
82
83    get_GOT     ebx                     ; get GOT address
84
85    ; ---- Pass 1: process columns from input, store into work array.
86
87;   mov         eax, [original_ebp]
88    mov         edx, POINTER [dct_table(eax)]    ; quantptr
89    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
90    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
91    mov         ecx, DCTSIZE/2                   ; ctr
92    alignx      16, 7
93.columnloop:
94%ifndef NO_ZERO_COLUMN_TEST_FLOAT_3DNOW
95    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
96    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
97    jnz         short .columnDCT
98
99    pushpic     ebx                     ; save GOT address
100    mov         ebx, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
101    mov         eax, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
102    or          ebx, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
103    or          eax, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
104    or          ebx, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
105    or          eax, ebx
106    poppic      ebx                     ; restore GOT address
107    jnz         short .columnDCT
108
109    ; -- AC terms all zero
110
111    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
112
113    punpcklwd   mm0, mm0
114    psrad       mm0, (DWORD_BIT-WORD_BIT)
115    pi2fd       mm0, mm0
116
117    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
118
119    movq        mm1, mm0
120    punpckldq   mm0, mm0
121    punpckhdq   mm1, mm1
122
123    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm0
124    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm0
125    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm0
126    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
127    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
128    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm1
129    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm1
130    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm1
131    jmp         near .nextcolumn
132    alignx      16, 7
133%endif
134.columnDCT:
135
136    ; -- Even part
137
138    movd        mm0, DWORD [DWBLOCK(0,0,esi,SIZEOF_JCOEF)]
139    movd        mm1, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
140    movd        mm2, DWORD [DWBLOCK(4,0,esi,SIZEOF_JCOEF)]
141    movd        mm3, DWORD [DWBLOCK(6,0,esi,SIZEOF_JCOEF)]
142
143    punpcklwd   mm0, mm0
144    punpcklwd   mm1, mm1
145    psrad       mm0, (DWORD_BIT-WORD_BIT)
146    psrad       mm1, (DWORD_BIT-WORD_BIT)
147    pi2fd       mm0, mm0
148    pi2fd       mm1, mm1
149
150    pfmul       mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
151    pfmul       mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
152
153    punpcklwd   mm2, mm2
154    punpcklwd   mm3, mm3
155    psrad       mm2, (DWORD_BIT-WORD_BIT)
156    psrad       mm3, (DWORD_BIT-WORD_BIT)
157    pi2fd       mm2, mm2
158    pi2fd       mm3, mm3
159
160    pfmul       mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
161    pfmul       mm3, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
162
163    movq        mm4, mm0
164    movq        mm5, mm1
165    pfsub       mm0, mm2                ; mm0=tmp11
166    pfsub       mm1, mm3
167    pfadd       mm4, mm2                ; mm4=tmp10
168    pfadd       mm5, mm3                ; mm5=tmp13
169
170    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
171    pfsub       mm1, mm5                ; mm1=tmp12
172
173    movq        mm6, mm4
174    movq        mm7, mm0
175    pfsub       mm4, mm5                ; mm4=tmp3
176    pfsub       mm0, mm1                ; mm0=tmp2
177    pfadd       mm6, mm5                ; mm6=tmp0
178    pfadd       mm7, mm1                ; mm7=tmp1
179
180    movq        MMWORD [wk(1)], mm4     ; tmp3
181    movq        MMWORD [wk(0)], mm0     ; tmp2
182
183    ; -- Odd part
184
185    movd        mm2, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
186    movd        mm3, DWORD [DWBLOCK(3,0,esi,SIZEOF_JCOEF)]
187    movd        mm5, DWORD [DWBLOCK(5,0,esi,SIZEOF_JCOEF)]
188    movd        mm1, DWORD [DWBLOCK(7,0,esi,SIZEOF_JCOEF)]
189
190    punpcklwd   mm2, mm2
191    punpcklwd   mm3, mm3
192    psrad       mm2, (DWORD_BIT-WORD_BIT)
193    psrad       mm3, (DWORD_BIT-WORD_BIT)
194    pi2fd       mm2, mm2
195    pi2fd       mm3, mm3
196
197    pfmul       mm2, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
198    pfmul       mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
199
200    punpcklwd   mm5, mm5
201    punpcklwd   mm1, mm1
202    psrad       mm5, (DWORD_BIT-WORD_BIT)
203    psrad       mm1, (DWORD_BIT-WORD_BIT)
204    pi2fd       mm5, mm5
205    pi2fd       mm1, mm1
206
207    pfmul       mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
208    pfmul       mm1, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
209
210    movq        mm4, mm2
211    movq        mm0, mm5
212    pfadd       mm2, mm1                ; mm2=z11
213    pfadd       mm5, mm3                ; mm5=z13
214    pfsub       mm4, mm1                ; mm4=z12
215    pfsub       mm0, mm3                ; mm0=z10
216
217    movq        mm1, mm2
218    pfsub       mm2, mm5
219    pfadd       mm1, mm5                ; mm1=tmp7
220
221    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
222
223    movq        mm3, mm0
224    pfadd       mm0, mm4
225    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
226    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
227    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
228    pfsubr      mm3, mm0                     ; mm3=tmp12
229    pfsub       mm4, mm0                     ; mm4=tmp10
230
231    ; -- Final output stage
232
233    pfsub       mm3, mm1                ; mm3=tmp6
234    movq        mm5, mm6
235    movq        mm0, mm7
236    pfadd       mm6, mm1                ; mm6=data0=(00 01)
237    pfadd       mm7, mm3                ; mm7=data1=(10 11)
238    pfsub       mm5, mm1                ; mm5=data7=(70 71)
239    pfsub       mm0, mm3                ; mm0=data6=(60 61)
240    pfsub       mm2, mm3                ; mm2=tmp5
241
242    movq        mm1, mm6                ; transpose coefficients
243    punpckldq   mm6, mm7                ; mm6=(00 10)
244    punpckhdq   mm1, mm7                ; mm1=(01 11)
245    movq        mm3, mm0                ; transpose coefficients
246    punpckldq   mm0, mm5                ; mm0=(60 70)
247    punpckhdq   mm3, mm5                ; mm3=(61 71)
248
249    movq        MMWORD [MMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], mm6
250    movq        MMWORD [MMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], mm1
251    movq        MMWORD [MMBLOCK(0,3,edi,SIZEOF_FAST_FLOAT)], mm0
252    movq        MMWORD [MMBLOCK(1,3,edi,SIZEOF_FAST_FLOAT)], mm3
253
254    movq        mm7, MMWORD [wk(0)]     ; mm7=tmp2
255    movq        mm5, MMWORD [wk(1)]     ; mm5=tmp3
256
257    pfadd       mm4, mm2                ; mm4=tmp4
258    movq        mm6, mm7
259    movq        mm1, mm5
260    pfadd       mm7, mm2                ; mm7=data2=(20 21)
261    pfadd       mm5, mm4                ; mm5=data4=(40 41)
262    pfsub       mm6, mm2                ; mm6=data5=(50 51)
263    pfsub       mm1, mm4                ; mm1=data3=(30 31)
264
265    movq        mm0, mm7                ; transpose coefficients
266    punpckldq   mm7, mm1                ; mm7=(20 30)
267    punpckhdq   mm0, mm1                ; mm0=(21 31)
268    movq        mm3, mm5                ; transpose coefficients
269    punpckldq   mm5, mm6                ; mm5=(40 50)
270    punpckhdq   mm3, mm6                ; mm3=(41 51)
271
272    movq        MMWORD [MMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], mm7
273    movq        MMWORD [MMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], mm0
274    movq        MMWORD [MMBLOCK(0,2,edi,SIZEOF_FAST_FLOAT)], mm5
275    movq        MMWORD [MMBLOCK(1,2,edi,SIZEOF_FAST_FLOAT)], mm3
276
277.nextcolumn:
278    add         esi, byte 2*SIZEOF_JCOEF               ; coef_block
279    add         edx, byte 2*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
280    add         edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
281    dec         ecx                                    ; ctr
282    jnz         near .columnloop
283
284    ; -- Prefetch the next coefficient block
285
286    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
287    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
288    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
289    prefetch [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
290
291    ; ---- Pass 2: process rows from work array, store into output array.
292
293    mov         eax, [original_ebp]
294    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
295    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
296    mov         eax, JDIMENSION [output_col(eax)]
297    mov         ecx, DCTSIZE/2                     ; ctr
298    alignx      16, 7
299.rowloop:
300
301    ; -- Even part
302
303    movq        mm0, MMWORD [MMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
304    movq        mm1, MMWORD [MMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
305    movq        mm2, MMWORD [MMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
306    movq        mm3, MMWORD [MMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
307
308    movq        mm4, mm0
309    movq        mm5, mm1
310    pfsub       mm0, mm2                ; mm0=tmp11
311    pfsub       mm1, mm3
312    pfadd       mm4, mm2                ; mm4=tmp10
313    pfadd       mm5, mm3                ; mm5=tmp13
314
315    pfmul       mm1, [GOTOFF(ebx,PD_1_414)]
316    pfsub       mm1, mm5                ; mm1=tmp12
317
318    movq        mm6, mm4
319    movq        mm7, mm0
320    pfsub       mm4, mm5                ; mm4=tmp3
321    pfsub       mm0, mm1                ; mm0=tmp2
322    pfadd       mm6, mm5                ; mm6=tmp0
323    pfadd       mm7, mm1                ; mm7=tmp1
324
325    movq        MMWORD [wk(1)], mm4     ; tmp3
326    movq        MMWORD [wk(0)], mm0     ; tmp2
327
328    ; -- Odd part
329
330    movq        mm2, MMWORD [MMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
331    movq        mm3, MMWORD [MMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
332    movq        mm5, MMWORD [MMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
333    movq        mm1, MMWORD [MMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
334
335    movq        mm4, mm2
336    movq        mm0, mm5
337    pfadd       mm2, mm1                ; mm2=z11
338    pfadd       mm5, mm3                ; mm5=z13
339    pfsub       mm4, mm1                ; mm4=z12
340    pfsub       mm0, mm3                ; mm0=z10
341
342    movq        mm1, mm2
343    pfsub       mm2, mm5
344    pfadd       mm1, mm5                ; mm1=tmp7
345
346    pfmul       mm2, [GOTOFF(ebx,PD_1_414)]  ; mm2=tmp11
347
348    movq        mm3, mm0
349    pfadd       mm0, mm4
350    pfmul       mm0, [GOTOFF(ebx,PD_1_847)]  ; mm0=z5
351    pfmul       mm3, [GOTOFF(ebx,PD_2_613)]  ; mm3=(z10 * 2.613125930)
352    pfmul       mm4, [GOTOFF(ebx,PD_1_082)]  ; mm4=(z12 * 1.082392200)
353    pfsubr      mm3, mm0                     ; mm3=tmp12
354    pfsub       mm4, mm0                     ; mm4=tmp10
355
356    ; -- Final output stage
357
358    pfsub       mm3, mm1                ; mm3=tmp6
359    movq        mm5, mm6
360    movq        mm0, mm7
361    pfadd       mm6, mm1                ; mm6=data0=(00 10)
362    pfadd       mm7, mm3                ; mm7=data1=(01 11)
363    pfsub       mm5, mm1                ; mm5=data7=(07 17)
364    pfsub       mm0, mm3                ; mm0=data6=(06 16)
365    pfsub       mm2, mm3                ; mm2=tmp5
366
367    movq        mm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm1=[PD_RNDINT_MAGIC]
368    pcmpeqd     mm3, mm3
369    psrld       mm3, WORD_BIT           ; mm3={0xFFFF 0x0000 0xFFFF 0x0000}
370
371    pfadd       mm6, mm1                ; mm6=roundint(data0/8)=(00 ** 10 **)
372    pfadd       mm7, mm1                ; mm7=roundint(data1/8)=(01 ** 11 **)
373    pfadd       mm0, mm1                ; mm0=roundint(data6/8)=(06 ** 16 **)
374    pfadd       mm5, mm1                ; mm5=roundint(data7/8)=(07 ** 17 **)
375
376    pand        mm6, mm3                ; mm6=(00 -- 10 --)
377    pslld       mm7, WORD_BIT           ; mm7=(-- 01 -- 11)
378    pand        mm0, mm3                ; mm0=(06 -- 16 --)
379    pslld       mm5, WORD_BIT           ; mm5=(-- 07 -- 17)
380    por         mm6, mm7                ; mm6=(00 01 10 11)
381    por         mm0, mm5                ; mm0=(06 07 16 17)
382
383    movq        mm1, MMWORD [wk(0)]     ; mm1=tmp2
384    movq        mm3, MMWORD [wk(1)]     ; mm3=tmp3
385
386    pfadd       mm4, mm2                ; mm4=tmp4
387    movq        mm7, mm1
388    movq        mm5, mm3
389    pfadd       mm1, mm2                ; mm1=data2=(02 12)
390    pfadd       mm3, mm4                ; mm3=data4=(04 14)
391    pfsub       mm7, mm2                ; mm7=data5=(05 15)
392    pfsub       mm5, mm4                ; mm5=data3=(03 13)
393
394    movq        mm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; mm2=[PD_RNDINT_MAGIC]
395    pcmpeqd     mm4, mm4
396    psrld       mm4, WORD_BIT           ; mm4={0xFFFF 0x0000 0xFFFF 0x0000}
397
398    pfadd       mm3, mm2                ; mm3=roundint(data4/8)=(04 ** 14 **)
399    pfadd       mm7, mm2                ; mm7=roundint(data5/8)=(05 ** 15 **)
400    pfadd       mm1, mm2                ; mm1=roundint(data2/8)=(02 ** 12 **)
401    pfadd       mm5, mm2                ; mm5=roundint(data3/8)=(03 ** 13 **)
402
403    pand        mm3, mm4                ; mm3=(04 -- 14 --)
404    pslld       mm7, WORD_BIT           ; mm7=(-- 05 -- 15)
405    pand        mm1, mm4                ; mm1=(02 -- 12 --)
406    pslld       mm5, WORD_BIT           ; mm5=(-- 03 -- 13)
407    por         mm3, mm7                ; mm3=(04 05 14 15)
408    por         mm1, mm5                ; mm1=(02 03 12 13)
409
410    movq        mm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; mm2=[PB_CENTERJSAMP]
411
412    packsswb    mm6, mm3                ; mm6=(00 01 10 11 04 05 14 15)
413    packsswb    mm1, mm0                ; mm1=(02 03 12 13 06 07 16 17)
414    paddb       mm6, mm2
415    paddb       mm1, mm2
416
417    movq        mm4, mm6                ; transpose coefficients(phase 2)
418    punpcklwd   mm6, mm1                ; mm6=(00 01 02 03 10 11 12 13)
419    punpckhwd   mm4, mm1                ; mm4=(04 05 06 07 14 15 16 17)
420
421    movq        mm7, mm6                ; transpose coefficients(phase 3)
422    punpckldq   mm6, mm4                ; mm6=(00 01 02 03 04 05 06 07)
423    punpckhdq   mm7, mm4                ; mm7=(10 11 12 13 14 15 16 17)
424
425    pushpic     ebx                     ; save GOT address
426
427    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
428    mov         ebx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
429    movq        MMWORD [edx+eax*SIZEOF_JSAMPLE], mm6
430    movq        MMWORD [ebx+eax*SIZEOF_JSAMPLE], mm7
431
432    poppic      ebx                     ; restore GOT address
433
434    add         esi, byte 2*SIZEOF_FAST_FLOAT  ; wsptr
435    add         edi, byte 2*SIZEOF_JSAMPROW
436    dec         ecx                            ; ctr
437    jnz         near .rowloop
438
439    femms                               ; empty MMX/3DNow! state
440
441    pop         edi
442    pop         esi
443;   pop         edx                     ; need not be preserved
444;   pop         ecx                     ; need not be preserved
445    pop         ebx
446    mov         esp, ebp                ; esp <- aligned ebp
447    pop         esp                     ; esp <- original ebp
448    pop         ebp
449    ret
450
451; For some reason, the OS X linker does not honor the request to align the
452; segment unless we do this.
453    align       32
454