1;
2; jidctflt.asm - floating-point IDCT (SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%macro  unpcklps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29        shufps  %1,%2,0x44
30%endmacro
31
32%macro  unpckhps2 2     ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33        shufps  %1,%2,0xEE
34%endmacro
35
36; --------------------------------------------------------------------------
37        SECTION SEG_CONST
38
39        alignz  16
40        global  EXTN(jconst_idct_float_sse2)
41
42EXTN(jconst_idct_float_sse2):
43
44PD_1_414        times 4 dd  1.414213562373095048801689
45PD_1_847        times 4 dd  1.847759065022573512256366
46PD_1_082        times 4 dd  1.082392200292393968799446
47PD_M2_613       times 4 dd -2.613125929752753055713286
48PD_RNDINT_MAGIC times 4 dd  100663296.0 ; (float)(0x00C00000 << 3)
49PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
50
51        alignz  16
52
53; --------------------------------------------------------------------------
54        SECTION SEG_TEXT
55        BITS    32
56;
57; Perform dequantization and inverse DCT on one block of coefficients.
58;
59; GLOBAL(void)
60; jsimd_idct_float_sse2 (void * dct_table, JCOEFPTR coef_block,
61;                        JSAMPARRAY output_buf, JDIMENSION output_col)
62;
63
64%define dct_table(b)    (b)+8           ; void * dct_table
65%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
66%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
67%define output_col(b)   (b)+20          ; JDIMENSION output_col
68
69%define original_ebp    ebp+0
70%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
71%define WK_NUM          2
72%define workspace       wk(0)-DCTSIZE2*SIZEOF_FAST_FLOAT
73                                        ; FAST_FLOAT workspace[DCTSIZE2]
74
75        align   16
76        global  EXTN(jsimd_idct_float_sse2)
77
78EXTN(jsimd_idct_float_sse2):
79        push    ebp
80        mov     eax,esp                         ; eax = original ebp
81        sub     esp, byte 4
82        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
83        mov     [esp],eax
84        mov     ebp,esp                         ; ebp = aligned ebp
85        lea     esp, [workspace]
86        push    ebx
87;       push    ecx             ; need not be preserved
88;       push    edx             ; need not be preserved
89        push    esi
90        push    edi
91
92        get_GOT ebx             ; get GOT address
93
94        ; ---- Pass 1: process columns from input, store into work array.
95
96;       mov     eax, [original_ebp]
97        mov     edx, POINTER [dct_table(eax)]           ; quantptr
98        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
99        lea     edi, [workspace]                        ; FAST_FLOAT * wsptr
100        mov     ecx, DCTSIZE/4                          ; ctr
101        alignx  16,7
102.columnloop:
103%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
104        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
105        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
106        jnz     near .columnDCT
107
108        movq    xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
109        movq    xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
110        movq    xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
111        movq    xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
112        movq    xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
113        movq    xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
114        movq    xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
115        por     xmm1,xmm2
116        por     xmm3,xmm4
117        por     xmm5,xmm6
118        por     xmm1,xmm3
119        por     xmm5,xmm7
120        por     xmm1,xmm5
121        packsswb xmm1,xmm1
122        movd    eax,xmm1
123        test    eax,eax
124        jnz     short .columnDCT
125
126        ; -- AC terms all zero
127
128        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
129
130        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
131        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
132        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
133
134        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
135
136        movaps  xmm1,xmm0
137        movaps  xmm2,xmm0
138        movaps  xmm3,xmm0
139
140        shufps  xmm0,xmm0,0x00                  ; xmm0=(00 00 00 00)
141        shufps  xmm1,xmm1,0x55                  ; xmm1=(01 01 01 01)
142        shufps  xmm2,xmm2,0xAA                  ; xmm2=(02 02 02 02)
143        shufps  xmm3,xmm3,0xFF                  ; xmm3=(03 03 03 03)
144
145        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
146        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
147        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
148        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
149        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
150        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
151        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
152        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
153        jmp     near .nextcolumn
154        alignx  16,7
155%endif
156.columnDCT:
157
158        ; -- Even part
159
160        movq      xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
161        movq      xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
162        movq      xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
163        movq      xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
164
165        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
166        punpcklwd xmm1,xmm1             ; xmm1=(20 20 21 21 22 22 23 23)
167        psrad     xmm0,(DWORD_BIT-WORD_BIT)     ; xmm0=in0=(00 01 02 03)
168        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in2=(20 21 22 23)
169        cvtdq2ps  xmm0,xmm0                     ; xmm0=in0=(00 01 02 03)
170        cvtdq2ps  xmm1,xmm1                     ; xmm1=in2=(20 21 22 23)
171
172        punpcklwd xmm2,xmm2             ; xmm2=(40 40 41 41 42 42 43 43)
173        punpcklwd xmm3,xmm3             ; xmm3=(60 60 61 61 62 62 63 63)
174        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in4=(40 41 42 43)
175        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in6=(60 61 62 63)
176        cvtdq2ps  xmm2,xmm2                     ; xmm2=in4=(40 41 42 43)
177        cvtdq2ps  xmm3,xmm3                     ; xmm3=in6=(60 61 62 63)
178
179        mulps     xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
180        mulps     xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
181        mulps     xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
182        mulps     xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
183
184        movaps  xmm4,xmm0
185        movaps  xmm5,xmm1
186        subps   xmm0,xmm2               ; xmm0=tmp11
187        subps   xmm1,xmm3
188        addps   xmm4,xmm2               ; xmm4=tmp10
189        addps   xmm5,xmm3               ; xmm5=tmp13
190
191        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
192        subps   xmm1,xmm5               ; xmm1=tmp12
193
194        movaps  xmm6,xmm4
195        movaps  xmm7,xmm0
196        subps   xmm4,xmm5               ; xmm4=tmp3
197        subps   xmm0,xmm1               ; xmm0=tmp2
198        addps   xmm6,xmm5               ; xmm6=tmp0
199        addps   xmm7,xmm1               ; xmm7=tmp1
200
201        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
202        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
203
204        ; -- Odd part
205
206        movq      xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
207        movq      xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
208        movq      xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
209        movq      xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
210
211        punpcklwd xmm2,xmm2             ; xmm2=(10 10 11 11 12 12 13 13)
212        punpcklwd xmm3,xmm3             ; xmm3=(30 30 31 31 32 32 33 33)
213        psrad     xmm2,(DWORD_BIT-WORD_BIT)     ; xmm2=in1=(10 11 12 13)
214        psrad     xmm3,(DWORD_BIT-WORD_BIT)     ; xmm3=in3=(30 31 32 33)
215        cvtdq2ps  xmm2,xmm2                     ; xmm2=in1=(10 11 12 13)
216        cvtdq2ps  xmm3,xmm3                     ; xmm3=in3=(30 31 32 33)
217
218        punpcklwd xmm5,xmm5             ; xmm5=(50 50 51 51 52 52 53 53)
219        punpcklwd xmm1,xmm1             ; xmm1=(70 70 71 71 72 72 73 73)
220        psrad     xmm5,(DWORD_BIT-WORD_BIT)     ; xmm5=in5=(50 51 52 53)
221        psrad     xmm1,(DWORD_BIT-WORD_BIT)     ; xmm1=in7=(70 71 72 73)
222        cvtdq2ps  xmm5,xmm5                     ; xmm5=in5=(50 51 52 53)
223        cvtdq2ps  xmm1,xmm1                     ; xmm1=in7=(70 71 72 73)
224
225        mulps     xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
226        mulps     xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
227        mulps     xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
228        mulps     xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
229
230        movaps  xmm4,xmm2
231        movaps  xmm0,xmm5
232        addps   xmm2,xmm1               ; xmm2=z11
233        addps   xmm5,xmm3               ; xmm5=z13
234        subps   xmm4,xmm1               ; xmm4=z12
235        subps   xmm0,xmm3               ; xmm0=z10
236
237        movaps  xmm1,xmm2
238        subps   xmm2,xmm5
239        addps   xmm1,xmm5               ; xmm1=tmp7
240
241        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
242
243        movaps  xmm3,xmm0
244        addps   xmm0,xmm4
245        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
246        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
247        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
248        addps   xmm3,xmm0               ; xmm3=tmp12
249        subps   xmm4,xmm0               ; xmm4=tmp10
250
251        ; -- Final output stage
252
253        subps   xmm3,xmm1               ; xmm3=tmp6
254        movaps  xmm5,xmm6
255        movaps  xmm0,xmm7
256        addps   xmm6,xmm1               ; xmm6=data0=(00 01 02 03)
257        addps   xmm7,xmm3               ; xmm7=data1=(10 11 12 13)
258        subps   xmm5,xmm1               ; xmm5=data7=(70 71 72 73)
259        subps   xmm0,xmm3               ; xmm0=data6=(60 61 62 63)
260        subps   xmm2,xmm3               ; xmm2=tmp5
261
262        movaps    xmm1,xmm6             ; transpose coefficients(phase 1)
263        unpcklps  xmm6,xmm7             ; xmm6=(00 10 01 11)
264        unpckhps  xmm1,xmm7             ; xmm1=(02 12 03 13)
265        movaps    xmm3,xmm0             ; transpose coefficients(phase 1)
266        unpcklps  xmm0,xmm5             ; xmm0=(60 70 61 71)
267        unpckhps  xmm3,xmm5             ; xmm3=(62 72 63 73)
268
269        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
270        movaps  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
271
272        movaps  XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
273        movaps  XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
274
275        addps   xmm4,xmm2               ; xmm4=tmp4
276        movaps  xmm0,xmm7
277        movaps  xmm3,xmm5
278        addps   xmm7,xmm2               ; xmm7=data2=(20 21 22 23)
279        addps   xmm5,xmm4               ; xmm5=data4=(40 41 42 43)
280        subps   xmm0,xmm2               ; xmm0=data5=(50 51 52 53)
281        subps   xmm3,xmm4               ; xmm3=data3=(30 31 32 33)
282
283        movaps    xmm2,xmm7             ; transpose coefficients(phase 1)
284        unpcklps  xmm7,xmm3             ; xmm7=(20 30 21 31)
285        unpckhps  xmm2,xmm3             ; xmm2=(22 32 23 33)
286        movaps    xmm4,xmm5             ; transpose coefficients(phase 1)
287        unpcklps  xmm5,xmm0             ; xmm5=(40 50 41 51)
288        unpckhps  xmm4,xmm0             ; xmm4=(42 52 43 53)
289
290        movaps    xmm3,xmm6             ; transpose coefficients(phase 2)
291        unpcklps2 xmm6,xmm7             ; xmm6=(00 10 20 30)
292        unpckhps2 xmm3,xmm7             ; xmm3=(01 11 21 31)
293        movaps    xmm0,xmm1             ; transpose coefficients(phase 2)
294        unpcklps2 xmm1,xmm2             ; xmm1=(02 12 22 32)
295        unpckhps2 xmm0,xmm2             ; xmm0=(03 13 23 33)
296
297        movaps  xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
298        movaps  xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
299
300        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
301        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
302        movaps  XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
303        movaps  XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
304
305        movaps    xmm6,xmm5             ; transpose coefficients(phase 2)
306        unpcklps2 xmm5,xmm7             ; xmm5=(40 50 60 70)
307        unpckhps2 xmm6,xmm7             ; xmm6=(41 51 61 71)
308        movaps    xmm3,xmm4             ; transpose coefficients(phase 2)
309        unpcklps2 xmm4,xmm2             ; xmm4=(42 52 62 72)
310        unpckhps2 xmm3,xmm2             ; xmm3=(43 53 63 73)
311
312        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
313        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
314        movaps  XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
315        movaps  XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
316
317.nextcolumn:
318        add     esi, byte 4*SIZEOF_JCOEF                ; coef_block
319        add     edx, byte 4*SIZEOF_FLOAT_MULT_TYPE      ; quantptr
320        add     edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT   ; wsptr
321        dec     ecx                                     ; ctr
322        jnz     near .columnloop
323
324        ; -- Prefetch the next coefficient block
325
326        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
327        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
328        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
329        prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
330
331        ; ---- Pass 2: process rows from work array, store into output array.
332
333        mov     eax, [original_ebp]
334        lea     esi, [workspace]                        ; FAST_FLOAT * wsptr
335        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
336        mov     eax, JDIMENSION [output_col(eax)]
337        mov     ecx, DCTSIZE/4                          ; ctr
338        alignx  16,7
339.rowloop:
340
341        ; -- Even part
342
343        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
344        movaps  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
345        movaps  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
346        movaps  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
347
348        movaps  xmm4,xmm0
349        movaps  xmm5,xmm1
350        subps   xmm0,xmm2               ; xmm0=tmp11
351        subps   xmm1,xmm3
352        addps   xmm4,xmm2               ; xmm4=tmp10
353        addps   xmm5,xmm3               ; xmm5=tmp13
354
355        mulps   xmm1,[GOTOFF(ebx,PD_1_414)]
356        subps   xmm1,xmm5               ; xmm1=tmp12
357
358        movaps  xmm6,xmm4
359        movaps  xmm7,xmm0
360        subps   xmm4,xmm5               ; xmm4=tmp3
361        subps   xmm0,xmm1               ; xmm0=tmp2
362        addps   xmm6,xmm5               ; xmm6=tmp0
363        addps   xmm7,xmm1               ; xmm7=tmp1
364
365        movaps  XMMWORD [wk(1)], xmm4   ; tmp3
366        movaps  XMMWORD [wk(0)], xmm0   ; tmp2
367
368        ; -- Odd part
369
370        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
371        movaps  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
372        movaps  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
373        movaps  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
374
375        movaps  xmm4,xmm2
376        movaps  xmm0,xmm5
377        addps   xmm2,xmm1               ; xmm2=z11
378        addps   xmm5,xmm3               ; xmm5=z13
379        subps   xmm4,xmm1               ; xmm4=z12
380        subps   xmm0,xmm3               ; xmm0=z10
381
382        movaps  xmm1,xmm2
383        subps   xmm2,xmm5
384        addps   xmm1,xmm5               ; xmm1=tmp7
385
386        mulps   xmm2,[GOTOFF(ebx,PD_1_414)]     ; xmm2=tmp11
387
388        movaps  xmm3,xmm0
389        addps   xmm0,xmm4
390        mulps   xmm0,[GOTOFF(ebx,PD_1_847)]     ; xmm0=z5
391        mulps   xmm3,[GOTOFF(ebx,PD_M2_613)]    ; xmm3=(z10 * -2.613125930)
392        mulps   xmm4,[GOTOFF(ebx,PD_1_082)]     ; xmm4=(z12 * 1.082392200)
393        addps   xmm3,xmm0               ; xmm3=tmp12
394        subps   xmm4,xmm0               ; xmm4=tmp10
395
396        ; -- Final output stage
397
398        subps   xmm3,xmm1               ; xmm3=tmp6
399        movaps  xmm5,xmm6
400        movaps  xmm0,xmm7
401        addps   xmm6,xmm1               ; xmm6=data0=(00 10 20 30)
402        addps   xmm7,xmm3               ; xmm7=data1=(01 11 21 31)
403        subps   xmm5,xmm1               ; xmm5=data7=(07 17 27 37)
404        subps   xmm0,xmm3               ; xmm0=data6=(06 16 26 36)
405        subps   xmm2,xmm3               ; xmm2=tmp5
406
407        movaps  xmm1,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm1=[PD_RNDINT_MAGIC]
408        pcmpeqd xmm3,xmm3
409        psrld   xmm3,WORD_BIT           ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
410
411        addps   xmm6,xmm1       ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
412        addps   xmm7,xmm1       ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
413        addps   xmm0,xmm1       ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
414        addps   xmm5,xmm1       ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
415
416        pand    xmm6,xmm3               ; xmm6=(00 -- 10 -- 20 -- 30 --)
417        pslld   xmm7,WORD_BIT           ; xmm7=(-- 01 -- 11 -- 21 -- 31)
418        pand    xmm0,xmm3               ; xmm0=(06 -- 16 -- 26 -- 36 --)
419        pslld   xmm5,WORD_BIT           ; xmm5=(-- 07 -- 17 -- 27 -- 37)
420        por     xmm6,xmm7               ; xmm6=(00 01 10 11 20 21 30 31)
421        por     xmm0,xmm5               ; xmm0=(06 07 16 17 26 27 36 37)
422
423        movaps  xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
424        movaps  xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
425
426        addps   xmm4,xmm2               ; xmm4=tmp4
427        movaps  xmm7,xmm1
428        movaps  xmm5,xmm3
429        addps   xmm1,xmm2               ; xmm1=data2=(02 12 22 32)
430        addps   xmm3,xmm4               ; xmm3=data4=(04 14 24 34)
431        subps   xmm7,xmm2               ; xmm7=data5=(05 15 25 35)
432        subps   xmm5,xmm4               ; xmm5=data3=(03 13 23 33)
433
434        movaps  xmm2,[GOTOFF(ebx,PD_RNDINT_MAGIC)]      ; xmm2=[PD_RNDINT_MAGIC]
435        pcmpeqd xmm4,xmm4
436        psrld   xmm4,WORD_BIT           ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
437
438        addps   xmm3,xmm2       ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
439        addps   xmm7,xmm2       ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
440        addps   xmm1,xmm2       ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
441        addps   xmm5,xmm2       ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
442
443        pand    xmm3,xmm4               ; xmm3=(04 -- 14 -- 24 -- 34 --)
444        pslld   xmm7,WORD_BIT           ; xmm7=(-- 05 -- 15 -- 25 -- 35)
445        pand    xmm1,xmm4               ; xmm1=(02 -- 12 -- 22 -- 32 --)
446        pslld   xmm5,WORD_BIT           ; xmm5=(-- 03 -- 13 -- 23 -- 33)
447        por     xmm3,xmm7               ; xmm3=(04 05 14 15 24 25 34 35)
448        por     xmm1,xmm5               ; xmm1=(02 03 12 13 22 23 32 33)
449
450        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
451
452        packsswb  xmm6,xmm3     ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
453        packsswb  xmm1,xmm0     ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
454        paddb     xmm6,xmm2
455        paddb     xmm1,xmm2
456
457        movdqa    xmm4,xmm6     ; transpose coefficients(phase 2)
458        punpcklwd xmm6,xmm1     ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
459        punpckhwd xmm4,xmm1     ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
460
461        movdqa    xmm7,xmm6     ; transpose coefficients(phase 3)
462        punpckldq xmm6,xmm4     ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
463        punpckhdq xmm7,xmm4     ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
464
465        pshufd  xmm5,xmm6,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
466        pshufd  xmm3,xmm7,0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
467
468        pushpic ebx                     ; save GOT address
469
470        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
471        mov     ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
472        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
473        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
474        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
475        mov     ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
476        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
477        movq    XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
478
479        poppic  ebx                     ; restore GOT address
480
481        add     esi, byte 4*SIZEOF_FAST_FLOAT   ; wsptr
482        add     edi, byte 4*SIZEOF_JSAMPROW
483        dec     ecx                             ; ctr
484        jnz     near .rowloop
485
486        pop     edi
487        pop     esi
488;       pop     edx             ; need not be preserved
489;       pop     ecx             ; need not be preserved
490        pop     ebx
491        mov     esp,ebp         ; esp <- aligned ebp
492        pop     esp             ; esp <- original ebp
493        pop     ebp
494        ret
495
496; For some reason, the OS X linker does not honor the request to align the
497; segment unless we do this.
498        align   16
499