1;
2; jidctflt.asm - floating-point IDCT (SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29    shufps      %1, %2, 0x44
30%endmacro
31
32%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33    shufps      %1, %2, 0xEE
34%endmacro
35
36; --------------------------------------------------------------------------
37    SECTION     SEG_CONST
38
39    alignz      32
40    GLOBAL_DATA(jconst_idct_float_sse2)
41
42EXTN(jconst_idct_float_sse2):
43
44PD_1_414        times 4  dd  1.414213562373095048801689
45PD_1_847        times 4  dd  1.847759065022573512256366
46PD_1_082        times 4  dd  1.082392200292393968799446
47PD_M2_613       times 4  dd -2.613125929752753055713286
48PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
49PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
50
51    alignz      32
52
53; --------------------------------------------------------------------------
54    SECTION     SEG_TEXT
55    BITS        32
56;
57; Perform dequantization and inverse DCT on one block of coefficients.
58;
59; GLOBAL(void)
60; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
61;                       JSAMPARRAY output_buf, JDIMENSION output_col)
62;
63
64%define dct_table(b)   (b) + 8          ; void *dct_table
65%define coef_block(b)  (b) + 12         ; JCOEFPTR coef_block
66%define output_buf(b)  (b) + 16         ; JSAMPARRAY output_buf
67%define output_col(b)  (b) + 20         ; JDIMENSION output_col
68
69%define original_ebp   ebp + 0
70%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_XMMWORD
71                                        ; xmmword wk[WK_NUM]
72%define WK_NUM         2
73%define workspace      wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
74                                        ; FAST_FLOAT workspace[DCTSIZE2]
75
76    align       32
77    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
78
79EXTN(jsimd_idct_float_sse2):
80    push        ebp
81    mov         eax, esp                     ; eax = original ebp
82    sub         esp, byte 4
83    and         esp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
84    mov         [esp], eax
85    mov         ebp, esp                     ; ebp = aligned ebp
86    lea         esp, [workspace]
87    push        ebx
88;   push        ecx                     ; need not be preserved
89;   push        edx                     ; need not be preserved
90    push        esi
91    push        edi
92
93    get_GOT     ebx                     ; get GOT address
94
95    ; ---- Pass 1: process columns from input, store into work array.
96
97;   mov         eax, [original_ebp]
98    mov         edx, POINTER [dct_table(eax)]    ; quantptr
99    mov         esi, JCOEFPTR [coef_block(eax)]  ; inptr
100    lea         edi, [workspace]                 ; FAST_FLOAT *wsptr
101    mov         ecx, DCTSIZE/4                   ; ctr
102    alignx      16, 7
103.columnloop:
104%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
105    mov         eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
106    or          eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
107    jnz         near .columnDCT
108
109    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
110    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
111    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
112    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
113    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
114    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
115    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
116    por         xmm1, xmm2
117    por         xmm3, xmm4
118    por         xmm5, xmm6
119    por         xmm1, xmm3
120    por         xmm5, xmm7
121    por         xmm1, xmm5
122    packsswb    xmm1, xmm1
123    movd        eax, xmm1
124    test        eax, eax
125    jnz         short .columnDCT
126
127    ; -- AC terms all zero
128
129    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
130
131    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
132    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
133    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
134
135    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
136
137    movaps      xmm1, xmm0
138    movaps      xmm2, xmm0
139    movaps      xmm3, xmm0
140
141    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
142    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
143    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
144    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
145
146    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
147    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0
148    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm1
149    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1
150    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm2
151    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm2
152    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm3
153    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
154    jmp         near .nextcolumn
155    alignx      16, 7
156%endif
157.columnDCT:
158
159    ; -- Even part
160
161    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,esi,SIZEOF_JCOEF)]
162    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,esi,SIZEOF_JCOEF)]
163    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,esi,SIZEOF_JCOEF)]
164    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,esi,SIZEOF_JCOEF)]
165
166    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
167    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
168    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
169    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
170    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
171    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
172
173    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
174    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
175    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
176    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
177    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
178    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
179
180    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
181    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
182    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
183    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
184
185    movaps      xmm4, xmm0
186    movaps      xmm5, xmm1
187    subps       xmm0, xmm2              ; xmm0=tmp11
188    subps       xmm1, xmm3
189    addps       xmm4, xmm2              ; xmm4=tmp10
190    addps       xmm5, xmm3              ; xmm5=tmp13
191
192    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
193    subps       xmm1, xmm5              ; xmm1=tmp12
194
195    movaps      xmm6, xmm4
196    movaps      xmm7, xmm0
197    subps       xmm4, xmm5              ; xmm4=tmp3
198    subps       xmm0, xmm1              ; xmm0=tmp2
199    addps       xmm6, xmm5              ; xmm6=tmp0
200    addps       xmm7, xmm1              ; xmm7=tmp1
201
202    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
203    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
204
205    ; -- Odd part
206
207    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,esi,SIZEOF_JCOEF)]
208    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,esi,SIZEOF_JCOEF)]
209    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,esi,SIZEOF_JCOEF)]
210    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,esi,SIZEOF_JCOEF)]
211
212    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
213    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
214    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
215    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
216    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
217    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
218
219    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
220    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
221    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
222    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
223    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
224    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
225
226    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
227    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
228    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
229    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_FLOAT_MULT_TYPE)]
230
231    movaps      xmm4, xmm2
232    movaps      xmm0, xmm5
233    addps       xmm2, xmm1              ; xmm2=z11
234    addps       xmm5, xmm3              ; xmm5=z13
235    subps       xmm4, xmm1              ; xmm4=z12
236    subps       xmm0, xmm3              ; xmm0=z10
237
238    movaps      xmm1, xmm2
239    subps       xmm2, xmm5
240    addps       xmm1, xmm5              ; xmm1=tmp7
241
242    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
243
244    movaps      xmm3, xmm0
245    addps       xmm0, xmm4
246    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
247    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
248    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
249    addps       xmm3, xmm0                     ; xmm3=tmp12
250    subps       xmm4, xmm0                     ; xmm4=tmp10
251
252    ; -- Final output stage
253
254    subps       xmm3, xmm1              ; xmm3=tmp6
255    movaps      xmm5, xmm6
256    movaps      xmm0, xmm7
257    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
258    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
259    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
260    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
261    subps       xmm2, xmm3              ; xmm2=tmp5
262
263    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
264    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
265    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
266    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
267    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
268    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
269
270    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
271    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
272
273    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
274    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
275
276    addps       xmm4, xmm2              ; xmm4=tmp4
277    movaps      xmm0, xmm7
278    movaps      xmm3, xmm5
279    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
280    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
281    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
282    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
283
284    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
285    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
286    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
287    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
288    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
289    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
290
291    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
292    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
293    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
294    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
295    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
296    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
297
298    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
299    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
300
301    movaps      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm6
302    movaps      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3
303    movaps      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_FAST_FLOAT)], xmm1
304    movaps      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_FAST_FLOAT)], xmm0
305
306    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
307    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
308    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
309    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
310    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
311    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
312
313    movaps      XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm5
314    movaps      XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
315    movaps      XMMWORD [XMMBLOCK(2,1,edi,SIZEOF_FAST_FLOAT)], xmm4
316    movaps      XMMWORD [XMMBLOCK(3,1,edi,SIZEOF_FAST_FLOAT)], xmm3
317
318.nextcolumn:
319    add         esi, byte 4*SIZEOF_JCOEF               ; coef_block
320    add         edx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
321    add         edi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
322    dec         ecx                                    ; ctr
323    jnz         near .columnloop
324
325    ; -- Prefetch the next coefficient block
326
327    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
328    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
329    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
330    prefetchnta [esi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
331
332    ; ---- Pass 2: process rows from work array, store into output array.
333
334    mov         eax, [original_ebp]
335    lea         esi, [workspace]                   ; FAST_FLOAT *wsptr
336    mov         edi, JSAMPARRAY [output_buf(eax)]  ; (JSAMPROW *)
337    mov         eax, JDIMENSION [output_col(eax)]
338    mov         ecx, DCTSIZE/4                     ; ctr
339    alignx      16, 7
340.rowloop:
341
342    ; -- Even part
343
344    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
345    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_FAST_FLOAT)]
346    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_FAST_FLOAT)]
347    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_FAST_FLOAT)]
348
349    movaps      xmm4, xmm0
350    movaps      xmm5, xmm1
351    subps       xmm0, xmm2              ; xmm0=tmp11
352    subps       xmm1, xmm3
353    addps       xmm4, xmm2              ; xmm4=tmp10
354    addps       xmm5, xmm3              ; xmm5=tmp13
355
356    mulps       xmm1, [GOTOFF(ebx,PD_1_414)]
357    subps       xmm1, xmm5              ; xmm1=tmp12
358
359    movaps      xmm6, xmm4
360    movaps      xmm7, xmm0
361    subps       xmm4, xmm5              ; xmm4=tmp3
362    subps       xmm0, xmm1              ; xmm0=tmp2
363    addps       xmm6, xmm5              ; xmm6=tmp0
364    addps       xmm7, xmm1              ; xmm7=tmp1
365
366    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
367    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
368
369    ; -- Odd part
370
371    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
372    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_FAST_FLOAT)]
373    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_FAST_FLOAT)]
374    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_FAST_FLOAT)]
375
376    movaps      xmm4, xmm2
377    movaps      xmm0, xmm5
378    addps       xmm2, xmm1              ; xmm2=z11
379    addps       xmm5, xmm3              ; xmm5=z13
380    subps       xmm4, xmm1              ; xmm4=z12
381    subps       xmm0, xmm3              ; xmm0=z10
382
383    movaps      xmm1, xmm2
384    subps       xmm2, xmm5
385    addps       xmm1, xmm5              ; xmm1=tmp7
386
387    mulps       xmm2, [GOTOFF(ebx,PD_1_414)]  ; xmm2=tmp11
388
389    movaps      xmm3, xmm0
390    addps       xmm0, xmm4
391    mulps       xmm0, [GOTOFF(ebx,PD_1_847)]   ; xmm0=z5
392    mulps       xmm3, [GOTOFF(ebx,PD_M2_613)]  ; xmm3=(z10 * -2.613125930)
393    mulps       xmm4, [GOTOFF(ebx,PD_1_082)]   ; xmm4=(z12 * 1.082392200)
394    addps       xmm3, xmm0                     ; xmm3=tmp12
395    subps       xmm4, xmm0                     ; xmm4=tmp10
396
397    ; -- Final output stage
398
399    subps       xmm3, xmm1              ; xmm3=tmp6
400    movaps      xmm5, xmm6
401    movaps      xmm0, xmm7
402    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
403    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
404    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
405    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
406    subps       xmm2, xmm3              ; xmm2=tmp5
407
408    movaps      xmm1, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm1=[PD_RNDINT_MAGIC]
409    pcmpeqd     xmm3, xmm3
410    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
411
412    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
413    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
414    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
415    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
416
417    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
418    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
419    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
420    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
421    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
422    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
423
424    movaps      xmm1, XMMWORD [wk(0)]   ; xmm1=tmp2
425    movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=tmp3
426
427    addps       xmm4, xmm2              ; xmm4=tmp4
428    movaps      xmm7, xmm1
429    movaps      xmm5, xmm3
430    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
431    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
432    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
433    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
434
435    movaps      xmm2, [GOTOFF(ebx,PD_RNDINT_MAGIC)]  ; xmm2=[PD_RNDINT_MAGIC]
436    pcmpeqd     xmm4, xmm4
437    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
438
439    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
440    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
441    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
442    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
443
444    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
445    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
446    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
447    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
448    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
449    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
450
451    movdqa      xmm2, [GOTOFF(ebx,PB_CENTERJSAMP)]  ; xmm2=[PB_CENTERJSAMP]
452
453    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
454    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
455    paddb       xmm6, xmm2
456    paddb       xmm1, xmm2
457
458    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
459    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
460    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
461
462    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
463    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
464    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
465
466    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
467    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
468
469    pushpic     ebx                     ; save GOT address
470
471    mov         edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
472    mov         ebx, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
473    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
474    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm7
475    mov         edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
476    mov         ebx, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
477    movq        XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
478    movq        XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE], xmm3
479
480    poppic      ebx                     ; restore GOT address
481
482    add         esi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
483    add         edi, byte 4*SIZEOF_JSAMPROW
484    dec         ecx                            ; ctr
485    jnz         near .rowloop
486
487    pop         edi
488    pop         esi
489;   pop         edx                     ; need not be preserved
490;   pop         ecx                     ; need not be preserved
491    pop         ebx
492    mov         esp, ebp                ; esp <- aligned ebp
493    pop         esp                     ; esp <- original ebp
494    pop         ebp
495    ret
496
497; For some reason, the OS X linker does not honor the request to align the
498; segment unless we do this.
499    align       32
500