1;
2; jidctflt.asm - floating-point IDCT (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; This file contains a floating-point implementation of the inverse DCT
18; (Discrete Cosine Transform). The following code is based directly on
19; the IJG's original jidctflt.c; see the jidctflt.c for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%macro unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
29    shufps      %1, %2, 0x44
30%endmacro
31
32%macro unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
33    shufps      %1, %2, 0xEE
34%endmacro
35
36; --------------------------------------------------------------------------
37    SECTION     SEG_CONST
38
39    alignz      32
40    GLOBAL_DATA(jconst_idct_float_sse2)
41
42EXTN(jconst_idct_float_sse2):
43
44PD_1_414        times 4  dd  1.414213562373095048801689
45PD_1_847        times 4  dd  1.847759065022573512256366
46PD_1_082        times 4  dd  1.082392200292393968799446
47PD_M2_613       times 4  dd -2.613125929752753055713286
48PD_RNDINT_MAGIC times 4  dd  100663296.0  ; (float)(0x00C00000 << 3)
49PB_CENTERJSAMP  times 16 db  CENTERJSAMPLE
50
51    alignz      32
52
53; --------------------------------------------------------------------------
54    SECTION     SEG_TEXT
55    BITS        64
56;
57; Perform dequantization and inverse DCT on one block of coefficients.
58;
59; GLOBAL(void)
60; jsimd_idct_float_sse2(void *dct_table, JCOEFPTR coef_block,
61;                       JSAMPARRAY output_buf, JDIMENSION output_col)
62;
63
64; r10 = void *dct_table
65; r11 = JCOEFPTR coef_block
66; r12 = JSAMPARRAY output_buf
67; r13d = JDIMENSION output_col
68
69%define original_rbp  rbp + 0
70%define wk(i)         rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD
71                                        ; xmmword wk[WK_NUM]
72%define WK_NUM        2
73%define workspace     wk(0) - DCTSIZE2 * SIZEOF_FAST_FLOAT
74                                        ; FAST_FLOAT workspace[DCTSIZE2]
75
76    align       32
77    GLOBAL_FUNCTION(jsimd_idct_float_sse2)
78
79EXTN(jsimd_idct_float_sse2):
80    push        rbp
81    mov         rax, rsp                     ; rax = original rbp
82    sub         rsp, byte 4
83    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
84    mov         [rsp], rax
85    mov         rbp, rsp                     ; rbp = aligned rbp
86    lea         rsp, [workspace]
87    collect_args 4
88    push        rbx
89
90    ; ---- Pass 1: process columns from input, store into work array.
91
92    mov         rdx, r10                ; quantptr
93    mov         rsi, r11                ; inptr
94    lea         rdi, [workspace]        ; FAST_FLOAT *wsptr
95    mov         rcx, DCTSIZE/4          ; ctr
96.columnloop:
97%ifndef NO_ZERO_COLUMN_TEST_FLOAT_SSE
98    mov         eax, DWORD [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
99    or          eax, DWORD [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
100    jnz         near .columnDCT
101
102    movq        xmm1, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
103    movq        xmm2, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
104    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
105    movq        xmm4, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
106    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
107    movq        xmm6, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
108    movq        xmm7, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
109    por         xmm1, xmm2
110    por         xmm3, xmm4
111    por         xmm5, xmm6
112    por         xmm1, xmm3
113    por         xmm5, xmm7
114    por         xmm1, xmm5
115    packsswb    xmm1, xmm1
116    movd        eax, xmm1
117    test        rax, rax
118    jnz         short .columnDCT
119
120    ; -- AC terms all zero
121
122    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
123
124    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
125    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
126    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
127
128    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
129
130    movaps      xmm1, xmm0
131    movaps      xmm2, xmm0
132    movaps      xmm3, xmm0
133
134    shufps      xmm0, xmm0, 0x00        ; xmm0=(00 00 00 00)
135    shufps      xmm1, xmm1, 0x55        ; xmm1=(01 01 01 01)
136    shufps      xmm2, xmm2, 0xAA        ; xmm2=(02 02 02 02)
137    shufps      xmm3, xmm3, 0xFF        ; xmm3=(03 03 03 03)
138
139    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
140    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
141    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
142    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
143    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
144    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm2
145    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
146    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
147    jmp         near .nextcolumn
148%endif
149.columnDCT:
150
151    ; -- Even part
152
153    movq        xmm0, XMM_MMWORD [MMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
154    movq        xmm1, XMM_MMWORD [MMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
155    movq        xmm2, XMM_MMWORD [MMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
156    movq        xmm3, XMM_MMWORD [MMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
157
158    punpcklwd   xmm0, xmm0                  ; xmm0=(00 00 01 01 02 02 03 03)
159    punpcklwd   xmm1, xmm1                  ; xmm1=(20 20 21 21 22 22 23 23)
160    psrad       xmm0, (DWORD_BIT-WORD_BIT)  ; xmm0=in0=(00 01 02 03)
161    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in2=(20 21 22 23)
162    cvtdq2ps    xmm0, xmm0                  ; xmm0=in0=(00 01 02 03)
163    cvtdq2ps    xmm1, xmm1                  ; xmm1=in2=(20 21 22 23)
164
165    punpcklwd   xmm2, xmm2                  ; xmm2=(40 40 41 41 42 42 43 43)
166    punpcklwd   xmm3, xmm3                  ; xmm3=(60 60 61 61 62 62 63 63)
167    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in4=(40 41 42 43)
168    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in6=(60 61 62 63)
169    cvtdq2ps    xmm2, xmm2                  ; xmm2=in4=(40 41 42 43)
170    cvtdq2ps    xmm3, xmm3                  ; xmm3=in6=(60 61 62 63)
171
172    mulps       xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
173    mulps       xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
174    mulps       xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
175    mulps       xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
176
177    movaps      xmm4, xmm0
178    movaps      xmm5, xmm1
179    subps       xmm0, xmm2              ; xmm0=tmp11
180    subps       xmm1, xmm3
181    addps       xmm4, xmm2              ; xmm4=tmp10
182    addps       xmm5, xmm3              ; xmm5=tmp13
183
184    mulps       xmm1, [rel PD_1_414]
185    subps       xmm1, xmm5              ; xmm1=tmp12
186
187    movaps      xmm6, xmm4
188    movaps      xmm7, xmm0
189    subps       xmm4, xmm5              ; xmm4=tmp3
190    subps       xmm0, xmm1              ; xmm0=tmp2
191    addps       xmm6, xmm5              ; xmm6=tmp0
192    addps       xmm7, xmm1              ; xmm7=tmp1
193
194    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
195    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
196
197    ; -- Odd part
198
199    movq        xmm2, XMM_MMWORD [MMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
200    movq        xmm3, XMM_MMWORD [MMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
201    movq        xmm5, XMM_MMWORD [MMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
202    movq        xmm1, XMM_MMWORD [MMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
203
204    punpcklwd   xmm2, xmm2                  ; xmm2=(10 10 11 11 12 12 13 13)
205    punpcklwd   xmm3, xmm3                  ; xmm3=(30 30 31 31 32 32 33 33)
206    psrad       xmm2, (DWORD_BIT-WORD_BIT)  ; xmm2=in1=(10 11 12 13)
207    psrad       xmm3, (DWORD_BIT-WORD_BIT)  ; xmm3=in3=(30 31 32 33)
208    cvtdq2ps    xmm2, xmm2                  ; xmm2=in1=(10 11 12 13)
209    cvtdq2ps    xmm3, xmm3                  ; xmm3=in3=(30 31 32 33)
210
211    punpcklwd   xmm5, xmm5                  ; xmm5=(50 50 51 51 52 52 53 53)
212    punpcklwd   xmm1, xmm1                  ; xmm1=(70 70 71 71 72 72 73 73)
213    psrad       xmm5, (DWORD_BIT-WORD_BIT)  ; xmm5=in5=(50 51 52 53)
214    psrad       xmm1, (DWORD_BIT-WORD_BIT)  ; xmm1=in7=(70 71 72 73)
215    cvtdq2ps    xmm5, xmm5                  ; xmm5=in5=(50 51 52 53)
216    cvtdq2ps    xmm1, xmm1                  ; xmm1=in7=(70 71 72 73)
217
218    mulps       xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
219    mulps       xmm3, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
220    mulps       xmm5, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
221    mulps       xmm1, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FLOAT_MULT_TYPE)]
222
223    movaps      xmm4, xmm2
224    movaps      xmm0, xmm5
225    addps       xmm2, xmm1              ; xmm2=z11
226    addps       xmm5, xmm3              ; xmm5=z13
227    subps       xmm4, xmm1              ; xmm4=z12
228    subps       xmm0, xmm3              ; xmm0=z10
229
230    movaps      xmm1, xmm2
231    subps       xmm2, xmm5
232    addps       xmm1, xmm5              ; xmm1=tmp7
233
234    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
235
236    movaps      xmm3, xmm0
237    addps       xmm0, xmm4
238    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
239    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
240    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
241    addps       xmm3, xmm0              ; xmm3=tmp12
242    subps       xmm4, xmm0              ; xmm4=tmp10
243
244    ; -- Final output stage
245
246    subps       xmm3, xmm1              ; xmm3=tmp6
247    movaps      xmm5, xmm6
248    movaps      xmm0, xmm7
249    addps       xmm6, xmm1              ; xmm6=data0=(00 01 02 03)
250    addps       xmm7, xmm3              ; xmm7=data1=(10 11 12 13)
251    subps       xmm5, xmm1              ; xmm5=data7=(70 71 72 73)
252    subps       xmm0, xmm3              ; xmm0=data6=(60 61 62 63)
253    subps       xmm2, xmm3              ; xmm2=tmp5
254
255    movaps      xmm1, xmm6              ; transpose coefficients(phase 1)
256    unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
257    unpckhps    xmm1, xmm7              ; xmm1=(02 12 03 13)
258    movaps      xmm3, xmm0              ; transpose coefficients(phase 1)
259    unpcklps    xmm0, xmm5              ; xmm0=(60 70 61 71)
260    unpckhps    xmm3, xmm5              ; xmm3=(62 72 63 73)
261
262    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
263    movaps      xmm5, XMMWORD [wk(1)]   ; xmm5=tmp3
264
265    movaps      XMMWORD [wk(0)], xmm0   ; wk(0)=(60 70 61 71)
266    movaps      XMMWORD [wk(1)], xmm3   ; wk(1)=(62 72 63 73)
267
268    addps       xmm4, xmm2              ; xmm4=tmp4
269    movaps      xmm0, xmm7
270    movaps      xmm3, xmm5
271    addps       xmm7, xmm2              ; xmm7=data2=(20 21 22 23)
272    addps       xmm5, xmm4              ; xmm5=data4=(40 41 42 43)
273    subps       xmm0, xmm2              ; xmm0=data5=(50 51 52 53)
274    subps       xmm3, xmm4              ; xmm3=data3=(30 31 32 33)
275
276    movaps      xmm2, xmm7              ; transpose coefficients(phase 1)
277    unpcklps    xmm7, xmm3              ; xmm7=(20 30 21 31)
278    unpckhps    xmm2, xmm3              ; xmm2=(22 32 23 33)
279    movaps      xmm4, xmm5              ; transpose coefficients(phase 1)
280    unpcklps    xmm5, xmm0              ; xmm5=(40 50 41 51)
281    unpckhps    xmm4, xmm0              ; xmm4=(42 52 43 53)
282
283    movaps      xmm3, xmm6              ; transpose coefficients(phase 2)
284    unpcklps2   xmm6, xmm7              ; xmm6=(00 10 20 30)
285    unpckhps2   xmm3, xmm7              ; xmm3=(01 11 21 31)
286    movaps      xmm0, xmm1              ; transpose coefficients(phase 2)
287    unpcklps2   xmm1, xmm2              ; xmm1=(02 12 22 32)
288    unpckhps2   xmm0, xmm2              ; xmm0=(03 13 23 33)
289
290    movaps      xmm7, XMMWORD [wk(0)]   ; xmm7=(60 70 61 71)
291    movaps      xmm2, XMMWORD [wk(1)]   ; xmm2=(62 72 63 73)
292
293    movaps      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm6
294    movaps      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
295    movaps      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_FAST_FLOAT)], xmm1
296    movaps      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_FAST_FLOAT)], xmm0
297
298    movaps      xmm6, xmm5              ; transpose coefficients(phase 2)
299    unpcklps2   xmm5, xmm7              ; xmm5=(40 50 60 70)
300    unpckhps2   xmm6, xmm7              ; xmm6=(41 51 61 71)
301    movaps      xmm3, xmm4              ; transpose coefficients(phase 2)
302    unpcklps2   xmm4, xmm2              ; xmm4=(42 52 62 72)
303    unpckhps2   xmm3, xmm2              ; xmm3=(43 53 63 73)
304
305    movaps      XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm5
306    movaps      XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm6
307    movaps      XMMWORD [XMMBLOCK(2,1,rdi,SIZEOF_FAST_FLOAT)], xmm4
308    movaps      XMMWORD [XMMBLOCK(3,1,rdi,SIZEOF_FAST_FLOAT)], xmm3
309
310.nextcolumn:
311    add         rsi, byte 4*SIZEOF_JCOEF               ; coef_block
312    add         rdx, byte 4*SIZEOF_FLOAT_MULT_TYPE     ; quantptr
313    add         rdi,      4*DCTSIZE*SIZEOF_FAST_FLOAT  ; wsptr
314    dec         rcx                                    ; ctr
315    jnz         near .columnloop
316
317    ; -- Prefetch the next coefficient block
318
319    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 0*32]
320    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 1*32]
321    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 2*32]
322    prefetchnta [rsi + (DCTSIZE2-8)*SIZEOF_JCOEF + 3*32]
323
324    ; ---- Pass 2: process rows from work array, store into output array.
325
326    mov         rax, [original_rbp]
327    lea         rsi, [workspace]        ; FAST_FLOAT *wsptr
328    mov         rdi, r12                ; (JSAMPROW *)
329    mov         eax, r13d
330    mov         rcx, DCTSIZE/4          ; ctr
331.rowloop:
332
333    ; -- Even part
334
335    movaps      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
336    movaps      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_FAST_FLOAT)]
337    movaps      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_FAST_FLOAT)]
338    movaps      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_FAST_FLOAT)]
339
340    movaps      xmm4, xmm0
341    movaps      xmm5, xmm1
342    subps       xmm0, xmm2              ; xmm0=tmp11
343    subps       xmm1, xmm3
344    addps       xmm4, xmm2              ; xmm4=tmp10
345    addps       xmm5, xmm3              ; xmm5=tmp13
346
347    mulps       xmm1, [rel PD_1_414]
348    subps       xmm1, xmm5              ; xmm1=tmp12
349
350    movaps      xmm6, xmm4
351    movaps      xmm7, xmm0
352    subps       xmm4, xmm5              ; xmm4=tmp3
353    subps       xmm0, xmm1              ; xmm0=tmp2
354    addps       xmm6, xmm5              ; xmm6=tmp0
355    addps       xmm7, xmm1              ; xmm7=tmp1
356
357    movaps      XMMWORD [wk(1)], xmm4   ; tmp3
358    movaps      XMMWORD [wk(0)], xmm0   ; tmp2
359
360    ; -- Odd part
361
362    movaps      xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
363    movaps      xmm3, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_FAST_FLOAT)]
364    movaps      xmm5, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_FAST_FLOAT)]
365    movaps      xmm1, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_FAST_FLOAT)]
366
367    movaps      xmm4, xmm2
368    movaps      xmm0, xmm5
369    addps       xmm2, xmm1              ; xmm2=z11
370    addps       xmm5, xmm3              ; xmm5=z13
371    subps       xmm4, xmm1              ; xmm4=z12
372    subps       xmm0, xmm3              ; xmm0=z10
373
374    movaps      xmm1, xmm2
375    subps       xmm2, xmm5
376    addps       xmm1, xmm5              ; xmm1=tmp7
377
378    mulps       xmm2, [rel PD_1_414]    ; xmm2=tmp11
379
380    movaps      xmm3, xmm0
381    addps       xmm0, xmm4
382    mulps       xmm0, [rel PD_1_847]    ; xmm0=z5
383    mulps       xmm3, [rel PD_M2_613]   ; xmm3=(z10 * -2.613125930)
384    mulps       xmm4, [rel PD_1_082]    ; xmm4=(z12 * 1.082392200)
385    addps       xmm3, xmm0              ; xmm3=tmp12
386    subps       xmm4, xmm0              ; xmm4=tmp10
387
388    ; -- Final output stage
389
390    subps       xmm3, xmm1              ; xmm3=tmp6
391    movaps      xmm5, xmm6
392    movaps      xmm0, xmm7
393    addps       xmm6, xmm1              ; xmm6=data0=(00 10 20 30)
394    addps       xmm7, xmm3              ; xmm7=data1=(01 11 21 31)
395    subps       xmm5, xmm1              ; xmm5=data7=(07 17 27 37)
396    subps       xmm0, xmm3              ; xmm0=data6=(06 16 26 36)
397    subps       xmm2, xmm3              ; xmm2=tmp5
398
399    movaps      xmm1, [rel PD_RNDINT_MAGIC]  ; xmm1=[rel PD_RNDINT_MAGIC]
400    pcmpeqd     xmm3, xmm3
401    psrld       xmm3, WORD_BIT          ; xmm3={0xFFFF 0x0000 0xFFFF 0x0000 ..}
402
403    addps       xmm6, xmm1              ; xmm6=roundint(data0/8)=(00 ** 10 ** 20 ** 30 **)
404    addps       xmm7, xmm1              ; xmm7=roundint(data1/8)=(01 ** 11 ** 21 ** 31 **)
405    addps       xmm0, xmm1              ; xmm0=roundint(data6/8)=(06 ** 16 ** 26 ** 36 **)
406    addps       xmm5, xmm1              ; xmm5=roundint(data7/8)=(07 ** 17 ** 27 ** 37 **)
407
408    pand        xmm6, xmm3              ; xmm6=(00 -- 10 -- 20 -- 30 --)
409    pslld       xmm7, WORD_BIT          ; xmm7=(-- 01 -- 11 -- 21 -- 31)
410    pand        xmm0, xmm3              ; xmm0=(06 -- 16 -- 26 -- 36 --)
411    pslld       xmm5, WORD_BIT          ; xmm5=(-- 07 -- 17 -- 27 -- 37)
412    por         xmm6, xmm7              ; xmm6=(00 01 10 11 20 21 30 31)
413    por         xmm0, xmm5              ; xmm0=(06 07 16 17 26 27 36 37)
414
415    movaps      xmm1,  XMMWORD [wk(0)]  ; xmm1=tmp2
416    movaps      xmm3,  XMMWORD [wk(1)]  ; xmm3=tmp3
417
418    addps       xmm4, xmm2              ; xmm4=tmp4
419    movaps      xmm7, xmm1
420    movaps      xmm5, xmm3
421    addps       xmm1, xmm2              ; xmm1=data2=(02 12 22 32)
422    addps       xmm3, xmm4              ; xmm3=data4=(04 14 24 34)
423    subps       xmm7, xmm2              ; xmm7=data5=(05 15 25 35)
424    subps       xmm5, xmm4              ; xmm5=data3=(03 13 23 33)
425
426    movaps      xmm2, [rel PD_RNDINT_MAGIC]  ; xmm2=[rel PD_RNDINT_MAGIC]
427    pcmpeqd     xmm4, xmm4
428    psrld       xmm4, WORD_BIT          ; xmm4={0xFFFF 0x0000 0xFFFF 0x0000 ..}
429
430    addps       xmm3, xmm2              ; xmm3=roundint(data4/8)=(04 ** 14 ** 24 ** 34 **)
431    addps       xmm7, xmm2              ; xmm7=roundint(data5/8)=(05 ** 15 ** 25 ** 35 **)
432    addps       xmm1, xmm2              ; xmm1=roundint(data2/8)=(02 ** 12 ** 22 ** 32 **)
433    addps       xmm5, xmm2              ; xmm5=roundint(data3/8)=(03 ** 13 ** 23 ** 33 **)
434
435    pand        xmm3, xmm4              ; xmm3=(04 -- 14 -- 24 -- 34 --)
436    pslld       xmm7, WORD_BIT          ; xmm7=(-- 05 -- 15 -- 25 -- 35)
437    pand        xmm1, xmm4              ; xmm1=(02 -- 12 -- 22 -- 32 --)
438    pslld       xmm5, WORD_BIT          ; xmm5=(-- 03 -- 13 -- 23 -- 33)
439    por         xmm3, xmm7              ; xmm3=(04 05 14 15 24 25 34 35)
440    por         xmm1, xmm5              ; xmm1=(02 03 12 13 22 23 32 33)
441
442    movdqa      xmm2, [rel PB_CENTERJSAMP]  ; xmm2=[rel PB_CENTERJSAMP]
443
444    packsswb    xmm6, xmm3        ; xmm6=(00 01 10 11 20 21 30 31 04 05 14 15 24 25 34 35)
445    packsswb    xmm1, xmm0        ; xmm1=(02 03 12 13 22 23 32 33 06 07 16 17 26 27 36 37)
446    paddb       xmm6, xmm2
447    paddb       xmm1, xmm2
448
449    movdqa      xmm4, xmm6        ; transpose coefficients(phase 2)
450    punpcklwd   xmm6, xmm1        ; xmm6=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
451    punpckhwd   xmm4, xmm1        ; xmm4=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
452
453    movdqa      xmm7, xmm6        ; transpose coefficients(phase 3)
454    punpckldq   xmm6, xmm4        ; xmm6=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
455    punpckhdq   xmm7, xmm4        ; xmm7=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
456
457    pshufd      xmm5, xmm6, 0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
458    pshufd      xmm3, xmm7, 0x4E  ; xmm3=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
459
460    mov         rdx, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
461    mov         rbx, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
462    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
463    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm7
464    mov         rdx, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
465    mov         rbx, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
466    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm5
467    movq        XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE], xmm3
468
469    add         rsi, byte 4*SIZEOF_FAST_FLOAT  ; wsptr
470    add         rdi, byte 4*SIZEOF_JSAMPROW
471    dec         rcx                            ; ctr
472    jnz         near .rowloop
473
474    pop         rbx
475    uncollect_args 4
476    mov         rsp, rbp                ; rsp <- aligned rbp
477    pop         rsp                     ; rsp <- original rbp
478    pop         rbp
479    ret
480
481; For some reason, the OS X linker does not honor the request to align the
482; segment unless we do this.
483    align       32
484