1;
2; jidctfst.asm - fast integer IDCT (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; This file contains a fast, not so accurate integer implementation of
17; the inverse DCT (Discrete Cosine Transform). The following code is
18; based directly on the IJG's original jidctfst.c; see the jidctfst.c
19; for more details.
20;
21; [TAB8]
22
23%include "jsimdext.inc"
24%include "jdct.inc"
25
26; --------------------------------------------------------------------------
27
28%define CONST_BITS      8       ; 14 is also OK.
29%define PASS1_BITS      2
30
31%if IFAST_SCALE_BITS != PASS1_BITS
32%error "'IFAST_SCALE_BITS' must be equal to 'PASS1_BITS'."
33%endif
34
35%if CONST_BITS == 8
36F_1_082 equ     277             ; FIX(1.082392200)
37F_1_414 equ     362             ; FIX(1.414213562)
38F_1_847 equ     473             ; FIX(1.847759065)
39F_2_613 equ     669             ; FIX(2.613125930)
40F_1_613 equ     (F_2_613 - 256) ; FIX(2.613125930) - FIX(1)
41%else
42; NASM cannot do compile-time arithmetic on floating-point constants.
43%define DESCALE(x,n)  (((x)+(1<<((n)-1)))>>(n))
44F_1_082 equ     DESCALE(1162209775,30-CONST_BITS)       ; FIX(1.082392200)
45F_1_414 equ     DESCALE(1518500249,30-CONST_BITS)       ; FIX(1.414213562)
46F_1_847 equ     DESCALE(1984016188,30-CONST_BITS)       ; FIX(1.847759065)
47F_2_613 equ     DESCALE(2805822602,30-CONST_BITS)       ; FIX(2.613125930)
48F_1_613 equ     (F_2_613 - (1 << CONST_BITS))   ; FIX(2.613125930) - FIX(1)
49%endif
50
51; --------------------------------------------------------------------------
52        SECTION SEG_CONST
53
54; PRE_MULTIPLY_SCALE_BITS <= 2 (to avoid overflow)
55; CONST_BITS + CONST_SHIFT + PRE_MULTIPLY_SCALE_BITS == 16 (for pmulhw)
56
57%define PRE_MULTIPLY_SCALE_BITS   2
58%define CONST_SHIFT     (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS)
59
60        alignz  16
61        global  EXTN(jconst_idct_ifast_sse2)
62
63EXTN(jconst_idct_ifast_sse2):
64
65PW_F1414        times 8 dw  F_1_414 << CONST_SHIFT
66PW_F1847        times 8 dw  F_1_847 << CONST_SHIFT
67PW_MF1613       times 8 dw -F_1_613 << CONST_SHIFT
68PW_F1082        times 8 dw  F_1_082 << CONST_SHIFT
69PB_CENTERJSAMP  times 16 db CENTERJSAMPLE
70
71        alignz  16
72
73; --------------------------------------------------------------------------
74        SECTION SEG_TEXT
75        BITS    32
76;
77; Perform dequantization and inverse DCT on one block of coefficients.
78;
79; GLOBAL(void)
80; jsimd_idct_ifast_sse2 (void *dct_table, JCOEFPTR coef_block,
81;                       JSAMPARRAY output_buf, JDIMENSION output_col)
82;
83
84%define dct_table(b)    (b)+8           ; jpeg_component_info *compptr
85%define coef_block(b)   (b)+12          ; JCOEFPTR coef_block
86%define output_buf(b)   (b)+16          ; JSAMPARRAY output_buf
87%define output_col(b)   (b)+20          ; JDIMENSION output_col
88
89%define original_ebp    ebp+0
90%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
91%define WK_NUM          2
92
93        align   16
94        global  EXTN(jsimd_idct_ifast_sse2)
95
96EXTN(jsimd_idct_ifast_sse2):
97        push    ebp
98        mov     eax,esp                         ; eax = original ebp
99        sub     esp, byte 4
100        and     esp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
101        mov     [esp],eax
102        mov     ebp,esp                         ; ebp = aligned ebp
103        lea     esp, [wk(0)]
104        pushpic ebx
105;       push    ecx             ; unused
106;       push    edx             ; need not be preserved
107        push    esi
108        push    edi
109
110        get_GOT ebx             ; get GOT address
111
112        ; ---- Pass 1: process columns from input.
113
114;       mov     eax, [original_ebp]
115        mov     edx, POINTER [dct_table(eax)]           ; quantptr
116        mov     esi, JCOEFPTR [coef_block(eax)]         ; inptr
117
118%ifndef NO_ZERO_COLUMN_TEST_IFAST_SSE2
119        mov     eax, DWORD [DWBLOCK(1,0,esi,SIZEOF_JCOEF)]
120        or      eax, DWORD [DWBLOCK(2,0,esi,SIZEOF_JCOEF)]
121        jnz     near .columnDCT
122
123        movdqa  xmm0, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
124        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
125        por     xmm0, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
126        por     xmm1, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
127        por     xmm0, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
128        por     xmm1, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
129        por     xmm0, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
130        por     xmm1,xmm0
131        packsswb xmm1,xmm1
132        packsswb xmm1,xmm1
133        movd    eax,xmm1
134        test    eax,eax
135        jnz     short .columnDCT
136
137        ; -- AC terms all zero
138
139        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
140        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_ISLOW_MULT_TYPE)]
141
142        movdqa    xmm7,xmm0             ; xmm0=in0=(00 01 02 03 04 05 06 07)
143        punpcklwd xmm0,xmm0             ; xmm0=(00 00 01 01 02 02 03 03)
144        punpckhwd xmm7,xmm7             ; xmm7=(04 04 05 05 06 06 07 07)
145
146        pshufd  xmm6,xmm0,0x00          ; xmm6=col0=(00 00 00 00 00 00 00 00)
147        pshufd  xmm2,xmm0,0x55          ; xmm2=col1=(01 01 01 01 01 01 01 01)
148        pshufd  xmm5,xmm0,0xAA          ; xmm5=col2=(02 02 02 02 02 02 02 02)
149        pshufd  xmm0,xmm0,0xFF          ; xmm0=col3=(03 03 03 03 03 03 03 03)
150        pshufd  xmm1,xmm7,0x00          ; xmm1=col4=(04 04 04 04 04 04 04 04)
151        pshufd  xmm4,xmm7,0x55          ; xmm4=col5=(05 05 05 05 05 05 05 05)
152        pshufd  xmm3,xmm7,0xAA          ; xmm3=col6=(06 06 06 06 06 06 06 06)
153        pshufd  xmm7,xmm7,0xFF          ; xmm7=col7=(07 07 07 07 07 07 07 07)
154
155        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=col1
156        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=col3
157        jmp     near .column_end
158        alignx  16,7
159%endif
160.columnDCT:
161
162        ; -- Even part
163
164        movdqa  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_JCOEF)]
165        movdqa  xmm1, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_JCOEF)]
166        pmullw  xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_IFAST_MULT_TYPE)]
167        pmullw  xmm1, XMMWORD [XMMBLOCK(2,0,edx,SIZEOF_IFAST_MULT_TYPE)]
168        movdqa  xmm2, XMMWORD [XMMBLOCK(4,0,esi,SIZEOF_JCOEF)]
169        movdqa  xmm3, XMMWORD [XMMBLOCK(6,0,esi,SIZEOF_JCOEF)]
170        pmullw  xmm2, XMMWORD [XMMBLOCK(4,0,edx,SIZEOF_IFAST_MULT_TYPE)]
171        pmullw  xmm3, XMMWORD [XMMBLOCK(6,0,edx,SIZEOF_IFAST_MULT_TYPE)]
172
173        movdqa  xmm4,xmm0
174        movdqa  xmm5,xmm1
175        psubw   xmm0,xmm2               ; xmm0=tmp11
176        psubw   xmm1,xmm3
177        paddw   xmm4,xmm2               ; xmm4=tmp10
178        paddw   xmm5,xmm3               ; xmm5=tmp13
179
180        psllw   xmm1,PRE_MULTIPLY_SCALE_BITS
181        pmulhw  xmm1,[GOTOFF(ebx,PW_F1414)]
182        psubw   xmm1,xmm5               ; xmm1=tmp12
183
184        movdqa  xmm6,xmm4
185        movdqa  xmm7,xmm0
186        psubw   xmm4,xmm5               ; xmm4=tmp3
187        psubw   xmm0,xmm1               ; xmm0=tmp2
188        paddw   xmm6,xmm5               ; xmm6=tmp0
189        paddw   xmm7,xmm1               ; xmm7=tmp1
190
191        movdqa  XMMWORD [wk(1)], xmm4   ; wk(1)=tmp3
192        movdqa  XMMWORD [wk(0)], xmm0   ; wk(0)=tmp2
193
194        ; -- Odd part
195
196        movdqa  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_JCOEF)]
197        movdqa  xmm3, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_JCOEF)]
198        pmullw  xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_IFAST_MULT_TYPE)]
199        pmullw  xmm3, XMMWORD [XMMBLOCK(3,0,edx,SIZEOF_IFAST_MULT_TYPE)]
200        movdqa  xmm5, XMMWORD [XMMBLOCK(5,0,esi,SIZEOF_JCOEF)]
201        movdqa  xmm1, XMMWORD [XMMBLOCK(7,0,esi,SIZEOF_JCOEF)]
202        pmullw  xmm5, XMMWORD [XMMBLOCK(5,0,edx,SIZEOF_IFAST_MULT_TYPE)]
203        pmullw  xmm1, XMMWORD [XMMBLOCK(7,0,edx,SIZEOF_IFAST_MULT_TYPE)]
204
205        movdqa  xmm4,xmm2
206        movdqa  xmm0,xmm5
207        psubw   xmm2,xmm1               ; xmm2=z12
208        psubw   xmm5,xmm3               ; xmm5=z10
209        paddw   xmm4,xmm1               ; xmm4=z11
210        paddw   xmm0,xmm3               ; xmm0=z13
211
212        movdqa  xmm1,xmm5               ; xmm1=z10(unscaled)
213        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
214        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
215
216        movdqa  xmm3,xmm4
217        psubw   xmm4,xmm0
218        paddw   xmm3,xmm0               ; xmm3=tmp7
219
220        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
221        pmulhw  xmm4,[GOTOFF(ebx,PW_F1414)]     ; xmm4=tmp11
222
223        ; To avoid overflow...
224        ;
225        ; (Original)
226        ; tmp12 = -2.613125930 * z10 + z5;
227        ;
228        ; (This implementation)
229        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
230        ;       = -1.613125930 * z10 - z10 + z5;
231
232        movdqa  xmm0,xmm5
233        paddw   xmm5,xmm2
234        pmulhw  xmm5,[GOTOFF(ebx,PW_F1847)]     ; xmm5=z5
235        pmulhw  xmm0,[GOTOFF(ebx,PW_MF1613)]
236        pmulhw  xmm2,[GOTOFF(ebx,PW_F1082)]
237        psubw   xmm0,xmm1
238        psubw   xmm2,xmm5               ; xmm2=tmp10
239        paddw   xmm0,xmm5               ; xmm0=tmp12
240
241        ; -- Final output stage
242
243        psubw   xmm0,xmm3               ; xmm0=tmp6
244        movdqa  xmm1,xmm6
245        movdqa  xmm5,xmm7
246        paddw   xmm6,xmm3               ; xmm6=data0=(00 01 02 03 04 05 06 07)
247        paddw   xmm7,xmm0               ; xmm7=data1=(10 11 12 13 14 15 16 17)
248        psubw   xmm1,xmm3               ; xmm1=data7=(70 71 72 73 74 75 76 77)
249        psubw   xmm5,xmm0               ; xmm5=data6=(60 61 62 63 64 65 66 67)
250        psubw   xmm4,xmm0               ; xmm4=tmp5
251
252        movdqa    xmm3,xmm6             ; transpose coefficients(phase 1)
253        punpcklwd xmm6,xmm7             ; xmm6=(00 10 01 11 02 12 03 13)
254        punpckhwd xmm3,xmm7             ; xmm3=(04 14 05 15 06 16 07 17)
255        movdqa    xmm0,xmm5             ; transpose coefficients(phase 1)
256        punpcklwd xmm5,xmm1             ; xmm5=(60 70 61 71 62 72 63 73)
257        punpckhwd xmm0,xmm1             ; xmm0=(64 74 65 75 66 76 67 77)
258
259        movdqa  xmm7, XMMWORD [wk(0)]   ; xmm7=tmp2
260        movdqa  xmm1, XMMWORD [wk(1)]   ; xmm1=tmp3
261
262        movdqa  XMMWORD [wk(0)], xmm5   ; wk(0)=(60 70 61 71 62 72 63 73)
263        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(64 74 65 75 66 76 67 77)
264
265        paddw   xmm2,xmm4               ; xmm2=tmp4
266        movdqa  xmm5,xmm7
267        movdqa  xmm0,xmm1
268        paddw   xmm7,xmm4               ; xmm7=data2=(20 21 22 23 24 25 26 27)
269        paddw   xmm1,xmm2               ; xmm1=data4=(40 41 42 43 44 45 46 47)
270        psubw   xmm5,xmm4               ; xmm5=data5=(50 51 52 53 54 55 56 57)
271        psubw   xmm0,xmm2               ; xmm0=data3=(30 31 32 33 34 35 36 37)
272
273        movdqa    xmm4,xmm7             ; transpose coefficients(phase 1)
274        punpcklwd xmm7,xmm0             ; xmm7=(20 30 21 31 22 32 23 33)
275        punpckhwd xmm4,xmm0             ; xmm4=(24 34 25 35 26 36 27 37)
276        movdqa    xmm2,xmm1             ; transpose coefficients(phase 1)
277        punpcklwd xmm1,xmm5             ; xmm1=(40 50 41 51 42 52 43 53)
278        punpckhwd xmm2,xmm5             ; xmm2=(44 54 45 55 46 56 47 57)
279
280        movdqa    xmm0,xmm3             ; transpose coefficients(phase 2)
281        punpckldq xmm3,xmm4             ; xmm3=(04 14 24 34 05 15 25 35)
282        punpckhdq xmm0,xmm4             ; xmm0=(06 16 26 36 07 17 27 37)
283        movdqa    xmm5,xmm6             ; transpose coefficients(phase 2)
284        punpckldq xmm6,xmm7             ; xmm6=(00 10 20 30 01 11 21 31)
285        punpckhdq xmm5,xmm7             ; xmm5=(02 12 22 32 03 13 23 33)
286
287        movdqa  xmm4, XMMWORD [wk(0)]   ; xmm4=(60 70 61 71 62 72 63 73)
288        movdqa  xmm7, XMMWORD [wk(1)]   ; xmm7=(64 74 65 75 66 76 67 77)
289
290        movdqa  XMMWORD [wk(0)], xmm3   ; wk(0)=(04 14 24 34 05 15 25 35)
291        movdqa  XMMWORD [wk(1)], xmm0   ; wk(1)=(06 16 26 36 07 17 27 37)
292
293        movdqa    xmm3,xmm1             ; transpose coefficients(phase 2)
294        punpckldq xmm1,xmm4             ; xmm1=(40 50 60 70 41 51 61 71)
295        punpckhdq xmm3,xmm4             ; xmm3=(42 52 62 72 43 53 63 73)
296        movdqa    xmm0,xmm2             ; transpose coefficients(phase 2)
297        punpckldq xmm2,xmm7             ; xmm2=(44 54 64 74 45 55 65 75)
298        punpckhdq xmm0,xmm7             ; xmm0=(46 56 66 76 47 57 67 77)
299
300        movdqa     xmm4,xmm6            ; transpose coefficients(phase 3)
301        punpcklqdq xmm6,xmm1            ; xmm6=col0=(00 10 20 30 40 50 60 70)
302        punpckhqdq xmm4,xmm1            ; xmm4=col1=(01 11 21 31 41 51 61 71)
303        movdqa     xmm7,xmm5            ; transpose coefficients(phase 3)
304        punpcklqdq xmm5,xmm3            ; xmm5=col2=(02 12 22 32 42 52 62 72)
305        punpckhqdq xmm7,xmm3            ; xmm7=col3=(03 13 23 33 43 53 63 73)
306
307        movdqa  xmm1, XMMWORD [wk(0)]   ; xmm1=(04 14 24 34 05 15 25 35)
308        movdqa  xmm3, XMMWORD [wk(1)]   ; xmm3=(06 16 26 36 07 17 27 37)
309
310        movdqa  XMMWORD [wk(0)], xmm4   ; wk(0)=col1
311        movdqa  XMMWORD [wk(1)], xmm7   ; wk(1)=col3
312
313        movdqa     xmm4,xmm1            ; transpose coefficients(phase 3)
314        punpcklqdq xmm1,xmm2            ; xmm1=col4=(04 14 24 34 44 54 64 74)
315        punpckhqdq xmm4,xmm2            ; xmm4=col5=(05 15 25 35 45 55 65 75)
316        movdqa     xmm7,xmm3            ; transpose coefficients(phase 3)
317        punpcklqdq xmm3,xmm0            ; xmm3=col6=(06 16 26 36 46 56 66 76)
318        punpckhqdq xmm7,xmm0            ; xmm7=col7=(07 17 27 37 47 57 67 77)
319.column_end:
320
321        ; -- Prefetch the next coefficient block
322
323        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
324        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
325        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
326        prefetchnta [esi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
327
328        ; ---- Pass 2: process rows from work array, store into output array.
329
330        mov     eax, [original_ebp]
331        mov     edi, JSAMPARRAY [output_buf(eax)]       ; (JSAMPROW *)
332        mov     eax, JDIMENSION [output_col(eax)]
333
334        ; -- Even part
335
336        ; xmm6=col0, xmm5=col2, xmm1=col4, xmm3=col6
337
338        movdqa  xmm2,xmm6
339        movdqa  xmm0,xmm5
340        psubw   xmm6,xmm1               ; xmm6=tmp11
341        psubw   xmm5,xmm3
342        paddw   xmm2,xmm1               ; xmm2=tmp10
343        paddw   xmm0,xmm3               ; xmm0=tmp13
344
345        psllw   xmm5,PRE_MULTIPLY_SCALE_BITS
346        pmulhw  xmm5,[GOTOFF(ebx,PW_F1414)]
347        psubw   xmm5,xmm0               ; xmm5=tmp12
348
349        movdqa  xmm1,xmm2
350        movdqa  xmm3,xmm6
351        psubw   xmm2,xmm0               ; xmm2=tmp3
352        psubw   xmm6,xmm5               ; xmm6=tmp2
353        paddw   xmm1,xmm0               ; xmm1=tmp0
354        paddw   xmm3,xmm5               ; xmm3=tmp1
355
356        movdqa  xmm0, XMMWORD [wk(0)]   ; xmm0=col1
357        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=col3
358
359        movdqa  XMMWORD [wk(0)], xmm2   ; wk(0)=tmp3
360        movdqa  XMMWORD [wk(1)], xmm6   ; wk(1)=tmp2
361
362        ; -- Odd part
363
364        ; xmm0=col1, xmm5=col3, xmm4=col5, xmm7=col7
365
366        movdqa  xmm2,xmm0
367        movdqa  xmm6,xmm4
368        psubw   xmm0,xmm7               ; xmm0=z12
369        psubw   xmm4,xmm5               ; xmm4=z10
370        paddw   xmm2,xmm7               ; xmm2=z11
371        paddw   xmm6,xmm5               ; xmm6=z13
372
373        movdqa  xmm7,xmm4               ; xmm7=z10(unscaled)
374        psllw   xmm0,PRE_MULTIPLY_SCALE_BITS
375        psllw   xmm4,PRE_MULTIPLY_SCALE_BITS
376
377        movdqa  xmm5,xmm2
378        psubw   xmm2,xmm6
379        paddw   xmm5,xmm6               ; xmm5=tmp7
380
381        psllw   xmm2,PRE_MULTIPLY_SCALE_BITS
382        pmulhw  xmm2,[GOTOFF(ebx,PW_F1414)]     ; xmm2=tmp11
383
384        ; To avoid overflow...
385        ;
386        ; (Original)
387        ; tmp12 = -2.613125930 * z10 + z5;
388        ;
389        ; (This implementation)
390        ; tmp12 = (-1.613125930 - 1) * z10 + z5;
391        ;       = -1.613125930 * z10 - z10 + z5;
392
393        movdqa  xmm6,xmm4
394        paddw   xmm4,xmm0
395        pmulhw  xmm4,[GOTOFF(ebx,PW_F1847)]     ; xmm4=z5
396        pmulhw  xmm6,[GOTOFF(ebx,PW_MF1613)]
397        pmulhw  xmm0,[GOTOFF(ebx,PW_F1082)]
398        psubw   xmm6,xmm7
399        psubw   xmm0,xmm4               ; xmm0=tmp10
400        paddw   xmm6,xmm4               ; xmm6=tmp12
401
402        ; -- Final output stage
403
404        psubw   xmm6,xmm5               ; xmm6=tmp6
405        movdqa  xmm7,xmm1
406        movdqa  xmm4,xmm3
407        paddw   xmm1,xmm5               ; xmm1=data0=(00 10 20 30 40 50 60 70)
408        paddw   xmm3,xmm6               ; xmm3=data1=(01 11 21 31 41 51 61 71)
409        psraw   xmm1,(PASS1_BITS+3)     ; descale
410        psraw   xmm3,(PASS1_BITS+3)     ; descale
411        psubw   xmm7,xmm5               ; xmm7=data7=(07 17 27 37 47 57 67 77)
412        psubw   xmm4,xmm6               ; xmm4=data6=(06 16 26 36 46 56 66 76)
413        psraw   xmm7,(PASS1_BITS+3)     ; descale
414        psraw   xmm4,(PASS1_BITS+3)     ; descale
415        psubw   xmm2,xmm6               ; xmm2=tmp5
416
417        packsswb  xmm1,xmm4     ; xmm1=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
418        packsswb  xmm3,xmm7     ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
419
420        movdqa  xmm5, XMMWORD [wk(1)]   ; xmm5=tmp2
421        movdqa  xmm6, XMMWORD [wk(0)]   ; xmm6=tmp3
422
423        paddw   xmm0,xmm2               ; xmm0=tmp4
424        movdqa  xmm4,xmm5
425        movdqa  xmm7,xmm6
426        paddw   xmm5,xmm2               ; xmm5=data2=(02 12 22 32 42 52 62 72)
427        paddw   xmm6,xmm0               ; xmm6=data4=(04 14 24 34 44 54 64 74)
428        psraw   xmm5,(PASS1_BITS+3)     ; descale
429        psraw   xmm6,(PASS1_BITS+3)     ; descale
430        psubw   xmm4,xmm2               ; xmm4=data5=(05 15 25 35 45 55 65 75)
431        psubw   xmm7,xmm0               ; xmm7=data3=(03 13 23 33 43 53 63 73)
432        psraw   xmm4,(PASS1_BITS+3)     ; descale
433        psraw   xmm7,(PASS1_BITS+3)     ; descale
434
435        movdqa    xmm2,[GOTOFF(ebx,PB_CENTERJSAMP)]     ; xmm2=[PB_CENTERJSAMP]
436
437        packsswb  xmm5,xmm6     ; xmm5=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
438        packsswb  xmm7,xmm4     ; xmm7=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
439
440        paddb     xmm1,xmm2
441        paddb     xmm3,xmm2
442        paddb     xmm5,xmm2
443        paddb     xmm7,xmm2
444
445        movdqa    xmm0,xmm1     ; transpose coefficients(phase 1)
446        punpcklbw xmm1,xmm3     ; xmm1=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
447        punpckhbw xmm0,xmm3     ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
448        movdqa    xmm6,xmm5     ; transpose coefficients(phase 1)
449        punpcklbw xmm5,xmm7     ; xmm5=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
450        punpckhbw xmm6,xmm7     ; xmm6=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
451
452        movdqa    xmm4,xmm1     ; transpose coefficients(phase 2)
453        punpcklwd xmm1,xmm5     ; xmm1=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
454        punpckhwd xmm4,xmm5     ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
455        movdqa    xmm2,xmm6     ; transpose coefficients(phase 2)
456        punpcklwd xmm6,xmm0     ; xmm6=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
457        punpckhwd xmm2,xmm0     ; xmm2=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
458
459        movdqa    xmm3,xmm1     ; transpose coefficients(phase 3)
460        punpckldq xmm1,xmm6     ; xmm1=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
461        punpckhdq xmm3,xmm6     ; xmm3=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
462        movdqa    xmm7,xmm4     ; transpose coefficients(phase 3)
463        punpckldq xmm4,xmm2     ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
464        punpckhdq xmm7,xmm2     ; xmm7=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
465
466        pshufd  xmm5,xmm1,0x4E  ; xmm5=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
467        pshufd  xmm0,xmm3,0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
468        pshufd  xmm6,xmm4,0x4E  ; xmm6=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
469        pshufd  xmm2,xmm7,0x4E  ; xmm2=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
470
471        mov     edx, JSAMPROW [edi+0*SIZEOF_JSAMPROW]
472        mov     esi, JSAMPROW [edi+2*SIZEOF_JSAMPROW]
473        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm1
474        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm3
475        mov     edx, JSAMPROW [edi+4*SIZEOF_JSAMPROW]
476        mov     esi, JSAMPROW [edi+6*SIZEOF_JSAMPROW]
477        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm4
478        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm7
479
480        mov     edx, JSAMPROW [edi+1*SIZEOF_JSAMPROW]
481        mov     esi, JSAMPROW [edi+3*SIZEOF_JSAMPROW]
482        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm5
483        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm0
484        mov     edx, JSAMPROW [edi+5*SIZEOF_JSAMPROW]
485        mov     esi, JSAMPROW [edi+7*SIZEOF_JSAMPROW]
486        movq    XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE], xmm6
487        movq    XMM_MMWORD [esi+eax*SIZEOF_JSAMPLE], xmm2
488
489        pop     edi
490        pop     esi
491;       pop     edx             ; need not be preserved
492;       pop     ecx             ; unused
493        poppic  ebx
494        mov     esp,ebp         ; esp <- aligned ebp
495        pop     esp             ; esp <- original ebp
496        pop     ebp
497        ret
498
499; For some reason, the OS X linker does not honor the request to align the
500; segment unless we do this.
501        align   16
502