• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jccolext.asm - colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2009, D. R. Commander.
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_rgb_ycc_convert_sse2 (JDIMENSION img_width,
26;                             JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
27;                             JDIMENSION output_row, int num_rows);
28;
29
30; r10 = JDIMENSION img_width
31; r11 = JSAMPARRAY input_buf
32; r12 = JSAMPIMAGE output_buf
33; r13 = JDIMENSION output_row
34; r14 = int num_rows
35
36%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
37%define WK_NUM          8
38
39        align   16
40
41        global  EXTN(jsimd_rgb_ycc_convert_sse2)
42
43EXTN(jsimd_rgb_ycc_convert_sse2):
44        push    rbp
45        mov     rax,rsp                         ; rax = original rbp
46        sub     rsp, byte 4
47        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
48        mov     [rsp],rax
49        mov     rbp,rsp                         ; rbp = aligned rbp
50        lea     rsp, [wk(0)]
51        collect_args
52        push    rbx
53
54        mov     ecx, r10d
55        test    rcx,rcx
56        jz      near .return
57
58        push    rcx
59
60        mov rsi, r12
61        mov ecx, r13d
62        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
63        mov     rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
64        mov     rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
65        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
66        lea     rbx, [rbx+rcx*SIZEOF_JSAMPROW]
67        lea     rdx, [rdx+rcx*SIZEOF_JSAMPROW]
68
69        pop     rcx
70
71        mov rsi, r11
72        mov     eax, r14d
73        test    rax,rax
74        jle     near .return
75.rowloop:
76        push    rdx
77        push    rbx
78        push    rdi
79        push    rsi
80        push    rcx                     ; col
81
82        mov     rsi, JSAMPROW [rsi]     ; inptr
83        mov     rdi, JSAMPROW [rdi]     ; outptr0
84        mov     rbx, JSAMPROW [rbx]     ; outptr1
85        mov     rdx, JSAMPROW [rdx]     ; outptr2
86
87        cmp     rcx, byte SIZEOF_XMMWORD
88        jae     near .columnloop
89
90%if RGB_PIXELSIZE == 3 ; ---------------
91
92.column_ld1:
93        push    rax
94        push    rdx
95        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
96        test    cl, SIZEOF_BYTE
97        jz      short .column_ld2
98        sub     rcx, byte SIZEOF_BYTE
99        movzx   rax, BYTE [rsi+rcx]
100.column_ld2:
101        test    cl, SIZEOF_WORD
102        jz      short .column_ld4
103        sub     rcx, byte SIZEOF_WORD
104        movzx   rdx, WORD [rsi+rcx]
105        shl     rax, WORD_BIT
106        or      rax,rdx
107.column_ld4:
108        movd    xmmA,eax
109        pop     rdx
110        pop     rax
111        test    cl, SIZEOF_DWORD
112        jz      short .column_ld8
113        sub     rcx, byte SIZEOF_DWORD
114        movd    xmmF, XMM_DWORD [rsi+rcx]
115        pslldq  xmmA, SIZEOF_DWORD
116        por     xmmA,xmmF
117.column_ld8:
118        test    cl, SIZEOF_MMWORD
119        jz      short .column_ld16
120        sub     rcx, byte SIZEOF_MMWORD
121        movq    xmmB, XMM_MMWORD [rsi+rcx]
122        pslldq  xmmA, SIZEOF_MMWORD
123        por     xmmA,xmmB
124.column_ld16:
125        test    cl, SIZEOF_XMMWORD
126        jz      short .column_ld32
127        movdqa  xmmF,xmmA
128        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
129        mov     rcx, SIZEOF_XMMWORD
130        jmp     short .rgb_ycc_cnv
131.column_ld32:
132        test    cl, 2*SIZEOF_XMMWORD
133        mov     rcx, SIZEOF_XMMWORD
134        jz      short .rgb_ycc_cnv
135        movdqa  xmmB,xmmA
136        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
137        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
138        jmp     short .rgb_ycc_cnv
139
140.columnloop:
141        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
142        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
143        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
144
145.rgb_ycc_cnv:
146        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
147        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
148        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
149
150        movdqa    xmmG,xmmA
151        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
152        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
153
154        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
155        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
156
157        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
158        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
159
160        movdqa    xmmD,xmmA
161        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
162        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
163
164        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
165        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
166
167        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
168        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
169
170        movdqa    xmmE,xmmA
171        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
172        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
173
174        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
175        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
176
177        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
178        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
179
180        pxor      xmmH,xmmH
181
182        movdqa    xmmC,xmmA
183        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
184        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
185
186        movdqa    xmmB,xmmE
187        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
188        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
189
190        movdqa    xmmF,xmmD
191        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
192        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
193
194%else ; RGB_PIXELSIZE == 4 ; -----------
195
196.column_ld1:
197        test    cl, SIZEOF_XMMWORD/16
198        jz      short .column_ld2
199        sub     rcx, byte SIZEOF_XMMWORD/16
200        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
201.column_ld2:
202        test    cl, SIZEOF_XMMWORD/8
203        jz      short .column_ld4
204        sub     rcx, byte SIZEOF_XMMWORD/8
205        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
206        pslldq  xmmA, SIZEOF_MMWORD
207        por     xmmA,xmmE
208.column_ld4:
209        test    cl, SIZEOF_XMMWORD/4
210        jz      short .column_ld8
211        sub     rcx, byte SIZEOF_XMMWORD/4
212        movdqa  xmmE,xmmA
213        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
214.column_ld8:
215        test    cl, SIZEOF_XMMWORD/2
216        mov     rcx, SIZEOF_XMMWORD
217        jz      short .rgb_ycc_cnv
218        movdqa  xmmF,xmmA
219        movdqa  xmmH,xmmE
220        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
221        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
222        jmp     short .rgb_ycc_cnv
223
224.columnloop:
225        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
226        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
227        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
228        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
229
230.rgb_ycc_cnv:
231        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
232        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
233        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
234        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
235
236        movdqa    xmmD,xmmA
237        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
238        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
239
240        movdqa    xmmC,xmmF
241        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
242        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
243
244        movdqa    xmmB,xmmA
245        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
246        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
247
248        movdqa    xmmG,xmmD
249        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
250        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
251
252        movdqa    xmmE,xmmA
253        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
254        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
255
256        movdqa    xmmH,xmmB
257        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
258        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
259
260        pxor      xmmF,xmmF
261
262        movdqa    xmmC,xmmA
263        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
264        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
265
266        movdqa    xmmD,xmmB
267        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
268        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
269
270        movdqa    xmmG,xmmE
271        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
272        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
273
274        punpcklbw xmmF,xmmH
275        punpckhbw xmmH,xmmH
276        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
277        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
278
279%endif ; RGB_PIXELSIZE ; ---------------
280
281        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
282        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
283
284        ; (Original)
285        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
286        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
287        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
288        ;
289        ; (This implementation)
290        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
291        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
292        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
293
294        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=RE
295        movdqa    XMMWORD [wk(1)], xmm1 ; wk(1)=RO
296        movdqa    XMMWORD [wk(2)], xmm4 ; wk(2)=BE
297        movdqa    XMMWORD [wk(3)], xmm5 ; wk(3)=BO
298
299        movdqa    xmm6,xmm1
300        punpcklwd xmm1,xmm3
301        punpckhwd xmm6,xmm3
302        movdqa    xmm7,xmm1
303        movdqa    xmm4,xmm6
304        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
305        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
306        pmaddwd   xmm7,[rel PW_MF016_MF033] ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
307        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
308
309        movdqa    XMMWORD [wk(4)], xmm1 ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
310        movdqa    XMMWORD [wk(5)], xmm6 ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
311
312        pxor      xmm1,xmm1
313        pxor      xmm6,xmm6
314        punpcklwd xmm1,xmm5             ; xmm1=BOL
315        punpckhwd xmm6,xmm5             ; xmm6=BOH
316        psrld     xmm1,1                ; xmm1=BOL*FIX(0.500)
317        psrld     xmm6,1                ; xmm6=BOH*FIX(0.500)
318
319        movdqa    xmm5,[rel PD_ONEHALFM1_CJ] ; xmm5=[PD_ONEHALFM1_CJ]
320
321        paddd     xmm7,xmm1
322        paddd     xmm4,xmm6
323        paddd     xmm7,xmm5
324        paddd     xmm4,xmm5
325        psrld     xmm7,SCALEBITS        ; xmm7=CbOL
326        psrld     xmm4,SCALEBITS        ; xmm4=CbOH
327        packssdw  xmm7,xmm4             ; xmm7=CbO
328
329        movdqa    xmm1, XMMWORD [wk(2)] ; xmm1=BE
330
331        movdqa    xmm6,xmm0
332        punpcklwd xmm0,xmm2
333        punpckhwd xmm6,xmm2
334        movdqa    xmm5,xmm0
335        movdqa    xmm4,xmm6
336        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
337        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
338        pmaddwd   xmm5,[rel PW_MF016_MF033] ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
339        pmaddwd   xmm4,[rel PW_MF016_MF033] ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
340
341        movdqa    XMMWORD [wk(6)], xmm0 ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
342        movdqa    XMMWORD [wk(7)], xmm6 ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
343
344        pxor      xmm0,xmm0
345        pxor      xmm6,xmm6
346        punpcklwd xmm0,xmm1             ; xmm0=BEL
347        punpckhwd xmm6,xmm1             ; xmm6=BEH
348        psrld     xmm0,1                ; xmm0=BEL*FIX(0.500)
349        psrld     xmm6,1                ; xmm6=BEH*FIX(0.500)
350
351        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
352
353        paddd     xmm5,xmm0
354        paddd     xmm4,xmm6
355        paddd     xmm5,xmm1
356        paddd     xmm4,xmm1
357        psrld     xmm5,SCALEBITS        ; xmm5=CbEL
358        psrld     xmm4,SCALEBITS        ; xmm4=CbEH
359        packssdw  xmm5,xmm4             ; xmm5=CbE
360
361        psllw     xmm7,BYTE_BIT
362        por       xmm5,xmm7             ; xmm5=Cb
363        movdqa    XMMWORD [rbx], xmm5   ; Save Cb
364
365        movdqa    xmm0, XMMWORD [wk(3)] ; xmm0=BO
366        movdqa    xmm6, XMMWORD [wk(2)] ; xmm6=BE
367        movdqa    xmm1, XMMWORD [wk(1)] ; xmm1=RO
368
369        movdqa    xmm4,xmm0
370        punpcklwd xmm0,xmm3
371        punpckhwd xmm4,xmm3
372        movdqa    xmm7,xmm0
373        movdqa    xmm5,xmm4
374        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
375        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
376        pmaddwd   xmm7,[rel PW_MF008_MF041] ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
377        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
378
379        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
380
381        paddd     xmm0, XMMWORD [wk(4)]
382        paddd     xmm4, XMMWORD [wk(5)]
383        paddd     xmm0,xmm3
384        paddd     xmm4,xmm3
385        psrld     xmm0,SCALEBITS        ; xmm0=YOL
386        psrld     xmm4,SCALEBITS        ; xmm4=YOH
387        packssdw  xmm0,xmm4             ; xmm0=YO
388
389        pxor      xmm3,xmm3
390        pxor      xmm4,xmm4
391        punpcklwd xmm3,xmm1             ; xmm3=ROL
392        punpckhwd xmm4,xmm1             ; xmm4=ROH
393        psrld     xmm3,1                ; xmm3=ROL*FIX(0.500)
394        psrld     xmm4,1                ; xmm4=ROH*FIX(0.500)
395
396        movdqa    xmm1,[rel PD_ONEHALFM1_CJ] ; xmm1=[PD_ONEHALFM1_CJ]
397
398        paddd     xmm7,xmm3
399        paddd     xmm5,xmm4
400        paddd     xmm7,xmm1
401        paddd     xmm5,xmm1
402        psrld     xmm7,SCALEBITS        ; xmm7=CrOL
403        psrld     xmm5,SCALEBITS        ; xmm5=CrOH
404        packssdw  xmm7,xmm5             ; xmm7=CrO
405
406        movdqa    xmm3, XMMWORD [wk(0)] ; xmm3=RE
407
408        movdqa    xmm4,xmm6
409        punpcklwd xmm6,xmm2
410        punpckhwd xmm4,xmm2
411        movdqa    xmm1,xmm6
412        movdqa    xmm5,xmm4
413        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
414        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
415        pmaddwd   xmm1,[rel PW_MF008_MF041] ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
416        pmaddwd   xmm5,[rel PW_MF008_MF041] ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
417
418        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
419
420        paddd     xmm6, XMMWORD [wk(6)]
421        paddd     xmm4, XMMWORD [wk(7)]
422        paddd     xmm6,xmm2
423        paddd     xmm4,xmm2
424        psrld     xmm6,SCALEBITS        ; xmm6=YEL
425        psrld     xmm4,SCALEBITS        ; xmm4=YEH
426        packssdw  xmm6,xmm4             ; xmm6=YE
427
428        psllw     xmm0,BYTE_BIT
429        por       xmm6,xmm0             ; xmm6=Y
430        movdqa    XMMWORD [rdi], xmm6   ; Save Y
431
432        pxor      xmm2,xmm2
433        pxor      xmm4,xmm4
434        punpcklwd xmm2,xmm3             ; xmm2=REL
435        punpckhwd xmm4,xmm3             ; xmm4=REH
436        psrld     xmm2,1                ; xmm2=REL*FIX(0.500)
437        psrld     xmm4,1                ; xmm4=REH*FIX(0.500)
438
439        movdqa    xmm0,[rel PD_ONEHALFM1_CJ] ; xmm0=[PD_ONEHALFM1_CJ]
440
441        paddd     xmm1,xmm2
442        paddd     xmm5,xmm4
443        paddd     xmm1,xmm0
444        paddd     xmm5,xmm0
445        psrld     xmm1,SCALEBITS        ; xmm1=CrEL
446        psrld     xmm5,SCALEBITS        ; xmm5=CrEH
447        packssdw  xmm1,xmm5             ; xmm1=CrE
448
449        psllw     xmm7,BYTE_BIT
450        por       xmm1,xmm7             ; xmm1=Cr
451        movdqa    XMMWORD [rdx], xmm1   ; Save Cr
452
453        sub     rcx, byte SIZEOF_XMMWORD
454        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
455        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
456        add     rbx, byte SIZEOF_XMMWORD                ; outptr1
457        add     rdx, byte SIZEOF_XMMWORD                ; outptr2
458        cmp     rcx, byte SIZEOF_XMMWORD
459        jae     near .columnloop
460        test    rcx,rcx
461        jnz     near .column_ld1
462
463        pop     rcx                     ; col
464        pop     rsi
465        pop     rdi
466        pop     rbx
467        pop     rdx
468
469        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
470        add     rdi, byte SIZEOF_JSAMPROW
471        add     rbx, byte SIZEOF_JSAMPROW
472        add     rdx, byte SIZEOF_JSAMPROW
473        dec     rax                             ; num_rows
474        jg      near .rowloop
475
476.return:
477        pop     rbx
478        uncollect_args
479        mov     rsp,rbp         ; rsp <- aligned rbp
480        pop     rsp             ; rsp <- original rbp
481        pop     rbp
482        ret
483
484; For some reason, the OS X linker does not honor the request to align the
485; segment unless we do this.
486        align   16
487