1;
2; jccolext.asm - colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2009, 2016, D. R. Commander.
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_rgb_ycc_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
26;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
27;                            int num_rows);
28;
29
30; r10d = JDIMENSION img_width
31; r11 = JSAMPARRAY input_buf
32; r12 = JSAMPIMAGE output_buf
33; r13d = JDIMENSION output_row
34; r14d = int num_rows
35
36%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
37%define WK_NUM  8
38
39    align       32
40    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_sse2)
41
42EXTN(jsimd_rgb_ycc_convert_sse2):
43    push        rbp
44    mov         rax, rsp                     ; rax = original rbp
45    sub         rsp, byte 4
46    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
47    mov         [rsp], rax
48    mov         rbp, rsp                     ; rbp = aligned rbp
49    lea         rsp, [wk(0)]
50    collect_args 5
51    push        rbx
52
53    mov         ecx, r10d
54    test        rcx, rcx
55    jz          near .return
56
57    push        rcx
58
59    mov         rsi, r12
60    mov         ecx, r13d
61    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
63    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
64    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
65    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
66    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
67
68    pop         rcx
69
70    mov         rsi, r11
71    mov         eax, r14d
72    test        rax, rax
73    jle         near .return
74.rowloop:
75    push        rdx
76    push        rbx
77    push        rdi
78    push        rsi
79    push        rcx                     ; col
80
81    mov         rsi, JSAMPROW [rsi]     ; inptr
82    mov         rdi, JSAMPROW [rdi]     ; outptr0
83    mov         rbx, JSAMPROW [rbx]     ; outptr1
84    mov         rdx, JSAMPROW [rdx]     ; outptr2
85
86    cmp         rcx, byte SIZEOF_XMMWORD
87    jae         near .columnloop
88
89%if RGB_PIXELSIZE == 3  ; ---------------
90
91.column_ld1:
92    push        rax
93    push        rdx
94    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
95    test        cl, SIZEOF_BYTE
96    jz          short .column_ld2
97    sub         rcx, byte SIZEOF_BYTE
98    movzx       rax, BYTE [rsi+rcx]
99.column_ld2:
100    test        cl, SIZEOF_WORD
101    jz          short .column_ld4
102    sub         rcx, byte SIZEOF_WORD
103    movzx       rdx, WORD [rsi+rcx]
104    shl         rax, WORD_BIT
105    or          rax, rdx
106.column_ld4:
107    movd        xmmA, eax
108    pop         rdx
109    pop         rax
110    test        cl, SIZEOF_DWORD
111    jz          short .column_ld8
112    sub         rcx, byte SIZEOF_DWORD
113    movd        xmmF, XMM_DWORD [rsi+rcx]
114    pslldq      xmmA, SIZEOF_DWORD
115    por         xmmA, xmmF
116.column_ld8:
117    test        cl, SIZEOF_MMWORD
118    jz          short .column_ld16
119    sub         rcx, byte SIZEOF_MMWORD
120    movq        xmmB, XMM_MMWORD [rsi+rcx]
121    pslldq      xmmA, SIZEOF_MMWORD
122    por         xmmA, xmmB
123.column_ld16:
124    test        cl, SIZEOF_XMMWORD
125    jz          short .column_ld32
126    movdqa      xmmF, xmmA
127    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128    mov         rcx, SIZEOF_XMMWORD
129    jmp         short .rgb_ycc_cnv
130.column_ld32:
131    test        cl, 2*SIZEOF_XMMWORD
132    mov         rcx, SIZEOF_XMMWORD
133    jz          short .rgb_ycc_cnv
134    movdqa      xmmB, xmmA
135    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
136    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
137    jmp         short .rgb_ycc_cnv
138
139.columnloop:
140    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
141    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
142    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
143
144.rgb_ycc_cnv:
145    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
146    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
147    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
148
149    movdqa      xmmG, xmmA
150    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
151    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
152
153    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
154    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
155
156    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
157    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
158
159    movdqa      xmmD, xmmA
160    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
161    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
162
163    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
164    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
165
166    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
167    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
168
169    movdqa      xmmE, xmmA
170    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
171    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
172
173    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
174    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
175
176    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
177    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
178
179    pxor        xmmH, xmmH
180
181    movdqa      xmmC, xmmA
182    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
183    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
184
185    movdqa      xmmB, xmmE
186    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
187    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
188
189    movdqa      xmmF, xmmD
190    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
191    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
192
193%else  ; RGB_PIXELSIZE == 4 ; -----------
194
195.column_ld1:
196    test        cl, SIZEOF_XMMWORD/16
197    jz          short .column_ld2
198    sub         rcx, byte SIZEOF_XMMWORD/16
199    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
200.column_ld2:
201    test        cl, SIZEOF_XMMWORD/8
202    jz          short .column_ld4
203    sub         rcx, byte SIZEOF_XMMWORD/8
204    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
205    pslldq      xmmA, SIZEOF_MMWORD
206    por         xmmA, xmmE
207.column_ld4:
208    test        cl, SIZEOF_XMMWORD/4
209    jz          short .column_ld8
210    sub         rcx, byte SIZEOF_XMMWORD/4
211    movdqa      xmmE, xmmA
212    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
213.column_ld8:
214    test        cl, SIZEOF_XMMWORD/2
215    mov         rcx, SIZEOF_XMMWORD
216    jz          short .rgb_ycc_cnv
217    movdqa      xmmF, xmmA
218    movdqa      xmmH, xmmE
219    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
220    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
221    jmp         short .rgb_ycc_cnv
222
223.columnloop:
224    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
225    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
226    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
227    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
228
229.rgb_ycc_cnv:
230    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
231    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
232    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
233    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
234
235    movdqa      xmmD, xmmA
236    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
237    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
238
239    movdqa      xmmC, xmmF
240    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
241    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
242
243    movdqa      xmmB, xmmA
244    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
245    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
246
247    movdqa      xmmG, xmmD
248    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
249    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
250
251    movdqa      xmmE, xmmA
252    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
253    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
254
255    movdqa      xmmH, xmmB
256    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
257    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
258
259    pxor        xmmF, xmmF
260
261    movdqa      xmmC, xmmA
262    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
263    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
264
265    movdqa      xmmD, xmmB
266    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
267    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
268
269    movdqa      xmmG, xmmE
270    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
271    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
272
273    punpcklbw   xmmF, xmmH
274    punpckhbw   xmmH, xmmH
275    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
276    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
277
278%endif  ; RGB_PIXELSIZE ; ---------------
279
280    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
281    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
282
283    ; (Original)
284    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
285    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
286    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
287    ;
288    ; (This implementation)
289    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
290    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
291    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
292
293    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=RE
294    movdqa      XMMWORD [wk(1)], xmm1   ; wk(1)=RO
295    movdqa      XMMWORD [wk(2)], xmm4   ; wk(2)=BE
296    movdqa      XMMWORD [wk(3)], xmm5   ; wk(3)=BO
297
298    movdqa      xmm6, xmm1
299    punpcklwd   xmm1, xmm3
300    punpckhwd   xmm6, xmm3
301    movdqa      xmm7, xmm1
302    movdqa      xmm4, xmm6
303    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
304    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
305    pmaddwd     xmm7, [rel PW_MF016_MF033]  ; xmm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
306    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
307
308    movdqa      XMMWORD [wk(4)], xmm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
309    movdqa      XMMWORD [wk(5)], xmm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
310
311    pxor        xmm1, xmm1
312    pxor        xmm6, xmm6
313    punpcklwd   xmm1, xmm5              ; xmm1=BOL
314    punpckhwd   xmm6, xmm5              ; xmm6=BOH
315    psrld       xmm1, 1                 ; xmm1=BOL*FIX(0.500)
316    psrld       xmm6, 1                 ; xmm6=BOH*FIX(0.500)
317
318    movdqa      xmm5, [rel PD_ONEHALFM1_CJ]  ; xmm5=[PD_ONEHALFM1_CJ]
319
320    paddd       xmm7, xmm1
321    paddd       xmm4, xmm6
322    paddd       xmm7, xmm5
323    paddd       xmm4, xmm5
324    psrld       xmm7, SCALEBITS         ; xmm7=CbOL
325    psrld       xmm4, SCALEBITS         ; xmm4=CbOH
326    packssdw    xmm7, xmm4              ; xmm7=CbO
327
328    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=BE
329
330    movdqa      xmm6, xmm0
331    punpcklwd   xmm0, xmm2
332    punpckhwd   xmm6, xmm2
333    movdqa      xmm5, xmm0
334    movdqa      xmm4, xmm6
335    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
336    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
337    pmaddwd     xmm5, [rel PW_MF016_MF033]  ; xmm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
338    pmaddwd     xmm4, [rel PW_MF016_MF033]  ; xmm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
339
340    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
341    movdqa      XMMWORD [wk(7)], xmm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
342
343    pxor        xmm0, xmm0
344    pxor        xmm6, xmm6
345    punpcklwd   xmm0, xmm1              ; xmm0=BEL
346    punpckhwd   xmm6, xmm1              ; xmm6=BEH
347    psrld       xmm0, 1                 ; xmm0=BEL*FIX(0.500)
348    psrld       xmm6, 1                 ; xmm6=BEH*FIX(0.500)
349
350    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
351
352    paddd       xmm5, xmm0
353    paddd       xmm4, xmm6
354    paddd       xmm5, xmm1
355    paddd       xmm4, xmm1
356    psrld       xmm5, SCALEBITS         ; xmm5=CbEL
357    psrld       xmm4, SCALEBITS         ; xmm4=CbEH
358    packssdw    xmm5, xmm4              ; xmm5=CbE
359
360    psllw       xmm7, BYTE_BIT
361    por         xmm5, xmm7              ; xmm5=Cb
362    movdqa      XMMWORD [rbx], xmm5     ; Save Cb
363
364    movdqa      xmm0, XMMWORD [wk(3)]   ; xmm0=BO
365    movdqa      xmm6, XMMWORD [wk(2)]   ; xmm6=BE
366    movdqa      xmm1, XMMWORD [wk(1)]   ; xmm1=RO
367
368    movdqa      xmm4, xmm0
369    punpcklwd   xmm0, xmm3
370    punpckhwd   xmm4, xmm3
371    movdqa      xmm7, xmm0
372    movdqa      xmm5, xmm4
373    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
374    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
375    pmaddwd     xmm7, [rel PW_MF008_MF041]  ; xmm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
376    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
377
378    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
379
380    paddd       xmm0, XMMWORD [wk(4)]
381    paddd       xmm4, XMMWORD [wk(5)]
382    paddd       xmm0, xmm3
383    paddd       xmm4, xmm3
384    psrld       xmm0, SCALEBITS         ; xmm0=YOL
385    psrld       xmm4, SCALEBITS         ; xmm4=YOH
386    packssdw    xmm0, xmm4              ; xmm0=YO
387
388    pxor        xmm3, xmm3
389    pxor        xmm4, xmm4
390    punpcklwd   xmm3, xmm1              ; xmm3=ROL
391    punpckhwd   xmm4, xmm1              ; xmm4=ROH
392    psrld       xmm3, 1                 ; xmm3=ROL*FIX(0.500)
393    psrld       xmm4, 1                 ; xmm4=ROH*FIX(0.500)
394
395    movdqa      xmm1, [rel PD_ONEHALFM1_CJ]  ; xmm1=[PD_ONEHALFM1_CJ]
396
397    paddd       xmm7, xmm3
398    paddd       xmm5, xmm4
399    paddd       xmm7, xmm1
400    paddd       xmm5, xmm1
401    psrld       xmm7, SCALEBITS         ; xmm7=CrOL
402    psrld       xmm5, SCALEBITS         ; xmm5=CrOH
403    packssdw    xmm7, xmm5              ; xmm7=CrO
404
405    movdqa      xmm3, XMMWORD [wk(0)]   ; xmm3=RE
406
407    movdqa      xmm4, xmm6
408    punpcklwd   xmm6, xmm2
409    punpckhwd   xmm4, xmm2
410    movdqa      xmm1, xmm6
411    movdqa      xmm5, xmm4
412    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
413    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
414    pmaddwd     xmm1, [rel PW_MF008_MF041]  ; xmm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
415    pmaddwd     xmm5, [rel PW_MF008_MF041]  ; xmm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
416
417    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
418
419    paddd       xmm6, XMMWORD [wk(6)]
420    paddd       xmm4, XMMWORD [wk(7)]
421    paddd       xmm6, xmm2
422    paddd       xmm4, xmm2
423    psrld       xmm6, SCALEBITS         ; xmm6=YEL
424    psrld       xmm4, SCALEBITS         ; xmm4=YEH
425    packssdw    xmm6, xmm4              ; xmm6=YE
426
427    psllw       xmm0, BYTE_BIT
428    por         xmm6, xmm0              ; xmm6=Y
429    movdqa      XMMWORD [rdi], xmm6     ; Save Y
430
431    pxor        xmm2, xmm2
432    pxor        xmm4, xmm4
433    punpcklwd   xmm2, xmm3              ; xmm2=REL
434    punpckhwd   xmm4, xmm3              ; xmm4=REH
435    psrld       xmm2, 1                 ; xmm2=REL*FIX(0.500)
436    psrld       xmm4, 1                 ; xmm4=REH*FIX(0.500)
437
438    movdqa      xmm0, [rel PD_ONEHALFM1_CJ]  ; xmm0=[PD_ONEHALFM1_CJ]
439
440    paddd       xmm1, xmm2
441    paddd       xmm5, xmm4
442    paddd       xmm1, xmm0
443    paddd       xmm5, xmm0
444    psrld       xmm1, SCALEBITS         ; xmm1=CrEL
445    psrld       xmm5, SCALEBITS         ; xmm5=CrEH
446    packssdw    xmm1, xmm5              ; xmm1=CrE
447
448    psllw       xmm7, BYTE_BIT
449    por         xmm1, xmm7              ; xmm1=Cr
450    movdqa      XMMWORD [rdx], xmm1     ; Save Cr
451
452    sub         rcx, byte SIZEOF_XMMWORD
453    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
454    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
455    add         rbx, byte SIZEOF_XMMWORD                ; outptr1
456    add         rdx, byte SIZEOF_XMMWORD                ; outptr2
457    cmp         rcx, byte SIZEOF_XMMWORD
458    jae         near .columnloop
459    test        rcx, rcx
460    jnz         near .column_ld1
461
462    pop         rcx                     ; col
463    pop         rsi
464    pop         rdi
465    pop         rbx
466    pop         rdx
467
468    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
469    add         rdi, byte SIZEOF_JSAMPROW
470    add         rbx, byte SIZEOF_JSAMPROW
471    add         rdx, byte SIZEOF_JSAMPROW
472    dec         rax                        ; num_rows
473    jg          near .rowloop
474
475.return:
476    pop         rbx
477    uncollect_args 5
478    mov         rsp, rbp                ; rsp <- aligned rbp
479    pop         rsp                     ; rsp <- original rbp
480    pop         rbp
481    ret
482
483; For some reason, the OS X linker does not honor the request to align the
484; segment unless we do this.
485    align       32
486