1;
2; jccolext.asm - colorspace conversion (64-bit AVX2)
3;
4; Copyright (C) 2009, 2016, D. R. Commander.
5; Copyright (C) 2015, Intel Corporation.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jcolsamp.inc"
20
21; --------------------------------------------------------------------------
22;
23; Convert some rows of samples to the output colorspace.
24;
25; GLOBAL(void)
26; jsimd_rgb_ycc_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
27;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
28;                            int num_rows);
29;
30
31; r10d = JDIMENSION img_width
32; r11 = JSAMPARRAY input_buf
33; r12 = JSAMPIMAGE output_buf
34; r13d = JDIMENSION output_row
35; r14d = int num_rows
36
37%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
38%define WK_NUM  8
39
40    align       32
41    GLOBAL_FUNCTION(jsimd_rgb_ycc_convert_avx2)
42
43EXTN(jsimd_rgb_ycc_convert_avx2):
44    push        rbp
45    mov         rax, rsp                     ; rax = original rbp
46    sub         rsp, byte 4
47    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
48    mov         [rsp], rax
49    mov         rbp, rsp                     ; rbp = aligned rbp
50    lea         rsp, [wk(0)]
51    collect_args 5
52    push        rbx
53
54    mov         ecx, r10d
55    test        rcx, rcx
56    jz          near .return
57
58    push        rcx
59
60    mov         rsi, r12
61    mov         ecx, r13d
62    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
63    mov         rbx, JSAMPARRAY [rsi+1*SIZEOF_JSAMPARRAY]
64    mov         rdx, JSAMPARRAY [rsi+2*SIZEOF_JSAMPARRAY]
65    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
66    lea         rbx, [rbx+rcx*SIZEOF_JSAMPROW]
67    lea         rdx, [rdx+rcx*SIZEOF_JSAMPROW]
68
69    pop         rcx
70
71    mov         rsi, r11
72    mov         eax, r14d
73    test        rax, rax
74    jle         near .return
75.rowloop:
76    push        rdx
77    push        rbx
78    push        rdi
79    push        rsi
80    push        rcx                     ; col
81
82    mov         rsi, JSAMPROW [rsi]     ; inptr
83    mov         rdi, JSAMPROW [rdi]     ; outptr0
84    mov         rbx, JSAMPROW [rbx]     ; outptr1
85    mov         rdx, JSAMPROW [rdx]     ; outptr2
86
87    cmp         rcx, byte SIZEOF_YMMWORD
88    jae         near .columnloop
89
90%if RGB_PIXELSIZE == 3  ; ---------------
91
92.column_ld1:
93    push        rax
94    push        rdx
95    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
96    test        cl, SIZEOF_BYTE
97    jz          short .column_ld2
98    sub         rcx, byte SIZEOF_BYTE
99    movzx       rax, BYTE [rsi+rcx]
100.column_ld2:
101    test        cl, SIZEOF_WORD
102    jz          short .column_ld4
103    sub         rcx, byte SIZEOF_WORD
104    movzx       rdx, WORD [rsi+rcx]
105    shl         rax, WORD_BIT
106    or          rax, rdx
107.column_ld4:
108    vmovd       xmmA, eax
109    pop         rdx
110    pop         rax
111    test        cl, SIZEOF_DWORD
112    jz          short .column_ld8
113    sub         rcx, byte SIZEOF_DWORD
114    vmovd       xmmF, XMM_DWORD [rsi+rcx]
115    vpslldq     xmmA, xmmA, SIZEOF_DWORD
116    vpor        xmmA, xmmA, xmmF
117.column_ld8:
118    test        cl, SIZEOF_MMWORD
119    jz          short .column_ld16
120    sub         rcx, byte SIZEOF_MMWORD
121    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
122    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
123    vpor        xmmA, xmmA, xmmB
124.column_ld16:
125    test        cl, SIZEOF_XMMWORD
126    jz          short .column_ld32
127    sub         rcx, byte SIZEOF_XMMWORD
128    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
129    vperm2i128  ymmA, ymmA, ymmA, 1
130    vpor        ymmA, ymmB
131.column_ld32:
132    test        cl, SIZEOF_YMMWORD
133    jz          short .column_ld64
134    sub         rcx, byte SIZEOF_YMMWORD
135    vmovdqa     ymmF, ymmA
136    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
137.column_ld64:
138    test        cl, 2*SIZEOF_YMMWORD
139    mov         rcx, SIZEOF_YMMWORD
140    jz          short .rgb_ycc_cnv
141    vmovdqa     ymmB, ymmA
142    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
143    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
144    jmp         short .rgb_ycc_cnv
145
146.columnloop:
147    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
148    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
149    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
150
151.rgb_ycc_cnv:
152    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
153    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
154    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
155    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
156    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
157    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
158
159    vmovdqu     ymmC, ymmA
160    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
161                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
162    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
163                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
164    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
165                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
166    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
167                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
168
169    vmovdqa     ymmG, ymmA
170    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
171                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
172    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
173                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
174
175    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
176                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
177    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
178                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
179
180    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
181                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
182    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
183                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
184
185    vmovdqa     ymmD, ymmA
186    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
187                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
188    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
189                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
190
191    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
192                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
193    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
194                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
195
196    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
197                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
198    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
199                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
200
201    vmovdqa     ymmE, ymmA
202    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
203                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
204    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
205                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
206
207    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
208                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
209    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
210                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
211
212    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
213                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
214    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
215                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
216
217    vpxor       ymmH, ymmH, ymmH
218
219    vmovdqa     ymmC, ymmA
220    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
221    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
222
223    vmovdqa     ymmB, ymmE
224    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
225    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
226
227    vmovdqa     ymmF, ymmD
228    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
229    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
230
231%else  ; RGB_PIXELSIZE == 4 ; -----------
232
233.column_ld1:
234    test        cl, SIZEOF_XMMWORD/16
235    jz          short .column_ld2
236    sub         rcx, byte SIZEOF_XMMWORD/16
237    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
238.column_ld2:
239    test        cl, SIZEOF_XMMWORD/8
240    jz          short .column_ld4
241    sub         rcx, byte SIZEOF_XMMWORD/8
242    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
243    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
244    vpor        xmmA, xmmA, xmmF
245.column_ld4:
246    test        cl, SIZEOF_XMMWORD/4
247    jz          short .column_ld8
248    sub         rcx, byte SIZEOF_XMMWORD/4
249    vmovdqa     xmmF, xmmA
250    vperm2i128  ymmF, ymmF, ymmF, 1
251    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
252    vpor        ymmA, ymmA, ymmF
253.column_ld8:
254    test        cl, SIZEOF_XMMWORD/2
255    jz          short .column_ld16
256    sub         rcx, byte SIZEOF_XMMWORD/2
257    vmovdqa     ymmF, ymmA
258    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
259.column_ld16:
260    test        cl, SIZEOF_XMMWORD
261    mov         rcx, SIZEOF_YMMWORD
262    jz          short .rgb_ycc_cnv
263    vmovdqa     ymmE, ymmA
264    vmovdqa     ymmH, ymmF
265    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
266    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
267    jmp         short .rgb_ycc_cnv
268
269.columnloop:
270    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
271    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
272    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
273    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
274
275.rgb_ycc_cnv:
276    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
277    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
278    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
279    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
280    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
281    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
282    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
283    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
284
285    vmovdqa     ymmB, ymmA
286    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
287                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
288    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
289                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
290
291    vmovdqa     ymmB, ymmF
292    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
293                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
294    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
295                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
296
297    vmovdqa     ymmD, ymmA
298    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
299                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
300    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
301                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
302
303    vmovdqa     ymmC, ymmF
304    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
305                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
306    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
307                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
308
309    vmovdqa     ymmB, ymmA
310    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
311                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
312    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
313                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
314
315    vmovdqa     ymmG, ymmD
316    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
317                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
318    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
319                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
320
321    vmovdqa     ymmE, ymmA
322    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
323                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
324    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
325                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
326
327    vmovdqa     ymmH, ymmB
328    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
329                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
330    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
331                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
332
333    vpxor       ymmF, ymmF, ymmF
334
335    vmovdqa     ymmC, ymmA
336    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
337    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
338
339    vmovdqa     ymmD, ymmB
340    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
341    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
342
343    vmovdqa     ymmG, ymmE
344    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
345    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
346
347    vpunpcklbw  ymmF, ymmF, ymmH
348    vpunpckhbw  ymmH, ymmH, ymmH
349    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
350    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
351
352%endif  ; RGB_PIXELSIZE ; ---------------
353
354    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
355    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
356
357    ; (Original)
358    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
359    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
360    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
361    ;
362    ; (This implementation)
363    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
364    ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
365    ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
366
367    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=RE
368    vmovdqa     YMMWORD [wk(1)], ymm1   ; wk(1)=RO
369    vmovdqa     YMMWORD [wk(2)], ymm4   ; wk(2)=BE
370    vmovdqa     YMMWORD [wk(3)], ymm5   ; wk(3)=BO
371
372    vmovdqa     ymm6, ymm1
373    vpunpcklwd  ymm1, ymm1, ymm3
374    vpunpckhwd  ymm6, ymm6, ymm3
375    vmovdqa     ymm7, ymm1
376    vmovdqa     ymm4, ymm6
377    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
378    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
379    vpmaddwd    ymm7, ymm7, [rel PW_MF016_MF033]  ; ymm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
380    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
381
382    vmovdqa     YMMWORD [wk(4)], ymm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
383    vmovdqa     YMMWORD [wk(5)], ymm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
384
385    vpxor       ymm1, ymm1, ymm1
386    vpxor       ymm6, ymm6, ymm6
387    vpunpcklwd  ymm1, ymm1, ymm5        ; ymm1=BOL
388    vpunpckhwd  ymm6, ymm6, ymm5        ; ymm6=BOH
389    vpsrld      ymm1, ymm1, 1           ; ymm1=BOL*FIX(0.500)
390    vpsrld      ymm6, ymm6, 1           ; ymm6=BOH*FIX(0.500)
391
392    vmovdqa     ymm5, [rel PD_ONEHALFM1_CJ]  ; ymm5=[PD_ONEHALFM1_CJ]
393
394    vpaddd      ymm7, ymm7, ymm1
395    vpaddd      ymm4, ymm4, ymm6
396    vpaddd      ymm7, ymm7, ymm5
397    vpaddd      ymm4, ymm4, ymm5
398    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CbOL
399    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbOH
400    vpackssdw   ymm7, ymm7, ymm4        ; ymm7=CbO
401
402    vmovdqa     ymm1, YMMWORD [wk(2)]   ; ymm1=BE
403
404    vmovdqa     ymm6, ymm0
405    vpunpcklwd  ymm0, ymm0, ymm2
406    vpunpckhwd  ymm6, ymm6, ymm2
407    vmovdqa     ymm5, ymm0
408    vmovdqa     ymm4, ymm6
409    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
410    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
411    vpmaddwd    ymm5, ymm5, [rel PW_MF016_MF033]  ; ymm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
412    vpmaddwd    ymm4, ymm4, [rel PW_MF016_MF033]  ; ymm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
413
414    vmovdqa     YMMWORD [wk(6)], ymm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
415    vmovdqa     YMMWORD [wk(7)], ymm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
416
417    vpxor       ymm0, ymm0, ymm0
418    vpxor       ymm6, ymm6, ymm6
419    vpunpcklwd  ymm0, ymm0, ymm1        ; ymm0=BEL
420    vpunpckhwd  ymm6, ymm6, ymm1        ; ymm6=BEH
421    vpsrld      ymm0, ymm0, 1           ; ymm0=BEL*FIX(0.500)
422    vpsrld      ymm6, ymm6, 1           ; ymm6=BEH*FIX(0.500)
423
424    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
425
426    vpaddd      ymm5, ymm5, ymm0
427    vpaddd      ymm4, ymm4, ymm6
428    vpaddd      ymm5, ymm5, ymm1
429    vpaddd      ymm4, ymm4, ymm1
430    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CbEL
431    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=CbEH
432    vpackssdw   ymm5, ymm5, ymm4        ; ymm5=CbE
433
434    vpsllw      ymm7, ymm7, BYTE_BIT
435    vpor        ymm5, ymm5, ymm7        ; ymm5=Cb
436    vmovdqu     YMMWORD [rbx], ymm5     ; Save Cb
437
438    vmovdqa     ymm0, YMMWORD [wk(3)]   ; ymm0=BO
439    vmovdqa     ymm6, YMMWORD [wk(2)]   ; ymm6=BE
440    vmovdqa     ymm1, YMMWORD [wk(1)]   ; ymm1=RO
441
442    vmovdqa     ymm4, ymm0
443    vpunpcklwd  ymm0, ymm0, ymm3
444    vpunpckhwd  ymm4, ymm4, ymm3
445    vmovdqa     ymm7, ymm0
446    vmovdqa     ymm5, ymm4
447    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
448    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
449    vpmaddwd    ymm7, ymm7, [rel PW_MF008_MF041]  ; ymm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
450    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
451
452    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
453
454    vpaddd      ymm0, ymm0, YMMWORD [wk(4)]
455    vpaddd      ymm4, ymm4, YMMWORD [wk(5)]
456    vpaddd      ymm0, ymm0, ymm3
457    vpaddd      ymm4, ymm4, ymm3
458    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
459    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
460    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
461
462    vpxor       ymm3, ymm3, ymm3
463    vpxor       ymm4, ymm4, ymm4
464    vpunpcklwd  ymm3, ymm3, ymm1        ; ymm3=ROL
465    vpunpckhwd  ymm4, ymm4, ymm1        ; ymm4=ROH
466    vpsrld      ymm3, ymm3, 1           ; ymm3=ROL*FIX(0.500)
467    vpsrld      ymm4, ymm4, 1           ; ymm4=ROH*FIX(0.500)
468
469    vmovdqa     ymm1, [rel PD_ONEHALFM1_CJ]  ; ymm1=[PD_ONEHALFM1_CJ]
470
471    vpaddd      ymm7, ymm7, ymm3
472    vpaddd      ymm5, ymm5, ymm4
473    vpaddd      ymm7, ymm7, ymm1
474    vpaddd      ymm5, ymm5, ymm1
475    vpsrld      ymm7, ymm7, SCALEBITS   ; ymm7=CrOL
476    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrOH
477    vpackssdw   ymm7, ymm7, ymm5        ; ymm7=CrO
478
479    vmovdqa     ymm3, YMMWORD [wk(0)]   ; ymm3=RE
480
481    vmovdqa     ymm4, ymm6
482    vpunpcklwd  ymm6, ymm6, ymm2
483    vpunpckhwd  ymm4, ymm4, ymm2
484    vmovdqa     ymm1, ymm6
485    vmovdqa     ymm5, ymm4
486    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
487    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
488    vpmaddwd    ymm1, ymm1, [rel PW_MF008_MF041]  ; ymm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
489    vpmaddwd    ymm5, ymm5, [rel PW_MF008_MF041]  ; ymm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
490
491    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
492
493    vpaddd      ymm6, ymm6, YMMWORD [wk(6)]
494    vpaddd      ymm4, ymm4, YMMWORD [wk(7)]
495    vpaddd      ymm6, ymm6, ymm2
496    vpaddd      ymm4, ymm4, ymm2
497    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
498    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
499    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
500
501    vpsllw      ymm0, ymm0, BYTE_BIT
502    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
503    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
504
505    vpxor       ymm2, ymm2, ymm2
506    vpxor       ymm4, ymm4, ymm4
507    vpunpcklwd  ymm2, ymm2, ymm3        ; ymm2=REL
508    vpunpckhwd  ymm4, ymm4, ymm3        ; ymm4=REH
509    vpsrld      ymm2, ymm2, 1           ; ymm2=REL*FIX(0.500)
510    vpsrld      ymm4, ymm4, 1           ; ymm4=REH*FIX(0.500)
511
512    vmovdqa     ymm0, [rel PD_ONEHALFM1_CJ]  ; ymm0=[PD_ONEHALFM1_CJ]
513
514    vpaddd      ymm1, ymm1, ymm2
515    vpaddd      ymm5, ymm5, ymm4
516    vpaddd      ymm1, ymm1, ymm0
517    vpaddd      ymm5, ymm5, ymm0
518    vpsrld      ymm1, ymm1, SCALEBITS   ; ymm1=CrEL
519    vpsrld      ymm5, ymm5, SCALEBITS   ; ymm5=CrEH
520    vpackssdw   ymm1, ymm1, ymm5        ; ymm1=CrE
521
522    vpsllw      ymm7, ymm7, BYTE_BIT
523    vpor        ymm1, ymm1, ymm7        ; ymm1=Cr
524    vmovdqu     YMMWORD [rdx], ymm1     ; Save Cr
525
526    sub         rcx, byte SIZEOF_YMMWORD
527    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
528    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
529    add         rbx, byte SIZEOF_YMMWORD           ; outptr1
530    add         rdx, byte SIZEOF_YMMWORD           ; outptr2
531    cmp         rcx, byte SIZEOF_YMMWORD
532    jae         near .columnloop
533    test        rcx, rcx
534    jnz         near .column_ld1
535
536    pop         rcx                     ; col
537    pop         rsi
538    pop         rdi
539    pop         rbx
540    pop         rdx
541
542    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
543    add         rdi, byte SIZEOF_JSAMPROW
544    add         rbx, byte SIZEOF_JSAMPROW
545    add         rdx, byte SIZEOF_JSAMPROW
546    dec         rax                        ; num_rows
547    jg          near .rowloop
548
549.return:
550    pop         rbx
551    vzeroupper
552    uncollect_args 5
553    mov         rsp, rbp                ; rsp <- aligned rbp
554    pop         rsp                     ; rsp <- original rbp
555    pop         rbp
556    ret
557
558; For some reason, the OS X linker does not honor the request to align the
559; segment unless we do this.
560    align       32
561