1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit AVX2)
3;
4; Copyright (C) 2011, 2016, D. R. Commander.
5; Copyright (C) 2015, Intel Corporation.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jcolsamp.inc"
20
21; --------------------------------------------------------------------------
22;
23; Convert some rows of samples to the output colorspace.
24;
25; GLOBAL(void)
26; jsimd_rgb_gray_convert_avx2(JDIMENSION img_width, JSAMPARRAY input_buf,
27;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
28;                             int num_rows);
29;
30
31; r10d = JDIMENSION img_width
32; r11 = JSAMPARRAY input_buf
33; r12 = JSAMPIMAGE output_buf
34; r13d = JDIMENSION output_row
35; r14d = int num_rows
36
37%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_YMMWORD  ; ymmword wk[WK_NUM]
38%define WK_NUM  2
39
40    align       32
41    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_avx2)
42
43EXTN(jsimd_rgb_gray_convert_avx2):
44    push        rbp
45    mov         rax, rsp                     ; rax = original rbp
46    sub         rsp, byte 4
47    and         rsp, byte (-SIZEOF_YMMWORD)  ; align to 256 bits
48    mov         [rsp], rax
49    mov         rbp, rsp                     ; rbp = aligned rbp
50    lea         rsp, [wk(0)]
51    collect_args 5
52    push        rbx
53
54    mov         ecx, r10d
55    test        rcx, rcx
56    jz          near .return
57
58    push        rcx
59
60    mov         rsi, r12
61    mov         ecx, r13d
62    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
63    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
64
65    pop         rcx
66
67    mov         rsi, r11
68    mov         eax, r14d
69    test        rax, rax
70    jle         near .return
71.rowloop:
72    push        rdi
73    push        rsi
74    push        rcx                     ; col
75
76    mov         rsi, JSAMPROW [rsi]     ; inptr
77    mov         rdi, JSAMPROW [rdi]     ; outptr0
78
79    cmp         rcx, byte SIZEOF_YMMWORD
80    jae         near .columnloop
81
82%if RGB_PIXELSIZE == 3  ; ---------------
83
84.column_ld1:
85    push        rax
86    push        rdx
87    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
88    test        cl, SIZEOF_BYTE
89    jz          short .column_ld2
90    sub         rcx, byte SIZEOF_BYTE
91    movzx       rax, BYTE [rsi+rcx]
92.column_ld2:
93    test        cl, SIZEOF_WORD
94    jz          short .column_ld4
95    sub         rcx, byte SIZEOF_WORD
96    movzx       rdx, WORD [rsi+rcx]
97    shl         rax, WORD_BIT
98    or          rax, rdx
99.column_ld4:
100    vmovd       xmmA, eax
101    pop         rdx
102    pop         rax
103    test        cl, SIZEOF_DWORD
104    jz          short .column_ld8
105    sub         rcx, byte SIZEOF_DWORD
106    vmovd       xmmF, XMM_DWORD [rsi+rcx]
107    vpslldq     xmmA, xmmA, SIZEOF_DWORD
108    vpor        xmmA, xmmA, xmmF
109.column_ld8:
110    test        cl, SIZEOF_MMWORD
111    jz          short .column_ld16
112    sub         rcx, byte SIZEOF_MMWORD
113    vmovq       xmmB, XMM_MMWORD [rsi+rcx]
114    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
115    vpor        xmmA, xmmA, xmmB
116.column_ld16:
117    test        cl, SIZEOF_XMMWORD
118    jz          short .column_ld32
119    sub         rcx, byte SIZEOF_XMMWORD
120    vmovdqu     xmmB, XMM_MMWORD [rsi+rcx]
121    vperm2i128  ymmA, ymmA, ymmA, 1
122    vpor        ymmA, ymmB
123.column_ld32:
124    test        cl, SIZEOF_YMMWORD
125    jz          short .column_ld64
126    sub         rcx, byte SIZEOF_YMMWORD
127    vmovdqa     ymmF, ymmA
128    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
129.column_ld64:
130    test        cl, 2*SIZEOF_YMMWORD
131    mov         rcx, SIZEOF_YMMWORD
132    jz          short .rgb_gray_cnv
133    vmovdqa     ymmB, ymmA
134    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
135    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
136    jmp         short .rgb_gray_cnv
137
138.columnloop:
139    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
140    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
141    vmovdqu     ymmB, YMMWORD [rsi+2*SIZEOF_YMMWORD]
142
143.rgb_gray_cnv:
144    ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
145    ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
146    ; ymmF=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
147    ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
148    ; ymmB=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
149    ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
150
151    vmovdqu     ymmC, ymmA
152    vinserti128 ymmA, ymmF, xmmA, 0  ; ymmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05
153                                     ;       0G 1G 2G 0H 1H 2H 0I 1I 2I 0J 1J 2J 0K 1K 2K 0L)
154    vinserti128 ymmC, ymmC, xmmB, 0  ; ymmC=(1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q
155                                     ;       15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
156    vinserti128 ymmB, ymmB, xmmF, 0  ; ymmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F
157                                     ;       2Q 0R 1R 2R 0S 1S 2S 0T 1T 2T 0U 1U 2U 0V 1V 2V)
158    vperm2i128  ymmF, ymmC, ymmC, 1  ; ymmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A
159                                     ;       1L 2L 0M 1M 2M 0N 1N 2N 0O 1O 2O 0P 1P 2P 0Q 1Q)
160
161    vmovdqa     ymmG, ymmA
162    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12
163                                  ;       22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I)
164    vpsrldq     ymmG, ymmG, 8     ; ymmG=(22 03 13 23 04 14 24 05 0G 1G 2G 0H 1H 2H 0I 1I
165                                  ;       2I 0J 1J 2J 0K 1K 2K 0L -- -- -- -- -- -- -- --)
166
167    vpunpckhbw  ymmA, ymmA, ymmF  ; ymmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A
168                                  ;       0G 0O 1G 1O 2G 2O 0H 0P 1H 1P 2H 2P 0I 0Q 1I 1Q)
169    vpslldq     ymmF, ymmF, 8     ; ymmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27
170                                  ;       08 18 28 09 19 29 0A 1A 1L 2L 0M 1M 2M 0N 1N 2N)
171
172    vpunpcklbw  ymmG, ymmG, ymmB  ; ymmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D
173                                  ;       2I 2Q 0J 0R 1J 1R 2J 2R 0K 0S 1K 1S 2K 2S 0L 0T)
174    vpunpckhbw  ymmF, ymmF, ymmB  ; ymmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F
175                                  ;       1L 1T 2L 2T 0M 0U 1M 1U 2M 2U 0N 0V 1N 1V 2N 2V)
176
177    vmovdqa     ymmD, ymmA
178    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09
179                                  ;       11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P)
180    vpsrldq     ymmD, ymmD, 8     ; ymmD=(11 19 21 29 02 0A 12 1A 0G 0O 1G 1O 2G 2O 0H 0P
181                                  ;       1H 1P 2H 2P 0I 0Q 1I 1Q -- -- -- -- -- -- -- --)
182
183    vpunpckhbw  ymmA, ymmA, ymmG  ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D
184                                  ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 0H 0L 0P 0T)
185    vpslldq     ymmG, ymmG, 8     ; ymmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B
186                                  ;       04 0C 14 1C 24 2C 05 0D 2I 2Q 0J 0R 1J 1R 2J 2R)
187
188    vpunpcklbw  ymmD, ymmD, ymmF  ; ymmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E
189                                  ;       1H 1L 1P 1T 2H 2L 2P 2T 0I 0M 0Q 0U 1I 1M 1Q 1U)
190    vpunpckhbw  ymmG, ymmG, ymmF  ; ymmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F
191                                  ;       2I 2M 2Q 2U 0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V)
192
193    vmovdqa     ymmE, ymmA
194    vpslldq     ymmA, ymmA, 8     ; ymmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C
195                                  ;       20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S)
196    vpsrldq     ymmE, ymmE, 8     ; ymmE=(20 24 28 2C 01 05 09 0D 0G 0K 0O 0S 1G 1K 1O 1S
197                                  ;       2G 2K 2O 2S 0H 0L 0P 0T -- -- -- -- -- -- -- --)
198
199    vpunpckhbw  ymmA, ymmA, ymmD  ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
200                                  ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
201    vpslldq     ymmD, ymmD, 8     ; ymmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D
202                                  ;       02 06 0A 0E 12 16 1A 1E 1H 1L 1P 1T 2H 2L 2P 2T)
203
204    vpunpcklbw  ymmE, ymmE, ymmG  ; ymmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F
205                                  ;       2G 2I 2K 2M 2O 2Q 2S 2U 0H 0J 0L 0N 0P 0R 0T 0V)
206    vpunpckhbw  ymmD, ymmD, ymmG  ; ymmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F
207                                  ;       1H 1J 1L 1N 1P 1R 1T 1V 2H 2J 2L 2N 2P 2R 2T 2V)
208
209    vpxor       ymmH, ymmH, ymmH
210
211    vmovdqa     ymmC, ymmA
212    vpunpcklbw  ymmA, ymmA, ymmH  ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
213    vpunpckhbw  ymmC, ymmC, ymmH  ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
214
215    vmovdqa     ymmB, ymmE
216    vpunpcklbw  ymmE, ymmE, ymmH  ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
217    vpunpckhbw  ymmB, ymmB, ymmH  ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
218
219    vmovdqa     ymmF, ymmD
220    vpunpcklbw  ymmD, ymmD, ymmH  ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
221    vpunpckhbw  ymmF, ymmF, ymmH  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
222
223%else  ; RGB_PIXELSIZE == 4 ; -----------
224
225.column_ld1:
226    test        cl, SIZEOF_XMMWORD/16
227    jz          short .column_ld2
228    sub         rcx, byte SIZEOF_XMMWORD/16
229    vmovd       xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
230.column_ld2:
231    test        cl, SIZEOF_XMMWORD/8
232    jz          short .column_ld4
233    sub         rcx, byte SIZEOF_XMMWORD/8
234    vmovq       xmmF, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
235    vpslldq     xmmA, xmmA, SIZEOF_MMWORD
236    vpor        xmmA, xmmA, xmmF
237.column_ld4:
238    test        cl, SIZEOF_XMMWORD/4
239    jz          short .column_ld8
240    sub         rcx, byte SIZEOF_XMMWORD/4
241    vmovdqa     xmmF, xmmA
242    vperm2i128  ymmF, ymmF, ymmF, 1
243    vmovdqu     xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
244    vpor        ymmA, ymmA, ymmF
245.column_ld8:
246    test        cl, SIZEOF_XMMWORD/2
247    jz          short .column_ld16
248    sub         rcx, byte SIZEOF_XMMWORD/2
249    vmovdqa     ymmF, ymmA
250    vmovdqu     ymmA, YMMWORD [rsi+rcx*RGB_PIXELSIZE]
251.column_ld16:
252    test        cl, SIZEOF_XMMWORD
253    mov         rcx, SIZEOF_YMMWORD
254    jz          short .rgb_gray_cnv
255    vmovdqa     ymmE, ymmA
256    vmovdqa     ymmH, ymmF
257    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
258    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
259    jmp         short .rgb_gray_cnv
260
261.columnloop:
262    vmovdqu     ymmA, YMMWORD [rsi+0*SIZEOF_YMMWORD]
263    vmovdqu     ymmF, YMMWORD [rsi+1*SIZEOF_YMMWORD]
264    vmovdqu     ymmE, YMMWORD [rsi+2*SIZEOF_YMMWORD]
265    vmovdqu     ymmH, YMMWORD [rsi+3*SIZEOF_YMMWORD]
266
267.rgb_gray_cnv:
268    ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
269    ;       04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
270    ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
271    ;       0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
272    ; ymmE=(0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J
273    ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
274    ; ymmH=(0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R
275    ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
276
277    vmovdqa     ymmB, ymmA
278    vinserti128 ymmA, ymmA, xmmE, 1     ; ymmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
279                                        ;       0G 1G 2G 3G 0H 1H 2H 3H 0I 1I 2I 3I 0J 1J 2J 3J)
280    vperm2i128  ymmE, ymmB, ymmE, 0x31  ; ymmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
281                                        ;       0K 1K 2K 3K 0L 1L 2L 3L 0M 1M 2M 3M 0N 1N 2N 3N)
282
283    vmovdqa     ymmB, ymmF
284    vinserti128 ymmF, ymmF, xmmH, 1     ; ymmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B
285                                        ;       0O 1O 2O 3O 0P 1P 2P 3P 0Q 1Q 2Q 3Q 0R 1R 2R 3R)
286    vperm2i128  ymmH, ymmB, ymmH, 0x31  ; ymmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F
287                                        ;       0S 1S 2S 3S 0T 1T 2T 3T 0U 1U 2U 3U 0V 1V 2V 3V)
288
289    vmovdqa     ymmD, ymmA
290    vpunpcklbw  ymmA, ymmA, ymmE      ; ymmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35
291                                      ;       0G 0K 1G 1K 2G 2K 3G 3K 0H 0L 1H 1L 2H 2L 3H 3L)
292    vpunpckhbw  ymmD, ymmD, ymmE      ; ymmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37
293                                      ;       0I 0M 1I 1M 2I 2M 3I 3M 0J 0N 1J 1N 2J 2N 3J 3N)
294
295    vmovdqa     ymmC, ymmF
296    vpunpcklbw  ymmF, ymmF, ymmH      ; ymmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D
297                                      ;       0O 0S 1O 1S 2O 2S 3O 3S 0P 0T 1P 1T 2P 2T 3P 3T)
298    vpunpckhbw  ymmC, ymmC, ymmH      ; ymmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F
299                                      ;       0Q 0U 1Q 1U 2Q 2U 3Q 3U 0R 0V 1R 1V 2R 2V 3R 3V)
300
301    vmovdqa     ymmB, ymmA
302    vpunpcklwd  ymmA, ymmA, ymmF      ; ymmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C
303                                      ;       0G 0K 0O 0S 1G 1K 1O 1S 2G 2K 2O 2S 3G 3K 3O 3S)
304    vpunpckhwd  ymmB, ymmB, ymmF      ; ymmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D
305                                      ;       0H 0L 0P 0T 1H 1L 1P 1T 2H 2L 2P 2T 3H 3L 3P 3T)
306
307    vmovdqa     ymmG, ymmD
308    vpunpcklwd  ymmD, ymmD, ymmC      ; ymmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E
309                                      ;       0I 0M 0Q 0U 1I 1M 1Q 1U 2I 2M 2Q 2U 3I 3M 3Q 3U)
310    vpunpckhwd  ymmG, ymmG, ymmC      ; ymmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F
311                                      ;       0J 0N 0R 0V 1J 1N 1R 1V 2J 2N 2R 2V 3J 3N 3R 3V)
312
313    vmovdqa     ymmE, ymmA
314    vpunpcklbw  ymmA, ymmA, ymmD      ; ymmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E
315                                      ;       0G 0I 0K 0M 0O 0Q 0S 0U 1G 1I 1K 1M 1O 1Q 1S 1U)
316    vpunpckhbw  ymmE, ymmE, ymmD      ; ymmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E
317                                      ;       2G 2I 2K 2M 2O 2Q 2S 2U 3G 3I 3K 3M 3O 3Q 3S 3U)
318
319    vmovdqa     ymmH, ymmB
320    vpunpcklbw  ymmB, ymmB, ymmG      ; ymmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F
321                                      ;       0H 0J 0L 0N 0P 0R 0T 0V 1H 1J 1L 1N 1P 1R 1T 1V)
322    vpunpckhbw  ymmH, ymmH, ymmG      ; ymmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F
323                                      ;       2H 2J 2L 2N 2P 2R 2T 2V 3H 3J 3L 3N 3P 3R 3T 3V)
324
325    vpxor       ymmF, ymmF, ymmF
326
327    vmovdqa     ymmC, ymmA
328    vpunpcklbw  ymmA, ymmA, ymmF      ; ymmA=(00 02 04 06 08 0A 0C 0E 0G 0I 0K 0M 0O 0Q 0S 0U)
329    vpunpckhbw  ymmC, ymmC, ymmF      ; ymmC=(10 12 14 16 18 1A 1C 1E 1G 1I 1K 1M 1O 1Q 1S 1U)
330
331    vmovdqa     ymmD, ymmB
332    vpunpcklbw  ymmB, ymmB, ymmF      ; ymmB=(01 03 05 07 09 0B 0D 0F 0H 0J 0L 0N 0P 0R 0T 0V)
333    vpunpckhbw  ymmD, ymmD, ymmF      ; ymmD=(11 13 15 17 19 1B 1D 1F 1H 1J 1L 1N 1P 1R 1T 1V)
334
335    vmovdqa     ymmG, ymmE
336    vpunpcklbw  ymmE, ymmE, ymmF      ; ymmE=(20 22 24 26 28 2A 2C 2E 2G 2I 2K 2M 2O 2Q 2S 2U)
337    vpunpckhbw  ymmG, ymmG, ymmF      ; ymmG=(30 32 34 36 38 3A 3C 3E 3G 3I 3K 3M 3O 3Q 3S 3U)
338
339    vpunpcklbw  ymmF, ymmF, ymmH
340    vpunpckhbw  ymmH, ymmH, ymmH
341    vpsrlw      ymmF, ymmF, BYTE_BIT  ; ymmF=(21 23 25 27 29 2B 2D 2F 2H 2J 2L 2N 2P 2R 2T 2V)
342    vpsrlw      ymmH, ymmH, BYTE_BIT  ; ymmH=(31 33 35 37 39 3B 3D 3F 3H 3J 3L 3N 3P 3R 3T 3V)
343
344%endif  ; RGB_PIXELSIZE ; ---------------
345
346    ; ymm0=R(02468ACEGIKMOQSU)=RE, ymm2=G(02468ACEGIKMOQSU)=GE, ymm4=B(02468ACEGIKMOQSU)=BE
347    ; ymm1=R(13579BDFHJLNPRTV)=RO, ymm3=G(13579BDFHJLNPRTV)=GO, ymm5=B(13579BDFHJLNPRTV)=BO
348
349    ; (Original)
350    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
351    ;
352    ; (This implementation)
353    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
354
355    vmovdqa     ymm6, ymm1
356    vpunpcklwd  ymm1, ymm1, ymm3
357    vpunpckhwd  ymm6, ymm6, ymm3
358    vpmaddwd    ymm1, ymm1, [rel PW_F0299_F0337]  ; ymm1=ROL*FIX(0.299)+GOL*FIX(0.337)
359    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=ROH*FIX(0.299)+GOH*FIX(0.337)
360
361    vmovdqa     ymm7, ymm6              ; ymm7=ROH*FIX(0.299)+GOH*FIX(0.337)
362
363    vmovdqa     ymm6, ymm0
364    vpunpcklwd  ymm0, ymm0, ymm2
365    vpunpckhwd  ymm6, ymm6, ymm2
366    vpmaddwd    ymm0, ymm0, [rel PW_F0299_F0337]  ; ymm0=REL*FIX(0.299)+GEL*FIX(0.337)
367    vpmaddwd    ymm6, ymm6, [rel PW_F0299_F0337]  ; ymm6=REH*FIX(0.299)+GEH*FIX(0.337)
368
369    vmovdqa     YMMWORD [wk(0)], ymm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
370    vmovdqa     YMMWORD [wk(1)], ymm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
371
372    vmovdqa     ymm0, ymm5              ; ymm0=BO
373    vmovdqa     ymm6, ymm4              ; ymm6=BE
374
375    vmovdqa     ymm4, ymm0
376    vpunpcklwd  ymm0, ymm0, ymm3
377    vpunpckhwd  ymm4, ymm4, ymm3
378    vpmaddwd    ymm0, ymm0, [rel PW_F0114_F0250]  ; ymm0=BOL*FIX(0.114)+GOL*FIX(0.250)
379    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BOH*FIX(0.114)+GOH*FIX(0.250)
380
381    vmovdqa     ymm3, [rel PD_ONEHALF]            ; ymm3=[PD_ONEHALF]
382
383    vpaddd      ymm0, ymm0, ymm1
384    vpaddd      ymm4, ymm4, ymm7
385    vpaddd      ymm0, ymm0, ymm3
386    vpaddd      ymm4, ymm4, ymm3
387    vpsrld      ymm0, ymm0, SCALEBITS   ; ymm0=YOL
388    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YOH
389    vpackssdw   ymm0, ymm0, ymm4        ; ymm0=YO
390
391    vmovdqa     ymm4, ymm6
392    vpunpcklwd  ymm6, ymm6, ymm2
393    vpunpckhwd  ymm4, ymm4, ymm2
394    vpmaddwd    ymm6, ymm6, [rel PW_F0114_F0250]  ; ymm6=BEL*FIX(0.114)+GEL*FIX(0.250)
395    vpmaddwd    ymm4, ymm4, [rel PW_F0114_F0250]  ; ymm4=BEH*FIX(0.114)+GEH*FIX(0.250)
396
397    vmovdqa     ymm2, [rel PD_ONEHALF]            ; ymm2=[PD_ONEHALF]
398
399    vpaddd      ymm6, ymm6, YMMWORD [wk(0)]
400    vpaddd      ymm4, ymm4, YMMWORD [wk(1)]
401    vpaddd      ymm6, ymm6, ymm2
402    vpaddd      ymm4, ymm4, ymm2
403    vpsrld      ymm6, ymm6, SCALEBITS   ; ymm6=YEL
404    vpsrld      ymm4, ymm4, SCALEBITS   ; ymm4=YEH
405    vpackssdw   ymm6, ymm6, ymm4        ; ymm6=YE
406
407    vpsllw      ymm0, ymm0, BYTE_BIT
408    vpor        ymm6, ymm6, ymm0        ; ymm6=Y
409    vmovdqu     YMMWORD [rdi], ymm6     ; Save Y
410
411    sub         rcx, byte SIZEOF_YMMWORD
412    add         rsi, RGB_PIXELSIZE*SIZEOF_YMMWORD  ; inptr
413    add         rdi, byte SIZEOF_YMMWORD           ; outptr0
414    cmp         rcx, byte SIZEOF_YMMWORD
415    jae         near .columnloop
416    test        rcx, rcx
417    jnz         near .column_ld1
418
419    pop         rcx                     ; col
420    pop         rsi
421    pop         rdi
422
423    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
424    add         rdi, byte SIZEOF_JSAMPROW
425    dec         rax                        ; num_rows
426    jg          near .rowloop
427
428.return:
429    pop         rbx
430    vzeroupper
431    uncollect_args 5
432    mov         rsp, rbp                ; rsp <- aligned rbp
433    pop         rsp                     ; rsp <- original rbp
434    pop         rbp
435    ret
436
437; For some reason, the OS X linker does not honor the request to align the
438; segment unless we do this.
439    align       32
440