1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2011, 2016, D. R. Commander.
5; Copyright (C) 2018, Matthias Räncker.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
25;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
26;                             int num_rows);
27;
28
29; r10d = JDIMENSION img_width
30; r11 = JSAMPARRAY input_buf
31; r12 = JSAMPIMAGE output_buf
32; r13d = JDIMENSION output_row
33; r14d = int num_rows
34
35%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
36%define WK_NUM  2
37
38    align       32
39    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
40
41EXTN(jsimd_rgb_gray_convert_sse2):
42    push        rbp
43    mov         rax, rsp                     ; rax = original rbp
44    sub         rsp, byte 4
45    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
46    mov         [rsp], rax
47    mov         rbp, rsp                     ; rbp = aligned rbp
48    lea         rsp, [wk(0)]
49    collect_args 5
50    push        rbx
51
52    mov         ecx, r10d
53    test        rcx, rcx
54    jz          near .return
55
56    push        rcx
57
58    mov         rsi, r12
59    mov         ecx, r13d
60    mov         rdip, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
61    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
62
63    pop         rcx
64
65    mov         rsi, r11
66    mov         eax, r14d
67    test        rax, rax
68    jle         near .return
69.rowloop:
70    push        rdi
71    push        rsi
72    push        rcx                     ; col
73
74    mov         rsip, JSAMPROW [rsi]    ; inptr
75    mov         rdip, JSAMPROW [rdi]    ; outptr0
76
77    cmp         rcx, byte SIZEOF_XMMWORD
78    jae         near .columnloop
79
80%if RGB_PIXELSIZE == 3  ; ---------------
81
82.column_ld1:
83    push        rax
84    push        rdx
85    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
86    test        cl, SIZEOF_BYTE
87    jz          short .column_ld2
88    sub         rcx, byte SIZEOF_BYTE
89    movzx       rax, byte [rsi+rcx]
90.column_ld2:
91    test        cl, SIZEOF_WORD
92    jz          short .column_ld4
93    sub         rcx, byte SIZEOF_WORD
94    movzx       rdx, word [rsi+rcx]
95    shl         rax, WORD_BIT
96    or          rax, rdx
97.column_ld4:
98    movd        xmmA, eax
99    pop         rdx
100    pop         rax
101    test        cl, SIZEOF_DWORD
102    jz          short .column_ld8
103    sub         rcx, byte SIZEOF_DWORD
104    movd        xmmF, XMM_DWORD [rsi+rcx]
105    pslldq      xmmA, SIZEOF_DWORD
106    por         xmmA, xmmF
107.column_ld8:
108    test        cl, SIZEOF_MMWORD
109    jz          short .column_ld16
110    sub         rcx, byte SIZEOF_MMWORD
111    movq        xmmB, XMM_MMWORD [rsi+rcx]
112    pslldq      xmmA, SIZEOF_MMWORD
113    por         xmmA, xmmB
114.column_ld16:
115    test        cl, SIZEOF_XMMWORD
116    jz          short .column_ld32
117    movdqa      xmmF, xmmA
118    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
119    mov         rcx, SIZEOF_XMMWORD
120    jmp         short .rgb_gray_cnv
121.column_ld32:
122    test        cl, 2*SIZEOF_XMMWORD
123    mov         rcx, SIZEOF_XMMWORD
124    jz          short .rgb_gray_cnv
125    movdqa      xmmB, xmmA
126    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
127    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
128    jmp         short .rgb_gray_cnv
129
130.columnloop:
131    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
132    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
133    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
134
135.rgb_gray_cnv:
136    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
137    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
138    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
139
140    movdqa      xmmG, xmmA
141    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
142    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
143
144    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
145    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
146
147    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
148    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
149
150    movdqa      xmmD, xmmA
151    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
152    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
153
154    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
155    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
156
157    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
158    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
159
160    movdqa      xmmE, xmmA
161    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
162    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
163
164    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
165    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
166
167    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
168    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
169
170    pxor        xmmH, xmmH
171
172    movdqa      xmmC, xmmA
173    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
174    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
175
176    movdqa      xmmB, xmmE
177    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
178    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
179
180    movdqa      xmmF, xmmD
181    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
182    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
183
184%else  ; RGB_PIXELSIZE == 4 ; -----------
185
186.column_ld1:
187    test        cl, SIZEOF_XMMWORD/16
188    jz          short .column_ld2
189    sub         rcx, byte SIZEOF_XMMWORD/16
190    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
191.column_ld2:
192    test        cl, SIZEOF_XMMWORD/8
193    jz          short .column_ld4
194    sub         rcx, byte SIZEOF_XMMWORD/8
195    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
196    pslldq      xmmA, SIZEOF_MMWORD
197    por         xmmA, xmmE
198.column_ld4:
199    test        cl, SIZEOF_XMMWORD/4
200    jz          short .column_ld8
201    sub         rcx, byte SIZEOF_XMMWORD/4
202    movdqa      xmmE, xmmA
203    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
204.column_ld8:
205    test        cl, SIZEOF_XMMWORD/2
206    mov         rcx, SIZEOF_XMMWORD
207    jz          short .rgb_gray_cnv
208    movdqa      xmmF, xmmA
209    movdqa      xmmH, xmmE
210    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
211    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
212    jmp         short .rgb_gray_cnv
213
214.columnloop:
215    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
216    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
217    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
218    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
219
220.rgb_gray_cnv:
221    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
222    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
223    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
224    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
225
226    movdqa      xmmD, xmmA
227    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
228    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
229
230    movdqa      xmmC, xmmF
231    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
232    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
233
234    movdqa      xmmB, xmmA
235    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
236    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
237
238    movdqa      xmmG, xmmD
239    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
240    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
241
242    movdqa      xmmE, xmmA
243    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
244    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
245
246    movdqa      xmmH, xmmB
247    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
248    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
249
250    pxor        xmmF, xmmF
251
252    movdqa      xmmC, xmmA
253    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
254    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
255
256    movdqa      xmmD, xmmB
257    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
258    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
259
260    movdqa      xmmG, xmmE
261    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
262    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
263
264    punpcklbw   xmmF, xmmH
265    punpckhbw   xmmH, xmmH
266    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
267    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
268
269%endif  ; RGB_PIXELSIZE ; ---------------
270
271    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
272    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
273
274    ; (Original)
275    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
276    ;
277    ; (This implementation)
278    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
279
280    movdqa      xmm6, xmm1
281    punpcklwd   xmm1, xmm3
282    punpckhwd   xmm6, xmm3
283    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
284    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
285
286    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
287
288    movdqa      xmm6, xmm0
289    punpcklwd   xmm0, xmm2
290    punpckhwd   xmm6, xmm2
291    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
292    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
293
294    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
295    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
296
297    movdqa      xmm0, xmm5              ; xmm0=BO
298    movdqa      xmm6, xmm4              ; xmm6=BE
299
300    movdqa      xmm4, xmm0
301    punpcklwd   xmm0, xmm3
302    punpckhwd   xmm4, xmm3
303    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
304    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
305
306    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
307
308    paddd       xmm0, xmm1
309    paddd       xmm4, xmm7
310    paddd       xmm0, xmm3
311    paddd       xmm4, xmm3
312    psrld       xmm0, SCALEBITS         ; xmm0=YOL
313    psrld       xmm4, SCALEBITS         ; xmm4=YOH
314    packssdw    xmm0, xmm4              ; xmm0=YO
315
316    movdqa      xmm4, xmm6
317    punpcklwd   xmm6, xmm2
318    punpckhwd   xmm4, xmm2
319    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
320    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
321
322    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
323
324    paddd       xmm6, XMMWORD [wk(0)]
325    paddd       xmm4, XMMWORD [wk(1)]
326    paddd       xmm6, xmm2
327    paddd       xmm4, xmm2
328    psrld       xmm6, SCALEBITS         ; xmm6=YEL
329    psrld       xmm4, SCALEBITS         ; xmm4=YEH
330    packssdw    xmm6, xmm4              ; xmm6=YE
331
332    psllw       xmm0, BYTE_BIT
333    por         xmm6, xmm0              ; xmm6=Y
334    movdqa      XMMWORD [rdi], xmm6     ; Save Y
335
336    sub         rcx, byte SIZEOF_XMMWORD
337    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
338    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
339    cmp         rcx, byte SIZEOF_XMMWORD
340    jae         near .columnloop
341    test        rcx, rcx
342    jnz         near .column_ld1
343
344    pop         rcx                     ; col
345    pop         rsi
346    pop         rdi
347
348    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
349    add         rdi, byte SIZEOF_JSAMPROW
350    dec         rax                        ; num_rows
351    jg          near .rowloop
352
353.return:
354    pop         rbx
355    uncollect_args 5
356    mov         rsp, rbp                ; rsp <- aligned rbp
357    pop         rsp                     ; rsp <- original rbp
358    pop         rbp
359    ret
360
361; For some reason, the OS X linker does not honor the request to align the
362; segment unless we do this.
363    align       32
364