1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2011, 2016, D. R. Commander.
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_rgb_gray_convert_sse2(JDIMENSION img_width, JSAMPARRAY input_buf,
26;                             JSAMPIMAGE output_buf, JDIMENSION output_row,
27;                             int num_rows);
28;
29
30; r10d = JDIMENSION img_width
31; r11 = JSAMPARRAY input_buf
32; r12 = JSAMPIMAGE output_buf
33; r13d = JDIMENSION output_row
34; r14d = int num_rows
35
36%define wk(i)   rbp - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
37%define WK_NUM  2
38
39    align       32
40    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_sse2)
41
42EXTN(jsimd_rgb_gray_convert_sse2):
43    push        rbp
44    mov         rax, rsp                     ; rax = original rbp
45    sub         rsp, byte 4
46    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
47    mov         [rsp], rax
48    mov         rbp, rsp                     ; rbp = aligned rbp
49    lea         rsp, [wk(0)]
50    collect_args 5
51    push        rbx
52
53    mov         ecx, r10d
54    test        rcx, rcx
55    jz          near .return
56
57    push        rcx
58
59    mov         rsi, r12
60    mov         ecx, r13d
61    mov         rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
62    lea         rdi, [rdi+rcx*SIZEOF_JSAMPROW]
63
64    pop         rcx
65
66    mov         rsi, r11
67    mov         eax, r14d
68    test        rax, rax
69    jle         near .return
70.rowloop:
71    push        rdi
72    push        rsi
73    push        rcx                     ; col
74
75    mov         rsi, JSAMPROW [rsi]     ; inptr
76    mov         rdi, JSAMPROW [rdi]     ; outptr0
77
78    cmp         rcx, byte SIZEOF_XMMWORD
79    jae         near .columnloop
80
81%if RGB_PIXELSIZE == 3  ; ---------------
82
83.column_ld1:
84    push        rax
85    push        rdx
86    lea         rcx, [rcx+rcx*2]        ; imul ecx,RGB_PIXELSIZE
87    test        cl, SIZEOF_BYTE
88    jz          short .column_ld2
89    sub         rcx, byte SIZEOF_BYTE
90    movzx       rax, BYTE [rsi+rcx]
91.column_ld2:
92    test        cl, SIZEOF_WORD
93    jz          short .column_ld4
94    sub         rcx, byte SIZEOF_WORD
95    movzx       rdx, WORD [rsi+rcx]
96    shl         rax, WORD_BIT
97    or          rax, rdx
98.column_ld4:
99    movd        xmmA, eax
100    pop         rdx
101    pop         rax
102    test        cl, SIZEOF_DWORD
103    jz          short .column_ld8
104    sub         rcx, byte SIZEOF_DWORD
105    movd        xmmF, XMM_DWORD [rsi+rcx]
106    pslldq      xmmA, SIZEOF_DWORD
107    por         xmmA, xmmF
108.column_ld8:
109    test        cl, SIZEOF_MMWORD
110    jz          short .column_ld16
111    sub         rcx, byte SIZEOF_MMWORD
112    movq        xmmB, XMM_MMWORD [rsi+rcx]
113    pslldq      xmmA, SIZEOF_MMWORD
114    por         xmmA, xmmB
115.column_ld16:
116    test        cl, SIZEOF_XMMWORD
117    jz          short .column_ld32
118    movdqa      xmmF, xmmA
119    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
120    mov         rcx, SIZEOF_XMMWORD
121    jmp         short .rgb_gray_cnv
122.column_ld32:
123    test        cl, 2*SIZEOF_XMMWORD
124    mov         rcx, SIZEOF_XMMWORD
125    jz          short .rgb_gray_cnv
126    movdqa      xmmB, xmmA
127    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
128    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
129    jmp         short .rgb_gray_cnv
130
131.columnloop:
132    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
133    movdqu      xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
134    movdqu      xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
135
136.rgb_gray_cnv:
137    ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
138    ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
139    ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
140
141    movdqa      xmmG, xmmA
142    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
143    psrldq      xmmG, 8     ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
144
145    punpckhbw   xmmA, xmmF  ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
146    pslldq      xmmF, 8     ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
147
148    punpcklbw   xmmG, xmmB  ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
149    punpckhbw   xmmF, xmmB  ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
150
151    movdqa      xmmD, xmmA
152    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
153    psrldq      xmmD, 8     ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
154
155    punpckhbw   xmmA, xmmG  ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
156    pslldq      xmmG, 8     ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
157
158    punpcklbw   xmmD, xmmF  ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
159    punpckhbw   xmmG, xmmF  ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
160
161    movdqa      xmmE, xmmA
162    pslldq      xmmA, 8     ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
163    psrldq      xmmE, 8     ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
164
165    punpckhbw   xmmA, xmmD  ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
166    pslldq      xmmD, 8     ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
167
168    punpcklbw   xmmE, xmmG  ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
169    punpckhbw   xmmD, xmmG  ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
170
171    pxor        xmmH, xmmH
172
173    movdqa      xmmC, xmmA
174    punpcklbw   xmmA, xmmH  ; xmmA=(00 02 04 06 08 0A 0C 0E)
175    punpckhbw   xmmC, xmmH  ; xmmC=(10 12 14 16 18 1A 1C 1E)
176
177    movdqa      xmmB, xmmE
178    punpcklbw   xmmE, xmmH  ; xmmE=(20 22 24 26 28 2A 2C 2E)
179    punpckhbw   xmmB, xmmH  ; xmmB=(01 03 05 07 09 0B 0D 0F)
180
181    movdqa      xmmF, xmmD
182    punpcklbw   xmmD, xmmH  ; xmmD=(11 13 15 17 19 1B 1D 1F)
183    punpckhbw   xmmF, xmmH  ; xmmF=(21 23 25 27 29 2B 2D 2F)
184
185%else  ; RGB_PIXELSIZE == 4 ; -----------
186
187.column_ld1:
188    test        cl, SIZEOF_XMMWORD/16
189    jz          short .column_ld2
190    sub         rcx, byte SIZEOF_XMMWORD/16
191    movd        xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
192.column_ld2:
193    test        cl, SIZEOF_XMMWORD/8
194    jz          short .column_ld4
195    sub         rcx, byte SIZEOF_XMMWORD/8
196    movq        xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
197    pslldq      xmmA, SIZEOF_MMWORD
198    por         xmmA, xmmE
199.column_ld4:
200    test        cl, SIZEOF_XMMWORD/4
201    jz          short .column_ld8
202    sub         rcx, byte SIZEOF_XMMWORD/4
203    movdqa      xmmE, xmmA
204    movdqu      xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
205.column_ld8:
206    test        cl, SIZEOF_XMMWORD/2
207    mov         rcx, SIZEOF_XMMWORD
208    jz          short .rgb_gray_cnv
209    movdqa      xmmF, xmmA
210    movdqa      xmmH, xmmE
211    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
212    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
213    jmp         short .rgb_gray_cnv
214
215.columnloop:
216    movdqu      xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
217    movdqu      xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
218    movdqu      xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
219    movdqu      xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
220
221.rgb_gray_cnv:
222    ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
223    ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
224    ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
225    ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
226
227    movdqa      xmmD, xmmA
228    punpcklbw   xmmA, xmmE      ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
229    punpckhbw   xmmD, xmmE      ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
230
231    movdqa      xmmC, xmmF
232    punpcklbw   xmmF, xmmH      ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
233    punpckhbw   xmmC, xmmH      ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
234
235    movdqa      xmmB, xmmA
236    punpcklwd   xmmA, xmmF      ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
237    punpckhwd   xmmB, xmmF      ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
238
239    movdqa      xmmG, xmmD
240    punpcklwd   xmmD, xmmC      ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
241    punpckhwd   xmmG, xmmC      ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
242
243    movdqa      xmmE, xmmA
244    punpcklbw   xmmA, xmmD      ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
245    punpckhbw   xmmE, xmmD      ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
246
247    movdqa      xmmH, xmmB
248    punpcklbw   xmmB, xmmG      ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
249    punpckhbw   xmmH, xmmG      ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
250
251    pxor        xmmF, xmmF
252
253    movdqa      xmmC, xmmA
254    punpcklbw   xmmA, xmmF      ; xmmA=(00 02 04 06 08 0A 0C 0E)
255    punpckhbw   xmmC, xmmF      ; xmmC=(10 12 14 16 18 1A 1C 1E)
256
257    movdqa      xmmD, xmmB
258    punpcklbw   xmmB, xmmF      ; xmmB=(01 03 05 07 09 0B 0D 0F)
259    punpckhbw   xmmD, xmmF      ; xmmD=(11 13 15 17 19 1B 1D 1F)
260
261    movdqa      xmmG, xmmE
262    punpcklbw   xmmE, xmmF      ; xmmE=(20 22 24 26 28 2A 2C 2E)
263    punpckhbw   xmmG, xmmF      ; xmmG=(30 32 34 36 38 3A 3C 3E)
264
265    punpcklbw   xmmF, xmmH
266    punpckhbw   xmmH, xmmH
267    psrlw       xmmF, BYTE_BIT  ; xmmF=(21 23 25 27 29 2B 2D 2F)
268    psrlw       xmmH, BYTE_BIT  ; xmmH=(31 33 35 37 39 3B 3D 3F)
269
270%endif  ; RGB_PIXELSIZE ; ---------------
271
272    ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
273    ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
274
275    ; (Original)
276    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
277    ;
278    ; (This implementation)
279    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
280
281    movdqa      xmm6, xmm1
282    punpcklwd   xmm1, xmm3
283    punpckhwd   xmm6, xmm3
284    pmaddwd     xmm1, [rel PW_F0299_F0337]  ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
285    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
286
287    movdqa      xmm7, xmm6              ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
288
289    movdqa      xmm6, xmm0
290    punpcklwd   xmm0, xmm2
291    punpckhwd   xmm6, xmm2
292    pmaddwd     xmm0, [rel PW_F0299_F0337]  ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
293    pmaddwd     xmm6, [rel PW_F0299_F0337]  ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
294
295    movdqa      XMMWORD [wk(0)], xmm0   ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
296    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
297
298    movdqa      xmm0, xmm5              ; xmm0=BO
299    movdqa      xmm6, xmm4              ; xmm6=BE
300
301    movdqa      xmm4, xmm0
302    punpcklwd   xmm0, xmm3
303    punpckhwd   xmm4, xmm3
304    pmaddwd     xmm0, [rel PW_F0114_F0250]  ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
305    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
306
307    movdqa      xmm3, [rel PD_ONEHALF]      ; xmm3=[PD_ONEHALF]
308
309    paddd       xmm0, xmm1
310    paddd       xmm4, xmm7
311    paddd       xmm0, xmm3
312    paddd       xmm4, xmm3
313    psrld       xmm0, SCALEBITS         ; xmm0=YOL
314    psrld       xmm4, SCALEBITS         ; xmm4=YOH
315    packssdw    xmm0, xmm4              ; xmm0=YO
316
317    movdqa      xmm4, xmm6
318    punpcklwd   xmm6, xmm2
319    punpckhwd   xmm4, xmm2
320    pmaddwd     xmm6, [rel PW_F0114_F0250]  ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
321    pmaddwd     xmm4, [rel PW_F0114_F0250]  ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
322
323    movdqa      xmm2, [rel PD_ONEHALF]      ; xmm2=[PD_ONEHALF]
324
325    paddd       xmm6, XMMWORD [wk(0)]
326    paddd       xmm4, XMMWORD [wk(1)]
327    paddd       xmm6, xmm2
328    paddd       xmm4, xmm2
329    psrld       xmm6, SCALEBITS         ; xmm6=YEL
330    psrld       xmm4, SCALEBITS         ; xmm4=YEH
331    packssdw    xmm6, xmm4              ; xmm6=YE
332
333    psllw       xmm0, BYTE_BIT
334    por         xmm6, xmm0              ; xmm6=Y
335    movdqa      XMMWORD [rdi], xmm6     ; Save Y
336
337    sub         rcx, byte SIZEOF_XMMWORD
338    add         rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
339    add         rdi, byte SIZEOF_XMMWORD                ; outptr0
340    cmp         rcx, byte SIZEOF_XMMWORD
341    jae         near .columnloop
342    test        rcx, rcx
343    jnz         near .column_ld1
344
345    pop         rcx                     ; col
346    pop         rsi
347    pop         rdi
348
349    add         rsi, byte SIZEOF_JSAMPROW  ; input_buf
350    add         rdi, byte SIZEOF_JSAMPROW
351    dec         rax                        ; num_rows
352    jg          near .rowloop
353
354.return:
355    pop         rbx
356    uncollect_args 5
357    mov         rsp, rbp                ; rsp <- aligned rbp
358    pop         rsp                     ; rsp <- original rbp
359    pop         rbp
360    ret
361
362; For some reason, the OS X linker does not honor the request to align the
363; segment unless we do this.
364    align       32
365