1;
2; jcgryext.asm - grayscale colorspace conversion (64-bit SSE2)
3;
4; Copyright (C) 2011, D. R. Commander.
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jcolsamp.inc"
19
20; --------------------------------------------------------------------------
21;
22; Convert some rows of samples to the output colorspace.
23;
24; GLOBAL(void)
25; jsimd_rgb_gray_convert_sse2 (JDIMENSION img_width,
26;                              JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
27;                              JDIMENSION output_row, int num_rows);
28;
29
30; r10 = JDIMENSION img_width
31; r11 = JSAMPARRAY input_buf
32; r12 = JSAMPIMAGE output_buf
33; r13 = JDIMENSION output_row
34; r14 = int num_rows
35
36%define wk(i)           rbp-(WK_NUM-(i))*SIZEOF_XMMWORD ; xmmword wk[WK_NUM]
37%define WK_NUM          2
38
39        align   16
40
41        global  EXTN(jsimd_rgb_gray_convert_sse2)
42
43EXTN(jsimd_rgb_gray_convert_sse2):
44        push    rbp
45        mov     rax,rsp                         ; rax = original rbp
46        sub     rsp, byte 4
47        and     rsp, byte (-SIZEOF_XMMWORD)     ; align to 128 bits
48        mov     [rsp],rax
49        mov     rbp,rsp                         ; rbp = aligned rbp
50        lea     rsp, [wk(0)]
51        collect_args
52        push    rbx
53
54        mov     ecx, r10d
55        test    rcx,rcx
56        jz      near .return
57
58        push    rcx
59
60        mov rsi, r12
61        mov ecx, r13d
62        mov     rdi, JSAMPARRAY [rsi+0*SIZEOF_JSAMPARRAY]
63        lea     rdi, [rdi+rcx*SIZEOF_JSAMPROW]
64
65        pop     rcx
66
67        mov rsi, r11
68        mov     eax, r14d
69        test    rax,rax
70        jle     near .return
71.rowloop:
72        push    rdi
73        push    rsi
74        push    rcx                     ; col
75
76        mov     rsi, JSAMPROW [rsi]     ; inptr
77        mov     rdi, JSAMPROW [rdi]     ; outptr0
78
79        cmp     rcx, byte SIZEOF_XMMWORD
80        jae     near .columnloop
81
82%if RGB_PIXELSIZE == 3 ; ---------------
83
84.column_ld1:
85        push    rax
86        push    rdx
87        lea     rcx,[rcx+rcx*2]         ; imul ecx,RGB_PIXELSIZE
88        test    cl, SIZEOF_BYTE
89        jz      short .column_ld2
90        sub     rcx, byte SIZEOF_BYTE
91        movzx   rax, BYTE [rsi+rcx]
92.column_ld2:
93        test    cl, SIZEOF_WORD
94        jz      short .column_ld4
95        sub     rcx, byte SIZEOF_WORD
96        movzx   rdx, WORD [rsi+rcx]
97        shl     rax, WORD_BIT
98        or      rax,rdx
99.column_ld4:
100        movd    xmmA,eax
101        pop     rdx
102        pop     rax
103        test    cl, SIZEOF_DWORD
104        jz      short .column_ld8
105        sub     rcx, byte SIZEOF_DWORD
106        movd    xmmF, XMM_DWORD [rsi+rcx]
107        pslldq  xmmA, SIZEOF_DWORD
108        por     xmmA,xmmF
109.column_ld8:
110        test    cl, SIZEOF_MMWORD
111        jz      short .column_ld16
112        sub     rcx, byte SIZEOF_MMWORD
113        movq    xmmB, XMM_MMWORD [rsi+rcx]
114        pslldq  xmmA, SIZEOF_MMWORD
115        por     xmmA,xmmB
116.column_ld16:
117        test    cl, SIZEOF_XMMWORD
118        jz      short .column_ld32
119        movdqa  xmmF,xmmA
120        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
121        mov     rcx, SIZEOF_XMMWORD
122        jmp     short .rgb_gray_cnv
123.column_ld32:
124        test    cl, 2*SIZEOF_XMMWORD
125        mov     rcx, SIZEOF_XMMWORD
126        jz      short .rgb_gray_cnv
127        movdqa  xmmB,xmmA
128        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
129        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
130        jmp     short .rgb_gray_cnv
131
132.columnloop:
133        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
134        movdqu  xmmF, XMMWORD [rsi+1*SIZEOF_XMMWORD]
135        movdqu  xmmB, XMMWORD [rsi+2*SIZEOF_XMMWORD]
136
137.rgb_gray_cnv:
138        ; xmmA=(00 10 20 01 11 21 02 12 22 03 13 23 04 14 24 05)
139        ; xmmF=(15 25 06 16 26 07 17 27 08 18 28 09 19 29 0A 1A)
140        ; xmmB=(2A 0B 1B 2B 0C 1C 2C 0D 1D 2D 0E 1E 2E 0F 1F 2F)
141
142        movdqa    xmmG,xmmA
143        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 10 20 01 11 21 02 12)
144        psrldq    xmmG,8        ; xmmG=(22 03 13 23 04 14 24 05 -- -- -- -- -- -- -- --)
145
146        punpckhbw xmmA,xmmF     ; xmmA=(00 08 10 18 20 28 01 09 11 19 21 29 02 0A 12 1A)
147        pslldq    xmmF,8        ; xmmF=(-- -- -- -- -- -- -- -- 15 25 06 16 26 07 17 27)
148
149        punpcklbw xmmG,xmmB     ; xmmG=(22 2A 03 0B 13 1B 23 2B 04 0C 14 1C 24 2C 05 0D)
150        punpckhbw xmmF,xmmB     ; xmmF=(15 1D 25 2D 06 0E 16 1E 26 2E 07 0F 17 1F 27 2F)
151
152        movdqa    xmmD,xmmA
153        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 08 10 18 20 28 01 09)
154        psrldq    xmmD,8        ; xmmD=(11 19 21 29 02 0A 12 1A -- -- -- -- -- -- -- --)
155
156        punpckhbw xmmA,xmmG     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 01 05 09 0D)
157        pslldq    xmmG,8        ; xmmG=(-- -- -- -- -- -- -- -- 22 2A 03 0B 13 1B 23 2B)
158
159        punpcklbw xmmD,xmmF     ; xmmD=(11 15 19 1D 21 25 29 2D 02 06 0A 0E 12 16 1A 1E)
160        punpckhbw xmmG,xmmF     ; xmmG=(22 26 2A 2E 03 07 0B 0F 13 17 1B 1F 23 27 2B 2F)
161
162        movdqa    xmmE,xmmA
163        pslldq    xmmA,8        ; xmmA=(-- -- -- -- -- -- -- -- 00 04 08 0C 10 14 18 1C)
164        psrldq    xmmE,8        ; xmmE=(20 24 28 2C 01 05 09 0D -- -- -- -- -- -- -- --)
165
166        punpckhbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
167        pslldq    xmmD,8        ; xmmD=(-- -- -- -- -- -- -- -- 11 15 19 1D 21 25 29 2D)
168
169        punpcklbw xmmE,xmmG     ; xmmE=(20 22 24 26 28 2A 2C 2E 01 03 05 07 09 0B 0D 0F)
170        punpckhbw xmmD,xmmG     ; xmmD=(11 13 15 17 19 1B 1D 1F 21 23 25 27 29 2B 2D 2F)
171
172        pxor      xmmH,xmmH
173
174        movdqa    xmmC,xmmA
175        punpcklbw xmmA,xmmH     ; xmmA=(00 02 04 06 08 0A 0C 0E)
176        punpckhbw xmmC,xmmH     ; xmmC=(10 12 14 16 18 1A 1C 1E)
177
178        movdqa    xmmB,xmmE
179        punpcklbw xmmE,xmmH     ; xmmE=(20 22 24 26 28 2A 2C 2E)
180        punpckhbw xmmB,xmmH     ; xmmB=(01 03 05 07 09 0B 0D 0F)
181
182        movdqa    xmmF,xmmD
183        punpcklbw xmmD,xmmH     ; xmmD=(11 13 15 17 19 1B 1D 1F)
184        punpckhbw xmmF,xmmH     ; xmmF=(21 23 25 27 29 2B 2D 2F)
185
186%else ; RGB_PIXELSIZE == 4 ; -----------
187
188.column_ld1:
189        test    cl, SIZEOF_XMMWORD/16
190        jz      short .column_ld2
191        sub     rcx, byte SIZEOF_XMMWORD/16
192        movd    xmmA, XMM_DWORD [rsi+rcx*RGB_PIXELSIZE]
193.column_ld2:
194        test    cl, SIZEOF_XMMWORD/8
195        jz      short .column_ld4
196        sub     rcx, byte SIZEOF_XMMWORD/8
197        movq    xmmE, XMM_MMWORD [rsi+rcx*RGB_PIXELSIZE]
198        pslldq  xmmA, SIZEOF_MMWORD
199        por     xmmA,xmmE
200.column_ld4:
201        test    cl, SIZEOF_XMMWORD/4
202        jz      short .column_ld8
203        sub     rcx, byte SIZEOF_XMMWORD/4
204        movdqa  xmmE,xmmA
205        movdqu  xmmA, XMMWORD [rsi+rcx*RGB_PIXELSIZE]
206.column_ld8:
207        test    cl, SIZEOF_XMMWORD/2
208        mov     rcx, SIZEOF_XMMWORD
209        jz      short .rgb_gray_cnv
210        movdqa  xmmF,xmmA
211        movdqa  xmmH,xmmE
212        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
213        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
214        jmp     short .rgb_gray_cnv
215
216.columnloop:
217        movdqu  xmmA, XMMWORD [rsi+0*SIZEOF_XMMWORD]
218        movdqu  xmmE, XMMWORD [rsi+1*SIZEOF_XMMWORD]
219        movdqu  xmmF, XMMWORD [rsi+2*SIZEOF_XMMWORD]
220        movdqu  xmmH, XMMWORD [rsi+3*SIZEOF_XMMWORD]
221
222.rgb_gray_cnv:
223        ; xmmA=(00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33)
224        ; xmmE=(04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37)
225        ; xmmF=(08 18 28 38 09 19 29 39 0A 1A 2A 3A 0B 1B 2B 3B)
226        ; xmmH=(0C 1C 2C 3C 0D 1D 2D 3D 0E 1E 2E 3E 0F 1F 2F 3F)
227
228        movdqa    xmmD,xmmA
229        punpcklbw xmmA,xmmE     ; xmmA=(00 04 10 14 20 24 30 34 01 05 11 15 21 25 31 35)
230        punpckhbw xmmD,xmmE     ; xmmD=(02 06 12 16 22 26 32 36 03 07 13 17 23 27 33 37)
231
232        movdqa    xmmC,xmmF
233        punpcklbw xmmF,xmmH     ; xmmF=(08 0C 18 1C 28 2C 38 3C 09 0D 19 1D 29 2D 39 3D)
234        punpckhbw xmmC,xmmH     ; xmmC=(0A 0E 1A 1E 2A 2E 3A 3E 0B 0F 1B 1F 2B 2F 3B 3F)
235
236        movdqa    xmmB,xmmA
237        punpcklwd xmmA,xmmF     ; xmmA=(00 04 08 0C 10 14 18 1C 20 24 28 2C 30 34 38 3C)
238        punpckhwd xmmB,xmmF     ; xmmB=(01 05 09 0D 11 15 19 1D 21 25 29 2D 31 35 39 3D)
239
240        movdqa    xmmG,xmmD
241        punpcklwd xmmD,xmmC     ; xmmD=(02 06 0A 0E 12 16 1A 1E 22 26 2A 2E 32 36 3A 3E)
242        punpckhwd xmmG,xmmC     ; xmmG=(03 07 0B 0F 13 17 1B 1F 23 27 2B 2F 33 37 3B 3F)
243
244        movdqa    xmmE,xmmA
245        punpcklbw xmmA,xmmD     ; xmmA=(00 02 04 06 08 0A 0C 0E 10 12 14 16 18 1A 1C 1E)
246        punpckhbw xmmE,xmmD     ; xmmE=(20 22 24 26 28 2A 2C 2E 30 32 34 36 38 3A 3C 3E)
247
248        movdqa    xmmH,xmmB
249        punpcklbw xmmB,xmmG     ; xmmB=(01 03 05 07 09 0B 0D 0F 11 13 15 17 19 1B 1D 1F)
250        punpckhbw xmmH,xmmG     ; xmmH=(21 23 25 27 29 2B 2D 2F 31 33 35 37 39 3B 3D 3F)
251
252        pxor      xmmF,xmmF
253
254        movdqa    xmmC,xmmA
255        punpcklbw xmmA,xmmF     ; xmmA=(00 02 04 06 08 0A 0C 0E)
256        punpckhbw xmmC,xmmF     ; xmmC=(10 12 14 16 18 1A 1C 1E)
257
258        movdqa    xmmD,xmmB
259        punpcklbw xmmB,xmmF     ; xmmB=(01 03 05 07 09 0B 0D 0F)
260        punpckhbw xmmD,xmmF     ; xmmD=(11 13 15 17 19 1B 1D 1F)
261
262        movdqa    xmmG,xmmE
263        punpcklbw xmmE,xmmF     ; xmmE=(20 22 24 26 28 2A 2C 2E)
264        punpckhbw xmmG,xmmF     ; xmmG=(30 32 34 36 38 3A 3C 3E)
265
266        punpcklbw xmmF,xmmH
267        punpckhbw xmmH,xmmH
268        psrlw     xmmF,BYTE_BIT ; xmmF=(21 23 25 27 29 2B 2D 2F)
269        psrlw     xmmH,BYTE_BIT ; xmmH=(31 33 35 37 39 3B 3D 3F)
270
271%endif ; RGB_PIXELSIZE ; ---------------
272
273        ; xmm0=R(02468ACE)=RE, xmm2=G(02468ACE)=GE, xmm4=B(02468ACE)=BE
274        ; xmm1=R(13579BDF)=RO, xmm3=G(13579BDF)=GO, xmm5=B(13579BDF)=BO
275
276        ; (Original)
277        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
278        ;
279        ; (This implementation)
280        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
281
282        movdqa    xmm6,xmm1
283        punpcklwd xmm1,xmm3
284        punpckhwd xmm6,xmm3
285        pmaddwd   xmm1,[rel PW_F0299_F0337] ; xmm1=ROL*FIX(0.299)+GOL*FIX(0.337)
286        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=ROH*FIX(0.299)+GOH*FIX(0.337)
287
288        movdqa    xmm7, xmm6    ; xmm7=ROH*FIX(0.299)+GOH*FIX(0.337)
289
290        movdqa    xmm6,xmm0
291        punpcklwd xmm0,xmm2
292        punpckhwd xmm6,xmm2
293        pmaddwd   xmm0,[rel PW_F0299_F0337] ; xmm0=REL*FIX(0.299)+GEL*FIX(0.337)
294        pmaddwd   xmm6,[rel PW_F0299_F0337] ; xmm6=REH*FIX(0.299)+GEH*FIX(0.337)
295
296        movdqa    XMMWORD [wk(0)], xmm0 ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
297        movdqa    XMMWORD [wk(1)], xmm6 ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
298
299        movdqa    xmm0, xmm5    ; xmm0=BO
300        movdqa    xmm6, xmm4    ; xmm6=BE
301
302        movdqa    xmm4,xmm0
303        punpcklwd xmm0,xmm3
304        punpckhwd xmm4,xmm3
305        pmaddwd   xmm0,[rel PW_F0114_F0250] ; xmm0=BOL*FIX(0.114)+GOL*FIX(0.250)
306        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BOH*FIX(0.114)+GOH*FIX(0.250)
307
308        movdqa    xmm3,[rel PD_ONEHALF] ; xmm3=[PD_ONEHALF]
309
310        paddd     xmm0, xmm1
311        paddd     xmm4, xmm7
312        paddd     xmm0,xmm3
313        paddd     xmm4,xmm3
314        psrld     xmm0,SCALEBITS        ; xmm0=YOL
315        psrld     xmm4,SCALEBITS        ; xmm4=YOH
316        packssdw  xmm0,xmm4             ; xmm0=YO
317
318        movdqa    xmm4,xmm6
319        punpcklwd xmm6,xmm2
320        punpckhwd xmm4,xmm2
321        pmaddwd   xmm6,[rel PW_F0114_F0250] ; xmm6=BEL*FIX(0.114)+GEL*FIX(0.250)
322        pmaddwd   xmm4,[rel PW_F0114_F0250] ; xmm4=BEH*FIX(0.114)+GEH*FIX(0.250)
323
324        movdqa    xmm2,[rel PD_ONEHALF] ; xmm2=[PD_ONEHALF]
325
326        paddd     xmm6, XMMWORD [wk(0)]
327        paddd     xmm4, XMMWORD [wk(1)]
328        paddd     xmm6,xmm2
329        paddd     xmm4,xmm2
330        psrld     xmm6,SCALEBITS        ; xmm6=YEL
331        psrld     xmm4,SCALEBITS        ; xmm4=YEH
332        packssdw  xmm6,xmm4             ; xmm6=YE
333
334        psllw     xmm0,BYTE_BIT
335        por       xmm6,xmm0             ; xmm6=Y
336        movdqa    XMMWORD [rdi], xmm6   ; Save Y
337
338        sub     rcx, byte SIZEOF_XMMWORD
339        add     rsi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; inptr
340        add     rdi, byte SIZEOF_XMMWORD                ; outptr0
341        cmp     rcx, byte SIZEOF_XMMWORD
342        jae     near .columnloop
343        test    rcx,rcx
344        jnz     near .column_ld1
345
346        pop     rcx                     ; col
347        pop     rsi
348        pop     rdi
349
350        add     rsi, byte SIZEOF_JSAMPROW       ; input_buf
351        add     rdi, byte SIZEOF_JSAMPROW
352        dec     rax                             ; num_rows
353        jg      near .rowloop
354
355.return:
356        pop     rbx
357        uncollect_args
358        mov     rsp,rbp         ; rsp <- aligned rbp
359        pop     rsp             ; rsp <- original rbp
360        pop     rbp
361        ret
362
363; For some reason, the OS X linker does not honor the request to align the
364; segment unless we do this.
365        align   16
366