1;
2; jcgryext.asm - grayscale colorspace conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2011, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16
17%include "jcolsamp.inc"
18
19; --------------------------------------------------------------------------
20;
21; Convert some rows of samples to the output colorspace.
22;
23; GLOBAL(void)
24; jsimd_rgb_gray_convert_mmx(JDIMENSION img_width, JSAMPARRAY input_buf,
25;                            JSAMPIMAGE output_buf, JDIMENSION output_row,
26;                            int num_rows);
27;
28
29%define img_width(b)   (b) + 8          ; JDIMENSION img_width
30%define input_buf(b)   (b) + 12         ; JSAMPARRAY input_buf
31%define output_buf(b)  (b) + 16         ; JSAMPIMAGE output_buf
32%define output_row(b)  (b) + 20         ; JDIMENSION output_row
33%define num_rows(b)    (b) + 24         ; int num_rows
34
35%define original_ebp   ebp + 0
36%define wk(i)          ebp - (WK_NUM - (i)) * SIZEOF_MMWORD
37                                        ; mmword wk[WK_NUM]
38%define WK_NUM         2
39%define gotptr         wk(0) - SIZEOF_POINTER  ; void * gotptr
40
41    align       32
42    GLOBAL_FUNCTION(jsimd_rgb_gray_convert_mmx)
43
44EXTN(jsimd_rgb_gray_convert_mmx):
45    push        ebp
46    mov         eax, esp                    ; eax = original ebp
47    sub         esp, byte 4
48    and         esp, byte (-SIZEOF_MMWORD)  ; align to 64 bits
49    mov         [esp], eax
50    mov         ebp, esp                    ; ebp = aligned ebp
51    lea         esp, [wk(0)]
52    pushpic     eax                     ; make a room for GOT address
53    push        ebx
54;   push        ecx                     ; need not be preserved
55;   push        edx                     ; need not be preserved
56    push        esi
57    push        edi
58
59    get_GOT     ebx                     ; get GOT address
60    movpic      POINTER [gotptr], ebx   ; save GOT address
61
62    mov         ecx, JDIMENSION [img_width(eax)]  ; num_cols
63    test        ecx, ecx
64    jz          near .return
65
66    push        ecx
67
68    mov         esi, JSAMPIMAGE [output_buf(eax)]
69    mov         ecx, JDIMENSION [output_row(eax)]
70    mov         edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
71    lea         edi, [edi+ecx*SIZEOF_JSAMPROW]
72
73    pop         ecx
74
75    mov         esi, JSAMPARRAY [input_buf(eax)]
76    mov         eax, INT [num_rows(eax)]
77    test        eax, eax
78    jle         near .return
79    alignx      16, 7
80.rowloop:
81    pushpic     eax
82    push        edi
83    push        esi
84    push        ecx                     ; col
85
86    mov         esi, JSAMPROW [esi]     ; inptr
87    mov         edi, JSAMPROW [edi]     ; outptr0
88    movpic      eax, POINTER [gotptr]   ; load GOT address (eax)
89
90    cmp         ecx, byte SIZEOF_MMWORD
91    jae         short .columnloop
92    alignx      16, 7
93
94%if RGB_PIXELSIZE == 3  ; ---------------
95
96.column_ld1:
97    push        eax
98    push        edx
99    lea         ecx, [ecx+ecx*2]        ; imul ecx,RGB_PIXELSIZE
100    test        cl, SIZEOF_BYTE
101    jz          short .column_ld2
102    sub         ecx, byte SIZEOF_BYTE
103    xor         eax, eax
104    mov         al, byte [esi+ecx]
105.column_ld2:
106    test        cl, SIZEOF_WORD
107    jz          short .column_ld4
108    sub         ecx, byte SIZEOF_WORD
109    xor         edx, edx
110    mov         dx, word [esi+ecx]
111    shl         eax, WORD_BIT
112    or          eax, edx
113.column_ld4:
114    movd        mmA, eax
115    pop         edx
116    pop         eax
117    test        cl, SIZEOF_DWORD
118    jz          short .column_ld8
119    sub         ecx, byte SIZEOF_DWORD
120    movd        mmG, dword [esi+ecx]
121    psllq       mmA, DWORD_BIT
122    por         mmA, mmG
123.column_ld8:
124    test        cl, SIZEOF_MMWORD
125    jz          short .column_ld16
126    movq        mmG, mmA
127    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
128    mov         ecx, SIZEOF_MMWORD
129    jmp         short .rgb_gray_cnv
130.column_ld16:
131    test        cl, 2*SIZEOF_MMWORD
132    mov         ecx, SIZEOF_MMWORD
133    jz          short .rgb_gray_cnv
134    movq        mmF, mmA
135    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
136    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
137    jmp         short .rgb_gray_cnv
138    alignx      16, 7
139
140.columnloop:
141    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
142    movq        mmG, MMWORD [esi+1*SIZEOF_MMWORD]
143    movq        mmF, MMWORD [esi+2*SIZEOF_MMWORD]
144
145.rgb_gray_cnv:
146    ; mmA=(00 10 20 01 11 21 02 12)
147    ; mmG=(22 03 13 23 04 14 24 05)
148    ; mmF=(15 25 06 16 26 07 17 27)
149
150    movq        mmD, mmA
151    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 10 20 01)
152    psrlq       mmD, 4*BYTE_BIT         ; mmD=(11 21 02 12 -- -- -- --)
153
154    punpckhbw   mmA, mmG                ; mmA=(00 04 10 14 20 24 01 05)
155    psllq       mmG, 4*BYTE_BIT         ; mmG=(-- -- -- -- 22 03 13 23)
156
157    punpcklbw   mmD, mmF                ; mmD=(11 15 21 25 02 06 12 16)
158    punpckhbw   mmG, mmF                ; mmG=(22 26 03 07 13 17 23 27)
159
160    movq        mmE, mmA
161    psllq       mmA, 4*BYTE_BIT         ; mmA=(-- -- -- -- 00 04 10 14)
162    psrlq       mmE, 4*BYTE_BIT         ; mmE=(20 24 01 05 -- -- -- --)
163
164    punpckhbw   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
165    psllq       mmD, 4*BYTE_BIT         ; mmD=(-- -- -- -- 11 15 21 25)
166
167    punpcklbw   mmE, mmG                ; mmE=(20 22 24 26 01 03 05 07)
168    punpckhbw   mmD, mmG                ; mmD=(11 13 15 17 21 23 25 27)
169
170    pxor        mmH, mmH
171
172    movq        mmC, mmA
173    punpcklbw   mmA, mmH                ; mmA=(00 02 04 06)
174    punpckhbw   mmC, mmH                ; mmC=(10 12 14 16)
175
176    movq        mmB, mmE
177    punpcklbw   mmE, mmH                ; mmE=(20 22 24 26)
178    punpckhbw   mmB, mmH                ; mmB=(01 03 05 07)
179
180    movq        mmF, mmD
181    punpcklbw   mmD, mmH                ; mmD=(11 13 15 17)
182    punpckhbw   mmF, mmH                ; mmF=(21 23 25 27)
183
184%else  ; RGB_PIXELSIZE == 4 ; -----------
185
186.column_ld1:
187    test        cl, SIZEOF_MMWORD/8
188    jz          short .column_ld2
189    sub         ecx, byte SIZEOF_MMWORD/8
190    movd        mmA, dword [esi+ecx*RGB_PIXELSIZE]
191.column_ld2:
192    test        cl, SIZEOF_MMWORD/4
193    jz          short .column_ld4
194    sub         ecx, byte SIZEOF_MMWORD/4
195    movq        mmF, mmA
196    movq        mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
197.column_ld4:
198    test        cl, SIZEOF_MMWORD/2
199    mov         ecx, SIZEOF_MMWORD
200    jz          short .rgb_gray_cnv
201    movq        mmD, mmA
202    movq        mmC, mmF
203    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
204    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
205    jmp         short .rgb_gray_cnv
206    alignx      16, 7
207
208.columnloop:
209    movq        mmA, MMWORD [esi+0*SIZEOF_MMWORD]
210    movq        mmF, MMWORD [esi+1*SIZEOF_MMWORD]
211    movq        mmD, MMWORD [esi+2*SIZEOF_MMWORD]
212    movq        mmC, MMWORD [esi+3*SIZEOF_MMWORD]
213
214.rgb_gray_cnv:
215    ; mmA=(00 10 20 30 01 11 21 31)
216    ; mmF=(02 12 22 32 03 13 23 33)
217    ; mmD=(04 14 24 34 05 15 25 35)
218    ; mmC=(06 16 26 36 07 17 27 37)
219
220    movq        mmB, mmA
221    punpcklbw   mmA, mmF                ; mmA=(00 02 10 12 20 22 30 32)
222    punpckhbw   mmB, mmF                ; mmB=(01 03 11 13 21 23 31 33)
223
224    movq        mmG, mmD
225    punpcklbw   mmD, mmC                ; mmD=(04 06 14 16 24 26 34 36)
226    punpckhbw   mmG, mmC                ; mmG=(05 07 15 17 25 27 35 37)
227
228    movq        mmE, mmA
229    punpcklwd   mmA, mmD                ; mmA=(00 02 04 06 10 12 14 16)
230    punpckhwd   mmE, mmD                ; mmE=(20 22 24 26 30 32 34 36)
231
232    movq        mmH, mmB
233    punpcklwd   mmB, mmG                ; mmB=(01 03 05 07 11 13 15 17)
234    punpckhwd   mmH, mmG                ; mmH=(21 23 25 27 31 33 35 37)
235
236    pxor        mmF, mmF
237
238    movq        mmC, mmA
239    punpcklbw   mmA, mmF                ; mmA=(00 02 04 06)
240    punpckhbw   mmC, mmF                ; mmC=(10 12 14 16)
241
242    movq        mmD, mmB
243    punpcklbw   mmB, mmF                ; mmB=(01 03 05 07)
244    punpckhbw   mmD, mmF                ; mmD=(11 13 15 17)
245
246    movq        mmG, mmE
247    punpcklbw   mmE, mmF                ; mmE=(20 22 24 26)
248    punpckhbw   mmG, mmF                ; mmG=(30 32 34 36)
249
250    punpcklbw   mmF, mmH
251    punpckhbw   mmH, mmH
252    psrlw       mmF, BYTE_BIT           ; mmF=(21 23 25 27)
253    psrlw       mmH, BYTE_BIT           ; mmH=(31 33 35 37)
254
255%endif  ; RGB_PIXELSIZE ; ---------------
256
257    ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
258    ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
259
260    ; (Original)
261    ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
262    ;
263    ; (This implementation)
264    ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
265
266    movq        mm6, mm1
267    punpcklwd   mm1, mm3
268    punpckhwd   mm6, mm3
269    pmaddwd     mm1, [GOTOFF(eax,PW_F0299_F0337)]  ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
270    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
271
272    movq        mm7,  mm6               ; mm7=ROH*FIX(0.299)+GOH*FIX(0.337)
273
274    movq        mm6, mm0
275    punpcklwd   mm0, mm2
276    punpckhwd   mm6, mm2
277    pmaddwd     mm0, [GOTOFF(eax,PW_F0299_F0337)]  ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
278    pmaddwd     mm6, [GOTOFF(eax,PW_F0299_F0337)]  ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
279
280    movq        MMWORD [wk(0)], mm0     ; wk(0)=REL*FIX(0.299)+GEL*FIX(0.337)
281    movq        MMWORD [wk(1)], mm6     ; wk(1)=REH*FIX(0.299)+GEH*FIX(0.337)
282
283    movq        mm0, mm5                ; mm0=BO
284    movq        mm6, mm4                ; mm6=BE
285
286    movq        mm4, mm0
287    punpcklwd   mm0, mm3
288    punpckhwd   mm4, mm3
289    pmaddwd     mm0, [GOTOFF(eax,PW_F0114_F0250)]  ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
290    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
291
292    movq        mm3, [GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
293
294    paddd       mm0, mm1
295    paddd       mm4, mm7
296    paddd       mm0, mm3
297    paddd       mm4, mm3
298    psrld       mm0, SCALEBITS          ; mm0=YOL
299    psrld       mm4, SCALEBITS          ; mm4=YOH
300    packssdw    mm0, mm4                ; mm0=YO
301
302    movq        mm4, mm6
303    punpcklwd   mm6, mm2
304    punpckhwd   mm4, mm2
305    pmaddwd     mm6, [GOTOFF(eax,PW_F0114_F0250)]  ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
306    pmaddwd     mm4, [GOTOFF(eax,PW_F0114_F0250)]  ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
307
308    movq        mm2, [GOTOFF(eax,PD_ONEHALF)]      ; mm2=[PD_ONEHALF]
309
310    paddd       mm6, MMWORD [wk(0)]
311    paddd       mm4, MMWORD [wk(1)]
312    paddd       mm6, mm2
313    paddd       mm4, mm2
314    psrld       mm6, SCALEBITS          ; mm6=YEL
315    psrld       mm4, SCALEBITS          ; mm4=YEH
316    packssdw    mm6, mm4                ; mm6=YE
317
318    psllw       mm0, BYTE_BIT
319    por         mm6, mm0                ; mm6=Y
320    movq        MMWORD [edi], mm6       ; Save Y
321
322    sub         ecx, byte SIZEOF_MMWORD
323    add         esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD  ; inptr
324    add         edi, byte SIZEOF_MMWORD                ; outptr0
325    cmp         ecx, byte SIZEOF_MMWORD
326    jae         near .columnloop
327    test        ecx, ecx
328    jnz         near .column_ld1
329
330    pop         ecx                     ; col
331    pop         esi
332    pop         edi
333    poppic      eax
334
335    add         esi, byte SIZEOF_JSAMPROW  ; input_buf
336    add         edi, byte SIZEOF_JSAMPROW
337    dec         eax                        ; num_rows
338    jg          near .rowloop
339
340    emms                                ; empty MMX state
341
342.return:
343    pop         edi
344    pop         esi
345;   pop         edx                     ; need not be preserved
346;   pop         ecx                     ; need not be preserved
347    pop         ebx
348    mov         esp, ebp                ; esp <- aligned ebp
349    pop         esp                     ; esp <- original ebp
350    pop         ebp
351    ret
352
353; For some reason, the OS X linker does not honor the request to align the
354; segment unless we do this.
355    align       32
356