1;
2; jccolext.asm - colorspace conversion (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jcolsamp.inc"
20
21; --------------------------------------------------------------------------
22;
23; Convert some rows of samples to the output colorspace.
24;
25; GLOBAL(void)
26; jsimd_rgb_ycc_convert_mmx (JDIMENSION img_width,
27;                           JSAMPARRAY input_buf, JSAMPIMAGE output_buf,
28;                           JDIMENSION output_row, int num_rows);
29;
30
31%define img_width(b)    (b)+8           ; JDIMENSION img_width
32%define input_buf(b)    (b)+12          ; JSAMPARRAY input_buf
33%define output_buf(b)   (b)+16          ; JSAMPIMAGE output_buf
34%define output_row(b)   (b)+20          ; JDIMENSION output_row
35%define num_rows(b)     (b)+24          ; int num_rows
36
37%define original_ebp    ebp+0
38%define wk(i)           ebp-(WK_NUM-(i))*SIZEOF_MMWORD  ; mmword wk[WK_NUM]
39%define WK_NUM          8
40%define gotptr          wk(0)-SIZEOF_POINTER    ; void * gotptr
41
42        align   16
43        global  EXTN(jsimd_rgb_ycc_convert_mmx)
44
45EXTN(jsimd_rgb_ycc_convert_mmx):
46        push    ebp
47        mov     eax,esp                         ; eax = original ebp
48        sub     esp, byte 4
49        and     esp, byte (-SIZEOF_MMWORD)      ; align to 64 bits
50        mov     [esp],eax
51        mov     ebp,esp                         ; ebp = aligned ebp
52        lea     esp, [wk(0)]
53        pushpic eax             ; make a room for GOT address
54        push    ebx
55;       push    ecx             ; need not be preserved
56;       push    edx             ; need not be preserved
57        push    esi
58        push    edi
59
60        get_GOT ebx                     ; get GOT address
61        movpic  POINTER [gotptr], ebx   ; save GOT address
62
63        mov     ecx, JDIMENSION [img_width(eax)]        ; num_cols
64        test    ecx,ecx
65        jz      near .return
66
67        push    ecx
68
69        mov     esi, JSAMPIMAGE [output_buf(eax)]
70        mov     ecx, JDIMENSION [output_row(eax)]
71        mov     edi, JSAMPARRAY [esi+0*SIZEOF_JSAMPARRAY]
72        mov     ebx, JSAMPARRAY [esi+1*SIZEOF_JSAMPARRAY]
73        mov     edx, JSAMPARRAY [esi+2*SIZEOF_JSAMPARRAY]
74        lea     edi, [edi+ecx*SIZEOF_JSAMPROW]
75        lea     ebx, [ebx+ecx*SIZEOF_JSAMPROW]
76        lea     edx, [edx+ecx*SIZEOF_JSAMPROW]
77
78        pop     ecx
79
80        mov     esi, JSAMPARRAY [input_buf(eax)]
81        mov     eax, INT [num_rows(eax)]
82        test    eax,eax
83        jle     near .return
84        alignx  16,7
85.rowloop:
86        pushpic eax
87        push    edx
88        push    ebx
89        push    edi
90        push    esi
91        push    ecx                     ; col
92
93        mov     esi, JSAMPROW [esi]     ; inptr
94        mov     edi, JSAMPROW [edi]     ; outptr0
95        mov     ebx, JSAMPROW [ebx]     ; outptr1
96        mov     edx, JSAMPROW [edx]     ; outptr2
97        movpic  eax, POINTER [gotptr]   ; load GOT address (eax)
98
99        cmp     ecx, byte SIZEOF_MMWORD
100        jae     short .columnloop
101        alignx  16,7
102
103%if RGB_PIXELSIZE == 3 ; ---------------
104
105.column_ld1:
106        push    eax
107        push    edx
108        lea     ecx,[ecx+ecx*2]         ; imul ecx,RGB_PIXELSIZE
109        test    cl, SIZEOF_BYTE
110        jz      short .column_ld2
111        sub     ecx, byte SIZEOF_BYTE
112        xor     eax,eax
113        mov     al, BYTE [esi+ecx]
114.column_ld2:
115        test    cl, SIZEOF_WORD
116        jz      short .column_ld4
117        sub     ecx, byte SIZEOF_WORD
118        xor     edx,edx
119        mov     dx, WORD [esi+ecx]
120        shl     eax, WORD_BIT
121        or      eax,edx
122.column_ld4:
123        movd    mmA,eax
124        pop     edx
125        pop     eax
126        test    cl, SIZEOF_DWORD
127        jz      short .column_ld8
128        sub     ecx, byte SIZEOF_DWORD
129        movd    mmG, DWORD [esi+ecx]
130        psllq   mmA, DWORD_BIT
131        por     mmA,mmG
132.column_ld8:
133        test    cl, SIZEOF_MMWORD
134        jz      short .column_ld16
135        movq    mmG,mmA
136        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
137        mov     ecx, SIZEOF_MMWORD
138        jmp     short .rgb_ycc_cnv
139.column_ld16:
140        test    cl, 2*SIZEOF_MMWORD
141        mov     ecx, SIZEOF_MMWORD
142        jz      short .rgb_ycc_cnv
143        movq    mmF,mmA
144        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
145        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
146        jmp     short .rgb_ycc_cnv
147        alignx  16,7
148
149.columnloop:
150        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
151        movq    mmG, MMWORD [esi+1*SIZEOF_MMWORD]
152        movq    mmF, MMWORD [esi+2*SIZEOF_MMWORD]
153
154.rgb_ycc_cnv:
155        ; mmA=(00 10 20 01 11 21 02 12)
156        ; mmG=(22 03 13 23 04 14 24 05)
157        ; mmF=(15 25 06 16 26 07 17 27)
158
159        movq      mmD,mmA
160        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 10 20 01)
161        psrlq     mmD,4*BYTE_BIT        ; mmD=(11 21 02 12 -- -- -- --)
162
163        punpckhbw mmA,mmG               ; mmA=(00 04 10 14 20 24 01 05)
164        psllq     mmG,4*BYTE_BIT        ; mmG=(-- -- -- -- 22 03 13 23)
165
166        punpcklbw mmD,mmF               ; mmD=(11 15 21 25 02 06 12 16)
167        punpckhbw mmG,mmF               ; mmG=(22 26 03 07 13 17 23 27)
168
169        movq      mmE,mmA
170        psllq     mmA,4*BYTE_BIT        ; mmA=(-- -- -- -- 00 04 10 14)
171        psrlq     mmE,4*BYTE_BIT        ; mmE=(20 24 01 05 -- -- -- --)
172
173        punpckhbw mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
174        psllq     mmD,4*BYTE_BIT        ; mmD=(-- -- -- -- 11 15 21 25)
175
176        punpcklbw mmE,mmG               ; mmE=(20 22 24 26 01 03 05 07)
177        punpckhbw mmD,mmG               ; mmD=(11 13 15 17 21 23 25 27)
178
179        pxor      mmH,mmH
180
181        movq      mmC,mmA
182        punpcklbw mmA,mmH               ; mmA=(00 02 04 06)
183        punpckhbw mmC,mmH               ; mmC=(10 12 14 16)
184
185        movq      mmB,mmE
186        punpcklbw mmE,mmH               ; mmE=(20 22 24 26)
187        punpckhbw mmB,mmH               ; mmB=(01 03 05 07)
188
189        movq      mmF,mmD
190        punpcklbw mmD,mmH               ; mmD=(11 13 15 17)
191        punpckhbw mmF,mmH               ; mmF=(21 23 25 27)
192
193%else ; RGB_PIXELSIZE == 4 ; -----------
194
195.column_ld1:
196        test    cl, SIZEOF_MMWORD/8
197        jz      short .column_ld2
198        sub     ecx, byte SIZEOF_MMWORD/8
199        movd    mmA, DWORD [esi+ecx*RGB_PIXELSIZE]
200.column_ld2:
201        test    cl, SIZEOF_MMWORD/4
202        jz      short .column_ld4
203        sub     ecx, byte SIZEOF_MMWORD/4
204        movq    mmF,mmA
205        movq    mmA, MMWORD [esi+ecx*RGB_PIXELSIZE]
206.column_ld4:
207        test    cl, SIZEOF_MMWORD/2
208        mov     ecx, SIZEOF_MMWORD
209        jz      short .rgb_ycc_cnv
210        movq    mmD,mmA
211        movq    mmC,mmF
212        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
213        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
214        jmp     short .rgb_ycc_cnv
215        alignx  16,7
216
217.columnloop:
218        movq    mmA, MMWORD [esi+0*SIZEOF_MMWORD]
219        movq    mmF, MMWORD [esi+1*SIZEOF_MMWORD]
220        movq    mmD, MMWORD [esi+2*SIZEOF_MMWORD]
221        movq    mmC, MMWORD [esi+3*SIZEOF_MMWORD]
222
223.rgb_ycc_cnv:
224        ; mmA=(00 10 20 30 01 11 21 31)
225        ; mmF=(02 12 22 32 03 13 23 33)
226        ; mmD=(04 14 24 34 05 15 25 35)
227        ; mmC=(06 16 26 36 07 17 27 37)
228
229        movq      mmB,mmA
230        punpcklbw mmA,mmF               ; mmA=(00 02 10 12 20 22 30 32)
231        punpckhbw mmB,mmF               ; mmB=(01 03 11 13 21 23 31 33)
232
233        movq      mmG,mmD
234        punpcklbw mmD,mmC               ; mmD=(04 06 14 16 24 26 34 36)
235        punpckhbw mmG,mmC               ; mmG=(05 07 15 17 25 27 35 37)
236
237        movq      mmE,mmA
238        punpcklwd mmA,mmD               ; mmA=(00 02 04 06 10 12 14 16)
239        punpckhwd mmE,mmD               ; mmE=(20 22 24 26 30 32 34 36)
240
241        movq      mmH,mmB
242        punpcklwd mmB,mmG               ; mmB=(01 03 05 07 11 13 15 17)
243        punpckhwd mmH,mmG               ; mmH=(21 23 25 27 31 33 35 37)
244
245        pxor      mmF,mmF
246
247        movq      mmC,mmA
248        punpcklbw mmA,mmF               ; mmA=(00 02 04 06)
249        punpckhbw mmC,mmF               ; mmC=(10 12 14 16)
250
251        movq      mmD,mmB
252        punpcklbw mmB,mmF               ; mmB=(01 03 05 07)
253        punpckhbw mmD,mmF               ; mmD=(11 13 15 17)
254
255        movq      mmG,mmE
256        punpcklbw mmE,mmF               ; mmE=(20 22 24 26)
257        punpckhbw mmG,mmF               ; mmG=(30 32 34 36)
258
259        punpcklbw mmF,mmH
260        punpckhbw mmH,mmH
261        psrlw     mmF,BYTE_BIT          ; mmF=(21 23 25 27)
262        psrlw     mmH,BYTE_BIT          ; mmH=(31 33 35 37)
263
264%endif ; RGB_PIXELSIZE ; ---------------
265
266        ; mm0=(R0 R2 R4 R6)=RE, mm2=(G0 G2 G4 G6)=GE, mm4=(B0 B2 B4 B6)=BE
267        ; mm1=(R1 R3 R5 R7)=RO, mm3=(G1 G3 G5 G7)=GO, mm5=(B1 B3 B5 B7)=BO
268
269        ; (Original)
270        ; Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
271        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
272        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
273        ;
274        ; (This implementation)
275        ; Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
276        ; Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
277        ; Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
278
279        movq      MMWORD [wk(0)], mm0   ; wk(0)=RE
280        movq      MMWORD [wk(1)], mm1   ; wk(1)=RO
281        movq      MMWORD [wk(2)], mm4   ; wk(2)=BE
282        movq      MMWORD [wk(3)], mm5   ; wk(3)=BO
283
284        movq      mm6,mm1
285        punpcklwd mm1,mm3
286        punpckhwd mm6,mm3
287        movq      mm7,mm1
288        movq      mm4,mm6
289        pmaddwd   mm1,[GOTOFF(eax,PW_F0299_F0337)] ; mm1=ROL*FIX(0.299)+GOL*FIX(0.337)
290        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=ROH*FIX(0.299)+GOH*FIX(0.337)
291        pmaddwd   mm7,[GOTOFF(eax,PW_MF016_MF033)] ; mm7=ROL*-FIX(0.168)+GOL*-FIX(0.331)
292        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=ROH*-FIX(0.168)+GOH*-FIX(0.331)
293
294        movq      MMWORD [wk(4)], mm1   ; wk(4)=ROL*FIX(0.299)+GOL*FIX(0.337)
295        movq      MMWORD [wk(5)], mm6   ; wk(5)=ROH*FIX(0.299)+GOH*FIX(0.337)
296
297        pxor      mm1,mm1
298        pxor      mm6,mm6
299        punpcklwd mm1,mm5               ; mm1=BOL
300        punpckhwd mm6,mm5               ; mm6=BOH
301        psrld     mm1,1                 ; mm1=BOL*FIX(0.500)
302        psrld     mm6,1                 ; mm6=BOH*FIX(0.500)
303
304        movq      mm5,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm5=[PD_ONEHALFM1_CJ]
305
306        paddd     mm7,mm1
307        paddd     mm4,mm6
308        paddd     mm7,mm5
309        paddd     mm4,mm5
310        psrld     mm7,SCALEBITS         ; mm7=CbOL
311        psrld     mm4,SCALEBITS         ; mm4=CbOH
312        packssdw  mm7,mm4               ; mm7=CbO
313
314        movq      mm1, MMWORD [wk(2)]   ; mm1=BE
315
316        movq      mm6,mm0
317        punpcklwd mm0,mm2
318        punpckhwd mm6,mm2
319        movq      mm5,mm0
320        movq      mm4,mm6
321        pmaddwd   mm0,[GOTOFF(eax,PW_F0299_F0337)] ; mm0=REL*FIX(0.299)+GEL*FIX(0.337)
322        pmaddwd   mm6,[GOTOFF(eax,PW_F0299_F0337)] ; mm6=REH*FIX(0.299)+GEH*FIX(0.337)
323        pmaddwd   mm5,[GOTOFF(eax,PW_MF016_MF033)] ; mm5=REL*-FIX(0.168)+GEL*-FIX(0.331)
324        pmaddwd   mm4,[GOTOFF(eax,PW_MF016_MF033)] ; mm4=REH*-FIX(0.168)+GEH*-FIX(0.331)
325
326        movq      MMWORD [wk(6)], mm0   ; wk(6)=REL*FIX(0.299)+GEL*FIX(0.337)
327        movq      MMWORD [wk(7)], mm6   ; wk(7)=REH*FIX(0.299)+GEH*FIX(0.337)
328
329        pxor      mm0,mm0
330        pxor      mm6,mm6
331        punpcklwd mm0,mm1               ; mm0=BEL
332        punpckhwd mm6,mm1               ; mm6=BEH
333        psrld     mm0,1                 ; mm0=BEL*FIX(0.500)
334        psrld     mm6,1                 ; mm6=BEH*FIX(0.500)
335
336        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
337
338        paddd     mm5,mm0
339        paddd     mm4,mm6
340        paddd     mm5,mm1
341        paddd     mm4,mm1
342        psrld     mm5,SCALEBITS         ; mm5=CbEL
343        psrld     mm4,SCALEBITS         ; mm4=CbEH
344        packssdw  mm5,mm4               ; mm5=CbE
345
346        psllw     mm7,BYTE_BIT
347        por       mm5,mm7               ; mm5=Cb
348        movq      MMWORD [ebx], mm5     ; Save Cb
349
350        movq      mm0, MMWORD [wk(3)]   ; mm0=BO
351        movq      mm6, MMWORD [wk(2)]   ; mm6=BE
352        movq      mm1, MMWORD [wk(1)]   ; mm1=RO
353
354        movq      mm4,mm0
355        punpcklwd mm0,mm3
356        punpckhwd mm4,mm3
357        movq      mm7,mm0
358        movq      mm5,mm4
359        pmaddwd   mm0,[GOTOFF(eax,PW_F0114_F0250)] ; mm0=BOL*FIX(0.114)+GOL*FIX(0.250)
360        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BOH*FIX(0.114)+GOH*FIX(0.250)
361        pmaddwd   mm7,[GOTOFF(eax,PW_MF008_MF041)] ; mm7=BOL*-FIX(0.081)+GOL*-FIX(0.418)
362        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BOH*-FIX(0.081)+GOH*-FIX(0.418)
363
364        movq      mm3,[GOTOFF(eax,PD_ONEHALF)]  ; mm3=[PD_ONEHALF]
365
366        paddd     mm0, MMWORD [wk(4)]
367        paddd     mm4, MMWORD [wk(5)]
368        paddd     mm0,mm3
369        paddd     mm4,mm3
370        psrld     mm0,SCALEBITS         ; mm0=YOL
371        psrld     mm4,SCALEBITS         ; mm4=YOH
372        packssdw  mm0,mm4               ; mm0=YO
373
374        pxor      mm3,mm3
375        pxor      mm4,mm4
376        punpcklwd mm3,mm1               ; mm3=ROL
377        punpckhwd mm4,mm1               ; mm4=ROH
378        psrld     mm3,1                 ; mm3=ROL*FIX(0.500)
379        psrld     mm4,1                 ; mm4=ROH*FIX(0.500)
380
381        movq      mm1,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm1=[PD_ONEHALFM1_CJ]
382
383        paddd     mm7,mm3
384        paddd     mm5,mm4
385        paddd     mm7,mm1
386        paddd     mm5,mm1
387        psrld     mm7,SCALEBITS         ; mm7=CrOL
388        psrld     mm5,SCALEBITS         ; mm5=CrOH
389        packssdw  mm7,mm5               ; mm7=CrO
390
391        movq      mm3, MMWORD [wk(0)]   ; mm3=RE
392
393        movq      mm4,mm6
394        punpcklwd mm6,mm2
395        punpckhwd mm4,mm2
396        movq      mm1,mm6
397        movq      mm5,mm4
398        pmaddwd   mm6,[GOTOFF(eax,PW_F0114_F0250)] ; mm6=BEL*FIX(0.114)+GEL*FIX(0.250)
399        pmaddwd   mm4,[GOTOFF(eax,PW_F0114_F0250)] ; mm4=BEH*FIX(0.114)+GEH*FIX(0.250)
400        pmaddwd   mm1,[GOTOFF(eax,PW_MF008_MF041)] ; mm1=BEL*-FIX(0.081)+GEL*-FIX(0.418)
401        pmaddwd   mm5,[GOTOFF(eax,PW_MF008_MF041)] ; mm5=BEH*-FIX(0.081)+GEH*-FIX(0.418)
402
403        movq      mm2,[GOTOFF(eax,PD_ONEHALF)]  ; mm2=[PD_ONEHALF]
404
405        paddd     mm6, MMWORD [wk(6)]
406        paddd     mm4, MMWORD [wk(7)]
407        paddd     mm6,mm2
408        paddd     mm4,mm2
409        psrld     mm6,SCALEBITS         ; mm6=YEL
410        psrld     mm4,SCALEBITS         ; mm4=YEH
411        packssdw  mm6,mm4               ; mm6=YE
412
413        psllw     mm0,BYTE_BIT
414        por       mm6,mm0               ; mm6=Y
415        movq      MMWORD [edi], mm6     ; Save Y
416
417        pxor      mm2,mm2
418        pxor      mm4,mm4
419        punpcklwd mm2,mm3               ; mm2=REL
420        punpckhwd mm4,mm3               ; mm4=REH
421        psrld     mm2,1                 ; mm2=REL*FIX(0.500)
422        psrld     mm4,1                 ; mm4=REH*FIX(0.500)
423
424        movq      mm0,[GOTOFF(eax,PD_ONEHALFM1_CJ)] ; mm0=[PD_ONEHALFM1_CJ]
425
426        paddd     mm1,mm2
427        paddd     mm5,mm4
428        paddd     mm1,mm0
429        paddd     mm5,mm0
430        psrld     mm1,SCALEBITS         ; mm1=CrEL
431        psrld     mm5,SCALEBITS         ; mm5=CrEH
432        packssdw  mm1,mm5               ; mm1=CrE
433
434        psllw     mm7,BYTE_BIT
435        por       mm1,mm7               ; mm1=Cr
436        movq      MMWORD [edx], mm1     ; Save Cr
437
438        sub     ecx, byte SIZEOF_MMWORD
439        add     esi, byte RGB_PIXELSIZE*SIZEOF_MMWORD   ; inptr
440        add     edi, byte SIZEOF_MMWORD                 ; outptr0
441        add     ebx, byte SIZEOF_MMWORD                 ; outptr1
442        add     edx, byte SIZEOF_MMWORD                 ; outptr2
443        cmp     ecx, byte SIZEOF_MMWORD
444        jae     near .columnloop
445        test    ecx,ecx
446        jnz     near .column_ld1
447
448        pop     ecx                     ; col
449        pop     esi
450        pop     edi
451        pop     ebx
452        pop     edx
453        poppic  eax
454
455        add     esi, byte SIZEOF_JSAMPROW       ; input_buf
456        add     edi, byte SIZEOF_JSAMPROW
457        add     ebx, byte SIZEOF_JSAMPROW
458        add     edx, byte SIZEOF_JSAMPROW
459        dec     eax                             ; num_rows
460        jg      near .rowloop
461
462        emms            ; empty MMX state
463
464.return:
465        pop     edi
466        pop     esi
467;       pop     edx             ; need not be preserved
468;       pop     ecx             ; need not be preserved
469        pop     ebx
470        mov     esp,ebp         ; esp <- aligned ebp
471        pop     esp             ; esp <- original ebp
472        pop     ebp
473        ret
474
475; For some reason, the OS X linker does not honor the request to align the
476; segment unless we do this.
477        align   16
478