1;
2; jquant.asm - sample data conversion and quantization (SSE & MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23        SECTION SEG_TEXT
24        BITS    32
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_float_sse (JSAMPARRAY sample_data, JDIMENSION start_col,
30;                           FAST_FLOAT * workspace);
31;
32
33%define sample_data     ebp+8           ; JSAMPARRAY sample_data
34%define start_col       ebp+12          ; JDIMENSION start_col
35%define workspace       ebp+16          ; FAST_FLOAT * workspace
36
37        align   16
38        global  EXTN(jsimd_convsamp_float_sse)
39
40EXTN(jsimd_convsamp_float_sse):
41        push    ebp
42        mov     ebp,esp
43        push    ebx
44;       push    ecx             ; need not be preserved
45;       push    edx             ; need not be preserved
46        push    esi
47        push    edi
48
49        pcmpeqw  mm7,mm7
50        psllw    mm7,7
51        packsswb mm7,mm7                ; mm7 = PB_CENTERJSAMPLE (0x808080..)
52
53        mov     esi, JSAMPARRAY [sample_data]   ; (JSAMPROW *)
54        mov     eax, JDIMENSION [start_col]
55        mov     edi, POINTER [workspace]        ; (DCTELEM *)
56        mov     ecx, DCTSIZE/2
57        alignx  16,7
58.convloop:
59        mov     ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
60        mov     edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
61
62        movq    mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE]
63        movq    mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE]
64
65        psubb   mm0,mm7                         ; mm0=(01234567)
66        psubb   mm1,mm7                         ; mm1=(89ABCDEF)
67
68        punpcklbw mm2,mm0                       ; mm2=(*0*1*2*3)
69        punpckhbw mm0,mm0                       ; mm0=(*4*5*6*7)
70        punpcklbw mm3,mm1                       ; mm3=(*8*9*A*B)
71        punpckhbw mm1,mm1                       ; mm1=(*C*D*E*F)
72
73        punpcklwd mm4,mm2                       ; mm4=(***0***1)
74        punpckhwd mm2,mm2                       ; mm2=(***2***3)
75        punpcklwd mm5,mm0                       ; mm5=(***4***5)
76        punpckhwd mm0,mm0                       ; mm0=(***6***7)
77
78        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(01)
79        psrad     mm2,(DWORD_BIT-BYTE_BIT)      ; mm2=(23)
80        cvtpi2ps  xmm0,mm4                      ; xmm0=(01**)
81        cvtpi2ps  xmm1,mm2                      ; xmm1=(23**)
82        psrad     mm5,(DWORD_BIT-BYTE_BIT)      ; mm5=(45)
83        psrad     mm0,(DWORD_BIT-BYTE_BIT)      ; mm0=(67)
84        cvtpi2ps  xmm2,mm5                      ; xmm2=(45**)
85        cvtpi2ps  xmm3,mm0                      ; xmm3=(67**)
86
87        punpcklwd mm6,mm3                       ; mm6=(***8***9)
88        punpckhwd mm3,mm3                       ; mm3=(***A***B)
89        punpcklwd mm4,mm1                       ; mm4=(***C***D)
90        punpckhwd mm1,mm1                       ; mm1=(***E***F)
91
92        psrad     mm6,(DWORD_BIT-BYTE_BIT)      ; mm6=(89)
93        psrad     mm3,(DWORD_BIT-BYTE_BIT)      ; mm3=(AB)
94        cvtpi2ps  xmm4,mm6                      ; xmm4=(89**)
95        cvtpi2ps  xmm5,mm3                      ; xmm5=(AB**)
96        psrad     mm4,(DWORD_BIT-BYTE_BIT)      ; mm4=(CD)
97        psrad     mm1,(DWORD_BIT-BYTE_BIT)      ; mm1=(EF)
98        cvtpi2ps  xmm6,mm4                      ; xmm6=(CD**)
99        cvtpi2ps  xmm7,mm1                      ; xmm7=(EF**)
100
101        movlhps   xmm0,xmm1                     ; xmm0=(0123)
102        movlhps   xmm2,xmm3                     ; xmm2=(4567)
103        movlhps   xmm4,xmm5                     ; xmm4=(89AB)
104        movlhps   xmm6,xmm7                     ; xmm6=(CDEF)
105
106        movaps  XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm0
107        movaps  XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm2
108        movaps  XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm4
109        movaps  XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm6
110
111        add     esi, byte 2*SIZEOF_JSAMPROW
112        add     edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
113        dec     ecx
114        jnz     near .convloop
115
116        emms            ; empty MMX state
117
118        pop     edi
119        pop     esi
120;       pop     edx             ; need not be preserved
121;       pop     ecx             ; need not be preserved
122        pop     ebx
123        pop     ebp
124        ret
125
126
127; --------------------------------------------------------------------------
128;
129; Quantize/descale the coefficients, and store into coef_block
130;
131; GLOBAL(void)
132; jsimd_quantize_float_sse (JCOEFPTR coef_block, FAST_FLOAT * divisors,
133;                           FAST_FLOAT * workspace);
134;
135
136%define coef_block      ebp+8           ; JCOEFPTR coef_block
137%define divisors        ebp+12          ; FAST_FLOAT * divisors
138%define workspace       ebp+16          ; FAST_FLOAT * workspace
139
140        align   16
141        global  EXTN(jsimd_quantize_float_sse)
142
143EXTN(jsimd_quantize_float_sse):
144        push    ebp
145        mov     ebp,esp
146;       push    ebx             ; unused
147;       push    ecx             ; unused
148;       push    edx             ; need not be preserved
149        push    esi
150        push    edi
151
152        mov     esi, POINTER [workspace]
153        mov     edx, POINTER [divisors]
154        mov     edi, JCOEFPTR [coef_block]
155        mov     eax, DCTSIZE2/16
156        alignx  16,7
157.quantloop:
158        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)]
159        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)]
160        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)]
161        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)]
162        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)]
163        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)]
164        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)]
165        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)]
166
167        movhlps  xmm4,xmm0
168        movhlps  xmm5,xmm1
169
170        cvtps2pi mm0,xmm0
171        cvtps2pi mm1,xmm1
172        cvtps2pi mm4,xmm4
173        cvtps2pi mm5,xmm5
174
175        movhlps  xmm6,xmm2
176        movhlps  xmm7,xmm3
177
178        cvtps2pi mm2,xmm2
179        cvtps2pi mm3,xmm3
180        cvtps2pi mm6,xmm6
181        cvtps2pi mm7,xmm7
182
183        packssdw mm0,mm4
184        packssdw mm1,mm5
185        packssdw mm2,mm6
186        packssdw mm3,mm7
187
188        movq    MMWORD [MMBLOCK(0,0,edi,SIZEOF_JCOEF)], mm0
189        movq    MMWORD [MMBLOCK(0,1,edi,SIZEOF_JCOEF)], mm1
190        movq    MMWORD [MMBLOCK(1,0,edi,SIZEOF_JCOEF)], mm2
191        movq    MMWORD [MMBLOCK(1,1,edi,SIZEOF_JCOEF)], mm3
192
193        add     esi, byte 16*SIZEOF_FAST_FLOAT
194        add     edx, byte 16*SIZEOF_FAST_FLOAT
195        add     edi, byte 16*SIZEOF_JCOEF
196        dec     eax
197        jnz     short .quantloop
198
199        emms            ; empty MMX state
200
201        pop     edi
202        pop     esi
203;       pop     edx             ; need not be preserved
204;       pop     ecx             ; unused
205;       pop     ebx             ; unused
206        pop     ebp
207        ret
208
209; For some reason, the OS X linker does not honor the request to align the
210; segment unless we do this.
211        align   16
212