1;
2; jcsample.asm - downsampling (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_TEXT
23        BITS    32
24;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
30; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
31;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
33;
34
35%define img_width(b)    (b)+8           ; JDIMENSION image_width
36%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
37%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
38%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
39%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
40%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
41
42        align   16
43        global  EXTN(jsimd_h2v1_downsample_mmx)
44
45EXTN(jsimd_h2v1_downsample_mmx):
46        push    ebp
47        mov     ebp,esp
48;       push    ebx             ; unused
49;       push    ecx             ; need not be preserved
50;       push    edx             ; need not be preserved
51        push    esi
52        push    edi
53
54        mov     ecx, JDIMENSION [width_blks(ebp)]
55        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
56        jz      near .return
57
58        mov     edx, JDIMENSION [img_width(ebp)]
59
60        ; -- expand_right_edge
61
62        push    ecx
63        shl     ecx,1                           ; output_cols * 2
64        sub     ecx,edx
65        jle     short .expand_end
66
67        mov     eax, INT [max_v_samp(ebp)]
68        test    eax,eax
69        jle     short .expand_end
70
71        cld
72        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
73        alignx  16,7
74.expandloop:
75        push    eax
76        push    ecx
77
78        mov     edi, JSAMPROW [esi]
79        add     edi,edx
80        mov     al, JSAMPLE [edi-1]
81
82        rep stosb
83
84        pop     ecx
85        pop     eax
86
87        add     esi, byte SIZEOF_JSAMPROW
88        dec     eax
89        jg      short .expandloop
90
91.expand_end:
92        pop     ecx                             ; output_cols
93
94        ; -- h2v1_downsample
95
96        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
97        test    eax,eax
98        jle     near .return
99
100        mov       edx, 0x00010000       ; bias pattern
101        movd      mm7,edx
102        pcmpeqw   mm6,mm6
103        punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
104        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
105
106        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
107        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
108        alignx  16,7
109.rowloop:
110        push    ecx
111        push    edi
112        push    esi
113
114        mov     esi, JSAMPROW [esi]             ; inptr
115        mov     edi, JSAMPROW [edi]             ; outptr
116        alignx  16,7
117.columnloop:
118
119        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
120        movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
121        movq    mm2,mm0
122        movq    mm3,mm1
123
124        pand    mm0,mm6
125        psrlw   mm2,BYTE_BIT
126        pand    mm1,mm6
127        psrlw   mm3,BYTE_BIT
128
129        paddw   mm0,mm2
130        paddw   mm1,mm3
131        paddw   mm0,mm7
132        paddw   mm1,mm7
133        psrlw   mm0,1
134        psrlw   mm1,1
135
136        packuswb mm0,mm1
137
138        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
139
140        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
141        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
142        sub     ecx, byte SIZEOF_MMWORD         ; outcol
143        jnz     short .columnloop
144
145        pop     esi
146        pop     edi
147        pop     ecx
148
149        add     esi, byte SIZEOF_JSAMPROW       ; input_data
150        add     edi, byte SIZEOF_JSAMPROW       ; output_data
151        dec     eax                             ; rowctr
152        jg      short .rowloop
153
154        emms            ; empty MMX state
155
156.return:
157        pop     edi
158        pop     esi
159;       pop     edx             ; need not be preserved
160;       pop     ecx             ; need not be preserved
161;       pop     ebx             ; unused
162        pop     ebp
163        ret
164
165; --------------------------------------------------------------------------
166;
167; Downsample pixel values of a single component.
168; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
169; without smoothing.
170;
171; GLOBAL(void)
172; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
173;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
174;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
175;
176
177%define img_width(b)    (b)+8           ; JDIMENSION image_width
178%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
179%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
180%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
181%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
182%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
183
184        align   16
185        global  EXTN(jsimd_h2v2_downsample_mmx)
186
187EXTN(jsimd_h2v2_downsample_mmx):
188        push    ebp
189        mov     ebp,esp
190;       push    ebx             ; unused
191;       push    ecx             ; need not be preserved
192;       push    edx             ; need not be preserved
193        push    esi
194        push    edi
195
196        mov     ecx, JDIMENSION [width_blks(ebp)]
197        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
198        jz      near .return
199
200        mov     edx, JDIMENSION [img_width(ebp)]
201
202        ; -- expand_right_edge
203
204        push    ecx
205        shl     ecx,1                           ; output_cols * 2
206        sub     ecx,edx
207        jle     short .expand_end
208
209        mov     eax, INT [max_v_samp(ebp)]
210        test    eax,eax
211        jle     short .expand_end
212
213        cld
214        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
215        alignx  16,7
216.expandloop:
217        push    eax
218        push    ecx
219
220        mov     edi, JSAMPROW [esi]
221        add     edi,edx
222        mov     al, JSAMPLE [edi-1]
223
224        rep stosb
225
226        pop     ecx
227        pop     eax
228
229        add     esi, byte SIZEOF_JSAMPROW
230        dec     eax
231        jg      short .expandloop
232
233.expand_end:
234        pop     ecx                             ; output_cols
235
236        ; -- h2v2_downsample
237
238        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
239        test    eax,eax
240        jle     near .return
241
242        mov       edx, 0x00020001       ; bias pattern
243        movd      mm7,edx
244        pcmpeqw   mm6,mm6
245        punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
246        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
247
248        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
249        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
250        alignx  16,7
251.rowloop:
252        push    ecx
253        push    edi
254        push    esi
255
256        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
257        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
258        mov     edi, JSAMPROW [edi]                     ; outptr
259        alignx  16,7
260.columnloop:
261
262        movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
263        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
264        movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
265        movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
266
267        movq    mm4,mm0
268        movq    mm5,mm1
269        pand    mm0,mm6
270        psrlw   mm4,BYTE_BIT
271        pand    mm1,mm6
272        psrlw   mm5,BYTE_BIT
273        paddw   mm0,mm4
274        paddw   mm1,mm5
275
276        movq    mm4,mm2
277        movq    mm5,mm3
278        pand    mm2,mm6
279        psrlw   mm4,BYTE_BIT
280        pand    mm3,mm6
281        psrlw   mm5,BYTE_BIT
282        paddw   mm2,mm4
283        paddw   mm3,mm5
284
285        paddw   mm0,mm1
286        paddw   mm2,mm3
287        paddw   mm0,mm7
288        paddw   mm2,mm7
289        psrlw   mm0,2
290        psrlw   mm2,2
291
292        packuswb mm0,mm2
293
294        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
295
296        add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
297        add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
298        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
299        sub     ecx, byte SIZEOF_MMWORD         ; outcol
300        jnz     near .columnloop
301
302        pop     esi
303        pop     edi
304        pop     ecx
305
306        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
307        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
308        dec     eax                             ; rowctr
309        jg      near .rowloop
310
311        emms            ; empty MMX state
312
313.return:
314        pop     edi
315        pop     esi
316;       pop     edx             ; need not be preserved
317;       pop     ecx             ; need not be preserved
318;       pop     ebx             ; unused
319        pop     ebp
320        ret
321
322; For some reason, the OS X linker does not honor the request to align the
323; segment unless we do this.
324        align   16
325