• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1;
2; jcsample.asm - downsampling (MMX)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on the x86 SIMD extension for IJG JPEG library
7; Copyright (C) 1999-2006, MIYASAKA Masaru.
8; For conditions of distribution and use, see copyright notice in jsimdext.inc
9;
10; This file should be assembled with NASM (Netwide Assembler),
11; can *not* be assembled with Microsoft's MASM or any compatible
12; assembler (including Borland's Turbo Assembler).
13; NASM is available from http://nasm.sourceforge.net/ or
14; http://sourceforge.net/project/showfiles.php?group_id=6208
15;
16; [TAB8]
17
18%include "jsimdext.inc"
19
20; --------------------------------------------------------------------------
21        SECTION SEG_TEXT
22        BITS    32
23;
24; Downsample pixel values of a single component.
25; This version handles the common case of 2:1 horizontal and 1:1 vertical,
26; without smoothing.
27;
28; GLOBAL(void)
29; jsimd_h2v1_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
30;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
31;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
32;
33
34%define img_width(b)    (b)+8           ; JDIMENSION image_width
35%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
36%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
37%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
38%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
39%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
40
41        align   16
42        global  EXTN(jsimd_h2v1_downsample_mmx)
43
44EXTN(jsimd_h2v1_downsample_mmx):
45        push    ebp
46        mov     ebp,esp
47;       push    ebx             ; unused
48;       push    ecx             ; need not be preserved
49;       push    edx             ; need not be preserved
50        push    esi
51        push    edi
52
53        mov     ecx, JDIMENSION [width_blks(ebp)]
54        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
55        jz      near .return
56
57        mov     edx, JDIMENSION [img_width(ebp)]
58
59        ; -- expand_right_edge
60
61        push    ecx
62        shl     ecx,1                           ; output_cols * 2
63        sub     ecx,edx
64        jle     short .expand_end
65
66        mov     eax, INT [max_v_samp(ebp)]
67        test    eax,eax
68        jle     short .expand_end
69
70        cld
71        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
72        alignx  16,7
73.expandloop:
74        push    eax
75        push    ecx
76
77        mov     edi, JSAMPROW [esi]
78        add     edi,edx
79        mov     al, JSAMPLE [edi-1]
80
81        rep stosb
82
83        pop     ecx
84        pop     eax
85
86        add     esi, byte SIZEOF_JSAMPROW
87        dec     eax
88        jg      short .expandloop
89
90.expand_end:
91        pop     ecx                             ; output_cols
92
93        ; -- h2v1_downsample
94
95        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
96        test    eax,eax
97        jle     near .return
98
99        mov       edx, 0x00010000       ; bias pattern
100        movd      mm7,edx
101        pcmpeqw   mm6,mm6
102        punpckldq mm7,mm7               ; mm7={0, 1, 0, 1}
103        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
104
105        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
106        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
107        alignx  16,7
108.rowloop:
109        push    ecx
110        push    edi
111        push    esi
112
113        mov     esi, JSAMPROW [esi]             ; inptr
114        mov     edi, JSAMPROW [edi]             ; outptr
115        alignx  16,7
116.columnloop:
117
118        movq    mm0, MMWORD [esi+0*SIZEOF_MMWORD]
119        movq    mm1, MMWORD [esi+1*SIZEOF_MMWORD]
120        movq    mm2,mm0
121        movq    mm3,mm1
122
123        pand    mm0,mm6
124        psrlw   mm2,BYTE_BIT
125        pand    mm1,mm6
126        psrlw   mm3,BYTE_BIT
127
128        paddw   mm0,mm2
129        paddw   mm1,mm3
130        paddw   mm0,mm7
131        paddw   mm1,mm7
132        psrlw   mm0,1
133        psrlw   mm1,1
134
135        packuswb mm0,mm1
136
137        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
138
139        add     esi, byte 2*SIZEOF_MMWORD       ; inptr
140        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
141        sub     ecx, byte SIZEOF_MMWORD         ; outcol
142        jnz     short .columnloop
143
144        pop     esi
145        pop     edi
146        pop     ecx
147
148        add     esi, byte SIZEOF_JSAMPROW       ; input_data
149        add     edi, byte SIZEOF_JSAMPROW       ; output_data
150        dec     eax                             ; rowctr
151        jg      short .rowloop
152
153        emms            ; empty MMX state
154
155.return:
156        pop     edi
157        pop     esi
158;       pop     edx             ; need not be preserved
159;       pop     ecx             ; need not be preserved
160;       pop     ebx             ; unused
161        pop     ebp
162        ret
163
164; --------------------------------------------------------------------------
165;
166; Downsample pixel values of a single component.
167; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
168; without smoothing.
169;
170; GLOBAL(void)
171; jsimd_h2v2_downsample_mmx (JDIMENSION image_width, int max_v_samp_factor,
172;                            JDIMENSION v_samp_factor, JDIMENSION width_blocks,
173;                            JSAMPARRAY input_data, JSAMPARRAY output_data);
174;
175
176%define img_width(b)    (b)+8           ; JDIMENSION image_width
177%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
178%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
179%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
180%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
181%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
182
183        align   16
184        global  EXTN(jsimd_h2v2_downsample_mmx)
185
186EXTN(jsimd_h2v2_downsample_mmx):
187        push    ebp
188        mov     ebp,esp
189;       push    ebx             ; unused
190;       push    ecx             ; need not be preserved
191;       push    edx             ; need not be preserved
192        push    esi
193        push    edi
194
195        mov     ecx, JDIMENSION [width_blks(ebp)]
196        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
197        jz      near .return
198
199        mov     edx, JDIMENSION [img_width(ebp)]
200
201        ; -- expand_right_edge
202
203        push    ecx
204        shl     ecx,1                           ; output_cols * 2
205        sub     ecx,edx
206        jle     short .expand_end
207
208        mov     eax, INT [max_v_samp(ebp)]
209        test    eax,eax
210        jle     short .expand_end
211
212        cld
213        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
214        alignx  16,7
215.expandloop:
216        push    eax
217        push    ecx
218
219        mov     edi, JSAMPROW [esi]
220        add     edi,edx
221        mov     al, JSAMPLE [edi-1]
222
223        rep stosb
224
225        pop     ecx
226        pop     eax
227
228        add     esi, byte SIZEOF_JSAMPROW
229        dec     eax
230        jg      short .expandloop
231
232.expand_end:
233        pop     ecx                             ; output_cols
234
235        ; -- h2v2_downsample
236
237        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
238        test    eax,eax
239        jle     near .return
240
241        mov       edx, 0x00020001       ; bias pattern
242        movd      mm7,edx
243        pcmpeqw   mm6,mm6
244        punpckldq mm7,mm7               ; mm7={1, 2, 1, 2}
245        psrlw     mm6,BYTE_BIT          ; mm6={0xFF 0x00 0xFF 0x00 ..}
246
247        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
248        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
249        alignx  16,7
250.rowloop:
251        push    ecx
252        push    edi
253        push    esi
254
255        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
256        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
257        mov     edi, JSAMPROW [edi]                     ; outptr
258        alignx  16,7
259.columnloop:
260
261        movq    mm0, MMWORD [edx+0*SIZEOF_MMWORD]
262        movq    mm1, MMWORD [esi+0*SIZEOF_MMWORD]
263        movq    mm2, MMWORD [edx+1*SIZEOF_MMWORD]
264        movq    mm3, MMWORD [esi+1*SIZEOF_MMWORD]
265
266        movq    mm4,mm0
267        movq    mm5,mm1
268        pand    mm0,mm6
269        psrlw   mm4,BYTE_BIT
270        pand    mm1,mm6
271        psrlw   mm5,BYTE_BIT
272        paddw   mm0,mm4
273        paddw   mm1,mm5
274
275        movq    mm4,mm2
276        movq    mm5,mm3
277        pand    mm2,mm6
278        psrlw   mm4,BYTE_BIT
279        pand    mm3,mm6
280        psrlw   mm5,BYTE_BIT
281        paddw   mm2,mm4
282        paddw   mm3,mm5
283
284        paddw   mm0,mm1
285        paddw   mm2,mm3
286        paddw   mm0,mm7
287        paddw   mm2,mm7
288        psrlw   mm0,2
289        psrlw   mm2,2
290
291        packuswb mm0,mm2
292
293        movq    MMWORD [edi+0*SIZEOF_MMWORD], mm0
294
295        add     edx, byte 2*SIZEOF_MMWORD       ; inptr0
296        add     esi, byte 2*SIZEOF_MMWORD       ; inptr1
297        add     edi, byte 1*SIZEOF_MMWORD       ; outptr
298        sub     ecx, byte SIZEOF_MMWORD         ; outcol
299        jnz     near .columnloop
300
301        pop     esi
302        pop     edi
303        pop     ecx
304
305        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
306        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
307        dec     eax                             ; rowctr
308        jg      near .rowloop
309
310        emms            ; empty MMX state
311
312.return:
313        pop     edi
314        pop     esi
315;       pop     edx             ; need not be preserved
316;       pop     ecx             ; need not be preserved
317;       pop     ebx             ; unused
318        pop     ebp
319        ret
320
321; For some reason, the OS X linker does not honor the request to align the
322; segment unless we do this.
323        align   16
324