1;
2; jcsample.asm - downsampling (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5;
6; Based on
7; x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22        SECTION SEG_TEXT
23        BITS    32
24;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
30; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
31;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
32;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
33;
34
35%define img_width(b)    (b)+8           ; JDIMENSION image_width
36%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
37%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
38%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
39%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
40%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
41
42        align   16
43        global  EXTN(jsimd_h2v1_downsample_sse2)
44
45EXTN(jsimd_h2v1_downsample_sse2):
46        push    ebp
47        mov     ebp,esp
48;       push    ebx             ; unused
49;       push    ecx             ; need not be preserved
50;       push    edx             ; need not be preserved
51        push    esi
52        push    edi
53
54        mov     ecx, JDIMENSION [width_blks(ebp)]
55        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
56        jz      near .return
57
58        mov     edx, JDIMENSION [img_width(ebp)]
59
60        ; -- expand_right_edge
61
62        push    ecx
63        shl     ecx,1                           ; output_cols * 2
64        sub     ecx,edx
65        jle     short .expand_end
66
67        mov     eax, INT [max_v_samp(ebp)]
68        test    eax,eax
69        jle     short .expand_end
70
71        cld
72        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
73        alignx  16,7
74.expandloop:
75        push    eax
76        push    ecx
77
78        mov     edi, JSAMPROW [esi]
79        add     edi,edx
80        mov     al, JSAMPLE [edi-1]
81
82        rep stosb
83
84        pop     ecx
85        pop     eax
86
87        add     esi, byte SIZEOF_JSAMPROW
88        dec     eax
89        jg      short .expandloop
90
91.expand_end:
92        pop     ecx                             ; output_cols
93
94        ; -- h2v1_downsample
95
96        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
97        test    eax,eax
98        jle     near .return
99
100        mov     edx, 0x00010000         ; bias pattern
101        movd    xmm7,edx
102        pcmpeqw xmm6,xmm6
103        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
104        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
105
106        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
107        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
108        alignx  16,7
109.rowloop:
110        push    ecx
111        push    edi
112        push    esi
113
114        mov     esi, JSAMPROW [esi]             ; inptr
115        mov     edi, JSAMPROW [edi]             ; outptr
116
117        cmp     ecx, byte SIZEOF_XMMWORD
118        jae     short .columnloop
119        alignx  16,7
120
121.columnloop_r8:
122        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
123        pxor    xmm1,xmm1
124        mov     ecx, SIZEOF_XMMWORD
125        jmp     short .downsample
126        alignx  16,7
127
128.columnloop:
129        movdqa  xmm0, XMMWORD [esi+0*SIZEOF_XMMWORD]
130        movdqa  xmm1, XMMWORD [esi+1*SIZEOF_XMMWORD]
131
132.downsample:
133        movdqa  xmm2,xmm0
134        movdqa  xmm3,xmm1
135
136        pand    xmm0,xmm6
137        psrlw   xmm2,BYTE_BIT
138        pand    xmm1,xmm6
139        psrlw   xmm3,BYTE_BIT
140
141        paddw   xmm0,xmm2
142        paddw   xmm1,xmm3
143        paddw   xmm0,xmm7
144        paddw   xmm1,xmm7
145        psrlw   xmm0,1
146        psrlw   xmm1,1
147
148        packuswb xmm0,xmm1
149
150        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
151
152        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
153        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr
154        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
155        cmp     ecx, byte SIZEOF_XMMWORD
156        jae     short .columnloop
157        test    ecx,ecx
158        jnz     short .columnloop_r8
159
160        pop     esi
161        pop     edi
162        pop     ecx
163
164        add     esi, byte SIZEOF_JSAMPROW       ; input_data
165        add     edi, byte SIZEOF_JSAMPROW       ; output_data
166        dec     eax                             ; rowctr
167        jg      near .rowloop
168
169.return:
170        pop     edi
171        pop     esi
172;       pop     edx             ; need not be preserved
173;       pop     ecx             ; need not be preserved
174;       pop     ebx             ; unused
175        pop     ebp
176        ret
177
178; --------------------------------------------------------------------------
179;
180; Downsample pixel values of a single component.
181; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
182; without smoothing.
183;
184; GLOBAL(void)
185; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
186;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
187;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
188;
189
190%define img_width(b)    (b)+8           ; JDIMENSION image_width
191%define max_v_samp(b)   (b)+12          ; int max_v_samp_factor
192%define v_samp(b)       (b)+16          ; JDIMENSION v_samp_factor
193%define width_blks(b)   (b)+20          ; JDIMENSION width_blocks
194%define input_data(b)   (b)+24          ; JSAMPARRAY input_data
195%define output_data(b)  (b)+28          ; JSAMPARRAY output_data
196
197        align   16
198        global  EXTN(jsimd_h2v2_downsample_sse2)
199
200EXTN(jsimd_h2v2_downsample_sse2):
201        push    ebp
202        mov     ebp,esp
203;       push    ebx             ; unused
204;       push    ecx             ; need not be preserved
205;       push    edx             ; need not be preserved
206        push    esi
207        push    edi
208
209        mov     ecx, JDIMENSION [width_blks(ebp)]
210        shl     ecx,3                   ; imul ecx,DCTSIZE (ecx = output_cols)
211        jz      near .return
212
213        mov     edx, JDIMENSION [img_width(ebp)]
214
215        ; -- expand_right_edge
216
217        push    ecx
218        shl     ecx,1                           ; output_cols * 2
219        sub     ecx,edx
220        jle     short .expand_end
221
222        mov     eax, INT [max_v_samp(ebp)]
223        test    eax,eax
224        jle     short .expand_end
225
226        cld
227        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
228        alignx  16,7
229.expandloop:
230        push    eax
231        push    ecx
232
233        mov     edi, JSAMPROW [esi]
234        add     edi,edx
235        mov     al, JSAMPLE [edi-1]
236
237        rep stosb
238
239        pop     ecx
240        pop     eax
241
242        add     esi, byte SIZEOF_JSAMPROW
243        dec     eax
244        jg      short .expandloop
245
246.expand_end:
247        pop     ecx                             ; output_cols
248
249        ; -- h2v2_downsample
250
251        mov     eax, JDIMENSION [v_samp(ebp)]   ; rowctr
252        test    eax,eax
253        jle     near .return
254
255        mov     edx, 0x00020001         ; bias pattern
256        movd    xmm7,edx
257        pcmpeqw xmm6,xmm6
258        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
259        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
260
261        mov     esi, JSAMPARRAY [input_data(ebp)]       ; input_data
262        mov     edi, JSAMPARRAY [output_data(ebp)]      ; output_data
263        alignx  16,7
264.rowloop:
265        push    ecx
266        push    edi
267        push    esi
268
269        mov     edx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]   ; inptr0
270        mov     esi, JSAMPROW [esi+1*SIZEOF_JSAMPROW]   ; inptr1
271        mov     edi, JSAMPROW [edi]                     ; outptr
272
273        cmp     ecx, byte SIZEOF_XMMWORD
274        jae     short .columnloop
275        alignx  16,7
276
277.columnloop_r8:
278        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
279        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
280        pxor    xmm2,xmm2
281        pxor    xmm3,xmm3
282        mov     ecx, SIZEOF_XMMWORD
283        jmp     short .downsample
284        alignx  16,7
285
286.columnloop:
287        movdqa  xmm0, XMMWORD [edx+0*SIZEOF_XMMWORD]
288        movdqa  xmm1, XMMWORD [esi+0*SIZEOF_XMMWORD]
289        movdqa  xmm2, XMMWORD [edx+1*SIZEOF_XMMWORD]
290        movdqa  xmm3, XMMWORD [esi+1*SIZEOF_XMMWORD]
291
292.downsample:
293        movdqa  xmm4,xmm0
294        movdqa  xmm5,xmm1
295        pand    xmm0,xmm6
296        psrlw   xmm4,BYTE_BIT
297        pand    xmm1,xmm6
298        psrlw   xmm5,BYTE_BIT
299        paddw   xmm0,xmm4
300        paddw   xmm1,xmm5
301
302        movdqa  xmm4,xmm2
303        movdqa  xmm5,xmm3
304        pand    xmm2,xmm6
305        psrlw   xmm4,BYTE_BIT
306        pand    xmm3,xmm6
307        psrlw   xmm5,BYTE_BIT
308        paddw   xmm2,xmm4
309        paddw   xmm3,xmm5
310
311        paddw   xmm0,xmm1
312        paddw   xmm2,xmm3
313        paddw   xmm0,xmm7
314        paddw   xmm2,xmm7
315        psrlw   xmm0,2
316        psrlw   xmm2,2
317
318        packuswb xmm0,xmm2
319
320        movdqa  XMMWORD [edi+0*SIZEOF_XMMWORD], xmm0
321
322        sub     ecx, byte SIZEOF_XMMWORD        ; outcol
323        add     edx, byte 2*SIZEOF_XMMWORD      ; inptr0
324        add     esi, byte 2*SIZEOF_XMMWORD      ; inptr1
325        add     edi, byte 1*SIZEOF_XMMWORD      ; outptr
326        cmp     ecx, byte SIZEOF_XMMWORD
327        jae     near .columnloop
328        test    ecx,ecx
329        jnz     near .columnloop_r8
330
331        pop     esi
332        pop     edi
333        pop     ecx
334
335        add     esi, byte 2*SIZEOF_JSAMPROW     ; input_data
336        add     edi, byte 1*SIZEOF_JSAMPROW     ; output_data
337        dec     eax                             ; rowctr
338        jg      near .rowloop
339
340.return:
341        pop     edi
342        pop     esi
343;       pop     edx             ; need not be preserved
344;       pop     ecx             ; need not be preserved
345;       pop     ebx             ; unused
346        pop     ebp
347        ret
348
349; For some reason, the OS X linker does not honor the request to align the
350; segment unless we do this.
351        align   16
352