1;
2; jcsample.asm - downsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright 2009 D. R. Commander
6;
7; Based on
8; x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23        SECTION SEG_TEXT
24        BITS    64
25;
26; Downsample pixel values of a single component.
27; This version handles the common case of 2:1 horizontal and 1:1 vertical,
28; without smoothing.
29;
30; GLOBAL(void)
31; jsimd_h2v1_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
32;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
33;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
34;
35
36; r10 = JDIMENSION image_width
37; r11 = int max_v_samp_factor
38; r12 = JDIMENSION v_samp_factor
39; r13 = JDIMENSION width_blocks
40; r14 = JSAMPARRAY input_data
41; r15 = JSAMPARRAY output_data
42
43        align   16
44        global  EXTN(jsimd_h2v1_downsample_sse2)
45
46EXTN(jsimd_h2v1_downsample_sse2):
47        push    rbp
48        mov     rax,rsp
49        mov     rbp,rsp
50        collect_args
51
52        mov ecx, r13d
53        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
54        jz      near .return
55
56        mov edx, r10d
57
58        ; -- expand_right_edge
59
60        push    rcx
61        shl     rcx,1                           ; output_cols * 2
62        sub     rcx,rdx
63        jle     short .expand_end
64
65        mov     rax, r11
66        test    rax,rax
67        jle     short .expand_end
68
69        cld
70        mov     rsi, r14        ; input_data
71.expandloop:
72        push    rax
73        push    rcx
74
75        mov     rdi, JSAMPROW [rsi]
76        add     rdi,rdx
77        mov     al, JSAMPLE [rdi-1]
78
79        rep stosb
80
81        pop     rcx
82        pop     rax
83
84        add     rsi, byte SIZEOF_JSAMPROW
85        dec     rax
86        jg      short .expandloop
87
88.expand_end:
89        pop     rcx                             ; output_cols
90
91        ; -- h2v1_downsample
92
93        mov     eax, r12d        ; rowctr
94        test    eax,eax
95        jle     near .return
96
97        mov     rdx, 0x00010000         ; bias pattern
98        movd    xmm7,edx
99        pcmpeqw xmm6,xmm6
100        pshufd  xmm7,xmm7,0x00          ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
102
103        mov     rsi, r14        ; input_data
104        mov     rdi, r15        ; output_data
105.rowloop:
106        push    rcx
107        push    rdi
108        push    rsi
109
110        mov     rsi, JSAMPROW [rsi]             ; inptr
111        mov rdi, JSAMPROW [rdi]         ; outptr
112
113        cmp     rcx, byte SIZEOF_XMMWORD
114        jae     short .columnloop
115
116.columnloop_r8:
117        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
118        pxor    xmm1,xmm1
119        mov     rcx, SIZEOF_XMMWORD
120        jmp     short .downsample
121
122.columnloop:
123        movdqa  xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
124        movdqa  xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
125
126.downsample:
127        movdqa  xmm2,xmm0
128        movdqa  xmm3,xmm1
129
130        pand    xmm0,xmm6
131        psrlw   xmm2,BYTE_BIT
132        pand    xmm1,xmm6
133        psrlw   xmm3,BYTE_BIT
134
135        paddw   xmm0,xmm2
136        paddw   xmm1,xmm3
137        paddw   xmm0,xmm7
138        paddw   xmm1,xmm7
139        psrlw   xmm0,1
140        psrlw   xmm1,1
141
142        packuswb xmm0,xmm1
143
144        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
145
146        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
147        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr
148        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
149        cmp     rcx, byte SIZEOF_XMMWORD
150        jae     short .columnloop
151        test    rcx,rcx
152        jnz     short .columnloop_r8
153
154        pop     rsi
155        pop     rdi
156        pop     rcx
157
158        add     rsi, byte SIZEOF_JSAMPROW       ; input_data
159        add     rdi, byte SIZEOF_JSAMPROW       ; output_data
160        dec     rax                             ; rowctr
161        jg      near .rowloop
162
163.return:
164        uncollect_args
165        pop     rbp
166        ret
167
168; --------------------------------------------------------------------------
169;
170; Downsample pixel values of a single component.
171; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
172; without smoothing.
173;
174; GLOBAL(void)
175; jsimd_h2v2_downsample_sse2 (JDIMENSION image_width, int max_v_samp_factor,
176;                             JDIMENSION v_samp_factor, JDIMENSION width_blocks,
177;                             JSAMPARRAY input_data, JSAMPARRAY output_data);
178;
179
180; r10 = JDIMENSION image_width
181; r11 = int max_v_samp_factor
182; r12 = JDIMENSION v_samp_factor
183; r13 = JDIMENSION width_blocks
184; r14 = JSAMPARRAY input_data
185; r15 = JSAMPARRAY output_data
186
187        align   16
188        global  EXTN(jsimd_h2v2_downsample_sse2)
189
190EXTN(jsimd_h2v2_downsample_sse2):
191        push    rbp
192        mov     rax,rsp
193        mov     rbp,rsp
194        collect_args
195
196        mov     ecx, r13d
197        shl     rcx,3                   ; imul rcx,DCTSIZE (rcx = output_cols)
198        jz      near .return
199
200        mov     edx, r10d
201
202        ; -- expand_right_edge
203
204        push    rcx
205        shl     rcx,1                           ; output_cols * 2
206        sub     rcx,rdx
207        jle     short .expand_end
208
209        mov     rax, r11
210        test    rax,rax
211        jle     short .expand_end
212
213        cld
214        mov     rsi, r14        ; input_data
215.expandloop:
216        push    rax
217        push    rcx
218
219        mov     rdi, JSAMPROW [rsi]
220        add     rdi,rdx
221        mov     al, JSAMPLE [rdi-1]
222
223        rep stosb
224
225        pop     rcx
226        pop     rax
227
228        add     rsi, byte SIZEOF_JSAMPROW
229        dec     rax
230        jg      short .expandloop
231
232.expand_end:
233        pop     rcx                             ; output_cols
234
235        ; -- h2v2_downsample
236
237        mov     eax, r12d        ; rowctr
238        test    rax,rax
239        jle     near .return
240
241        mov     rdx, 0x00020001         ; bias pattern
242        movd    xmm7,edx
243        pcmpeqw xmm6,xmm6
244        pshufd  xmm7,xmm7,0x00          ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
245        psrlw   xmm6,BYTE_BIT           ; xmm6={0xFF 0x00 0xFF 0x00 ..}
246
247        mov     rsi, r14        ; input_data
248        mov     rdi, r15        ; output_data
249.rowloop:
250        push    rcx
251        push    rdi
252        push    rsi
253
254        mov     rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; inptr0
255        mov     rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]   ; inptr1
256        mov     rdi, JSAMPROW [rdi]                     ; outptr
257
258        cmp     rcx, byte SIZEOF_XMMWORD
259        jae     short .columnloop
260
261.columnloop_r8:
262        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
263        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
264        pxor    xmm2,xmm2
265        pxor    xmm3,xmm3
266        mov     rcx, SIZEOF_XMMWORD
267        jmp     short .downsample
268
269.columnloop:
270        movdqa  xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
271        movdqa  xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
272        movdqa  xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
273        movdqa  xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
274
275.downsample:
276        movdqa  xmm4,xmm0
277        movdqa  xmm5,xmm1
278        pand    xmm0,xmm6
279        psrlw   xmm4,BYTE_BIT
280        pand    xmm1,xmm6
281        psrlw   xmm5,BYTE_BIT
282        paddw   xmm0,xmm4
283        paddw   xmm1,xmm5
284
285        movdqa  xmm4,xmm2
286        movdqa  xmm5,xmm3
287        pand    xmm2,xmm6
288        psrlw   xmm4,BYTE_BIT
289        pand    xmm3,xmm6
290        psrlw   xmm5,BYTE_BIT
291        paddw   xmm2,xmm4
292        paddw   xmm3,xmm5
293
294        paddw   xmm0,xmm1
295        paddw   xmm2,xmm3
296        paddw   xmm0,xmm7
297        paddw   xmm2,xmm7
298        psrlw   xmm0,2
299        psrlw   xmm2,2
300
301        packuswb xmm0,xmm2
302
303        movdqa  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
304
305        sub     rcx, byte SIZEOF_XMMWORD        ; outcol
306        add     rdx, byte 2*SIZEOF_XMMWORD      ; inptr0
307        add     rsi, byte 2*SIZEOF_XMMWORD      ; inptr1
308        add     rdi, byte 1*SIZEOF_XMMWORD      ; outptr
309        cmp     rcx, byte SIZEOF_XMMWORD
310        jae     near .columnloop
311        test    rcx,rcx
312        jnz     near .columnloop_r8
313
314        pop     rsi
315        pop     rdi
316        pop     rcx
317
318        add     rsi, byte 2*SIZEOF_JSAMPROW     ; input_data
319        add     rdi, byte 1*SIZEOF_JSAMPROW     ; output_data
320        dec     rax                             ; rowctr
321        jg      near .rowloop
322
323.return:
324        uncollect_args
325        pop     rbp
326        ret
327
328; For some reason, the OS X linker does not honor the request to align the
329; segment unless we do this.
330        align   16
331