1;
2; jcsample.asm - downsampling (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_TEXT
23    BITS        64
24;
25; Downsample pixel values of a single component.
26; This version handles the common case of 2:1 horizontal and 1:1 vertical,
27; without smoothing.
28;
29; GLOBAL(void)
30; jsimd_h2v1_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
31;                            JDIMENSION v_samp_factor,
32;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
33;                            JSAMPARRAY output_data);
34;
35
36; r10d = JDIMENSION image_width
37; r11 = int max_v_samp_factor
38; r12d = JDIMENSION v_samp_factor
39; r13d = JDIMENSION width_in_blocks
40; r14 = JSAMPARRAY input_data
41; r15 = JSAMPARRAY output_data
42
43    align       32
44    GLOBAL_FUNCTION(jsimd_h2v1_downsample_sse2)
45
46EXTN(jsimd_h2v1_downsample_sse2):
47    push        rbp
48    mov         rax, rsp
49    mov         rbp, rsp
50    collect_args 6
51
52    mov         ecx, r13d
53    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
54    jz          near .return
55
56    mov         edx, r10d
57
58    ; -- expand_right_edge
59
60    push        rcx
61    shl         rcx, 1                  ; output_cols * 2
62    sub         rcx, rdx
63    jle         short .expand_end
64
65    mov         rax, r11
66    test        rax, rax
67    jle         short .expand_end
68
69    cld
70    mov         rsi, r14                ; input_data
71.expandloop:
72    push        rax
73    push        rcx
74
75    mov         rdi, JSAMPROW [rsi]
76    add         rdi, rdx
77    mov         al, JSAMPLE [rdi-1]
78
79    rep stosb
80
81    pop         rcx
82    pop         rax
83
84    add         rsi, byte SIZEOF_JSAMPROW
85    dec         rax
86    jg          short .expandloop
87
88.expand_end:
89    pop         rcx                     ; output_cols
90
91    ; -- h2v1_downsample
92
93    mov         eax, r12d               ; rowctr
94    test        eax, eax
95    jle         near .return
96
97    mov         rdx, 0x00010000         ; bias pattern
98    movd        xmm7, edx
99    pcmpeqw     xmm6, xmm6
100    pshufd      xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
102
103    mov         rsi, r14                ; input_data
104    mov         rdi, r15                ; output_data
105.rowloop:
106    push        rcx
107    push        rdi
108    push        rsi
109
110    mov         rsi, JSAMPROW [rsi]     ; inptr
111    mov         rdi, JSAMPROW [rdi]     ; outptr
112
113    cmp         rcx, byte SIZEOF_XMMWORD
114    jae         short .columnloop
115
116.columnloop_r8:
117    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
118    pxor        xmm1, xmm1
119    mov         rcx, SIZEOF_XMMWORD
120    jmp         short .downsample
121
122.columnloop:
123    movdqa      xmm0, XMMWORD [rsi+0*SIZEOF_XMMWORD]
124    movdqa      xmm1, XMMWORD [rsi+1*SIZEOF_XMMWORD]
125
126.downsample:
127    movdqa      xmm2, xmm0
128    movdqa      xmm3, xmm1
129
130    pand        xmm0, xmm6
131    psrlw       xmm2, BYTE_BIT
132    pand        xmm1, xmm6
133    psrlw       xmm3, BYTE_BIT
134
135    paddw       xmm0, xmm2
136    paddw       xmm1, xmm3
137    paddw       xmm0, xmm7
138    paddw       xmm1, xmm7
139    psrlw       xmm0, 1
140    psrlw       xmm1, 1
141
142    packuswb    xmm0, xmm1
143
144    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
145
146    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
147    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr
148    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
149    cmp         rcx, byte SIZEOF_XMMWORD
150    jae         short .columnloop
151    test        rcx, rcx
152    jnz         short .columnloop_r8
153
154    pop         rsi
155    pop         rdi
156    pop         rcx
157
158    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
159    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
160    dec         rax                        ; rowctr
161    jg          near .rowloop
162
163.return:
164    uncollect_args 6
165    pop         rbp
166    ret
167
168; --------------------------------------------------------------------------
169;
170; Downsample pixel values of a single component.
171; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
172; without smoothing.
173;
174; GLOBAL(void)
175; jsimd_h2v2_downsample_sse2(JDIMENSION image_width, int max_v_samp_factor,
176;                            JDIMENSION v_samp_factor,
177;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
178;                            JSAMPARRAY output_data);
179;
180
181; r10d = JDIMENSION image_width
182; r11 = int max_v_samp_factor
183; r12d = JDIMENSION v_samp_factor
184; r13d = JDIMENSION width_in_blocks
185; r14 = JSAMPARRAY input_data
186; r15 = JSAMPARRAY output_data
187
188    align       32
189    GLOBAL_FUNCTION(jsimd_h2v2_downsample_sse2)
190
191EXTN(jsimd_h2v2_downsample_sse2):
192    push        rbp
193    mov         rax, rsp
194    mov         rbp, rsp
195    collect_args 6
196
197    mov         ecx, r13d
198    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
199    jz          near .return
200
201    mov         edx, r10d
202
203    ; -- expand_right_edge
204
205    push        rcx
206    shl         rcx, 1                  ; output_cols * 2
207    sub         rcx, rdx
208    jle         short .expand_end
209
210    mov         rax, r11
211    test        rax, rax
212    jle         short .expand_end
213
214    cld
215    mov         rsi, r14                ; input_data
216.expandloop:
217    push        rax
218    push        rcx
219
220    mov         rdi, JSAMPROW [rsi]
221    add         rdi, rdx
222    mov         al, JSAMPLE [rdi-1]
223
224    rep stosb
225
226    pop         rcx
227    pop         rax
228
229    add         rsi, byte SIZEOF_JSAMPROW
230    dec         rax
231    jg          short .expandloop
232
233.expand_end:
234    pop         rcx                     ; output_cols
235
236    ; -- h2v2_downsample
237
238    mov         eax, r12d               ; rowctr
239    test        rax, rax
240    jle         near .return
241
242    mov         rdx, 0x00020001         ; bias pattern
243    movd        xmm7, edx
244    pcmpeqw     xmm6, xmm6
245    pshufd      xmm7, xmm7, 0x00        ; xmm7={1, 2, 1, 2, 1, 2, 1, 2}
246    psrlw       xmm6, BYTE_BIT          ; xmm6={0xFF 0x00 0xFF 0x00 ..}
247
248    mov         rsi, r14                ; input_data
249    mov         rdi, r15                ; output_data
250.rowloop:
251    push        rcx
252    push        rdi
253    push        rsi
254
255    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
256    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
257    mov         rdi, JSAMPROW [rdi]                    ; outptr
258
259    cmp         rcx, byte SIZEOF_XMMWORD
260    jae         short .columnloop
261
262.columnloop_r8:
263    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
264    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
265    pxor        xmm2, xmm2
266    pxor        xmm3, xmm3
267    mov         rcx, SIZEOF_XMMWORD
268    jmp         short .downsample
269
270.columnloop:
271    movdqa      xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
272    movdqa      xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
273    movdqa      xmm2, XMMWORD [rdx+1*SIZEOF_XMMWORD]
274    movdqa      xmm3, XMMWORD [rsi+1*SIZEOF_XMMWORD]
275
276.downsample:
277    movdqa      xmm4, xmm0
278    movdqa      xmm5, xmm1
279    pand        xmm0, xmm6
280    psrlw       xmm4, BYTE_BIT
281    pand        xmm1, xmm6
282    psrlw       xmm5, BYTE_BIT
283    paddw       xmm0, xmm4
284    paddw       xmm1, xmm5
285
286    movdqa      xmm4, xmm2
287    movdqa      xmm5, xmm3
288    pand        xmm2, xmm6
289    psrlw       xmm4, BYTE_BIT
290    pand        xmm3, xmm6
291    psrlw       xmm5, BYTE_BIT
292    paddw       xmm2, xmm4
293    paddw       xmm3, xmm5
294
295    paddw       xmm0, xmm1
296    paddw       xmm2, xmm3
297    paddw       xmm0, xmm7
298    paddw       xmm2, xmm7
299    psrlw       xmm0, 2
300    psrlw       xmm2, 2
301
302    packuswb    xmm0, xmm2
303
304    movdqa      XMMWORD [rdi+0*SIZEOF_XMMWORD], xmm0
305
306    sub         rcx, byte SIZEOF_XMMWORD    ; outcol
307    add         rdx, byte 2*SIZEOF_XMMWORD  ; inptr0
308    add         rsi, byte 2*SIZEOF_XMMWORD  ; inptr1
309    add         rdi, byte 1*SIZEOF_XMMWORD  ; outptr
310    cmp         rcx, byte SIZEOF_XMMWORD
311    jae         near .columnloop
312    test        rcx, rcx
313    jnz         near .columnloop_r8
314
315    pop         rsi
316    pop         rdi
317    pop         rcx
318
319    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
320    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
321    dec         rax                          ; rowctr
322    jg          near .rowloop
323
324.return:
325    uncollect_args 6
326    pop         rbp
327    ret
328
329; For some reason, the OS X linker does not honor the request to align the
330; segment unless we do this.
331    align       32
332