1;
2; jcsample.asm - downsampling (64-bit AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2015, Intel Corporation.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        64
25;
26; Downsample pixel values of a single component.
27; This version handles the common case of 2:1 horizontal and 1:1 vertical,
28; without smoothing.
29;
30; GLOBAL(void)
31; jsimd_h2v1_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
32;                            JDIMENSION v_samp_factor,
33;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
34;                            JSAMPARRAY output_data);
35;
36
37; r10d = JDIMENSION image_width
38; r11 = int max_v_samp_factor
39; r12d = JDIMENSION v_samp_factor
40; r13d = JDIMENSION width_in_blocks
41; r14 = JSAMPARRAY input_data
42; r15 = JSAMPARRAY output_data
43
44    align       32
45    GLOBAL_FUNCTION(jsimd_h2v1_downsample_avx2)
46
47EXTN(jsimd_h2v1_downsample_avx2):
48    push        rbp
49    mov         rax, rsp
50    mov         rbp, rsp
51    collect_args 6
52
53    mov         ecx, r13d
54    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
55    jz          near .return
56
57    mov         edx, r10d
58
59    ; -- expand_right_edge
60
61    push        rcx
62    shl         rcx, 1                  ; output_cols * 2
63    sub         rcx, rdx
64    jle         short .expand_end
65
66    mov         rax, r11
67    test        rax, rax
68    jle         short .expand_end
69
70    cld
71    mov         rsi, r14                ; input_data
72.expandloop:
73    push        rax
74    push        rcx
75
76    mov         rdi, JSAMPROW [rsi]
77    add         rdi, rdx
78    mov         al, JSAMPLE [rdi-1]
79
80    rep stosb
81
82    pop         rcx
83    pop         rax
84
85    add         rsi, byte SIZEOF_JSAMPROW
86    dec         rax
87    jg          short .expandloop
88
89.expand_end:
90    pop         rcx                     ; output_cols
91
92    ; -- h2v1_downsample
93
94    mov         eax, r12d               ; rowctr
95    test        eax, eax
96    jle         near .return
97
98    mov         rdx, 0x00010000         ; bias pattern
99    vmovd       xmm7, edx
100    vpshufd     xmm7, xmm7, 0x00        ; xmm7={0, 1, 0, 1, 0, 1, 0, 1}
101    vperm2i128  ymm7, ymm7, ymm7, 0     ; ymm7={xmm7, xmm7}
102    vpcmpeqw    ymm6, ymm6, ymm6
103    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
104
105    mov         rsi, r14                ; input_data
106    mov         rdi, r15                ; output_data
107.rowloop:
108    push        rcx
109    push        rdi
110    push        rsi
111
112    mov         rsi, JSAMPROW [rsi]     ; inptr
113    mov         rdi, JSAMPROW [rdi]     ; outptr
114
115    cmp         rcx, byte SIZEOF_YMMWORD
116    jae         short .columnloop
117
118.columnloop_r24:
119    ; rcx can possibly be 8, 16, 24
120    cmp         rcx, 24
121    jne         .columnloop_r16
122    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
123    vmovdqu     xmm1, XMMWORD [rsi+1*SIZEOF_YMMWORD]
124    mov         rcx, SIZEOF_YMMWORD
125    jmp         short .downsample
126
127.columnloop_r16:
128    cmp         rcx, 16
129    jne         .columnloop_r8
130    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
131    vpxor       ymm1, ymm1, ymm1
132    mov         rcx, SIZEOF_YMMWORD
133    jmp         short .downsample
134
135.columnloop_r8:
136    vmovdqu     xmm0, XMMWORD[rsi+0*SIZEOF_YMMWORD]
137    vpxor       ymm1, ymm1, ymm1
138    mov         rcx, SIZEOF_YMMWORD
139    jmp         short .downsample
140
141.columnloop:
142    vmovdqu     ymm0, YMMWORD [rsi+0*SIZEOF_YMMWORD]
143    vmovdqu     ymm1, YMMWORD [rsi+1*SIZEOF_YMMWORD]
144
145.downsample:
146    vpsrlw      ymm2, ymm0, BYTE_BIT
147    vpand       ymm0, ymm0, ymm6
148    vpsrlw      ymm3, ymm1, BYTE_BIT
149    vpand       ymm1, ymm1, ymm6
150
151    vpaddw      ymm0, ymm0, ymm2
152    vpaddw      ymm1, ymm1, ymm3
153    vpaddw      ymm0, ymm0, ymm7
154    vpaddw      ymm1, ymm1, ymm7
155    vpsrlw      ymm0, ymm0, 1
156    vpsrlw      ymm1, ymm1, 1
157
158    vpackuswb   ymm0, ymm0, ymm1
159    vpermq      ymm0, ymm0, 0xd8
160
161    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
162
163    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
164    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr
165    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
166    cmp         rcx, byte SIZEOF_YMMWORD
167    jae         short .columnloop
168    test        rcx, rcx
169    jnz         near .columnloop_r24
170
171    pop         rsi
172    pop         rdi
173    pop         rcx
174
175    add         rsi, byte SIZEOF_JSAMPROW  ; input_data
176    add         rdi, byte SIZEOF_JSAMPROW  ; output_data
177    dec         rax                        ; rowctr
178    jg          near .rowloop
179
180.return:
181    vzeroupper
182    uncollect_args 6
183    pop         rbp
184    ret
185
186; --------------------------------------------------------------------------
187;
188; Downsample pixel values of a single component.
189; This version handles the standard case of 2:1 horizontal and 2:1 vertical,
190; without smoothing.
191;
192; GLOBAL(void)
193; jsimd_h2v2_downsample_avx2(JDIMENSION image_width, int max_v_samp_factor,
194;                            JDIMENSION v_samp_factor,
195;                            JDIMENSION width_in_blocks, JSAMPARRAY input_data,
196;                            JSAMPARRAY output_data);
197;
198
199; r10d = JDIMENSION image_width
200; r11 = int max_v_samp_factor
201; r12d = JDIMENSION v_samp_factor
202; r13d = JDIMENSION width_in_blocks
203; r14 = JSAMPARRAY input_data
204; r15 = JSAMPARRAY output_data
205
206    align       32
207    GLOBAL_FUNCTION(jsimd_h2v2_downsample_avx2)
208
209EXTN(jsimd_h2v2_downsample_avx2):
210    push        rbp
211    mov         rax, rsp
212    mov         rbp, rsp
213    collect_args 6
214
215    mov         ecx, r13d
216    shl         rcx, 3                  ; imul rcx,DCTSIZE (rcx = output_cols)
217    jz          near .return
218
219    mov         edx, r10d
220
221    ; -- expand_right_edge
222
223    push        rcx
224    shl         rcx, 1                  ; output_cols * 2
225    sub         rcx, rdx
226    jle         short .expand_end
227
228    mov         rax, r11
229    test        rax, rax
230    jle         short .expand_end
231
232    cld
233    mov         rsi, r14                ; input_data
234.expandloop:
235    push        rax
236    push        rcx
237
238    mov         rdi, JSAMPROW [rsi]
239    add         rdi, rdx
240    mov         al, JSAMPLE [rdi-1]
241
242    rep stosb
243
244    pop         rcx
245    pop         rax
246
247    add         rsi, byte SIZEOF_JSAMPROW
248    dec         rax
249    jg          short .expandloop
250
251.expand_end:
252    pop         rcx                     ; output_cols
253
254    ; -- h2v2_downsample
255
256    mov         eax, r12d               ; rowctr
257    test        rax, rax
258    jle         near .return
259
260    mov         rdx, 0x00020001         ; bias pattern
261    vmovd       xmm7, edx
262    vpcmpeqw    ymm6, ymm6, ymm6
263    vpshufd     xmm7, xmm7, 0x00        ; ymm7={1, 2, 1, 2, 1, 2, 1, 2}
264    vperm2i128  ymm7, ymm7, ymm7, 0
265    vpsrlw      ymm6, ymm6, BYTE_BIT    ; ymm6={0xFF 0x00 0xFF 0x00 ..}
266
267    mov         rsi, r14                ; input_data
268    mov         rdi, r15                ; output_data
269.rowloop:
270    push        rcx
271    push        rdi
272    push        rsi
273
274    mov         rdx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; inptr0
275    mov         rsi, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; inptr1
276    mov         rdi, JSAMPROW [rdi]                    ; outptr
277
278    cmp         rcx, byte SIZEOF_YMMWORD
279    jae         short .columnloop
280
281.columnloop_r24:
282    cmp         rcx, 24
283    jne         .columnloop_r16
284    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
285    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
286    vmovdqu     xmm2, XMMWORD [rdx+1*SIZEOF_YMMWORD]
287    vmovdqu     xmm3, XMMWORD [rsi+1*SIZEOF_YMMWORD]
288    mov         rcx, SIZEOF_YMMWORD
289    jmp         short .downsample
290
291.columnloop_r16:
292    cmp         rcx, 16
293    jne         .columnloop_r8
294    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
295    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
296    vpxor       ymm2, ymm2, ymm2
297    vpxor       ymm3, ymm3, ymm3
298    mov         rcx, SIZEOF_YMMWORD
299    jmp         short .downsample
300
301.columnloop_r8:
302    vmovdqu     xmm0, XMMWORD [rdx+0*SIZEOF_XMMWORD]
303    vmovdqu     xmm1, XMMWORD [rsi+0*SIZEOF_XMMWORD]
304    vpxor       ymm2, ymm2, ymm2
305    vpxor       ymm3, ymm3, ymm3
306    mov         rcx, SIZEOF_YMMWORD
307    jmp         short .downsample
308
309.columnloop:
310    vmovdqu     ymm0, YMMWORD [rdx+0*SIZEOF_YMMWORD]
311    vmovdqu     ymm1, YMMWORD [rsi+0*SIZEOF_YMMWORD]
312    vmovdqu     ymm2, YMMWORD [rdx+1*SIZEOF_YMMWORD]
313    vmovdqu     ymm3, YMMWORD [rsi+1*SIZEOF_YMMWORD]
314
315.downsample:
316    vpand       ymm4, ymm0, ymm6
317    vpsrlw      ymm0, ymm0, BYTE_BIT
318    vpand       ymm5, ymm1, ymm6
319    vpsrlw      ymm1, ymm1, BYTE_BIT
320    vpaddw      ymm0, ymm0, ymm4
321    vpaddw      ymm1, ymm1, ymm5
322
323    vpand       ymm4, ymm2, ymm6
324    vpsrlw      ymm2, ymm2, BYTE_BIT
325    vpand       ymm5, ymm3, ymm6
326    vpsrlw      ymm3, ymm3, BYTE_BIT
327    vpaddw      ymm2, ymm2, ymm4
328    vpaddw      ymm3, ymm3, ymm5
329
330    vpaddw      ymm0, ymm0, ymm1
331    vpaddw      ymm2, ymm2, ymm3
332    vpaddw      ymm0, ymm0, ymm7
333    vpaddw      ymm2, ymm2, ymm7
334    vpsrlw      ymm0, ymm0, 2
335    vpsrlw      ymm2, ymm2, 2
336
337    vpackuswb   ymm0, ymm0, ymm2
338    vpermq      ymm0, ymm0, 0xd8
339
340    vmovdqu     YMMWORD [rdi+0*SIZEOF_YMMWORD], ymm0
341
342    sub         rcx, byte SIZEOF_YMMWORD    ; outcol
343    add         rdx, byte 2*SIZEOF_YMMWORD  ; inptr0
344    add         rsi, byte 2*SIZEOF_YMMWORD  ; inptr1
345    add         rdi, byte 1*SIZEOF_YMMWORD  ; outptr
346    cmp         rcx, byte SIZEOF_YMMWORD
347    jae         near .columnloop
348    test        rcx, rcx
349    jnz         near .columnloop_r24
350
351    pop         rsi
352    pop         rdi
353    pop         rcx
354
355    add         rsi, byte 2*SIZEOF_JSAMPROW  ; input_data
356    add         rdi, byte 1*SIZEOF_JSAMPROW  ; output_data
357    dec         rax                          ; rowctr
358    jg          near .rowloop
359
360.return:
361    vzeroupper
362    uncollect_args 6
363    pop         rbp
364    ret
365
366; For some reason, the OS X linker does not honor the request to align the
367; segment unless we do this.
368    align       32
369