1;
2; jquanti.asm - sample data conversion and quantization (AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, 2018, D. R. Commander.
6; Copyright (C) 2016, Matthieu Darbois.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17;
18; [TAB8]
19
20%include "jsimdext.inc"
21%include "jdct.inc"
22
23; --------------------------------------------------------------------------
24    SECTION     SEG_TEXT
25    BITS        32
26;
27; Load data into workspace, applying unsigned->signed conversion
28;
29; GLOBAL(void)
30; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
31;                     DCTELEM *workspace);
32;
33
34%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
35%define start_col    ebp + 12           ; JDIMENSION start_col
36%define workspace    ebp + 16           ; DCTELEM *workspace
37
38    align       32
39    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
40
41EXTN(jsimd_convsamp_avx2):
42    push        ebp
43    mov         ebp, esp
44    push        ebx
45;   push        ecx                     ; need not be preserved
46;   push        edx                     ; need not be preserved
47    push        esi
48    push        edi
49
50    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
51    mov         eax, JDIMENSION [start_col]
52    mov         edi, POINTER [workspace]       ; (DCTELEM *)
53
54    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
55    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
56    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
57    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
58
59    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
60    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
61    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
62    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
63
64    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
65    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
66    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
67    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
68
69    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
70    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
71    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
72    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
73
74    vinserti128 ymm0, ymm0, xmm1, 1
75    vinserti128 ymm2, ymm2, xmm3, 1
76    vinserti128 ymm4, ymm4, xmm5, 1
77    vinserti128 ymm6, ymm6, xmm7, 1
78
79    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
80    vpunpcklbw  ymm0, ymm0, ymm1
81    vpunpcklbw  ymm2, ymm2, ymm1
82    vpunpcklbw  ymm4, ymm4, ymm1
83    vpunpcklbw  ymm6, ymm6, ymm1
84
85    vpcmpeqw    ymm7, ymm7, ymm7
86    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
87
88    vpaddw      ymm0, ymm0, ymm7
89    vpaddw      ymm2, ymm2, ymm7
90    vpaddw      ymm4, ymm4, ymm7
91    vpaddw      ymm6, ymm6, ymm7
92
93    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
94    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
95    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
96    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
97
98    vzeroupper
99    pop         edi
100    pop         esi
101;   pop         edx                     ; need not be preserved
102;   pop         ecx                     ; need not be preserved
103    pop         ebx
104    pop         ebp
105    ret
106
107; --------------------------------------------------------------------------
108;
109; Quantize/descale the coefficients, and store into coef_block
110;
111; This implementation is based on an algorithm described in
112;   "How to optimize for the Pentium family of microprocessors"
113;   (http://www.agner.org/assem/).
114;
115; GLOBAL(void)
116; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
117;                     DCTELEM *workspace);
118;
119
120%define RECIPROCAL(m, n, b) \
121  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
122%define CORRECTION(m, n, b) \
123  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
124%define SCALE(m, n, b) \
125  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
126
127%define coef_block  ebp + 8             ; JCOEFPTR coef_block
128%define divisors    ebp + 12            ; DCTELEM *divisors
129%define workspace   ebp + 16            ; DCTELEM *workspace
130
131    align       32
132    GLOBAL_FUNCTION(jsimd_quantize_avx2)
133
134EXTN(jsimd_quantize_avx2):
135    push        ebp
136    mov         ebp, esp
137;   push        ebx                     ; unused
138;   push        ecx                     ; unused
139;   push        edx                     ; need not be preserved
140    push        esi
141    push        edi
142
143    mov         esi, POINTER [workspace]
144    mov         edx, POINTER [divisors]
145    mov         edi, JCOEFPTR [coef_block]
146
147    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
148    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
149    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
150    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
151    vpabsw      ymm0, ymm4
152    vpabsw      ymm1, ymm5
153    vpabsw      ymm2, ymm6
154    vpabsw      ymm3, ymm7
155
156    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
157    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
158    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
159    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
160    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
161    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
162    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
163    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
164    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
165    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
166    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
167    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
168
169    vpsignw     ymm0, ymm0, ymm4
170    vpsignw     ymm1, ymm1, ymm5
171    vpsignw     ymm2, ymm2, ymm6
172    vpsignw     ymm3, ymm3, ymm7
173
174    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
175    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
176    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
177    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
178
179    vzeroupper
180    pop         edi
181    pop         esi
182;   pop         edx                     ; need not be preserved
183;   pop         ecx                     ; unused
184;   pop         ebx                     ; unused
185    pop         ebp
186    ret
187
188; For some reason, the OS X linker does not honor the request to align the
189; segment unless we do this.
190    align       32
191