1;
2; jquanti.asm - sample data conversion and quantization (AVX2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, 2018, D. R. Commander.
6; Copyright (C) 2016, Matthieu Darbois.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19%include "jdct.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_TEXT
23    BITS        32
24;
25; Load data into workspace, applying unsigned->signed conversion
26;
27; GLOBAL(void)
28; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col,
29;                     DCTELEM *workspace);
30;
31
32%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
33%define start_col    ebp + 12           ; JDIMENSION start_col
34%define workspace    ebp + 16           ; DCTELEM *workspace
35
36    align       32
37    GLOBAL_FUNCTION(jsimd_convsamp_avx2)
38
39EXTN(jsimd_convsamp_avx2):
40    push        ebp
41    mov         ebp, esp
42    push        ebx
43;   push        ecx                     ; need not be preserved
44;   push        edx                     ; need not be preserved
45    push        esi
46    push        edi
47
48    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
49    mov         eax, JDIMENSION [start_col]
50    mov         edi, POINTER [workspace]       ; (DCTELEM *)
51
52    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
53    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
54    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
55    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
56
57    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
58    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
59    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
60    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
61
62    mov         ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
63    mov         edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
64    movq        xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
65    movq        xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
66
67    mov         ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
68    mov         edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
69    movq        xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]
70    movq        xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]
71
72    vinserti128 ymm0, ymm0, xmm1, 1
73    vinserti128 ymm2, ymm2, xmm3, 1
74    vinserti128 ymm4, ymm4, xmm5, 1
75    vinserti128 ymm6, ymm6, xmm7, 1
76
77    vpxor       ymm1, ymm1, ymm1        ; ymm1=(all 0's)
78    vpunpcklbw  ymm0, ymm0, ymm1
79    vpunpcklbw  ymm2, ymm2, ymm1
80    vpunpcklbw  ymm4, ymm4, ymm1
81    vpunpcklbw  ymm6, ymm6, ymm1
82
83    vpcmpeqw    ymm7, ymm7, ymm7
84    vpsllw      ymm7, ymm7, 7           ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
85
86    vpaddw      ymm0, ymm0, ymm7
87    vpaddw      ymm2, ymm2, ymm7
88    vpaddw      ymm4, ymm4, ymm7
89    vpaddw      ymm6, ymm6, ymm7
90
91    vmovdqu     YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
92    vmovdqu     YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2
93    vmovdqu     YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4
94    vmovdqu     YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6
95
96    vzeroupper
97    pop         edi
98    pop         esi
99;   pop         edx                     ; need not be preserved
100;   pop         ecx                     ; need not be preserved
101    pop         ebx
102    pop         ebp
103    ret
104
105; --------------------------------------------------------------------------
106;
107; Quantize/descale the coefficients, and store into coef_block
108;
109; This implementation is based on an algorithm described in
110;   "How to optimize for the Pentium family of microprocessors"
111;   (http://www.agner.org/assem/).
112;
113; GLOBAL(void)
114; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors,
115;                     DCTELEM *workspace);
116;
117
118%define RECIPROCAL(m, n, b) \
119  YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
120%define CORRECTION(m, n, b) \
121  YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
122%define SCALE(m, n, b) \
123  YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
124
125%define coef_block  ebp + 8             ; JCOEFPTR coef_block
126%define divisors    ebp + 12            ; DCTELEM *divisors
127%define workspace   ebp + 16            ; DCTELEM *workspace
128
129    align       32
130    GLOBAL_FUNCTION(jsimd_quantize_avx2)
131
132EXTN(jsimd_quantize_avx2):
133    push        ebp
134    mov         ebp, esp
135;   push        ebx                     ; unused
136;   push        ecx                     ; unused
137;   push        edx                     ; need not be preserved
138    push        esi
139    push        edi
140
141    mov         esi, POINTER [workspace]
142    mov         edx, POINTER [divisors]
143    mov         edi, JCOEFPTR [coef_block]
144
145    vmovdqu     ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
146    vmovdqu     ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
147    vmovdqu     ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)]
148    vmovdqu     ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)]
149    vpabsw      ymm0, ymm4
150    vpabsw      ymm1, ymm5
151    vpabsw      ymm2, ymm6
152    vpabsw      ymm3, ymm7
153
154    vpaddw      ymm0, YMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
155    vpaddw      ymm1, YMMWORD [CORRECTION(2,0,edx)]
156    vpaddw      ymm2, YMMWORD [CORRECTION(4,0,edx)]
157    vpaddw      ymm3, YMMWORD [CORRECTION(6,0,edx)]
158    vpmulhuw    ymm0, YMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
159    vpmulhuw    ymm1, YMMWORD [RECIPROCAL(2,0,edx)]
160    vpmulhuw    ymm2, YMMWORD [RECIPROCAL(4,0,edx)]
161    vpmulhuw    ymm3, YMMWORD [RECIPROCAL(6,0,edx)]
162    vpmulhuw    ymm0, YMMWORD [SCALE(0,0,edx)]       ; scale
163    vpmulhuw    ymm1, YMMWORD [SCALE(2,0,edx)]
164    vpmulhuw    ymm2, YMMWORD [SCALE(4,0,edx)]
165    vpmulhuw    ymm3, YMMWORD [SCALE(6,0,edx)]
166
167    vpsignw     ymm0, ymm0, ymm4
168    vpsignw     ymm1, ymm1, ymm5
169    vpsignw     ymm2, ymm2, ymm6
170    vpsignw     ymm3, ymm3, ymm7
171
172    vmovdqu     [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0
173    vmovdqu     [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1
174    vmovdqu     [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2
175    vmovdqu     [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3
176
177    vzeroupper
178    pop         edi
179    pop         esi
180;   pop         edx                     ; need not be preserved
181;   pop         ecx                     ; unused
182;   pop         ebx                     ; unused
183    pop         ebp
184    ret
185
186; For some reason, the OS X linker does not honor the request to align the
187; segment unless we do this.
188    align       32
189