1;
2; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        64
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
30;                     DCTELEM *workspace);
31;
32
33; r10 = JSAMPARRAY sample_data
34; r11d = JDIMENSION start_col
35; r12 = DCTELEM *workspace
36
37    align       32
38    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
39
40EXTN(jsimd_convsamp_sse2):
41    push        rbp
42    mov         rax, rsp
43    mov         rbp, rsp
44    collect_args 3
45    push        rbx
46
47    pxor        xmm6, xmm6              ; xmm6=(all 0's)
48    pcmpeqw     xmm7, xmm7
49    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
50
51    mov         rsi, r10
52    mov         eax, r11d
53    mov         rdi, r12
54    mov         rcx, DCTSIZE/4
55.convloop:
56    mov         rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
57    mov         rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
58
59    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
60    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
61
62    mov         rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
63    mov         rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
64
65    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
66    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
67
68    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
69    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
70    paddw       xmm0, xmm7
71    paddw       xmm1, xmm7
72    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
73    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
74    paddw       xmm2, xmm7
75    paddw       xmm3, xmm7
76
77    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
78    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
79    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
80    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
81
82    add         rsi, byte 4*SIZEOF_JSAMPROW
83    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
84    dec         rcx
85    jnz         short .convloop
86
87    pop         rbx
88    uncollect_args 3
89    pop         rbp
90    ret
91
92; --------------------------------------------------------------------------
93;
94; Quantize/descale the coefficients, and store into coef_block
95;
96; This implementation is based on an algorithm described in
97;   "How to optimize for the Pentium family of microprocessors"
98;   (http://www.agner.org/assem/).
99;
100; GLOBAL(void)
101; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
102;                     DCTELEM *workspace);
103;
104
105%define RECIPROCAL(m, n, b) \
106  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
107%define CORRECTION(m, n, b) \
108  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
109%define SCALE(m, n, b) \
110  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
111
112; r10 = JCOEFPTR coef_block
113; r11 = DCTELEM *divisors
114; r12 = DCTELEM *workspace
115
116    align       32
117    GLOBAL_FUNCTION(jsimd_quantize_sse2)
118
119EXTN(jsimd_quantize_sse2):
120    push        rbp
121    mov         rax, rsp
122    mov         rbp, rsp
123    collect_args 3
124
125    mov         rsi, r12
126    mov         rdx, r11
127    mov         rdi, r10
128    mov         rax, DCTSIZE2/32
129.quantloop:
130    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
131    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
132    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
133    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
134    movdqa      xmm0, xmm4
135    movdqa      xmm1, xmm5
136    movdqa      xmm2, xmm6
137    movdqa      xmm3, xmm7
138    psraw       xmm4, (WORD_BIT-1)
139    psraw       xmm5, (WORD_BIT-1)
140    psraw       xmm6, (WORD_BIT-1)
141    psraw       xmm7, (WORD_BIT-1)
142    pxor        xmm0, xmm4
143    pxor        xmm1, xmm5
144    pxor        xmm2, xmm6
145    pxor        xmm3, xmm7
146    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
147    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
148    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
149    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
150
151    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
152    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
153    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
154    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
155    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
156    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
157    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
158    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
159    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
160    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
161    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
162    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
163
164    pxor        xmm0, xmm4
165    pxor        xmm1, xmm5
166    pxor        xmm2, xmm6
167    pxor        xmm3, xmm7
168    psubw       xmm0, xmm4
169    psubw       xmm1, xmm5
170    psubw       xmm2, xmm6
171    psubw       xmm3, xmm7
172    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
173    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
174    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
175    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
176
177    add         rsi, byte 32*SIZEOF_DCTELEM
178    add         rdx, byte 32*SIZEOF_DCTELEM
179    add         rdi, byte 32*SIZEOF_JCOEF
180    dec         rax
181    jnz         near .quantloop
182
183    uncollect_args 3
184    pop         rbp
185    ret
186
187; For some reason, the OS X linker does not honor the request to align the
188; segment unless we do this.
189    align       32
190