1;
2; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, 2016, D. R. Commander.
6; Copyright (C) 2018, Matthias Räncker.
7;
8; Based on the x86 SIMD extension for IJG JPEG library
9; Copyright (C) 1999-2006, MIYASAKA Masaru.
10; For conditions of distribution and use, see copyright notice in jsimdext.inc
11;
12; This file should be assembled with NASM (Netwide Assembler),
13; can *not* be assembled with Microsoft's MASM or any compatible
14; assembler (including Borland's Turbo Assembler).
15; NASM is available from http://nasm.sourceforge.net/ or
16; http://sourceforge.net/project/showfiles.php?group_id=6208
17
18%include "jsimdext.inc"
19%include "jdct.inc"
20
21; --------------------------------------------------------------------------
22    SECTION     SEG_TEXT
23    BITS        64
24;
25; Load data into workspace, applying unsigned->signed conversion
26;
27; GLOBAL(void)
28; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
29;                     DCTELEM *workspace);
30;
31
32; r10 = JSAMPARRAY sample_data
33; r11d = JDIMENSION start_col
34; r12 = DCTELEM *workspace
35
36    align       32
37    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
38
39EXTN(jsimd_convsamp_sse2):
40    push        rbp
41    mov         rax, rsp
42    mov         rbp, rsp
43    collect_args 3
44    push        rbx
45
46    pxor        xmm6, xmm6              ; xmm6=(all 0's)
47    pcmpeqw     xmm7, xmm7
48    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
49
50    mov         rsi, r10
51    mov         eax, r11d
52    mov         rdi, r12
53    mov         rcx, DCTSIZE/4
54.convloop:
55    mov         rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
56    mov         rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
57
58    movq        xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
59    movq        xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
60
61    mov         rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
62    mov         rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
63
64    movq        xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
65    movq        xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
66
67    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
68    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
69    paddw       xmm0, xmm7
70    paddw       xmm1, xmm7
71    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
72    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
73    paddw       xmm2, xmm7
74    paddw       xmm3, xmm7
75
76    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
77    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
78    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
79    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
80
81    add         rsi, byte 4*SIZEOF_JSAMPROW
82    add         rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
83    dec         rcx
84    jnz         short .convloop
85
86    pop         rbx
87    uncollect_args 3
88    pop         rbp
89    ret
90
91; --------------------------------------------------------------------------
92;
93; Quantize/descale the coefficients, and store into coef_block
94;
95; This implementation is based on an algorithm described in
96;   "How to optimize for the Pentium family of microprocessors"
97;   (http://www.agner.org/assem/).
98;
99; GLOBAL(void)
100; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
101;                     DCTELEM *workspace);
102;
103
104%define RECIPROCAL(m, n, b) \
105  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
106%define CORRECTION(m, n, b) \
107  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
108%define SCALE(m, n, b) \
109  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
110
111; r10 = JCOEFPTR coef_block
112; r11 = DCTELEM *divisors
113; r12 = DCTELEM *workspace
114
115    align       32
116    GLOBAL_FUNCTION(jsimd_quantize_sse2)
117
118EXTN(jsimd_quantize_sse2):
119    push        rbp
120    mov         rax, rsp
121    mov         rbp, rsp
122    collect_args 3
123
124    mov         rsi, r12
125    mov         rdx, r11
126    mov         rdi, r10
127    mov         rax, DCTSIZE2/32
128.quantloop:
129    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
130    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
131    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
132    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
133    movdqa      xmm0, xmm4
134    movdqa      xmm1, xmm5
135    movdqa      xmm2, xmm6
136    movdqa      xmm3, xmm7
137    psraw       xmm4, (WORD_BIT-1)
138    psraw       xmm5, (WORD_BIT-1)
139    psraw       xmm6, (WORD_BIT-1)
140    psraw       xmm7, (WORD_BIT-1)
141    pxor        xmm0, xmm4
142    pxor        xmm1, xmm5
143    pxor        xmm2, xmm6
144    pxor        xmm3, xmm7
145    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
146    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
147    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
148    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
149
150    paddw       xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
151    paddw       xmm1, XMMWORD [CORRECTION(1,0,rdx)]
152    paddw       xmm2, XMMWORD [CORRECTION(2,0,rdx)]
153    paddw       xmm3, XMMWORD [CORRECTION(3,0,rdx)]
154    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
155    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
156    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
157    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
158    pmulhuw     xmm0, XMMWORD [SCALE(0,0,rdx)]       ; scale
159    pmulhuw     xmm1, XMMWORD [SCALE(1,0,rdx)]
160    pmulhuw     xmm2, XMMWORD [SCALE(2,0,rdx)]
161    pmulhuw     xmm3, XMMWORD [SCALE(3,0,rdx)]
162
163    pxor        xmm0, xmm4
164    pxor        xmm1, xmm5
165    pxor        xmm2, xmm6
166    pxor        xmm3, xmm7
167    psubw       xmm0, xmm4
168    psubw       xmm1, xmm5
169    psubw       xmm2, xmm6
170    psubw       xmm3, xmm7
171    movdqa      XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
172    movdqa      XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
173    movdqa      XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
174    movdqa      XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
175
176    add         rsi, byte 32*SIZEOF_DCTELEM
177    add         rdx, byte 32*SIZEOF_DCTELEM
178    add         rdi, byte 32*SIZEOF_JCOEF
179    dec         rax
180    jnz         near .quantloop
181
182    uncollect_args 3
183    pop         rbp
184    ret
185
186; For some reason, the OS X linker does not honor the request to align the
187; segment unless we do this.
188    align       32
189