1;
2; jquanti.asm - sample data conversion and quantization (64-bit SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23        SECTION SEG_TEXT
24        BITS    64
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
30;                      DCTELEM *workspace);
31;
32
33; r10 = JSAMPARRAY sample_data
34; r11 = JDIMENSION start_col
35; r12 = DCTELEM *workspace
36
37        align   16
38        global  EXTN(jsimd_convsamp_sse2)
39
40EXTN(jsimd_convsamp_sse2):
41        push    rbp
42        mov     rax,rsp
43        mov     rbp,rsp
44        collect_args
45        push    rbx
46
47        pxor    xmm6,xmm6               ; xmm6=(all 0's)
48        pcmpeqw xmm7,xmm7
49        psllw   xmm7,7                  ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
50
51        mov rsi, r10
52        mov eax, r11d
53        mov rdi, r12
54        mov     rcx, DCTSIZE/4
55.convloop:
56        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
57        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
58
59        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm0=(01234567)
60        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm1=(89ABCDEF)
61
62        mov     rbx, JSAMPROW [rsi+2*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
63        mov     rdx, JSAMPROW [rsi+3*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
64
65        movq    xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]       ; xmm2=(GHIJKLMN)
66        movq    xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]       ; xmm3=(OPQRSTUV)
67
68        punpcklbw xmm0,xmm6             ; xmm0=(01234567)
69        punpcklbw xmm1,xmm6             ; xmm1=(89ABCDEF)
70        paddw     xmm0,xmm7
71        paddw     xmm1,xmm7
72        punpcklbw xmm2,xmm6             ; xmm2=(GHIJKLMN)
73        punpcklbw xmm3,xmm6             ; xmm3=(OPQRSTUV)
74        paddw     xmm2,xmm7
75        paddw     xmm3,xmm7
76
77        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
78        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
79        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
80        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
81
82        add     rsi, byte 4*SIZEOF_JSAMPROW
83        add     rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM
84        dec     rcx
85        jnz     short .convloop
86
87        pop     rbx
88        uncollect_args
89        pop     rbp
90        ret
91
92; --------------------------------------------------------------------------
93;
94; Quantize/descale the coefficients, and store into coef_block
95;
96; This implementation is based on an algorithm described in
97;   "How to optimize for the Pentium family of microprocessors"
98;   (http://www.agner.org/assem/).
99;
100; GLOBAL(void)
101; jsimd_quantize_sse2 (JCOEFPTR coef_block, DCTELEM *divisors,
102;                      DCTELEM *workspace);
103;
104
105%define RECIPROCAL(m,n,b) XMMBLOCK(DCTSIZE*0+(m),(n),(b),SIZEOF_DCTELEM)
106%define CORRECTION(m,n,b) XMMBLOCK(DCTSIZE*1+(m),(n),(b),SIZEOF_DCTELEM)
107%define SCALE(m,n,b)      XMMBLOCK(DCTSIZE*2+(m),(n),(b),SIZEOF_DCTELEM)
108
109; r10 = JCOEFPTR coef_block
110; r11 = DCTELEM *divisors
111; r12 = DCTELEM *workspace
112
113        align   16
114        global  EXTN(jsimd_quantize_sse2)
115
116EXTN(jsimd_quantize_sse2):
117        push    rbp
118        mov     rax,rsp
119        mov     rbp,rsp
120        collect_args
121
122        mov rsi, r12
123        mov rdx, r11
124        mov rdi, r10
125        mov     rax, DCTSIZE2/32
126.quantloop:
127        movdqa  xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)]
128        movdqa  xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)]
129        movdqa  xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)]
130        movdqa  xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)]
131        movdqa  xmm0,xmm4
132        movdqa  xmm1,xmm5
133        movdqa  xmm2,xmm6
134        movdqa  xmm3,xmm7
135        psraw   xmm4,(WORD_BIT-1)
136        psraw   xmm5,(WORD_BIT-1)
137        psraw   xmm6,(WORD_BIT-1)
138        psraw   xmm7,(WORD_BIT-1)
139        pxor    xmm0,xmm4
140        pxor    xmm1,xmm5
141        pxor    xmm2,xmm6
142        pxor    xmm3,xmm7
143        psubw   xmm0,xmm4               ; if (xmm0 < 0) xmm0 = -xmm0;
144        psubw   xmm1,xmm5               ; if (xmm1 < 0) xmm1 = -xmm1;
145        psubw   xmm2,xmm6               ; if (xmm2 < 0) xmm2 = -xmm2;
146        psubw   xmm3,xmm7               ; if (xmm3 < 0) xmm3 = -xmm3;
147
148        paddw   xmm0, XMMWORD [CORRECTION(0,0,rdx)]  ; correction + roundfactor
149        paddw   xmm1, XMMWORD [CORRECTION(1,0,rdx)]
150        paddw   xmm2, XMMWORD [CORRECTION(2,0,rdx)]
151        paddw   xmm3, XMMWORD [CORRECTION(3,0,rdx)]
152        pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)]  ; reciprocal
153        pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)]
154        pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)]
155        pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)]
156        pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)]  ; scale
157        pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)]
158        pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)]
159        pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)]
160
161        pxor    xmm0,xmm4
162        pxor    xmm1,xmm5
163        pxor    xmm2,xmm6
164        pxor    xmm3,xmm7
165        psubw   xmm0,xmm4
166        psubw   xmm1,xmm5
167        psubw   xmm2,xmm6
168        psubw   xmm3,xmm7
169        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0
170        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1
171        movdqa  XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2
172        movdqa  XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3
173
174        add     rsi, byte 32*SIZEOF_DCTELEM
175        add     rdx, byte 32*SIZEOF_DCTELEM
176        add     rdi, byte 32*SIZEOF_JCOEF
177        dec     rax
178        jnz     near .quantloop
179
180        uncollect_args
181        pop     rbp
182        ret
183
184; For some reason, the OS X linker does not honor the request to align the
185; segment unless we do this.
186        align   16
187