1;
2; jquanti.asm - sample data conversion and quantization (SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2016, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23    SECTION     SEG_TEXT
24    BITS        32
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col,
30;                     DCTELEM *workspace);
31;
32
33%define sample_data  ebp + 8            ; JSAMPARRAY sample_data
34%define start_col    ebp + 12           ; JDIMENSION start_col
35%define workspace    ebp + 16           ; DCTELEM *workspace
36
37    align       32
38    GLOBAL_FUNCTION(jsimd_convsamp_sse2)
39
40EXTN(jsimd_convsamp_sse2):
41    push        ebp
42    mov         ebp, esp
43    push        ebx
44;   push        ecx                     ; need not be preserved
45;   push        edx                     ; need not be preserved
46    push        esi
47    push        edi
48
49    pxor        xmm6, xmm6              ; xmm6=(all 0's)
50    pcmpeqw     xmm7, xmm7
51    psllw       xmm7, 7                 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..}
52
53    mov         esi, JSAMPARRAY [sample_data]  ; (JSAMPROW *)
54    mov         eax, JDIMENSION [start_col]
55    mov         edi, POINTER [workspace]       ; (DCTELEM *)
56    mov         ecx, DCTSIZE/4
57    alignx      16, 7
58.convloop:
59    mov         ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
60    mov         edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
61
62    movq        xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm0=(01234567)
63    movq        xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm1=(89ABCDEF)
64
65    mov         ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
66    mov         edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW]  ; (JSAMPLE *)
67
68    movq        xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE]  ; xmm2=(GHIJKLMN)
69    movq        xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE]  ; xmm3=(OPQRSTUV)
70
71    punpcklbw   xmm0, xmm6              ; xmm0=(01234567)
72    punpcklbw   xmm1, xmm6              ; xmm1=(89ABCDEF)
73    paddw       xmm0, xmm7
74    paddw       xmm1, xmm7
75    punpcklbw   xmm2, xmm6              ; xmm2=(GHIJKLMN)
76    punpcklbw   xmm3, xmm6              ; xmm3=(OPQRSTUV)
77    paddw       xmm2, xmm7
78    paddw       xmm3, xmm7
79
80    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
81    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
82    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
83    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
84
85    add         esi, byte 4*SIZEOF_JSAMPROW
86    add         edi, byte 4*DCTSIZE*SIZEOF_DCTELEM
87    dec         ecx
88    jnz         short .convloop
89
90    pop         edi
91    pop         esi
92;   pop         edx                     ; need not be preserved
93;   pop         ecx                     ; need not be preserved
94    pop         ebx
95    pop         ebp
96    ret
97
98; --------------------------------------------------------------------------
99;
100; Quantize/descale the coefficients, and store into coef_block
101;
102; This implementation is based on an algorithm described in
103;   "How to optimize for the Pentium family of microprocessors"
104;   (http://www.agner.org/assem/).
105;
106; GLOBAL(void)
107; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors,
108;                     DCTELEM *workspace);
109;
110
111%define RECIPROCAL(m, n, b) \
112  XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM)
113%define CORRECTION(m, n, b) \
114  XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM)
115%define SCALE(m, n, b) \
116  XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM)
117
118%define coef_block  ebp + 8             ; JCOEFPTR coef_block
119%define divisors    ebp + 12            ; DCTELEM *divisors
120%define workspace   ebp + 16            ; DCTELEM *workspace
121
122    align       32
123    GLOBAL_FUNCTION(jsimd_quantize_sse2)
124
125EXTN(jsimd_quantize_sse2):
126    push        ebp
127    mov         ebp, esp
128;   push        ebx                     ; unused
129;   push        ecx                     ; unused
130;   push        edx                     ; need not be preserved
131    push        esi
132    push        edi
133
134    mov         esi, POINTER [workspace]
135    mov         edx, POINTER [divisors]
136    mov         edi, JCOEFPTR [coef_block]
137    mov         eax, DCTSIZE2/32
138    alignx      16, 7
139.quantloop:
140    movdqa      xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)]
141    movdqa      xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)]
142    movdqa      xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)]
143    movdqa      xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)]
144    movdqa      xmm0, xmm4
145    movdqa      xmm1, xmm5
146    movdqa      xmm2, xmm6
147    movdqa      xmm3, xmm7
148    psraw       xmm4, (WORD_BIT-1)
149    psraw       xmm5, (WORD_BIT-1)
150    psraw       xmm6, (WORD_BIT-1)
151    psraw       xmm7, (WORD_BIT-1)
152    pxor        xmm0, xmm4
153    pxor        xmm1, xmm5
154    pxor        xmm2, xmm6
155    pxor        xmm3, xmm7
156    psubw       xmm0, xmm4              ; if (xmm0 < 0) xmm0 = -xmm0;
157    psubw       xmm1, xmm5              ; if (xmm1 < 0) xmm1 = -xmm1;
158    psubw       xmm2, xmm6              ; if (xmm2 < 0) xmm2 = -xmm2;
159    psubw       xmm3, xmm7              ; if (xmm3 < 0) xmm3 = -xmm3;
160
161    paddw       xmm0, XMMWORD [CORRECTION(0,0,edx)]  ; correction + roundfactor
162    paddw       xmm1, XMMWORD [CORRECTION(1,0,edx)]
163    paddw       xmm2, XMMWORD [CORRECTION(2,0,edx)]
164    paddw       xmm3, XMMWORD [CORRECTION(3,0,edx)]
165    pmulhuw     xmm0, XMMWORD [RECIPROCAL(0,0,edx)]  ; reciprocal
166    pmulhuw     xmm1, XMMWORD [RECIPROCAL(1,0,edx)]
167    pmulhuw     xmm2, XMMWORD [RECIPROCAL(2,0,edx)]
168    pmulhuw     xmm3, XMMWORD [RECIPROCAL(3,0,edx)]
169    pmulhuw     xmm0, XMMWORD [SCALE(0,0,edx)]       ; scale
170    pmulhuw     xmm1, XMMWORD [SCALE(1,0,edx)]
171    pmulhuw     xmm2, XMMWORD [SCALE(2,0,edx)]
172    pmulhuw     xmm3, XMMWORD [SCALE(3,0,edx)]
173
174    pxor        xmm0, xmm4
175    pxor        xmm1, xmm5
176    pxor        xmm2, xmm6
177    pxor        xmm3, xmm7
178    psubw       xmm0, xmm4
179    psubw       xmm1, xmm5
180    psubw       xmm2, xmm6
181    psubw       xmm3, xmm7
182    movdqa      XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0
183    movdqa      XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1
184    movdqa      XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2
185    movdqa      XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3
186
187    add         esi, byte 32*SIZEOF_DCTELEM
188    add         edx, byte 32*SIZEOF_DCTELEM
189    add         edi, byte 32*SIZEOF_JCOEF
190    dec         eax
191    jnz         near .quantloop
192
193    pop         edi
194    pop         esi
195;   pop         edx                     ; need not be preserved
196;   pop         ecx                     ; unused
197;   pop         ebx                     ; unused
198    pop         ebp
199    ret
200
201; For some reason, the OS X linker does not honor the request to align the
202; segment unless we do this.
203    align       32
204