1;
2; jquantf.asm - sample data conversion and quantization (64-bit SSE & SSE2)
3;
4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5; Copyright (C) 2009, D. R. Commander.
6;
7; Based on the x86 SIMD extension for IJG JPEG library
8; Copyright (C) 1999-2006, MIYASAKA Masaru.
9; For conditions of distribution and use, see copyright notice in jsimdext.inc
10;
11; This file should be assembled with NASM (Netwide Assembler),
12; can *not* be assembled with Microsoft's MASM or any compatible
13; assembler (including Borland's Turbo Assembler).
14; NASM is available from http://nasm.sourceforge.net/ or
15; http://sourceforge.net/project/showfiles.php?group_id=6208
16;
17; [TAB8]
18
19%include "jsimdext.inc"
20%include "jdct.inc"
21
22; --------------------------------------------------------------------------
23        SECTION SEG_TEXT
24        BITS    64
25;
26; Load data into workspace, applying unsigned->signed conversion
27;
28; GLOBAL(void)
29; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
30;                            FAST_FLOAT *workspace);
31;
32
33; r10 = JSAMPARRAY sample_data
34; r11 = JDIMENSION start_col
35; r12 = FAST_FLOAT *workspace
36
37        align   16
38        global  EXTN(jsimd_convsamp_float_sse2)
39
40EXTN(jsimd_convsamp_float_sse2):
41        push    rbp
42        mov     rax,rsp
43        mov     rbp,rsp
44        collect_args
45        push    rbx
46
47        pcmpeqw  xmm7,xmm7
48        psllw    xmm7,7
49        packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
50
51        mov rsi, r10
52        mov     eax, r11d
53        mov rdi, r12
54        mov     rcx, DCTSIZE/2
55.convloop:
56        mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
57        mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
58
59        movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
60        movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
61
62        psubb   xmm0,xmm7                       ; xmm0=(01234567)
63        psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
64
65        punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
66        punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
67
68        punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
69        punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
70        punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
71        punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
72
73        psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
74        psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
75        cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
76        cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
77        psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
78        psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
79        cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
80        cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
81
82        movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
83        movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
84        movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
85        movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
86
87        add     rsi, byte 2*SIZEOF_JSAMPROW
88        add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
89        dec     rcx
90        jnz     short .convloop
91
92        pop     rbx
93        uncollect_args
94        pop     rbp
95        ret
96
97
98; --------------------------------------------------------------------------
99;
100; Quantize/descale the coefficients, and store into coef_block
101;
102; GLOBAL(void)
103; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT *divisors,
104;                         FAST_FLOAT *workspace);
105;
106
107; r10 = JCOEFPTR coef_block
108; r11 = FAST_FLOAT *divisors
109; r12 = FAST_FLOAT *workspace
110
111        align   16
112        global  EXTN(jsimd_quantize_float_sse2)
113
114EXTN(jsimd_quantize_float_sse2):
115        push    rbp
116        mov     rax,rsp
117        mov     rbp,rsp
118        collect_args
119
120        mov rsi, r12
121        mov rdx, r11
122        mov rdi, r10
123        mov     rax, DCTSIZE2/16
124.quantloop:
125        movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
126        movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
127        mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
128        mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
129        movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
130        movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
131        mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
132        mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
133
134        cvtps2dq xmm0,xmm0
135        cvtps2dq xmm1,xmm1
136        cvtps2dq xmm2,xmm2
137        cvtps2dq xmm3,xmm3
138
139        packssdw xmm0,xmm1
140        packssdw xmm2,xmm3
141
142        movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
143        movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
144
145        add     rsi, byte 16*SIZEOF_FAST_FLOAT
146        add     rdx, byte 16*SIZEOF_FAST_FLOAT
147        add     rdi, byte 16*SIZEOF_JCOEF
148        dec     rax
149        jnz     short .quantloop
150
151        uncollect_args
152        pop     rbp
153        ret
154
155; For some reason, the OS X linker does not honor the request to align the
156; segment unless we do this.
157        align   16
158