1; 2; jquanti.asm - sample data conversion and quantization (SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 30; DCTELEM *workspace); 31; 32 33%define sample_data ebp + 8 ; JSAMPARRAY sample_data 34%define start_col ebp + 12 ; JDIMENSION start_col 35%define workspace ebp + 16 ; DCTELEM *workspace 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_convsamp_sse2) 39 40EXTN(jsimd_convsamp_sse2): 41 push ebp 42 mov ebp, esp 43 push ebx 44; push ecx ; need not be preserved 45; push edx ; need not be preserved 46 push esi 47 push edi 48 49 pxor xmm6, xmm6 ; xmm6=(all 0's) 50 pcmpeqw xmm7, xmm7 51 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 52 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 54 mov eax, JDIMENSION [start_col] 55 mov edi, POINTER [workspace] ; (DCTELEM *) 56 mov ecx, DCTSIZE/4 57 alignx 16, 7 58.convloop: 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 63 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 64 65 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 66 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 67 68 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 69 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 70 71 punpcklbw xmm0, xmm6 ; xmm0=(01234567) 72 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 73 paddw xmm0, xmm7 74 paddw xmm1, xmm7 75 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 76 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 77 paddw xmm2, xmm7 78 paddw xmm3, xmm7 79 80 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 81 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 82 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 83 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 84 85 add esi, byte 4*SIZEOF_JSAMPROW 86 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 87 dec ecx 88 jnz short .convloop 89 90 pop edi 91 pop esi 92; pop edx ; need not be preserved 93; pop ecx ; need not be preserved 94 pop ebx 95 pop ebp 96 ret 97 98; -------------------------------------------------------------------------- 99; 100; Quantize/descale the coefficients, and store into coef_block 101; 102; This implementation is based on an algorithm described in 103; "How to optimize for the Pentium family of microprocessors" 104; (http://www.agner.org/assem/). 105; 106; GLOBAL(void) 107; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 108; DCTELEM *workspace); 109; 110 111%define RECIPROCAL(m, n, b) \ 112 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 113%define CORRECTION(m, n, b) \ 114 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 115%define SCALE(m, n, b) \ 116 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 117 118%define coef_block ebp + 8 ; JCOEFPTR coef_block 119%define divisors ebp + 12 ; DCTELEM *divisors 120%define workspace ebp + 16 ; DCTELEM *workspace 121 122 align 32 123 GLOBAL_FUNCTION(jsimd_quantize_sse2) 124 125EXTN(jsimd_quantize_sse2): 126 push ebp 127 mov ebp, esp 128; push ebx ; unused 129; push ecx ; unused 130; push edx ; need not be preserved 131 push esi 132 push edi 133 134 mov esi, POINTER [workspace] 135 mov edx, POINTER [divisors] 136 mov edi, JCOEFPTR [coef_block] 137 mov eax, DCTSIZE2/32 138 alignx 16, 7 139.quantloop: 140 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 141 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] 142 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 143 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] 144 movdqa xmm0, xmm4 145 movdqa xmm1, xmm5 146 movdqa xmm2, xmm6 147 movdqa xmm3, xmm7 148 psraw xmm4, (WORD_BIT-1) 149 psraw xmm5, (WORD_BIT-1) 150 psraw xmm6, (WORD_BIT-1) 151 psraw xmm7, (WORD_BIT-1) 152 pxor xmm0, xmm4 153 pxor xmm1, xmm5 154 pxor xmm2, xmm6 155 pxor xmm3, xmm7 156 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 157 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 158 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 159 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 160 161 paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 162 paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] 163 paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] 164 paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] 165 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 166 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] 167 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] 168 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] 169 pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale 170 pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] 171 pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] 172 pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] 173 174 pxor xmm0, xmm4 175 pxor xmm1, xmm5 176 pxor xmm2, xmm6 177 pxor xmm3, xmm7 178 psubw xmm0, xmm4 179 psubw xmm1, xmm5 180 psubw xmm2, xmm6 181 psubw xmm3, xmm7 182 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 183 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 184 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 185 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 186 187 add esi, byte 32*SIZEOF_DCTELEM 188 add edx, byte 32*SIZEOF_DCTELEM 189 add edi, byte 32*SIZEOF_JCOEF 190 dec eax 191 jnz near .quantloop 192 193 pop edi 194 pop esi 195; pop edx ; need not be preserved 196; pop ecx ; unused 197; pop ebx ; unused 198 pop ebp 199 ret 200 201; For some reason, the OS X linker does not honor the request to align the 202; segment unless we do this. 203 align 32 204