1; 2; jquant.asm - sample data conversion and quantization (MMX) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_mmx(JSAMPARRAY sample_data, JDIMENSION start_col, 30; DCTELEM *workspace); 31; 32 33%define sample_data ebp + 8 ; JSAMPARRAY sample_data 34%define start_col ebp + 12 ; JDIMENSION start_col 35%define workspace ebp + 16 ; DCTELEM *workspace 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_convsamp_mmx) 39 40EXTN(jsimd_convsamp_mmx): 41 push ebp 42 mov ebp, esp 43 push ebx 44; push ecx ; need not be preserved 45; push edx ; need not be preserved 46 push esi 47 push edi 48 49 pxor mm6, mm6 ; mm6=(all 0's) 50 pcmpeqw mm7, mm7 51 psllw mm7, 7 ; mm7={0xFF80 0xFF80 0xFF80 0xFF80} 52 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 54 mov eax, JDIMENSION [start_col] 55 mov edi, POINTER [workspace] ; (DCTELEM *) 56 mov ecx, DCTSIZE/4 57 alignx 16, 7 58.convloop: 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq mm0, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm0=(01234567) 63 movq mm1, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm1=(89ABCDEF) 64 65 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 66 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 67 68 movq mm2, MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; mm2=(GHIJKLMN) 69 movq mm3, MMWORD [edx+eax*SIZEOF_JSAMPLE] ; mm3=(OPQRSTUV) 70 71 movq mm4, mm0 72 punpcklbw mm0, mm6 ; mm0=(0123) 73 punpckhbw mm4, mm6 ; mm4=(4567) 74 movq mm5, mm1 75 punpcklbw mm1, mm6 ; mm1=(89AB) 76 punpckhbw mm5, mm6 ; mm5=(CDEF) 77 78 paddw mm0, mm7 79 paddw mm4, mm7 80 paddw mm1, mm7 81 paddw mm5, mm7 82 83 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 84 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm4 85 movq MMWORD [MMBLOCK(1,0,edi,SIZEOF_DCTELEM)], mm1 86 movq MMWORD [MMBLOCK(1,1,edi,SIZEOF_DCTELEM)], mm5 87 88 movq mm0, mm2 89 punpcklbw mm2, mm6 ; mm2=(GHIJ) 90 punpckhbw mm0, mm6 ; mm0=(KLMN) 91 movq mm4, mm3 92 punpcklbw mm3, mm6 ; mm3=(OPQR) 93 punpckhbw mm4, mm6 ; mm4=(STUV) 94 95 paddw mm2, mm7 96 paddw mm0, mm7 97 paddw mm3, mm7 98 paddw mm4, mm7 99 100 movq MMWORD [MMBLOCK(2,0,edi,SIZEOF_DCTELEM)], mm2 101 movq MMWORD [MMBLOCK(2,1,edi,SIZEOF_DCTELEM)], mm0 102 movq MMWORD [MMBLOCK(3,0,edi,SIZEOF_DCTELEM)], mm3 103 movq MMWORD [MMBLOCK(3,1,edi,SIZEOF_DCTELEM)], mm4 104 105 add esi, byte 4*SIZEOF_JSAMPROW 106 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 107 dec ecx 108 jnz short .convloop 109 110 emms ; empty MMX state 111 112 pop edi 113 pop esi 114; pop edx ; need not be preserved 115; pop ecx ; need not be preserved 116 pop ebx 117 pop ebp 118 ret 119 120; -------------------------------------------------------------------------- 121; 122; Quantize/descale the coefficients, and store into coef_block 123; 124; This implementation is based on an algorithm described in 125; "How to optimize for the Pentium family of microprocessors" 126; (http://www.agner.org/assem/). 127; 128; GLOBAL(void) 129; jsimd_quantize_mmx(JCOEFPTR coef_block, DCTELEM *divisors, 130; DCTELEM *workspace); 131; 132 133%define RECIPROCAL(m, n, b) \ 134 MMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 135%define CORRECTION(m, n, b) \ 136 MMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 137%define SCALE(m, n, b) \ 138 MMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 139%define SHIFT(m, n, b) \ 140 MMBLOCK(DCTSIZE * 3 + (m), (n), (b), SIZEOF_DCTELEM) 141 142%define coef_block ebp + 8 ; JCOEFPTR coef_block 143%define divisors ebp + 12 ; DCTELEM *divisors 144%define workspace ebp + 16 ; DCTELEM *workspace 145 146 align 32 147 GLOBAL_FUNCTION(jsimd_quantize_mmx) 148 149EXTN(jsimd_quantize_mmx): 150 push ebp 151 mov ebp, esp 152; push ebx ; unused 153; push ecx ; unused 154; push edx ; need not be preserved 155 push esi 156 push edi 157 158 mov esi, POINTER [workspace] 159 mov edx, POINTER [divisors] 160 mov edi, JCOEFPTR [coef_block] 161 mov ah, 2 162 alignx 16, 7 163.quantloop1: 164 mov al, DCTSIZE2/8/2 165 alignx 16, 7 166.quantloop2: 167 movq mm2, MMWORD [MMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 168 movq mm3, MMWORD [MMBLOCK(0,1,esi,SIZEOF_DCTELEM)] 169 170 movq mm0, mm2 171 movq mm1, mm3 172 173 psraw mm2, (WORD_BIT-1) ; -1 if value < 0, 0 otherwise 174 psraw mm3, (WORD_BIT-1) 175 176 pxor mm0, mm2 ; val = -val 177 pxor mm1, mm3 178 psubw mm0, mm2 179 psubw mm1, mm3 180 181 ; 182 ; MMX is an annoyingly crappy instruction set. It has two 183 ; misfeatures that are causing problems here: 184 ; 185 ; - All multiplications are signed. 186 ; 187 ; - The second operand for the shifts is not treated as packed. 188 ; 189 ; 190 ; We work around the first problem by implementing this algorithm: 191 ; 192 ; unsigned long unsigned_multiply(unsigned short x, unsigned short y) 193 ; { 194 ; enum { SHORT_BIT = 16 }; 195 ; signed short sx = (signed short)x; 196 ; signed short sy = (signed short)y; 197 ; signed long sz; 198 ; 199 ; sz = (long)sx * (long)sy; /* signed multiply */ 200 ; 201 ; if (sx < 0) sz += (long)sy << SHORT_BIT; 202 ; if (sy < 0) sz += (long)sx << SHORT_BIT; 203 ; 204 ; return (unsigned long)sz; 205 ; } 206 ; 207 ; (note that a negative sx adds _sy_ and vice versa) 208 ; 209 ; For the second problem, we replace the shift by a multiplication. 210 ; Unfortunately that means we have to deal with the signed issue again. 211 ; 212 213 paddw mm0, MMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 214 paddw mm1, MMWORD [CORRECTION(0,1,edx)] 215 216 movq mm4, mm0 ; store current value for later 217 movq mm5, mm1 218 pmulhw mm0, MMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 219 pmulhw mm1, MMWORD [RECIPROCAL(0,1,edx)] 220 paddw mm0, mm4 ; reciprocal is always negative (MSB=1), 221 paddw mm1, mm5 ; so we always need to add the initial value 222 ; (input value is never negative as we 223 ; inverted it at the start of this routine) 224 225 ; here it gets a bit tricky as both scale 226 ; and mm0/mm1 can be negative 227 movq mm6, MMWORD [SCALE(0,0,edx)] ; scale 228 movq mm7, MMWORD [SCALE(0,1,edx)] 229 movq mm4, mm0 230 movq mm5, mm1 231 pmulhw mm0, mm6 232 pmulhw mm1, mm7 233 234 psraw mm6, (WORD_BIT-1) ; determine if scale is negative 235 psraw mm7, (WORD_BIT-1) 236 237 pand mm6, mm4 ; and add input if it is 238 pand mm7, mm5 239 paddw mm0, mm6 240 paddw mm1, mm7 241 242 psraw mm4, (WORD_BIT-1) ; then check if negative input 243 psraw mm5, (WORD_BIT-1) 244 245 pand mm4, MMWORD [SCALE(0,0,edx)] ; and add scale if it is 246 pand mm5, MMWORD [SCALE(0,1,edx)] 247 paddw mm0, mm4 248 paddw mm1, mm5 249 250 pxor mm0, mm2 ; val = -val 251 pxor mm1, mm3 252 psubw mm0, mm2 253 psubw mm1, mm3 254 255 movq MMWORD [MMBLOCK(0,0,edi,SIZEOF_DCTELEM)], mm0 256 movq MMWORD [MMBLOCK(0,1,edi,SIZEOF_DCTELEM)], mm1 257 258 add esi, byte 8*SIZEOF_DCTELEM 259 add edx, byte 8*SIZEOF_DCTELEM 260 add edi, byte 8*SIZEOF_JCOEF 261 dec al 262 jnz near .quantloop2 263 dec ah 264 jnz near .quantloop1 ; to avoid branch misprediction 265 266 emms ; empty MMX state 267 268 pop edi 269 pop esi 270; pop edx ; need not be preserved 271; pop ecx ; unused 272; pop ebx ; unused 273 pop ebp 274 ret 275 276; For some reason, the OS X linker does not honor the request to align the 277; segment unless we do this. 278 align 32 279