1; 2; jquantf.asm - sample data conversion and quantization (SSE & SSE2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, D. R. Commander. 6; 7; Based on the x86 SIMD extension for IJG JPEG library 8; Copyright (C) 1999-2006, MIYASAKA Masaru. 9; For conditions of distribution and use, see copyright notice in jsimdext.inc 10; 11; This file should be assembled with NASM (Netwide Assembler), 12; can *not* be assembled with Microsoft's MASM or any compatible 13; assembler (including Borland's Turbo Assembler). 14; NASM is available from http://nasm.sourceforge.net/ or 15; http://sourceforge.net/project/showfiles.php?group_id=6208 16; 17; [TAB8] 18 19%include "jsimdext.inc" 20%include "jdct.inc" 21 22; -------------------------------------------------------------------------- 23 SECTION SEG_TEXT 24 BITS 32 25; 26; Load data into workspace, applying unsigned->signed conversion 27; 28; GLOBAL(void) 29; jsimd_convsamp_float_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 30; FAST_FLOAT *workspace); 31; 32 33%define sample_data ebp + 8 ; JSAMPARRAY sample_data 34%define start_col ebp + 12 ; JDIMENSION start_col 35%define workspace ebp + 16 ; FAST_FLOAT *workspace 36 37 align 32 38 GLOBAL_FUNCTION(jsimd_convsamp_float_sse2) 39 40EXTN(jsimd_convsamp_float_sse2): 41 push ebp 42 mov ebp, esp 43 push ebx 44; push ecx ; need not be preserved 45; push edx ; need not be preserved 46 push esi 47 push edi 48 49 pcmpeqw xmm7, xmm7 50 psllw xmm7, 7 51 packsswb xmm7, xmm7 ; xmm7 = PB_CENTERJSAMPLE (0x808080..) 52 53 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 54 mov eax, JDIMENSION [start_col] 55 mov edi, POINTER [workspace] ; (DCTELEM *) 56 mov ecx, DCTSIZE/2 57 alignx 16, 7 58.convloop: 59 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 63 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 64 65 psubb xmm0, xmm7 ; xmm0=(01234567) 66 psubb xmm1, xmm7 ; xmm1=(89ABCDEF) 67 68 punpcklbw xmm0, xmm0 ; xmm0=(*0*1*2*3*4*5*6*7) 69 punpcklbw xmm1, xmm1 ; xmm1=(*8*9*A*B*C*D*E*F) 70 71 punpcklwd xmm2, xmm0 ; xmm2=(***0***1***2***3) 72 punpckhwd xmm0, xmm0 ; xmm0=(***4***5***6***7) 73 punpcklwd xmm3, xmm1 ; xmm3=(***8***9***A***B) 74 punpckhwd xmm1, xmm1 ; xmm1=(***C***D***E***F) 75 76 psrad xmm2, (DWORD_BIT-BYTE_BIT) ; xmm2=(0123) 77 psrad xmm0, (DWORD_BIT-BYTE_BIT) ; xmm0=(4567) 78 cvtdq2ps xmm2, xmm2 ; xmm2=(0123) 79 cvtdq2ps xmm0, xmm0 ; xmm0=(4567) 80 psrad xmm3, (DWORD_BIT-BYTE_BIT) ; xmm3=(89AB) 81 psrad xmm1, (DWORD_BIT-BYTE_BIT) ; xmm1=(CDEF) 82 cvtdq2ps xmm3, xmm3 ; xmm3=(89AB) 83 cvtdq2ps xmm1, xmm1 ; xmm1=(CDEF) 84 85 movaps XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_FAST_FLOAT)], xmm2 86 movaps XMMWORD [XMMBLOCK(0,1,edi,SIZEOF_FAST_FLOAT)], xmm0 87 movaps XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_FAST_FLOAT)], xmm3 88 movaps XMMWORD [XMMBLOCK(1,1,edi,SIZEOF_FAST_FLOAT)], xmm1 89 90 add esi, byte 2*SIZEOF_JSAMPROW 91 add edi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT 92 dec ecx 93 jnz short .convloop 94 95 pop edi 96 pop esi 97; pop edx ; need not be preserved 98; pop ecx ; need not be preserved 99 pop ebx 100 pop ebp 101 ret 102 103; -------------------------------------------------------------------------- 104; 105; Quantize/descale the coefficients, and store into coef_block 106; 107; GLOBAL(void) 108; jsimd_quantize_float_sse2(JCOEFPTR coef_block, FAST_FLOAT *divisors, 109; FAST_FLOAT *workspace); 110; 111 112%define coef_block ebp + 8 ; JCOEFPTR coef_block 113%define divisors ebp + 12 ; FAST_FLOAT *divisors 114%define workspace ebp + 16 ; FAST_FLOAT *workspace 115 116 align 32 117 GLOBAL_FUNCTION(jsimd_quantize_float_sse2) 118 119EXTN(jsimd_quantize_float_sse2): 120 push ebp 121 mov ebp, esp 122; push ebx ; unused 123; push ecx ; unused 124; push edx ; need not be preserved 125 push esi 126 push edi 127 128 mov esi, POINTER [workspace] 129 mov edx, POINTER [divisors] 130 mov edi, JCOEFPTR [coef_block] 131 mov eax, DCTSIZE2/16 132 alignx 16, 7 133.quantloop: 134 movaps xmm0, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_FAST_FLOAT)] 135 movaps xmm1, XMMWORD [XMMBLOCK(0,1,esi,SIZEOF_FAST_FLOAT)] 136 mulps xmm0, XMMWORD [XMMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] 137 mulps xmm1, XMMWORD [XMMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] 138 movaps xmm2, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_FAST_FLOAT)] 139 movaps xmm3, XMMWORD [XMMBLOCK(1,1,esi,SIZEOF_FAST_FLOAT)] 140 mulps xmm2, XMMWORD [XMMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] 141 mulps xmm3, XMMWORD [XMMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] 142 143 cvtps2dq xmm0, xmm0 144 cvtps2dq xmm1, xmm1 145 cvtps2dq xmm2, xmm2 146 cvtps2dq xmm3, xmm3 147 148 packssdw xmm0, xmm1 149 packssdw xmm2, xmm3 150 151 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_JCOEF)], xmm0 152 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_JCOEF)], xmm2 153 154 add esi, byte 16*SIZEOF_FAST_FLOAT 155 add edx, byte 16*SIZEOF_FAST_FLOAT 156 add edi, byte 16*SIZEOF_JCOEF 157 dec eax 158 jnz short .quantloop 159 160 pop edi 161 pop esi 162; pop edx ; need not be preserved 163; pop ecx ; unused 164; pop ebx ; unused 165 pop ebp 166 ret 167 168; For some reason, the OS X linker does not honor the request to align the 169; segment unless we do this. 170 align 32 171