1; 2; jquanti.asm - sample data conversion and quantization (AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, 2018, D. R. Commander. 6; Copyright (C) 2016, Matthieu Darbois. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17 18%include "jsimdext.inc" 19%include "jdct.inc" 20 21; -------------------------------------------------------------------------- 22 SECTION SEG_TEXT 23 BITS 32 24; 25; Load data into workspace, applying unsigned->signed conversion 26; 27; GLOBAL(void) 28; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, 29; DCTELEM *workspace); 30; 31 32%define sample_data ebp + 8 ; JSAMPARRAY sample_data 33%define start_col ebp + 12 ; JDIMENSION start_col 34%define workspace ebp + 16 ; DCTELEM *workspace 35 36 align 32 37 GLOBAL_FUNCTION(jsimd_convsamp_avx2) 38 39EXTN(jsimd_convsamp_avx2): 40 push ebp 41 mov ebp, esp 42 push ebx 43; push ecx ; need not be preserved 44; push edx ; need not be preserved 45 push esi 46 push edi 47 48 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 49 mov eax, JDIMENSION [start_col] 50 mov edi, POINTER [workspace] ; (DCTELEM *) 51 52 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 53 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 54 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 55 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 56 57 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 60 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 61 62 mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) 63 mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64 movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 65 movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 66 67 mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) 68 mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) 69 movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 70 movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 71 72 vinserti128 ymm0, ymm0, xmm1, 1 73 vinserti128 ymm2, ymm2, xmm3, 1 74 vinserti128 ymm4, ymm4, xmm5, 1 75 vinserti128 ymm6, ymm6, xmm7, 1 76 77 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 78 vpunpcklbw ymm0, ymm0, ymm1 79 vpunpcklbw ymm2, ymm2, ymm1 80 vpunpcklbw ymm4, ymm4, ymm1 81 vpunpcklbw ymm6, ymm6, ymm1 82 83 vpcmpeqw ymm7, ymm7, ymm7 84 vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 85 86 vpaddw ymm0, ymm0, ymm7 87 vpaddw ymm2, ymm2, ymm7 88 vpaddw ymm4, ymm4, ymm7 89 vpaddw ymm6, ymm6, ymm7 90 91 vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 92 vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 93 vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 94 vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 95 96 vzeroupper 97 pop edi 98 pop esi 99; pop edx ; need not be preserved 100; pop ecx ; need not be preserved 101 pop ebx 102 pop ebp 103 ret 104 105; -------------------------------------------------------------------------- 106; 107; Quantize/descale the coefficients, and store into coef_block 108; 109; This implementation is based on an algorithm described in 110; "How to optimize for the Pentium family of microprocessors" 111; (http://www.agner.org/assem/). 112; 113; GLOBAL(void) 114; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, 115; DCTELEM *workspace); 116; 117 118%define RECIPROCAL(m, n, b) \ 119 YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 120%define CORRECTION(m, n, b) \ 121 YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 122%define SCALE(m, n, b) \ 123 YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 124 125%define coef_block ebp + 8 ; JCOEFPTR coef_block 126%define divisors ebp + 12 ; DCTELEM *divisors 127%define workspace ebp + 16 ; DCTELEM *workspace 128 129 align 32 130 GLOBAL_FUNCTION(jsimd_quantize_avx2) 131 132EXTN(jsimd_quantize_avx2): 133 push ebp 134 mov ebp, esp 135; push ebx ; unused 136; push ecx ; unused 137; push edx ; need not be preserved 138 push esi 139 push edi 140 141 mov esi, POINTER [workspace] 142 mov edx, POINTER [divisors] 143 mov edi, JCOEFPTR [coef_block] 144 145 vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 146 vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 147 vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] 148 vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] 149 vpabsw ymm0, ymm4 150 vpabsw ymm1, ymm5 151 vpabsw ymm2, ymm6 152 vpabsw ymm3, ymm7 153 154 vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 155 vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] 156 vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] 157 vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] 158 vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 159 vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] 160 vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] 161 vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] 162 vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale 163 vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] 164 vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] 165 vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] 166 167 vpsignw ymm0, ymm0, ymm4 168 vpsignw ymm1, ymm1, ymm5 169 vpsignw ymm2, ymm2, ymm6 170 vpsignw ymm3, ymm3, ymm7 171 172 vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 173 vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 174 vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 175 vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 176 177 vzeroupper 178 pop edi 179 pop esi 180; pop edx ; need not be preserved 181; pop ecx ; unused 182; pop ebx ; unused 183 pop ebp 184 ret 185 186; For some reason, the OS X linker does not honor the request to align the 187; segment unless we do this. 188 align 32 189