1; 2; jquanti.asm - sample data conversion and quantization (AVX2) 3; 4; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5; Copyright (C) 2016, 2018, D. R. Commander. 6; Copyright (C) 2016, Matthieu Darbois. 7; 8; Based on the x86 SIMD extension for IJG JPEG library 9; Copyright (C) 1999-2006, MIYASAKA Masaru. 10; For conditions of distribution and use, see copyright notice in jsimdext.inc 11; 12; This file should be assembled with NASM (Netwide Assembler), 13; can *not* be assembled with Microsoft's MASM or any compatible 14; assembler (including Borland's Turbo Assembler). 15; NASM is available from http://nasm.sourceforge.net/ or 16; http://sourceforge.net/project/showfiles.php?group_id=6208 17; 18; [TAB8] 19 20%include "jsimdext.inc" 21%include "jdct.inc" 22 23; -------------------------------------------------------------------------- 24 SECTION SEG_TEXT 25 BITS 32 26; 27; Load data into workspace, applying unsigned->signed conversion 28; 29; GLOBAL(void) 30; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, 31; DCTELEM *workspace); 32; 33 34%define sample_data ebp + 8 ; JSAMPARRAY sample_data 35%define start_col ebp + 12 ; JDIMENSION start_col 36%define workspace ebp + 16 ; DCTELEM *workspace 37 38 align 32 39 GLOBAL_FUNCTION(jsimd_convsamp_avx2) 40 41EXTN(jsimd_convsamp_avx2): 42 push ebp 43 mov ebp, esp 44 push ebx 45; push ecx ; need not be preserved 46; push edx ; need not be preserved 47 push esi 48 push edi 49 50 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 51 mov eax, JDIMENSION [start_col] 52 mov edi, POINTER [workspace] ; (DCTELEM *) 53 54 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 55 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 56 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 57 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 58 59 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 62 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 63 64 mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) 65 mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) 66 movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 67 movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 68 69 mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) 70 mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) 71 movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 72 movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 73 74 vinserti128 ymm0, ymm0, xmm1, 1 75 vinserti128 ymm2, ymm2, xmm3, 1 76 vinserti128 ymm4, ymm4, xmm5, 1 77 vinserti128 ymm6, ymm6, xmm7, 1 78 79 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 80 vpunpcklbw ymm0, ymm0, ymm1 81 vpunpcklbw ymm2, ymm2, ymm1 82 vpunpcklbw ymm4, ymm4, ymm1 83 vpunpcklbw ymm6, ymm6, ymm1 84 85 vpcmpeqw ymm7, ymm7, ymm7 86 vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 87 88 vpaddw ymm0, ymm0, ymm7 89 vpaddw ymm2, ymm2, ymm7 90 vpaddw ymm4, ymm4, ymm7 91 vpaddw ymm6, ymm6, ymm7 92 93 vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 94 vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 95 vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 96 vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 97 98 vzeroupper 99 pop edi 100 pop esi 101; pop edx ; need not be preserved 102; pop ecx ; need not be preserved 103 pop ebx 104 pop ebp 105 ret 106 107; -------------------------------------------------------------------------- 108; 109; Quantize/descale the coefficients, and store into coef_block 110; 111; This implementation is based on an algorithm described in 112; "How to optimize for the Pentium family of microprocessors" 113; (http://www.agner.org/assem/). 114; 115; GLOBAL(void) 116; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, 117; DCTELEM *workspace); 118; 119 120%define RECIPROCAL(m, n, b) \ 121 YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 122%define CORRECTION(m, n, b) \ 123 YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 124%define SCALE(m, n, b) \ 125 YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 126 127%define coef_block ebp + 8 ; JCOEFPTR coef_block 128%define divisors ebp + 12 ; DCTELEM *divisors 129%define workspace ebp + 16 ; DCTELEM *workspace 130 131 align 32 132 GLOBAL_FUNCTION(jsimd_quantize_avx2) 133 134EXTN(jsimd_quantize_avx2): 135 push ebp 136 mov ebp, esp 137; push ebx ; unused 138; push ecx ; unused 139; push edx ; need not be preserved 140 push esi 141 push edi 142 143 mov esi, POINTER [workspace] 144 mov edx, POINTER [divisors] 145 mov edi, JCOEFPTR [coef_block] 146 147 vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 148 vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 149 vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] 150 vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] 151 vpabsw ymm0, ymm4 152 vpabsw ymm1, ymm5 153 vpabsw ymm2, ymm6 154 vpabsw ymm3, ymm7 155 156 vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 157 vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] 158 vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] 159 vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] 160 vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 161 vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] 162 vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] 163 vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] 164 vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale 165 vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] 166 vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] 167 vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] 168 169 vpsignw ymm0, ymm0, ymm4 170 vpsignw ymm1, ymm1, ymm5 171 vpsignw ymm2, ymm2, ymm6 172 vpsignw ymm3, ymm3, ymm7 173 174 vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 175 vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 176 vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 177 vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 178 179 vzeroupper 180 pop edi 181 pop esi 182; pop edx ; need not be preserved 183; pop ecx ; unused 184; pop ebx ; unused 185 pop ebp 186 ret 187 188; For some reason, the OS X linker does not honor the request to align the 189; segment unless we do this. 190 align 32 191