1; 2; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 3; 4; Use of this source code is governed by a BSD-style license 5; that can be found in the LICENSE file in the root of the source 6; tree. An additional intellectual property rights grant can be found 7; in the file PATENTS. All contributing project authors may 8; be found in the AUTHORS file in the root of the source tree. 9; 10 11%define private_prefix vp9 12 13%include "third_party/x86inc/x86inc.asm" 14%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" 15 16SECTION_RODATA 17pw_1: times 8 dw 1 18 19SECTION .text 20 21%macro QUANTIZE_FP 2 22cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \ 23 qcoeff, dqcoeff, dequant, \ 24 eob, scan, iscan 25 26 ; actual quantize loop - setup pointers, rounders, etc. 27 movifnidn coeffq, coeffmp 28 movifnidn ncoeffq, ncoeffmp 29 mov r2, dequantmp 30 movifnidn roundq, roundmp 31 movifnidn quantq, quantmp 32 mova m1, [roundq] ; m1 = round 33 mova m2, [quantq] ; m2 = quant 34%ifidn %1, fp_32x32 35 pcmpeqw m5, m5 36 psrlw m5, 15 37 paddw m1, m5 38 psrlw m1, 1 ; m1 = (m1 + 1) / 2 39%endif 40 mova m3, [r2q] ; m3 = dequant 41 mov r3, qcoeffmp 42 mov r4, dqcoeffmp 43 mov r5, iscanmp 44%ifidn %1, fp_32x32 45 psllw m2, 1 46%endif 47 pxor m5, m5 ; m5 = dedicated zero 48 49 INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq 50 lea r5q, [r5q+ncoeffq*2] 51 INCREMENT_ELEMENTS_TRAN_LOW r3q, ncoeffq 52 INCREMENT_ELEMENTS_TRAN_LOW r4q, ncoeffq 53 neg ncoeffq 54 55 ; get DC and first 15 AC coeffs 56 LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] 57 LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] 58 pabsw m6, m9 ; m6 = abs(m9) 59 pabsw m11, m10 ; m11 = abs(m10) 60 pcmpeqw m7, m7 61 62 paddsw m6, m1 ; m6 += round 63 punpckhqdq m1, m1 64 paddsw m11, m1 ; m11 += round 65 pmulhw m8, m6, m2 ; m8 = m6*q>>16 66 punpckhqdq m2, m2 67 pmulhw m13, m11, m2 ; m13 = m11*q>>16 68 psignw m8, m9 ; m8 = reinsert sign 69 psignw m13, m10 ; m13 = reinsert sign 70 STORE_TRAN_LOW 8, r3q, ncoeffq, 6, 11, 12 71 STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 72%ifidn %1, fp_32x32 73 pabsw m8, m8 74 pabsw m13, m13 75%endif 76 pmullw m8, m3 ; r4[i] = r3[i] * q 77 punpckhqdq m3, m3 78 pmullw m13, m3 ; r4[i] = r3[i] * q 79%ifidn %1, fp_32x32 80 psrlw m8, 1 81 psrlw m13, 1 82 psignw m8, m9 83 psignw m13, m10 84 psrlw m0, m3, 2 85%else 86 psrlw m0, m3, 1 87%endif 88 STORE_TRAN_LOW 8, r4q, ncoeffq, 6, 11, 12 89 STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 90 pcmpeqw m8, m5 ; m8 = c[i] == 0 91 pcmpeqw m13, m5 ; m13 = c[i] == 0 92 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] 93 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] 94 psubw m6, m7 ; m6 = scan[i] + 1 95 psubw m11, m7 ; m11 = scan[i] + 1 96 pandn m8, m6 ; m8 = max(eob) 97 pandn m13, m11 ; m13 = max(eob) 98 pmaxsw m8, m13 99 add ncoeffq, mmsize 100 jz .accumulate_eob 101 102.ac_only_loop: 103 LOAD_TRAN_LOW 9, coeffq, ncoeffq ; m9 = c[i] 104 LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8 ; m10 = c[i] 105 pabsw m6, m9 ; m6 = abs(m9) 106 pabsw m11, m10 ; m11 = abs(m10) 107 108 pcmpgtw m7, m6, m0 109 pcmpgtw m12, m11, m0 110 pmovmskb r6d, m7 111 pmovmskb r2d, m12 112 113 or r6, r2 114 jz .skip_iter 115 116 pcmpeqw m7, m7 117 118 paddsw m6, m1 ; m6 += round 119 paddsw m11, m1 ; m11 += round 120 pmulhw m14, m6, m2 ; m14 = m6*q>>16 121 pmulhw m13, m11, m2 ; m13 = m11*q>>16 122 psignw m14, m9 ; m14 = reinsert sign 123 psignw m13, m10 ; m13 = reinsert sign 124 STORE_TRAN_LOW 14, r3q, ncoeffq, 6, 11, 12 125 STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12 126%ifidn %1, fp_32x32 127 pabsw m14, m14 128 pabsw m13, m13 129%endif 130 pmullw m14, m3 ; r4[i] = r3[i] * q 131 pmullw m13, m3 ; r4[i] = r3[i] * q 132%ifidn %1, fp_32x32 133 psrlw m14, 1 134 psrlw m13, 1 135 psignw m14, m9 136 psignw m13, m10 137%endif 138 STORE_TRAN_LOW 14, r4q, ncoeffq, 6, 11, 12 139 STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12 140 pcmpeqw m14, m5 ; m14 = c[i] == 0 141 pcmpeqw m13, m5 ; m13 = c[i] == 0 142 mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i] 143 mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i] 144 psubw m6, m7 ; m6 = scan[i] + 1 145 psubw m11, m7 ; m11 = scan[i] + 1 146 pandn m14, m6 ; m14 = max(eob) 147 pandn m13, m11 ; m13 = max(eob) 148 pmaxsw m8, m14 149 pmaxsw m8, m13 150 add ncoeffq, mmsize 151 jl .ac_only_loop 152 153 jmp .accumulate_eob 154.skip_iter: 155 STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq 156 STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8 157 STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq 158 STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8 159 add ncoeffq, mmsize 160 jl .ac_only_loop 161 162.accumulate_eob: 163 ; horizontally accumulate/max eobs and write into [eob] memory pointer 164 mov r2, eobmp 165 pshufd m7, m8, 0xe 166 pmaxsw m8, m7 167 pshuflw m7, m8, 0xe 168 pmaxsw m8, m7 169 pshuflw m7, m8, 0x1 170 pmaxsw m8, m7 171 pextrw r6, m8, 0 172 mov [r2], r6w 173 RET 174%endmacro 175 176INIT_XMM ssse3 177QUANTIZE_FP fp, 7 178QUANTIZE_FP fp_32x32, 7 179