1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION_RODATA 17pw_1: times 8 dw 1 18 19SECTION .text 20 21%macro QUANTIZE_FN 2 22cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ 23 shift, qcoeff, dqcoeff, dequant, \ 24 eob, scan, iscan 25 26 ; actual quantize loop - setup pointers, rounders, etc. 27 movifnidn coeffq, coeffmp 28 movifnidn ncoeffq, ncoeffmp 29 movifnidn zbinq, zbinmp 30 movifnidn roundq, roundmp 31 movifnidn quantq, quantmp 32 movifnidn dequantq, dequantmp 33 mova m0, [zbinq] ; m0 = zbin 34 mova m1, [roundq] ; m1 = round 35 mova m2, [quantq] ; m2 = quant 36%ifidn %1, b_32x32 37 pcmpeqw m5, m5 38 psrlw m5, 15 39 paddw m0, m5 40 paddw m1, m5 41 psrlw m0, 1 ; m0 = (m0 + 1) / 2 42 psrlw m1, 1 ; m1 = (m1 + 1) / 2 43%endif 44 mova m3, [dequantq] ; m3 = dequant 45 mov r2, shiftmp 46 psubw m0, [GLOBAL(pw_1)] 47 mova m4, [r2] ; m4 = shift 48 mov r3, qcoeffmp 49 mov r4, dqcoeffmp 50 mov r5, iscanmp 51 pxor m5, m5 ; m5 = dedicated zero 52 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob 53 lea coeffq, [ coeffq+ncoeffq*4] 54 lea qcoeffq, [ qcoeffq+ncoeffq*4] 55 lea dqcoeffq, [dqcoeffq+ncoeffq*4] 56 lea iscanq, [ iscanq+ncoeffq*2] 57 neg ncoeffq 58 59 ; get DC and first 15 AC coeffs 60 ; coeff stored as 32bit numbers & require 16bit numbers 61 mova m9, [ coeffq+ncoeffq*4+ 0] 62 packssdw m9, [ coeffq+ncoeffq*4+16] 63 mova m10, [ coeffq+ncoeffq*4+32] 64 packssdw m10, [ coeffq+ncoeffq*4+48] 65 pabsw m6, m9 ; m6 = abs(m9) 66 pabsw m11, m10 ; m11 = abs(m10) 67 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 68 punpckhqdq m0, m0 69 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 70 paddsw m6, m1 ; m6 += round 71 punpckhqdq m1, m1 72 paddsw m11, m1 ; m11 += round 73 pmulhw m8, m6, m2 ; m8 = m6*q>>16 74 punpckhqdq m2, m2 75 pmulhw m13, m11, m2 ; m13 = m11*q>>16 76 paddw m8, m6 ; m8 += m6 77 paddw m13, m11 ; m13 += m11 78 %ifidn %1, b_32x32 79 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh 80 %endif 81 pmulhw m8, m4 ; m8 = m8*qsh>>16 82 %ifidn %1, b_32x32 83 psllw m8, 1 84 psrlw m5, 15 85 por m8, m5 86 %endif 87 punpckhqdq m4, m4 88 %ifidn %1, b_32x32 89 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 90 %endif 91 pmulhw m13, m4 ; m13 = m13*qsh>>16 92 %ifidn %1, b_32x32 93 psllw m13, 1 94 psrlw m5, 15 95 por m13, m5 96 pxor m5, m5 ; reset m5 to zero register 97 %endif 98 psignw m8, m9 ; m8 = reinsert sign 99 psignw m13, m10 ; m13 = reinsert sign 100 pand m8, m7 101 pand m13, m12 102 103 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 104 mova m11, m8 105 mova m6, m8 106 pcmpgtw m5, m8 107 punpcklwd m11, m5 108 punpckhwd m6, m5 109 mova [qcoeffq+ncoeffq*4+ 0], m11 110 mova [qcoeffq+ncoeffq*4+16], m6 111 pxor m5, m5 112 mova m11, m13 113 mova m6, m13 114 pcmpgtw m5, m13 115 punpcklwd m11, m5 116 punpckhwd m6, m5 117 mova [qcoeffq+ncoeffq*4+32], m11 118 mova [qcoeffq+ncoeffq*4+48], m6 119 pxor m5, m5 ; reset m5 to zero register 120 121%ifidn %1, b_32x32 122 pabsw m8, m8 123 pabsw m13, m13 124%endif 125 pmullw m8, m3 ; dqc[i] = qc[i] * q 126 punpckhqdq m3, m3 127 pmullw m13, m3 ; dqc[i] = qc[i] * q 128%ifidn %1, b_32x32 129 psrlw m8, 1 130 psrlw m13, 1 131 psignw m8, m9 132 psignw m13, m10 133%endif 134 ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff 135 mova m11, m8 136 mova m6, m8 137 pcmpgtw m5, m8 138 punpcklwd m11, m5 139 punpckhwd m6, m5 140 mova [dqcoeffq+ncoeffq*4+ 0], m11 141 mova [dqcoeffq+ncoeffq*4+16], m6 142 pxor m5, m5 143 mova m11, m13 144 mova m6, m13 145 pcmpgtw m5, m13 146 punpcklwd m11, m5 147 punpckhwd m6, m5 148 mova [dqcoeffq+ncoeffq*4+32], m11 149 mova [dqcoeffq+ncoeffq*4+48], m6 150 pxor m5, m5 ; reset m5 to zero register 151 pcmpeqw m8, m5 ; m8 = c[i] == 0 152 pcmpeqw m13, m5 ; m13 = c[i] == 0 153 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 154 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 155 psubw m6, m7 ; m6 = scan[i] + 1 156 psubw m11, m12 ; m11 = scan[i] + 1 157 pandn m8, m6 ; m8 = max(eob) 158 pandn m13, m11 ; m13 = max(eob) 159 pmaxsw m8, m13 160 add ncoeffq, mmsize 161 jz .accumulate_eob 162 163.ac_only_loop: 164 ; pack coeff from 32bit to 16bit array 165 mova m9, [ coeffq+ncoeffq*4+ 0] 166 packssdw m9, [ coeffq+ncoeffq*4+16] 167 mova m10, [ coeffq+ncoeffq*4+32] 168 packssdw m10, [ coeffq+ncoeffq*4+48] 169 170 pabsw m6, m9 ; m6 = abs(m9) 171 pabsw m11, m10 ; m11 = abs(m10) 172 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 173 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 174%ifidn %1, b_32x32 175 pmovmskb r6d, m7 176 pmovmskb r2d, m12 177 or r6, r2 178 jz .skip_iter 179%endif 180 paddsw m6, m1 ; m6 += round 181 paddsw m11, m1 ; m11 += round 182 pmulhw m14, m6, m2 ; m14 = m6*q>>16 183 pmulhw m13, m11, m2 ; m13 = m11*q>>16 184 paddw m14, m6 ; m14 += m6 185 paddw m13, m11 ; m13 += m11 186 %ifidn %1, b_32x32 187 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh 188 %endif 189 pmulhw m14, m4 ; m14 = m14*qsh>>16 190 %ifidn %1, b_32x32 191 psllw m14, 1 192 psrlw m5, 15 193 por m14, m5 194 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 195 %endif 196 pmulhw m13, m4 ; m13 = m13*qsh>>16 197 %ifidn %1, b_32x32 198 psllw m13, 1 199 psrlw m5, 15 200 por m13, m5 201 pxor m5, m5 ; reset m5 to zero register 202 %endif 203 psignw m14, m9 ; m14 = reinsert sign 204 psignw m13, m10 ; m13 = reinsert sign 205 pand m14, m7 206 pand m13, m12 207 208 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 209 pxor m11, m11 210 mova m11, m14 211 mova m6, m14 212 pcmpgtw m5, m14 213 punpcklwd m11, m5 214 punpckhwd m6, m5 215 mova [qcoeffq+ncoeffq*4+ 0], m11 216 mova [qcoeffq+ncoeffq*4+16], m6 217 pxor m5, m5 218 mova m11, m13 219 mova m6, m13 220 pcmpgtw m5, m13 221 punpcklwd m11, m5 222 punpckhwd m6, m5 223 mova [qcoeffq+ncoeffq*4+32], m11 224 mova [qcoeffq+ncoeffq*4+48], m6 225 pxor m5, m5 ; reset m5 to zero register 226 227%ifidn %1, b_32x32 228 pabsw m14, m14 229 pabsw m13, m13 230%endif 231 pmullw m14, m3 ; dqc[i] = qc[i] * q 232 pmullw m13, m3 ; dqc[i] = qc[i] * q 233%ifidn %1, b_32x32 234 psrlw m14, 1 235 psrlw m13, 1 236 psignw m14, m9 237 psignw m13, m10 238%endif 239 240 ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff 241 mova m11, m14 242 mova m6, m14 243 pcmpgtw m5, m14 244 punpcklwd m11, m5 245 punpckhwd m6, m5 246 mova [dqcoeffq+ncoeffq*4+ 0], m11 247 mova [dqcoeffq+ncoeffq*4+16], m6 248 pxor m5, m5 249 mova m11, m13 250 mova m6, m13 251 pcmpgtw m5, m13 252 punpcklwd m11, m5 253 punpckhwd m6, m5 254 mova [dqcoeffq+ncoeffq*4+32], m11 255 mova [dqcoeffq+ncoeffq*4+48], m6 256 pxor m5, m5 257 258 pcmpeqw m14, m5 ; m14 = c[i] == 0 259 pcmpeqw m13, m5 ; m13 = c[i] == 0 260 mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 261 mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 262 psubw m6, m7 ; m6 = scan[i] + 1 263 psubw m11, m12 ; m11 = scan[i] + 1 264 pandn m14, m6 ; m14 = max(eob) 265 pandn m13, m11 ; m13 = max(eob) 266 pmaxsw m8, m14 267 pmaxsw m8, m13 268 add ncoeffq, mmsize 269 jl .ac_only_loop 270 271%ifidn %1, b_32x32 272 jmp .accumulate_eob 273.skip_iter: 274 mova [qcoeffq+ncoeffq*4+ 0], m5 275 mova [qcoeffq+ncoeffq*4+16], m5 276 mova [qcoeffq+ncoeffq*4+32], m5 277 mova [qcoeffq+ncoeffq*4+48], m5 278 mova [dqcoeffq+ncoeffq*4+ 0], m5 279 mova [dqcoeffq+ncoeffq*4+16], m5 280 mova [dqcoeffq+ncoeffq*4+32], m5 281 mova [dqcoeffq+ncoeffq*4+48], m5 282 add ncoeffq, mmsize 283 jl .ac_only_loop 284%endif 285 286.accumulate_eob: 287 ; horizontally accumulate/max eobs and write into [eob] memory pointer 288 mov r2, eobmp 289 pshufd m7, m8, 0xe 290 pmaxsw m8, m7 291 pshuflw m7, m8, 0xe 292 pmaxsw m8, m7 293 pshuflw m7, m8, 0x1 294 pmaxsw m8, m7 295 pextrw r6, m8, 0 296 mov [r2], r6 297 RET 298%endmacro 299 300INIT_XMM ssse3 301QUANTIZE_FN b, 9 302QUANTIZE_FN b_32x32, 9 303