1; 2; Copyright (c) 2016, Alliance for Open Media. All rights reserved 3; 4; This source code is subject to the terms of the BSD 2 Clause License and 5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6; was not distributed with this source code in the LICENSE file, you can 7; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8; Media Patent License 1.0 was not distributed with this source code in the 9; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10; 11 12; 13 14%include "third_party/x86inc/x86inc.asm" 15 16SECTION .text 17 18%macro QUANTIZE_FN 2 19cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \ 20 shift, qcoeff, dqcoeff, dequant, \ 21 eob, scan, iscan 22 23 vzeroupper 24 25%ifnidn %1, b_32x32 26 27 ; Special case for ncoeff == 16, as it is frequent and we can save on 28 ; not setting up a loop. 29 cmp ncoeffmp, 16 30 jne .generic 31 32 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 33 ;; Special case of ncoeff == 16 34 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 35 36.single: 37 38 movifnidn coeffq, coeffmp 39 movifnidn zbinq, zbinmp 40 mova m0, [zbinq] ; m0 = zbin 41 42 ; Get DC and first 15 AC coeffs - in this special case, that is all. 43 ; coeff stored as 32bit numbers but we process them as 16 bit numbers 44 mova m9, [coeffq] 45 packssdw m9, [coeffq+16] ; m9 = c[i] 46 mova m10, [coeffq+32] 47 packssdw m10, [coeffq+48] ; m10 = c[i] 48 49 mov r0, eobmp ; Output pointer 50 mov r1, qcoeffmp ; Output pointer 51 mov r2, dqcoeffmp ; Output pointer 52 53 pxor m5, m5 ; m5 = dedicated zero 54 55 pcmpeqw m4, m4 ; All word lanes -1 56 paddw m0, m4 ; m0 = zbin - 1 57 58 pabsw m6, m9 ; m6 = abs(m9) 59 pabsw m11, m10 ; m11 = abs(m10) 60 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 61 punpckhqdq m0, m0 62 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 63 64 ; Check if all coeffs are less than zbin. If yes, we just write zeros 65 ; to the outputs and we are done. 66 por m14, m7, m12 67 ptest m14, m14 68 jnz .single_nonzero 69 70 mova [r1 ], ymm5 71 mova [r1+32], ymm5 72 mova [r2 ], ymm5 73 mova [r2+32], ymm5 74 mov [r0], word 0 75 76 vzeroupper 77 RET 78 79.single_nonzero: 80 81 ; Actual quantization of size 16 block - setup pointers, rounders, etc. 82 movifnidn r3, roundmp 83 movifnidn r4, quantmp 84 mov r6, dequantmp 85 mov r5, shiftmp 86 mova m1, [r3] ; m1 = round 87 mova m2, [r4] ; m2 = quant 88 mova m3, [r6] ; m3 = dequant 89 mova m4, [r5] ; m4 = shift 90 91 mov r3, iscanmp 92 93 DEFINE_ARGS eob, qcoeff, dqcoeff, iscan 94 95 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 96 97 paddsw m6, m1 ; m6 += round 98 punpckhqdq m1, m1 99 paddsw m11, m1 ; m11 += round 100 pmulhw m8, m6, m2 ; m8 = m6*q>>16 101 punpckhqdq m2, m2 102 pmulhw m13, m11, m2 ; m13 = m11*q>>16 103 paddw m8, m6 ; m8 += m6 104 paddw m13, m11 ; m13 += m11 105 pmulhw m8, m4 ; m8 = m8*qsh>>16 106 punpckhqdq m4, m4 107 pmulhw m13, m4 ; m13 = m13*qsh>>16 108 psignw m8, m9 ; m8 = reinsert sign 109 psignw m13, m10 ; m13 = reinsert sign 110 pand m8, m7 111 pand m13, m12 112 113 ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff 114 pcmpgtw m6, m5, m8 115 punpckhwd m6, m8, m6 116 pmovsxwd m11, m8 117 mova [qcoeffq ], m11 118 mova [qcoeffq+16], m6 119 pcmpgtw m6, m5, m13 120 punpckhwd m6, m13, m6 121 pmovsxwd m11, m13 122 mova [qcoeffq+32], m11 123 mova [qcoeffq+48], m6 124 125 pmullw m8, m3 ; dqc[i] = qc[i] * q 126 punpckhqdq m3, m3 127 pmullw m13, m3 ; dqc[i] = qc[i] * q 128 129 ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff 130 pcmpgtw m6, m5, m8 131 punpckhwd m6, m8, m6 132 pmovsxwd m11, m8 133 mova [dqcoeffq ], m11 134 mova [dqcoeffq+16], m6 135 pcmpgtw m6, m5, m13 136 punpckhwd m6, m13, m6 137 pmovsxwd m11, m13 138 mova [dqcoeffq+32], m11 139 mova [dqcoeffq+48], m6 140 141 mova m6, [iscanq] ; m6 = scan[i] 142 mova m11, [iscanq+16] ; m11 = scan[i] 143 144 pcmpeqw m8, m8, m5 ; m8 = c[i] == 0 145 pcmpeqw m13, m13, m5 ; m13 = c[i] == 0 146 psubw m6, m6, m7 ; m6 = scan[i] + 1 147 psubw m11, m11, m12 ; m11 = scan[i] + 1 148 pandn m8, m8, m6 ; m8 = max(eob) 149 pandn m13, m13, m11 ; m13 = max(eob) 150 pmaxsw m8, m8, m13 151 152 ; Horizontally accumulate/max eobs and write into [eob] memory pointer 153 pshufd m7, m8, 0xe 154 pmaxsw m8, m7 155 pshuflw m7, m8, 0xe 156 pmaxsw m8, m7 157 pshuflw m7, m8, 0x1 158 pmaxsw m8, m7 159 movq rax, m8 160 mov [eobq], ax 161 162 vzeroupper 163 RET 164 165 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 166 ;; Generic case of ncoeff != 16 167 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; 168 169.generic: 170 171%endif ; %ifnidn %1, b_32x32 172 173DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \ 174 qcoeff, dqcoeff, dequant, eob, scan, iscan 175 176 ; Actual quantization loop - setup pointers, rounders, etc. 177 movifnidn coeffq, coeffmp 178 movifnidn ncoeffq, ncoeffmp 179 movifnidn zbinq, zbinmp 180 movifnidn roundq, roundmp 181 movifnidn quantq, quantmp 182 movifnidn dequantq, dequantmp 183 mova m0, [zbinq] ; m0 = zbin 184 mova m1, [roundq] ; m1 = round 185 mova m2, [quantq] ; m2 = quant 186 mova m3, [dequantq] ; m3 = dequant 187 pcmpeqw m4, m4 ; All lanes -1 188%ifidn %1, b_32x32 189 psubw m0, m4 190 psubw m1, m4 191 psrlw m0, 1 ; m0 = (m0 + 1) / 2 192 psrlw m1, 1 ; m1 = (m1 + 1) / 2 193%endif 194 paddw m0, m4 ; m0 = m0 + 1 195 196 mov r2, shiftmp 197 mov r3, qcoeffmp 198 mova m4, [r2] ; m4 = shift 199 mov r4, dqcoeffmp 200 mov r5, iscanmp 201 pxor m5, m5 ; m5 = dedicated zero 202 203 DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob 204 205 206 lea coeffq, [ coeffq+ncoeffq*4] 207 lea qcoeffq, [ qcoeffq+ncoeffq*4] 208 lea dqcoeffq, [dqcoeffq+ncoeffq*4] 209 210 lea iscanq, [ iscanq+ncoeffq*2] 211 neg ncoeffq 212 213 ; get DC and first 15 AC coeffs 214 ; coeff stored as 32bit numbers & require 16bit numbers 215 mova m9, [coeffq+ncoeffq*4+ 0] 216 packssdw m9, [coeffq+ncoeffq*4+16] 217 mova m10, [coeffq+ncoeffq*4+32] 218 packssdw m10, [coeffq+ncoeffq*4+48] 219 220 pabsw m6, m9 ; m6 = abs(m9) 221 pabsw m11, m10 ; m11 = abs(m10) 222 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 223 punpckhqdq m0, m0 224 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 225 226 ; Check if all coeffs are less than zbin. If yes, skip forward quickly. 227 por m14, m7, m12 228 ptest m14, m14 229 jnz .first_nonzero 230 231 mova [qcoeffq+ncoeffq*4 ], ymm5 232 mova [qcoeffq+ncoeffq*4+32], ymm5 233 mova [dqcoeffq+ncoeffq*4 ], ymm5 234 mova [dqcoeffq+ncoeffq*4+32], ymm5 235 add ncoeffq, mmsize 236 237 punpckhqdq m1, m1 238 punpckhqdq m2, m2 239 punpckhqdq m3, m3 240 punpckhqdq m4, m4 241 pxor m8, m8 242 243 jmp .ac_only_loop 244 245.first_nonzero: 246 247 paddsw m6, m1 ; m6 += round 248 punpckhqdq m1, m1 249 paddsw m11, m1 ; m11 += round 250 pmulhw m8, m6, m2 ; m8 = m6*q>>16 251 punpckhqdq m2, m2 252 pmulhw m13, m11, m2 ; m13 = m11*q>>16 253 paddw m8, m6 ; m8 += m6 254 paddw m13, m11 ; m13 += m11 255 %ifidn %1, b_32x32 256 pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh 257 %endif 258 pmulhw m8, m4 ; m8 = m8*qsh>>16 259 %ifidn %1, b_32x32 260 psllw m8, 1 261 psrlw m5, 15 262 por m8, m5 263 %endif 264 punpckhqdq m4, m4 265 %ifidn %1, b_32x32 266 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 267 %endif 268 pmulhw m13, m4 ; m13 = m13*qsh>>16 269 %ifidn %1, b_32x32 270 psllw m13, 1 271 psrlw m5, 15 272 por m13, m5 273 pxor m5, m5 ; reset m5 to zero register 274 %endif 275 psignw m8, m9 ; m8 = reinsert sign 276 psignw m13, m10 ; m13 = reinsert sign 277 pand m8, m7 278 pand m13, m12 279 280 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 281 pcmpgtw m6, m5, m8 282 punpckhwd m6, m8, m6 283 pmovsxwd m11, m8 284 mova [qcoeffq+ncoeffq*4+ 0], m11 285 mova [qcoeffq+ncoeffq*4+16], m6 286 pcmpgtw m6, m5, m13 287 punpckhwd m6, m13, m6 288 pmovsxwd m11, m13 289 mova [qcoeffq+ncoeffq*4+32], m11 290 mova [qcoeffq+ncoeffq*4+48], m6 291 292%ifidn %1, b_32x32 293 pabsw m8, m8 294 pabsw m13, m13 295%endif 296 pmullw m8, m3 ; dqc[i] = qc[i] * q 297 punpckhqdq m3, m3 298 pmullw m13, m3 ; dqc[i] = qc[i] * q 299%ifidn %1, b_32x32 300 psrlw m8, 1 301 psrlw m13, 1 302 psignw m8, m9 303 psignw m13, m10 304%endif 305 306 ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff 307 pcmpgtw m6, m5, m8 308 punpckhwd m6, m8, m6 309 pmovsxwd m11, m8 310 mova [dqcoeffq+ncoeffq*4+ 0], m11 311 mova [dqcoeffq+ncoeffq*4+16], m6 312 pcmpgtw m6, m5, m13 313 punpckhwd m6, m13, m6 314 pmovsxwd m11, m13 315 mova [dqcoeffq+ncoeffq*4+32], m11 316 mova [dqcoeffq+ncoeffq*4+48], m6 317 318 pcmpeqw m8, m5 ; m8 = c[i] == 0 319 pcmpeqw m13, m5 ; m13 = c[i] == 0 320 mova m6, [iscanq+ncoeffq*2] ; m6 = scan[i] 321 mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] 322 psubw m6, m7 ; m6 = scan[i] + 1 323 psubw m11, m12 ; m11 = scan[i] + 1 324 pandn m8, m6 ; m8 = max(eob) 325 pandn m13, m11 ; m13 = max(eob) 326 pmaxsw m8, m13 327 add ncoeffq, mmsize 328 329.ac_only_loop: 330 331 ; pack coeff from 32bit to 16bit array 332 mova m9, [coeffq+ncoeffq*4+ 0] 333 packssdw m9, [coeffq+ncoeffq*4+16] 334 mova m10, [coeffq+ncoeffq*4+32] 335 packssdw m10, [coeffq+ncoeffq*4+48] 336 337 pabsw m6, m9 ; m6 = abs(m9) 338 pabsw m11, m10 ; m11 = abs(m10) 339 pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 340 pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 341 342 ; Check if all coeffs are less than zbin. If yes, skip this itertion. 343 ; And just write zeros as the result would be. 344 por m14, m7, m12 345 ptest m14, m14 346 jnz .rest_nonzero 347 348 mova [qcoeffq+ncoeffq*4+ 0], ymm5 349 mova [qcoeffq+ncoeffq*4+32], ymm5 350 mova [dqcoeffq+ncoeffq*4+ 0], ymm5 351 mova [dqcoeffq+ncoeffq*4+32], ymm5 352 353 add ncoeffq, mmsize 354 jnz .ac_only_loop 355 356 ; Horizontally accumulate/max eobs and write into [eob] memory pointer 357 mov r2, eobmp 358 pshufd m7, m8, 0xe 359 pmaxsw m8, m7 360 pshuflw m7, m8, 0xe 361 pmaxsw m8, m7 362 pshuflw m7, m8, 0x1 363 pmaxsw m8, m7 364 movq rax, m8 365 mov [r2], ax 366 vzeroupper 367 RET 368 369.rest_nonzero: 370 paddsw m6, m1 ; m6 += round 371 paddsw m11, m1 ; m11 += round 372 pmulhw m14, m6, m2 ; m14 = m6*q>>16 373 pmulhw m13, m11, m2 ; m13 = m11*q>>16 374 paddw m14, m6 ; m14 += m6 375 paddw m13, m11 ; m13 += m11 376 %ifidn %1, b_32x32 377 pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh 378 %endif 379 pmulhw m14, m4 ; m14 = m14*qsh>>16 380 %ifidn %1, b_32x32 381 psllw m14, 1 382 psrlw m5, 15 383 por m14, m5 384 pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh 385 %endif 386 pmulhw m13, m4 ; m13 = m13*qsh>>16 387 %ifidn %1, b_32x32 388 psllw m13, 1 389 psrlw m5, 15 390 por m13, m5 391 pxor m5, m5 ; reset m5 to zero register 392 %endif 393 psignw m14, m9 ; m14 = reinsert sign 394 psignw m13, m10 ; m13 = reinsert sign 395 pand m14, m7 396 pand m13, m12 397 398 ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff 399 pcmpgtw m6, m5, m14 400 punpckhwd m6, m14, m6 401 pmovsxwd m11, m14 402 mova [qcoeffq+ncoeffq*4+ 0], m11 403 mova [qcoeffq+ncoeffq*4+16], m6 404 pcmpgtw m6, m5, m13 405 punpckhwd m6, m13, m6 406 pmovsxwd m11, m13 407 mova [qcoeffq+ncoeffq*4+32], m11 408 mova [qcoeffq+ncoeffq*4+48], m6 409 410%ifidn %1, b_32x32 411 pabsw m14, m14 412 pabsw m13, m13 413%endif 414 pmullw m14, m3 ; dqc[i] = qc[i] * q 415 pmullw m13, m3 ; dqc[i] = qc[i] * q 416%ifidn %1, b_32x32 417 psrlw m14, 1 418 psrlw m13, 1 419 psignw m14, m9 420 psignw m13, m10 421%endif 422 423 ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff 424 pcmpgtw m6, m5, m14 425 punpckhwd m6, m14, m6 426 pmovsxwd m11, m14 427 mova [dqcoeffq+ncoeffq*4+ 0], m11 428 mova [dqcoeffq+ncoeffq*4+16], m6 429 pcmpgtw m6, m5, m13 430 punpckhwd m6, m13, m6 431 pmovsxwd m11, m13 432 mova [dqcoeffq+ncoeffq*4+32], m11 433 mova [dqcoeffq+ncoeffq*4+48], m6 434 435 pcmpeqw m14, m5 ; m14 = c[i] == 0 436 pcmpeqw m13, m5 ; m13 = c[i] == 0 437 mova m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 438 mova m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i] 439 psubw m6, m7 ; m6 = scan[i] + 1 440 psubw m11, m12 ; m11 = scan[i] + 1 441 pandn m14, m6 ; m14 = max(eob) 442 pandn m13, m11 ; m13 = max(eob) 443 pmaxsw m8, m14 444 pmaxsw m8, m13 445 add ncoeffq, mmsize 446 jnz .ac_only_loop 447 448 ; Horizontally accumulate/max eobs and write into [eob] memory pointer 449 mov r2, eobmp 450 pshufd m7, m8, 0xe 451 pmaxsw m8, m7 452 pshuflw m7, m8, 0xe 453 pmaxsw m8, m7 454 pshuflw m7, m8, 0x1 455 pmaxsw m8, m7 456 movq rax, m8 457 mov [r2], ax 458 vzeroupper 459 RET 460%endmacro 461 462INIT_XMM avx 463QUANTIZE_FN b, 9 464QUANTIZE_FN b_32x32, 9 465