1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION .text
17
18%macro QUANTIZE_FN 2
19cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
20                                shift, qcoeff, dqcoeff, dequant, \
21                                eob, scan, iscan
22
23  vzeroupper
24
25%ifnidn %1, b_32x32
26
27  ; Special case for ncoeff == 16, as it is frequent and we can save on
28  ; not setting up a loop.
29  cmp                       ncoeffmp, 16
30  jne .generic
31
32  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
33  ;; Special case of ncoeff == 16
34  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
35
36.single:
37
38  movifnidn                   coeffq, coeffmp
39  movifnidn                    zbinq, zbinmp
40  mova                            m0, [zbinq]              ; m0 = zbin
41
42  ; Get DC and first 15 AC coeffs - in this special case, that is all.
43  ; coeff stored as 32bit numbers but we process them as 16 bit numbers
44  mova                            m9, [coeffq]
45  packssdw                        m9, [coeffq+16]          ; m9 = c[i]
46  mova                           m10, [coeffq+32]
47  packssdw                       m10, [coeffq+48]          ; m10 = c[i]
48
49  mov                             r0, eobmp                ; Output pointer
50  mov                             r1, qcoeffmp             ; Output pointer
51  mov                             r2, dqcoeffmp            ; Output pointer
52
53  pxor                            m5, m5                   ; m5 = dedicated zero
54
55  pcmpeqw                         m4, m4                   ; All word lanes -1
56  paddw                           m0, m4                   ; m0 = zbin - 1
57
58  pabsw                           m6, m9                   ; m6 = abs(m9)
59  pabsw                          m11, m10                  ; m11 = abs(m10)
60  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
61  punpckhqdq                      m0, m0
62  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
63
64  ; Check if all coeffs are less than zbin. If yes, we just write zeros
65  ; to the outputs and we are done.
66  por                            m14, m7, m12
67  ptest                          m14, m14
68  jnz .single_nonzero
69
70  mova                       [r1   ], ymm5
71  mova                       [r1+32], ymm5
72  mova                       [r2   ], ymm5
73  mova                       [r2+32], ymm5
74  mov                           [r0], word 0
75
76  vzeroupper
77  RET
78
79.single_nonzero:
80
81  ; Actual quantization of size 16 block - setup pointers, rounders, etc.
82  movifnidn                       r3, roundmp
83  movifnidn                       r4, quantmp
84  mov                             r6, dequantmp
85  mov                             r5, shiftmp
86  mova                            m1, [r3]              ; m1 = round
87  mova                            m2, [r4]              ; m2 = quant
88  mova                            m3, [r6]              ; m3 = dequant
89  mova                            m4, [r5]              ; m4 = shift
90
91  mov                             r3, iscanmp
92
93  DEFINE_ARGS eob, qcoeff, dqcoeff, iscan
94
95  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
96
97  paddsw                          m6, m1                   ; m6 += round
98  punpckhqdq                      m1, m1
99  paddsw                         m11, m1                   ; m11 += round
100  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
101  punpckhqdq                      m2, m2
102  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
103  paddw                           m8, m6                   ; m8 += m6
104  paddw                          m13, m11                  ; m13 += m11
105  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
106  punpckhqdq                      m4, m4
107  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
108  psignw                          m8, m9                   ; m8 = reinsert sign
109  psignw                         m13, m10                  ; m13 = reinsert sign
110  pand                            m8, m7
111  pand                           m13, m12
112
113  ; Store 16bit numbers as 32bit numbers in array pointed to by qcoeff
114  pcmpgtw                         m6, m5, m8
115  punpckhwd                       m6, m8, m6
116  pmovsxwd                       m11, m8
117  mova                  [qcoeffq   ], m11
118  mova                  [qcoeffq+16], m6
119  pcmpgtw                         m6, m5, m13
120  punpckhwd                       m6, m13, m6
121  pmovsxwd                       m11, m13
122  mova                  [qcoeffq+32], m11
123  mova                  [qcoeffq+48], m6
124
125  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
126  punpckhqdq                      m3, m3
127  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
128
129  ; Store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
130  pcmpgtw                         m6, m5, m8
131  punpckhwd                       m6, m8, m6
132  pmovsxwd                       m11, m8
133  mova                 [dqcoeffq   ], m11
134  mova                 [dqcoeffq+16], m6
135  pcmpgtw                         m6, m5, m13
136  punpckhwd                       m6, m13, m6
137  pmovsxwd                       m11, m13
138  mova                 [dqcoeffq+32], m11
139  mova                 [dqcoeffq+48], m6
140
141  mova                            m6, [iscanq]            ; m6 = scan[i]
142  mova                           m11, [iscanq+16]         ; m11 = scan[i]
143
144  pcmpeqw                         m8,  m8,  m5            ; m8 = c[i] == 0
145  pcmpeqw                        m13, m13,  m5            ; m13 = c[i] == 0
146  psubw                           m6,  m6,  m7            ; m6 = scan[i] + 1
147  psubw                          m11, m11, m12            ; m11 = scan[i] + 1
148  pandn                           m8,  m8,  m6            ; m8 = max(eob)
149  pandn                          m13, m13, m11            ; m13 = max(eob)
150  pmaxsw                          m8,  m8, m13
151
152  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
153  pshufd                          m7, m8, 0xe
154  pmaxsw                          m8, m7
155  pshuflw                         m7, m8, 0xe
156  pmaxsw                          m8, m7
157  pshuflw                         m7, m8, 0x1
158  pmaxsw                          m8, m7
159  movq                           rax, m8
160  mov                         [eobq], ax
161
162  vzeroupper
163  RET
164
165  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
166  ;; Generic case of ncoeff != 16
167  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
168
169.generic:
170
171%endif ; %ifnidn %1, b_32x32
172
173DEFINE_ARGS coeff, ncoeff, zbin, round, quant, shift, \
174            qcoeff, dqcoeff, dequant, eob, scan, iscan
175
176  ; Actual quantization loop - setup pointers, rounders, etc.
177  movifnidn                   coeffq, coeffmp
178  movifnidn                  ncoeffq, ncoeffmp
179  movifnidn                    zbinq, zbinmp
180  movifnidn                   roundq, roundmp
181  movifnidn                   quantq, quantmp
182  movifnidn                 dequantq, dequantmp
183  mova                            m0, [zbinq]              ; m0 = zbin
184  mova                            m1, [roundq]             ; m1 = round
185  mova                            m2, [quantq]             ; m2 = quant
186  mova                            m3, [dequantq]           ; m3 = dequant
187  pcmpeqw                         m4, m4                   ; All lanes -1
188%ifidn %1, b_32x32
189  psubw                           m0, m4
190  psubw                           m1, m4
191  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
192  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
193%endif
194  paddw                           m0, m4                   ; m0 = m0 + 1
195
196  mov                             r2, shiftmp
197  mov                             r3, qcoeffmp
198  mova                            m4, [r2]            ; m4 = shift
199  mov                             r4, dqcoeffmp
200  mov                             r5, iscanmp
201  pxor                            m5, m5              ; m5 = dedicated zero
202
203  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
204
205
206  lea                         coeffq, [  coeffq+ncoeffq*4]
207  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
208  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
209
210  lea                         iscanq, [  iscanq+ncoeffq*2]
211  neg                        ncoeffq
212
213  ; get DC and first 15 AC coeffs
214  ; coeff stored as 32bit numbers & require 16bit numbers
215  mova                            m9, [coeffq+ncoeffq*4+ 0]
216  packssdw                        m9, [coeffq+ncoeffq*4+16]
217  mova                           m10, [coeffq+ncoeffq*4+32]
218  packssdw                       m10, [coeffq+ncoeffq*4+48]
219
220  pabsw                           m6, m9                   ; m6 = abs(m9)
221  pabsw                          m11, m10                  ; m11 = abs(m10)
222  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
223  punpckhqdq                      m0, m0
224  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
225
226  ; Check if all coeffs are less than zbin. If yes, skip forward quickly.
227  por                            m14, m7, m12
228  ptest                          m14, m14
229  jnz .first_nonzero
230
231  mova        [qcoeffq+ncoeffq*4   ], ymm5
232  mova        [qcoeffq+ncoeffq*4+32], ymm5
233  mova       [dqcoeffq+ncoeffq*4   ], ymm5
234  mova       [dqcoeffq+ncoeffq*4+32], ymm5
235  add                        ncoeffq, mmsize
236
237  punpckhqdq                      m1, m1
238  punpckhqdq                      m2, m2
239  punpckhqdq                      m3, m3
240  punpckhqdq                      m4, m4
241  pxor                            m8, m8
242
243  jmp .ac_only_loop
244
245.first_nonzero:
246
247  paddsw                          m6, m1                   ; m6 += round
248  punpckhqdq                      m1, m1
249  paddsw                         m11, m1                   ; m11 += round
250  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
251  punpckhqdq                      m2, m2
252  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
253  paddw                           m8, m6                   ; m8 += m6
254  paddw                          m13, m11                  ; m13 += m11
255  %ifidn %1, b_32x32
256  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
257  %endif
258  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
259  %ifidn %1, b_32x32
260  psllw                           m8, 1
261  psrlw                           m5, 15
262  por                             m8, m5
263  %endif
264  punpckhqdq                      m4, m4
265  %ifidn %1, b_32x32
266  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
267  %endif
268  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
269  %ifidn %1, b_32x32
270  psllw                          m13, 1
271  psrlw                           m5, 15
272  por                            m13, m5
273  pxor                            m5, m5                   ; reset m5 to zero register
274  %endif
275  psignw                          m8, m9                   ; m8 = reinsert sign
276  psignw                         m13, m10                  ; m13 = reinsert sign
277  pand                            m8, m7
278  pand                           m13, m12
279
280  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
281  pcmpgtw                         m6, m5, m8
282  punpckhwd                       m6, m8, m6
283  pmovsxwd                       m11, m8
284  mova        [qcoeffq+ncoeffq*4+ 0], m11
285  mova        [qcoeffq+ncoeffq*4+16], m6
286  pcmpgtw                         m6, m5, m13
287  punpckhwd                       m6, m13, m6
288  pmovsxwd                       m11, m13
289  mova        [qcoeffq+ncoeffq*4+32], m11
290  mova        [qcoeffq+ncoeffq*4+48], m6
291
292%ifidn %1, b_32x32
293  pabsw                           m8, m8
294  pabsw                          m13, m13
295%endif
296  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
297  punpckhqdq                      m3, m3
298  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
299%ifidn %1, b_32x32
300  psrlw                           m8, 1
301  psrlw                          m13, 1
302  psignw                          m8, m9
303  psignw                         m13, m10
304%endif
305
306  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
307  pcmpgtw                         m6, m5, m8
308  punpckhwd                       m6, m8, m6
309  pmovsxwd                       m11, m8
310  mova       [dqcoeffq+ncoeffq*4+ 0], m11
311  mova       [dqcoeffq+ncoeffq*4+16], m6
312  pcmpgtw                         m6, m5, m13
313  punpckhwd                       m6, m13, m6
314  pmovsxwd                       m11, m13
315  mova       [dqcoeffq+ncoeffq*4+32], m11
316  mova       [dqcoeffq+ncoeffq*4+48], m6
317
318  pcmpeqw                         m8, m5                    ; m8 = c[i] == 0
319  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
320  mova                            m6, [iscanq+ncoeffq*2]    ; m6 = scan[i]
321  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
322  psubw                           m6, m7                    ; m6 = scan[i] + 1
323  psubw                          m11, m12                   ; m11 = scan[i] + 1
324  pandn                           m8, m6                    ; m8 = max(eob)
325  pandn                          m13, m11                   ; m13 = max(eob)
326  pmaxsw                          m8, m13
327  add                        ncoeffq, mmsize
328
329.ac_only_loop:
330
331  ; pack coeff from 32bit to 16bit array
332  mova                            m9, [coeffq+ncoeffq*4+ 0]
333  packssdw                        m9, [coeffq+ncoeffq*4+16]
334  mova                           m10, [coeffq+ncoeffq*4+32]
335  packssdw                       m10, [coeffq+ncoeffq*4+48]
336
337  pabsw                           m6, m9                   ; m6 = abs(m9)
338  pabsw                          m11, m10                  ; m11 = abs(m10)
339  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
340  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
341
342  ; Check if all coeffs are less than zbin. If yes, skip this itertion.
343  ; And just write zeros as the result would be.
344  por                            m14, m7, m12
345  ptest                          m14, m14
346  jnz .rest_nonzero
347
348  mova        [qcoeffq+ncoeffq*4+ 0], ymm5
349  mova        [qcoeffq+ncoeffq*4+32], ymm5
350  mova       [dqcoeffq+ncoeffq*4+ 0], ymm5
351  mova       [dqcoeffq+ncoeffq*4+32], ymm5
352
353  add                        ncoeffq, mmsize
354  jnz .ac_only_loop
355
356  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
357  mov                             r2, eobmp
358  pshufd                          m7, m8, 0xe
359  pmaxsw                          m8, m7
360  pshuflw                         m7, m8, 0xe
361  pmaxsw                          m8, m7
362  pshuflw                         m7, m8, 0x1
363  pmaxsw                          m8, m7
364  movq                           rax, m8
365  mov                           [r2], ax
366  vzeroupper
367  RET
368
369.rest_nonzero:
370  paddsw                          m6, m1                   ; m6 += round
371  paddsw                         m11, m1                   ; m11 += round
372  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
373  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
374  paddw                          m14, m6                   ; m14 += m6
375  paddw                          m13, m11                  ; m13 += m11
376  %ifidn %1, b_32x32
377  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
378  %endif
379  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
380  %ifidn %1, b_32x32
381  psllw                          m14, 1
382  psrlw                           m5, 15
383  por                            m14, m5
384  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
385  %endif
386  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
387  %ifidn %1, b_32x32
388  psllw                          m13, 1
389  psrlw                           m5, 15
390  por                            m13, m5
391  pxor                            m5, m5                   ; reset m5 to zero register
392  %endif
393  psignw                         m14, m9                   ; m14 = reinsert sign
394  psignw                         m13, m10                  ; m13 = reinsert sign
395  pand                           m14, m7
396  pand                           m13, m12
397
398  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
399  pcmpgtw                         m6, m5, m14
400  punpckhwd                       m6, m14, m6
401  pmovsxwd                       m11, m14
402  mova        [qcoeffq+ncoeffq*4+ 0], m11
403  mova        [qcoeffq+ncoeffq*4+16], m6
404  pcmpgtw                         m6, m5, m13
405  punpckhwd                       m6, m13, m6
406  pmovsxwd                       m11, m13
407  mova        [qcoeffq+ncoeffq*4+32], m11
408  mova        [qcoeffq+ncoeffq*4+48], m6
409
410%ifidn %1, b_32x32
411  pabsw                          m14, m14
412  pabsw                          m13, m13
413%endif
414  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
415  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
416%ifidn %1, b_32x32
417  psrlw                          m14, 1
418  psrlw                          m13, 1
419  psignw                         m14, m9
420  psignw                         m13, m10
421%endif
422
423  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
424  pcmpgtw                         m6, m5, m14
425  punpckhwd                       m6, m14, m6
426  pmovsxwd                       m11, m14
427  mova       [dqcoeffq+ncoeffq*4+ 0], m11
428  mova       [dqcoeffq+ncoeffq*4+16], m6
429  pcmpgtw                         m6, m5, m13
430  punpckhwd                       m6, m13, m6
431  pmovsxwd                       m11, m13
432  mova       [dqcoeffq+ncoeffq*4+32], m11
433  mova       [dqcoeffq+ncoeffq*4+48], m6
434
435  pcmpeqw                        m14, m5                    ; m14 = c[i] == 0
436  pcmpeqw                        m13, m5                    ; m13 = c[i] == 0
437  mova                            m6, [iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
438  mova                           m11, [iscanq+ncoeffq*2+16] ; m11 = scan[i]
439  psubw                           m6, m7                    ; m6 = scan[i] + 1
440  psubw                          m11, m12                   ; m11 = scan[i] + 1
441  pandn                          m14, m6                    ; m14 = max(eob)
442  pandn                          m13, m11                   ; m13 = max(eob)
443  pmaxsw                          m8, m14
444  pmaxsw                          m8, m13
445  add                        ncoeffq, mmsize
446  jnz .ac_only_loop
447
448  ; Horizontally accumulate/max eobs and write into [eob] memory pointer
449  mov                             r2, eobmp
450  pshufd                          m7, m8, 0xe
451  pmaxsw                          m8, m7
452  pshuflw                         m7, m8, 0xe
453  pmaxsw                          m8, m7
454  pshuflw                         m7, m8, 0x1
455  pmaxsw                          m8, m7
456  movq                           rax, m8
457  mov                           [r2], ax
458  vzeroupper
459  RET
460%endmacro
461
462INIT_XMM avx
463QUANTIZE_FN b, 9
464QUANTIZE_FN b_32x32, 9
465