1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%define private_prefix av1
15
16%include "third_party/x86inc/x86inc.asm"
17
18SECTION_RODATA
19pw_1: times 8 dw 1
20
21SECTION .text
22
23%macro QUANTIZE_FP 2
24cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
25                                shift, qcoeff, dqcoeff, dequant, \
26                                eob, scan, iscan
27  cmp                    dword skipm, 0
28  jne .blank
29
30  ; actual quantize loop - setup pointers, rounders, etc.
31  movifnidn                   coeffq, coeffmp
32  movifnidn                  ncoeffq, ncoeffmp
33  mov                             r2, dequantmp
34  movifnidn                    zbinq, zbinmp
35  movifnidn                   roundq, roundmp
36  movifnidn                   quantq, quantmp
37  mova                            m1, [roundq]             ; m1 = round
38  mova                            m2, [quantq]             ; m2 = quant
39%ifidn %1, fp_32x32
40  pcmpeqw                         m5, m5
41  psrlw                           m5, 15
42  paddw                           m1, m5
43  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
44%endif
45  mova                            m3, [r2q]                ; m3 = dequant
46  mov                             r3, qcoeffmp
47  mov                             r4, dqcoeffmp
48  mov                             r5, iscanmp
49%ifidn %1, fp_32x32
50  psllw                           m2, 1
51%endif
52  pxor                            m5, m5                   ; m5 = dedicated zero
53
54  lea                         coeffq, [  coeffq+ncoeffq*2]
55  lea                            r5q, [  r5q+ncoeffq*2]
56  lea                            r3q, [ r3q+ncoeffq*2]
57  lea                            r4q, [r4q+ncoeffq*2]
58  neg                        ncoeffq
59
60  ; get DC and first 15 AC coeffs
61  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
62  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
63  pabsw                           m6, m9                   ; m6 = abs(m9)
64  pabsw                          m11, m10                  ; m11 = abs(m10)
65  pcmpeqw                         m7, m7
66
67  paddsw                          m6, m1                   ; m6 += round
68  punpckhqdq                      m1, m1
69  paddsw                         m11, m1                   ; m11 += round
70  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
71  punpckhqdq                      m2, m2
72  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
73  psignw                          m8, m9                   ; m8 = reinsert sign
74  psignw                         m13, m10                  ; m13 = reinsert sign
75  mova            [r3q+ncoeffq*2+ 0], m8
76  mova            [r3q+ncoeffq*2+16], m13
77%ifidn %1, fp_32x32
78  pabsw                           m8, m8
79  pabsw                          m13, m13
80%endif
81  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
82  punpckhqdq                      m3, m3
83  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
84%ifidn %1, fp_32x32
85  psrlw                           m8, 1
86  psrlw                          m13, 1
87  psignw                          m8, m9
88  psignw                         m13, m10
89  psrlw                           m0, m3, 2
90%else
91  psrlw                           m0, m3, 1
92%endif
93  mova            [r4q+ncoeffq*2+ 0], m8
94  mova            [r4q+ncoeffq*2+16], m13
95  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
96  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
97  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
98  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
99  psubw                           m6, m7                   ; m6 = scan[i] + 1
100  psubw                          m11, m7                   ; m11 = scan[i] + 1
101  pandn                           m8, m6                   ; m8 = max(eob)
102  pandn                          m13, m11                  ; m13 = max(eob)
103  pmaxsw                          m8, m13
104  add                        ncoeffq, mmsize
105  jz .accumulate_eob
106
107.ac_only_loop:
108  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
109  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
110  pabsw                           m6, m9                   ; m6 = abs(m9)
111  pabsw                          m11, m10                  ; m11 = abs(m10)
112
113  pcmpgtw                         m7, m6,  m0
114  pcmpgtw                        m12, m11, m0
115  pmovmskb                       r6d, m7
116  pmovmskb                       r2d, m12
117
118  or                              r6, r2
119  jz .skip_iter
120
121  pcmpeqw                         m7, m7
122
123  paddsw                          m6, m1                   ; m6 += round
124  paddsw                         m11, m1                   ; m11 += round
125  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
126  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
127  psignw                         m14, m9                   ; m14 = reinsert sign
128  psignw                         m13, m10                  ; m13 = reinsert sign
129  mova            [r3q+ncoeffq*2+ 0], m14
130  mova            [r3q+ncoeffq*2+16], m13
131%ifidn %1, fp_32x32
132  pabsw                          m14, m14
133  pabsw                          m13, m13
134%endif
135  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
136  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
137%ifidn %1, fp_32x32
138  psrlw                          m14, 1
139  psrlw                          m13, 1
140  psignw                         m14, m9
141  psignw                         m13, m10
142%endif
143  mova            [r4q+ncoeffq*2+ 0], m14
144  mova            [r4q+ncoeffq*2+16], m13
145  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
146  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
147  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
148  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
149  psubw                           m6, m7                   ; m6 = scan[i] + 1
150  psubw                          m11, m7                   ; m11 = scan[i] + 1
151  pandn                          m14, m6                   ; m14 = max(eob)
152  pandn                          m13, m11                  ; m13 = max(eob)
153  pmaxsw                          m8, m14
154  pmaxsw                          m8, m13
155  add                        ncoeffq, mmsize
156  jl .ac_only_loop
157
158  jmp .accumulate_eob
159.skip_iter:
160  mova            [r3q+ncoeffq*2+ 0], m5
161  mova            [r3q+ncoeffq*2+16], m5
162  mova            [r4q+ncoeffq*2+ 0], m5
163  mova            [r4q+ncoeffq*2+16], m5
164  add                        ncoeffq, mmsize
165  jl .ac_only_loop
166
167.accumulate_eob:
168  ; horizontally accumulate/max eobs and write into [eob] memory pointer
169  mov                             r2, eobmp
170  pshufd                          m7, m8, 0xe
171  pmaxsw                          m8, m7
172  pshuflw                         m7, m8, 0xe
173  pmaxsw                          m8, m7
174  pshuflw                         m7, m8, 0x1
175  pmaxsw                          m8, m7
176  pextrw                          r6, m8, 0
177  mov                           [r2], r6
178  RET
179
180  ; skip-block, i.e. just write all zeroes
181.blank:
182  mov                             r0, dqcoeffmp
183  movifnidn                  ncoeffq, ncoeffmp
184  mov                             r2, qcoeffmp
185  mov                             r3, eobmp
186
187  lea                            r0q, [r0q+ncoeffq*2]
188  lea                            r2q, [r2q+ncoeffq*2]
189  neg                        ncoeffq
190  pxor                            m7, m7
191.blank_loop:
192  mova            [r0q+ncoeffq*2+ 0], m7
193  mova            [r0q+ncoeffq*2+16], m7
194  mova            [r2q+ncoeffq*2+ 0], m7
195  mova            [r2q+ncoeffq*2+16], m7
196  add                        ncoeffq, mmsize
197  jl .blank_loop
198  mov                     word [r3q], 0
199  RET
200%endmacro
201
202INIT_XMM ssse3
203QUANTIZE_FP fp, 7
204QUANTIZE_FP fp_32x32, 7
205