1;
2; Copyright (c) 2016, Alliance for Open Media. All rights reserved
3;
4; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
12;
13
14%include "third_party/x86inc/x86inc.asm"
15
16SECTION_RODATA
17pw_1: times 8 dw 1
18
19SECTION .text
20
21%macro QUANTIZE_FN 2
22cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
23                                shift, qcoeff, dqcoeff, dequant, \
24                                eob, scan, iscan
25
26  ; actual quantize loop - setup pointers, rounders, etc.
27  movifnidn                   coeffq, coeffmp
28  movifnidn                  ncoeffq, ncoeffmp
29  movifnidn                    zbinq, zbinmp
30  movifnidn                   roundq, roundmp
31  movifnidn                   quantq, quantmp
32  movifnidn                 dequantq, dequantmp
33  mova                            m0, [zbinq]              ; m0 = zbin
34  mova                            m1, [roundq]             ; m1 = round
35  mova                            m2, [quantq]             ; m2 = quant
36%ifidn %1, b_32x32
37  pcmpeqw                         m5, m5
38  psrlw                           m5, 15
39  paddw                           m0, m5
40  paddw                           m1, m5
41  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
42  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
43%endif
44  mova                            m3, [dequantq]           ; m3 = dequant
45  mov                             r2, shiftmp
46  psubw                           m0, [GLOBAL(pw_1)]
47  mova                            m4, [r2]                 ; m4 = shift
48  mov                             r3, qcoeffmp
49  mov                             r4, dqcoeffmp
50  mov                             r5, iscanmp
51  pxor                            m5, m5                   ; m5 = dedicated zero
52  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
53  lea                         coeffq, [  coeffq+ncoeffq*4]
54  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
55  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
56  lea                         iscanq, [  iscanq+ncoeffq*2]
57  neg                        ncoeffq
58
59  ; get DC and first 15 AC coeffs
60  ; coeff stored as 32bit numbers & require 16bit numbers
61  mova                            m9, [  coeffq+ncoeffq*4+ 0]
62  packssdw                        m9, [  coeffq+ncoeffq*4+16]
63  mova                           m10, [  coeffq+ncoeffq*4+32]
64  packssdw                       m10, [  coeffq+ncoeffq*4+48]
65  pabsw                           m6, m9                   ; m6 = abs(m9)
66  pabsw                          m11, m10                  ; m11 = abs(m10)
67  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
68  punpckhqdq                      m0, m0
69  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
70  paddsw                          m6, m1                   ; m6 += round
71  punpckhqdq                      m1, m1
72  paddsw                         m11, m1                   ; m11 += round
73  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
74  punpckhqdq                      m2, m2
75  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
76  paddw                           m8, m6                   ; m8 += m6
77  paddw                          m13, m11                  ; m13 += m11
78  %ifidn %1, b_32x32
79  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
80  %endif
81  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
82  %ifidn %1, b_32x32
83  psllw                           m8, 1
84  psrlw                           m5, 15
85  por                             m8, m5
86  %endif
87  punpckhqdq                      m4, m4
88  %ifidn %1, b_32x32
89  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
90  %endif
91  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
92  %ifidn %1, b_32x32
93  psllw                          m13, 1
94  psrlw                           m5, 15
95  por                            m13, m5
96  pxor                            m5, m5                   ; reset m5 to zero register
97  %endif
98  psignw                          m8, m9                   ; m8 = reinsert sign
99  psignw                         m13, m10                  ; m13 = reinsert sign
100  pand                            m8, m7
101  pand                           m13, m12
102
103  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
104  mova                           m11, m8
105  mova                            m6, m8
106  pcmpgtw                         m5, m8
107  punpcklwd                      m11, m5
108  punpckhwd                       m6, m5
109  mova        [qcoeffq+ncoeffq*4+ 0], m11
110  mova        [qcoeffq+ncoeffq*4+16], m6
111  pxor                            m5, m5
112  mova                           m11, m13
113  mova                            m6, m13
114  pcmpgtw                         m5, m13
115  punpcklwd                      m11, m5
116  punpckhwd                       m6, m5
117  mova        [qcoeffq+ncoeffq*4+32], m11
118  mova        [qcoeffq+ncoeffq*4+48], m6
119  pxor                            m5, m5             ; reset m5 to zero register
120
121%ifidn %1, b_32x32
122  pabsw                           m8, m8
123  pabsw                          m13, m13
124%endif
125  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
126  punpckhqdq                      m3, m3
127  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
128%ifidn %1, b_32x32
129  psrlw                           m8, 1
130  psrlw                          m13, 1
131  psignw                          m8, m9
132  psignw                         m13, m10
133%endif
134  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
135  mova                            m11, m8
136  mova                            m6, m8
137  pcmpgtw                         m5, m8
138  punpcklwd                      m11, m5
139  punpckhwd                       m6, m5
140  mova       [dqcoeffq+ncoeffq*4+ 0], m11
141  mova       [dqcoeffq+ncoeffq*4+16], m6
142  pxor                            m5, m5
143  mova                           m11, m13
144  mova                            m6, m13
145  pcmpgtw                         m5, m13
146  punpcklwd                      m11, m5
147  punpckhwd                       m6, m5
148  mova       [dqcoeffq+ncoeffq*4+32], m11
149  mova       [dqcoeffq+ncoeffq*4+48], m6
150  pxor                            m5, m5             ; reset m5 to zero register
151  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
152  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
153  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
154  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
155  psubw                           m6, m7                   ; m6 = scan[i] + 1
156  psubw                          m11, m12                  ; m11 = scan[i] + 1
157  pandn                           m8, m6                   ; m8 = max(eob)
158  pandn                          m13, m11                  ; m13 = max(eob)
159  pmaxsw                          m8, m13
160  add                        ncoeffq, mmsize
161  jz .accumulate_eob
162
163.ac_only_loop:
164  ; pack coeff from 32bit to 16bit array
165  mova                            m9, [  coeffq+ncoeffq*4+ 0]
166  packssdw                        m9, [  coeffq+ncoeffq*4+16]
167  mova                           m10, [  coeffq+ncoeffq*4+32]
168  packssdw                       m10, [  coeffq+ncoeffq*4+48]
169
170  pabsw                           m6, m9                   ; m6 = abs(m9)
171  pabsw                          m11, m10                  ; m11 = abs(m10)
172  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
173  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
174%ifidn %1, b_32x32
175  pmovmskb                       r6d, m7
176  pmovmskb                       r2d, m12
177  or                              r6, r2
178  jz .skip_iter
179%endif
180  paddsw                          m6, m1                   ; m6 += round
181  paddsw                         m11, m1                   ; m11 += round
182  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
183  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
184  paddw                          m14, m6                   ; m14 += m6
185  paddw                          m13, m11                  ; m13 += m11
186  %ifidn %1, b_32x32
187  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
188  %endif
189  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
190  %ifidn %1, b_32x32
191  psllw                          m14, 1
192  psrlw                           m5, 15
193  por                            m14, m5
194  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
195  %endif
196  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
197  %ifidn %1, b_32x32
198  psllw                          m13, 1
199  psrlw                           m5, 15
200  por                            m13, m5
201  pxor                            m5, m5                   ; reset m5 to zero register
202  %endif
203  psignw                         m14, m9                   ; m14 = reinsert sign
204  psignw                         m13, m10                  ; m13 = reinsert sign
205  pand                           m14, m7
206  pand                           m13, m12
207
208  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
209  pxor                           m11, m11
210  mova                           m11, m14
211  mova                            m6, m14
212  pcmpgtw                         m5, m14
213  punpcklwd                      m11, m5
214  punpckhwd                       m6, m5
215  mova        [qcoeffq+ncoeffq*4+ 0], m11
216  mova        [qcoeffq+ncoeffq*4+16], m6
217  pxor                            m5, m5
218  mova                           m11, m13
219  mova                            m6, m13
220  pcmpgtw                         m5, m13
221  punpcklwd                      m11, m5
222  punpckhwd                       m6, m5
223  mova        [qcoeffq+ncoeffq*4+32], m11
224  mova        [qcoeffq+ncoeffq*4+48], m6
225  pxor                            m5, m5             ; reset m5 to zero register
226
227%ifidn %1, b_32x32
228  pabsw                          m14, m14
229  pabsw                          m13, m13
230%endif
231  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
232  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
233%ifidn %1, b_32x32
234  psrlw                          m14, 1
235  psrlw                          m13, 1
236  psignw                         m14, m9
237  psignw                         m13, m10
238%endif
239
240  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
241  mova                           m11, m14
242  mova                            m6, m14
243  pcmpgtw                         m5, m14
244  punpcklwd                      m11, m5
245  punpckhwd                       m6, m5
246  mova       [dqcoeffq+ncoeffq*4+ 0], m11
247  mova       [dqcoeffq+ncoeffq*4+16], m6
248  pxor                            m5, m5
249  mova                           m11, m13
250  mova                            m6, m13
251  pcmpgtw                         m5, m13
252  punpcklwd                      m11, m5
253  punpckhwd                       m6, m5
254  mova       [dqcoeffq+ncoeffq*4+32], m11
255  mova       [dqcoeffq+ncoeffq*4+48], m6
256  pxor                            m5, m5
257
258  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
259  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
260  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
261  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
262  psubw                           m6, m7                   ; m6 = scan[i] + 1
263  psubw                          m11, m12                  ; m11 = scan[i] + 1
264  pandn                          m14, m6                   ; m14 = max(eob)
265  pandn                          m13, m11                  ; m13 = max(eob)
266  pmaxsw                          m8, m14
267  pmaxsw                          m8, m13
268  add                        ncoeffq, mmsize
269  jl .ac_only_loop
270
271%ifidn %1, b_32x32
272  jmp .accumulate_eob
273.skip_iter:
274  mova        [qcoeffq+ncoeffq*4+ 0], m5
275  mova        [qcoeffq+ncoeffq*4+16], m5
276  mova        [qcoeffq+ncoeffq*4+32], m5
277  mova        [qcoeffq+ncoeffq*4+48], m5
278  mova       [dqcoeffq+ncoeffq*4+ 0], m5
279  mova       [dqcoeffq+ncoeffq*4+16], m5
280  mova       [dqcoeffq+ncoeffq*4+32], m5
281  mova       [dqcoeffq+ncoeffq*4+48], m5
282  add                        ncoeffq, mmsize
283  jl .ac_only_loop
284%endif
285
286.accumulate_eob:
287  ; horizontally accumulate/max eobs and write into [eob] memory pointer
288  mov                             r2, eobmp
289  pshufd                          m7, m8, 0xe
290  pmaxsw                          m8, m7
291  pshuflw                         m7, m8, 0xe
292  pmaxsw                          m8, m7
293  pshuflw                         m7, m8, 0x1
294  pmaxsw                          m8, m7
295  pextrw                          r6, m8, 0
296  mov                             [r2], r6
297  RET
298%endmacro
299
300INIT_XMM ssse3
301QUANTIZE_FN b, 9
302QUANTIZE_FN b_32x32, 9
303