1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
15
16SECTION_RODATA
17pw_1: times 8 dw 1
18
19SECTION .text
20
21%macro QUANTIZE_FP 2
22cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
23                                qcoeff, dqcoeff, dequant, \
24                                eob, scan, iscan
25
26  ; actual quantize loop - setup pointers, rounders, etc.
27  movifnidn                   coeffq, coeffmp
28  movifnidn                  ncoeffq, ncoeffmp
29  mov                             r2, dequantmp
30  movifnidn                   roundq, roundmp
31  movifnidn                   quantq, quantmp
32  mova                            m1, [roundq]             ; m1 = round
33  mova                            m2, [quantq]             ; m2 = quant
34%ifidn %1, fp_32x32
35  pcmpeqw                         m5, m5
36  psrlw                           m5, 15
37  paddw                           m1, m5
38  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
39%endif
40  mova                            m3, [r2q]                ; m3 = dequant
41  mov                             r3, qcoeffmp
42  mov                             r4, dqcoeffmp
43  mov                             r5, iscanmp
44%ifidn %1, fp_32x32
45  psllw                           m2, 1
46%endif
47  pxor                            m5, m5                   ; m5 = dedicated zero
48
49  INCREMENT_ELEMENTS_TRAN_LOW coeffq, ncoeffq
50  lea                            r5q, [r5q+ncoeffq*2]
51  INCREMENT_ELEMENTS_TRAN_LOW    r3q, ncoeffq
52  INCREMENT_ELEMENTS_TRAN_LOW    r4q, ncoeffq
53  neg                        ncoeffq
54
55  ; get DC and first 15 AC coeffs
56  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
57  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
58  pabsw                           m6, m9                   ; m6 = abs(m9)
59  pabsw                          m11, m10                  ; m11 = abs(m10)
60  pcmpeqw                         m7, m7
61
62  paddsw                          m6, m1                   ; m6 += round
63  punpckhqdq                      m1, m1
64  paddsw                         m11, m1                   ; m11 += round
65  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
66  punpckhqdq                      m2, m2
67  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
68  psignw                          m8, m9                   ; m8 = reinsert sign
69  psignw                         m13, m10                  ; m13 = reinsert sign
70  STORE_TRAN_LOW  8, r3q, ncoeffq,     6, 11, 12
71  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
72%ifidn %1, fp_32x32
73  pabsw                           m8, m8
74  pabsw                          m13, m13
75%endif
76  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
77  punpckhqdq                      m3, m3
78  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
79%ifidn %1, fp_32x32
80  psrlw                           m8, 1
81  psrlw                          m13, 1
82  psignw                          m8, m9
83  psignw                         m13, m10
84  psrlw                           m0, m3, 2
85%else
86  psrlw                           m0, m3, 1
87%endif
88  STORE_TRAN_LOW  8, r4q, ncoeffq,     6, 11, 12
89  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
90  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
91  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
92  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
93  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
94  psubw                           m6, m7                   ; m6 = scan[i] + 1
95  psubw                          m11, m7                   ; m11 = scan[i] + 1
96  pandn                           m8, m6                   ; m8 = max(eob)
97  pandn                          m13, m11                  ; m13 = max(eob)
98  pmaxsw                          m8, m13
99  add                        ncoeffq, mmsize
100  jz .accumulate_eob
101
102.ac_only_loop:
103  LOAD_TRAN_LOW  9, coeffq, ncoeffq                        ; m9 = c[i]
104  LOAD_TRAN_LOW 10, coeffq, ncoeffq + 8                    ; m10 = c[i]
105  pabsw                           m6, m9                   ; m6 = abs(m9)
106  pabsw                          m11, m10                  ; m11 = abs(m10)
107
108  pcmpgtw                         m7, m6,  m0
109  pcmpgtw                        m12, m11, m0
110  pmovmskb                       r6d, m7
111  pmovmskb                       r2d, m12
112
113  or                              r6, r2
114  jz .skip_iter
115
116  pcmpeqw                         m7, m7
117
118  paddsw                          m6, m1                   ; m6 += round
119  paddsw                         m11, m1                   ; m11 += round
120  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
121  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
122  psignw                         m14, m9                   ; m14 = reinsert sign
123  psignw                         m13, m10                  ; m13 = reinsert sign
124  STORE_TRAN_LOW 14, r3q, ncoeffq,     6, 11, 12
125  STORE_TRAN_LOW 13, r3q, ncoeffq + 8, 6, 11, 12
126%ifidn %1, fp_32x32
127  pabsw                          m14, m14
128  pabsw                          m13, m13
129%endif
130  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
131  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
132%ifidn %1, fp_32x32
133  psrlw                          m14, 1
134  psrlw                          m13, 1
135  psignw                         m14, m9
136  psignw                         m13, m10
137%endif
138  STORE_TRAN_LOW 14, r4q, ncoeffq,     6, 11, 12
139  STORE_TRAN_LOW 13, r4q, ncoeffq + 8, 6, 11, 12
140  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
141  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
142  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
143  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
144  psubw                           m6, m7                   ; m6 = scan[i] + 1
145  psubw                          m11, m7                   ; m11 = scan[i] + 1
146  pandn                          m14, m6                   ; m14 = max(eob)
147  pandn                          m13, m11                  ; m13 = max(eob)
148  pmaxsw                          m8, m14
149  pmaxsw                          m8, m13
150  add                        ncoeffq, mmsize
151  jl .ac_only_loop
152
153  jmp .accumulate_eob
154.skip_iter:
155  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq
156  STORE_ZERO_TRAN_LOW 5, r3q, ncoeffq + 8
157  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq
158  STORE_ZERO_TRAN_LOW 5, r4q, ncoeffq + 8
159  add                        ncoeffq, mmsize
160  jl .ac_only_loop
161
162.accumulate_eob:
163  ; horizontally accumulate/max eobs and write into [eob] memory pointer
164  mov                             r2, eobmp
165  pshufd                          m7, m8, 0xe
166  pmaxsw                          m8, m7
167  pshuflw                         m7, m8, 0xe
168  pmaxsw                          m8, m7
169  pshuflw                         m7, m8, 0x1
170  pmaxsw                          m8, m7
171  pextrw                          r6, m8, 0
172  mov                           [r2], r6w
173  RET
174%endmacro
175
176INIT_XMM ssse3
177QUANTIZE_FP fp, 7
178QUANTIZE_FP fp_32x32, 7
179