1;
2;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14
15SECTION .text
16ALIGN 16
17
18;
19; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
20;                                     intptr_t block_size, int64_t *ssz)
21;
22
23INIT_XMM avx
24cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
25  vzeroupper
26
27  ; If only one iteration is required, then handle this as a special case.
28  ; It is the most frequent case, so we can have a significant gain here
29  ; by not setting up a loop and accumulators.
30  cmp    sizeq, 16
31  jne   .generic
32
33  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
34  ;; Common case of size == 16
35  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
36
37  ; Load input vectors
38  mova      xm0, [dqcq]
39  packssdw  xm0, [dqcq+16]
40  mova      xm2, [uqcq]
41  packssdw  xm2, [uqcq+16]
42
43  mova      xm1, [dqcq+32]
44  packssdw  xm1, [dqcq+48]
45  mova      xm3, [uqcq+32]
46  packssdw  xm3, [uqcq+48]
47
48  ; Compute the errors.
49  psubw     xm0, xm2
50  psubw     xm1, xm3
51
52  ; Individual errors are max 15bit+sign, so squares are 30bit, and
53  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
54  pmaddwd   xm2, xm2
55  pmaddwd   xm3, xm3
56
57  pmaddwd   xm0, xm0
58  pmaddwd   xm1, xm1
59
60  ; Squares are always positive, so we can use unsigned arithmetic after
61  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
62  ; fit in 32bits
63  paddd     xm2, xm3
64  paddd     xm0, xm1
65
66  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
67  pxor      xm5, xm5
68
69  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
70  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
71
72  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
73  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
74
75  paddq     xm2, xm3
76  paddq     xm0, xm1
77
78  psrldq    xm3, xm2, 8
79  psrldq    xm1, xm0, 8
80
81  paddq     xm2, xm3
82  paddq     xm0, xm1
83
84  ; Store the return value
85%if ARCH_X86_64
86  movq      rax, xm0
87  movq   [sszq], xm2
88%else
89  movd      eax, xm0
90  pextrd    edx, xm0, 1
91  movq   [sszd], xm2
92%endif
93  RET
94
95  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
96  ;; Generic case of size != 16, speculative low precision
97  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
98  ALIGN 16
99.generic:
100  pxor      xm4, xm4                ; sse accumulator
101  pxor      xm5, xm5                ; overflow detection register for xm4
102  pxor      xm6, xm6                ; ssz accumulator
103  pxor      xm7, xm7                ; overflow detection register for xm6
104  lea      uqcq, [uqcq+sizeq*4]
105  lea      dqcq, [dqcq+sizeq*4]
106  neg     sizeq
107
108  ; Push the negative size as the high precision code might need it
109  push    sizeq
110
111.loop:
112  ; Load input vectors
113  mova      xm0, [dqcq+sizeq*4]
114  packssdw  xm0, [dqcq+sizeq*4+16]
115  mova      xm2, [uqcq+sizeq*4]
116  packssdw  xm2, [uqcq+sizeq*4+16]
117
118  mova      xm1, [dqcq+sizeq*4+32]
119  packssdw  xm1, [dqcq+sizeq*4+48]
120  mova      xm3, [uqcq+sizeq*4+32]
121  packssdw  xm3, [uqcq+sizeq*4+48]
122
123  add     sizeq, 16
124
125  ; Compute the squared errors.
126  ; Individual errors are max 15bit+sign, so squares are 30bit, and
127  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
128  psubw     xm0, xm2
129  pmaddwd   xm2, xm2
130  pmaddwd   xm0, xm0
131
132  psubw     xm1, xm3
133  pmaddwd   xm3, xm3
134  pmaddwd   xm1, xm1
135
136  ; Squares are always positive, so we can use unsigned arithmetic after
137  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
138  ; fit in 32bits
139  paddd     xm2, xm3
140  paddd     xm0, xm1
141
142  ; We accumulate using 32 bit arithmetic, but detect potential overflow
143  ; by checking if the MSB of the accumulators have ever been a set bit.
144  ; If yes, we redo the whole compute at the end on higher precision, but
145  ; this happens extremely rarely, so we still achieve a net gain.
146  paddd     xm4, xm0
147  paddd     xm6, xm2
148  por       xm5, xm4  ; OR in the accumulator for overflow detection
149  por       xm7, xm6  ; OR in the accumulator for overflow detection
150
151  jnz .loop
152
153  ; Add pairs horizontally (still only on 32 bits)
154  phaddd    xm4, xm4
155  por       xm5, xm4  ; OR in the accumulator for overflow detection
156  phaddd    xm6, xm6
157  por       xm7, xm6  ; OR in the accumulator for overflow detection
158
159  ; Check for possibility of overflow by testing if bit 32 of each dword lane
160  ; have ever been set. If they were not, then there was no overflow and the
161  ; final sum will fit in 32 bits. If overflow happened, then
162  ; we redo the whole computation on higher precision.
163  por       xm7, xm5
164  pmovmskb   r4, xm7
165  test       r4, 0x8888
166  jnz .highprec
167
168  phaddd    xm4, xm4
169  phaddd    xm6, xm6
170  pmovzxdq  xm4, xm4
171  pmovzxdq  xm6, xm6
172
173  ; Restore stack
174  pop     sizeq
175
176  ; Store the return value
177%if ARCH_X86_64
178  movq      rax, xm4
179  movq   [sszq], xm6
180%else
181  movd      eax, xm4
182  pextrd    edx, xm4, 1
183  movq   [sszd], xm6
184%endif
185  RET
186
187  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
188  ;; Generic case of size != 16, high precision case
189  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
190.highprec:
191  pxor      xm4, xm4                 ; sse accumulator
192  pxor      xm5, xm5                 ; dedicated zero register
193  pxor      xm6, xm6                 ; ssz accumulator
194  pop     sizeq
195
196.loophp:
197  mova      xm0, [dqcq+sizeq*4]
198  packssdw  xm0, [dqcq+sizeq*4+16]
199  mova      xm2, [uqcq+sizeq*4]
200  packssdw  xm2, [uqcq+sizeq*4+16]
201
202  mova      xm1, [dqcq+sizeq*4+32]
203  packssdw  xm1, [dqcq+sizeq*4+48]
204  mova      xm3, [uqcq+sizeq*4+32]
205  packssdw  xm3, [uqcq+sizeq*4+48]
206
207  add     sizeq, 16
208
209  ; individual errors are max. 15bit+sign, so squares are 30bit, and
210  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
211
212  psubw     xm0, xm2
213  pmaddwd   xm2, xm2
214  pmaddwd   xm0, xm0
215
216  psubw     xm1, xm3
217  pmaddwd   xm3, xm3
218  pmaddwd   xm1, xm1
219
220  ; accumulate in 64bit
221  punpckldq xm7, xm0, xm5
222  punpckhdq xm0, xm5
223  paddq     xm4, xm7
224
225  punpckldq xm7, xm2, xm5
226  punpckhdq xm2, xm5
227  paddq     xm6, xm7
228
229  punpckldq xm7, xm1, xm5
230  punpckhdq xm1, xm5
231  paddq     xm4, xm7
232
233  punpckldq xm7, xm3, xm5
234  punpckhdq xm3, xm5
235  paddq     xm6, xm7
236
237  paddq     xm4, xm0
238  paddq     xm4, xm1
239  paddq     xm6, xm2
240  paddq     xm6, xm3
241
242  jnz .loophp
243
244  ; Accumulate horizontally
245  movhlps   xm5, xm4
246  movhlps   xm7, xm6
247  paddq     xm4, xm5
248  paddq     xm6, xm7
249
250  ; Store the return value
251%if ARCH_X86_64
252  movq      rax, xm4
253  movq   [sszq], xm6
254%else
255  movd      eax, xm4
256  pextrd    edx, xm4, 1
257  movq   [sszd], xm6
258%endif
259  RET
260
261END
262