1;
2;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14
15SECTION .text
16ALIGN 16
17
18;
19; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
20;                                     intptr_t block_size, int64_t *ssz)
21;
22
23INIT_XMM sse2
24cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
25  pxor      m4, m4                 ; sse accumulator
26  pxor      m6, m6                 ; ssz accumulator
27  pxor      m5, m5                 ; dedicated zero register
28  lea     uqcq, [uqcq+sizeq*4]
29  lea     dqcq, [dqcq+sizeq*4]
30  neg    sizeq
31
32  ALIGN 16
33
34.loop:
35  mova      m0, [dqcq+sizeq*4]
36  packssdw  m0, [dqcq+sizeq*4+mmsize]
37  mova      m2, [uqcq+sizeq*4]
38  packssdw  m2, [uqcq+sizeq*4+mmsize]
39
40  mova      m1, [dqcq+sizeq*4+mmsize*2]
41  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
42  mova      m3, [uqcq+sizeq*4+mmsize*2]
43  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
44
45  add    sizeq, mmsize
46
47  ; individual errors are max. 15bit+sign, so squares are 30bit, and
48  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
49
50  psubw     m0, m2
51  pmaddwd   m2, m2
52  pmaddwd   m0, m0
53
54  psubw     m1, m3
55  pmaddwd   m3, m3
56  pmaddwd   m1, m1
57
58  ; accumulate in 64bit
59  punpckldq m7, m0, m5
60  punpckhdq m0, m5
61  paddq     m4, m7
62
63  punpckldq m7, m2, m5
64  punpckhdq m2, m5
65  paddq     m6, m7
66
67  punpckldq m7, m1, m5
68  punpckhdq m1, m5
69  paddq     m4, m7
70
71  punpckldq m7, m3, m5
72  punpckhdq m3, m5
73  paddq     m6, m7
74
75  paddq     m4, m0
76  paddq     m4, m1
77  paddq     m6, m2
78  paddq     m6, m3
79
80  jnz .loop
81
82  ; accumulate horizontally and store in return value
83  movhlps   m5, m4
84  movhlps   m7, m6
85  paddq     m4, m5
86  paddq     m6, m7
87
88%if ARCH_X86_64
89  movq    rax, m4
90  movq [sszq], m6
91%else
92  mov     eax, sszm
93  pshufd   m5, m4, 0x1
94  movq  [eax], m6
95  movd    eax, m4
96  movd    edx, m5
97%endif
98  RET
99