1;
2;  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3;
4;  Use of this source code is governed by a BSD-style license
5;  that can be found in the LICENSE file in the root of the source
6;  tree. An additional intellectual property rights grant can be found
7;  in the file PATENTS.  All contributing project authors may
8;  be found in the AUTHORS file in the root of the source tree.
9;
10
11%define private_prefix vp9
12
13%include "third_party/x86inc/x86inc.asm"
14
15; This file provides SSSE3 version of the forward transformation. Part
16; of the macro definitions are originally derived from the ffmpeg project.
17; The current version applies to x86 64-bit only.
18
19SECTION .text
20
21%if ARCH_X86_64
22; matrix transpose
23%macro INTERLEAVE_2X 4
24  punpckh%1          m%4, m%2, m%3
25  punpckl%1          m%2, m%3
26  SWAP               %3,  %4
27%endmacro
28
29%macro TRANSPOSE8X8 9
30  INTERLEAVE_2X  wd, %1, %2, %9
31  INTERLEAVE_2X  wd, %3, %4, %9
32  INTERLEAVE_2X  wd, %5, %6, %9
33  INTERLEAVE_2X  wd, %7, %8, %9
34
35  INTERLEAVE_2X  dq, %1, %3, %9
36  INTERLEAVE_2X  dq, %2, %4, %9
37  INTERLEAVE_2X  dq, %5, %7, %9
38  INTERLEAVE_2X  dq, %6, %8, %9
39
40  INTERLEAVE_2X  qdq, %1, %5, %9
41  INTERLEAVE_2X  qdq, %3, %7, %9
42  INTERLEAVE_2X  qdq, %2, %6, %9
43  INTERLEAVE_2X  qdq, %4, %8, %9
44
45  SWAP  %2, %5
46  SWAP  %4, %7
47%endmacro
48
49%macro HMD8_1D 0
50  psubw              m8, m0, m1
51  psubw              m9, m2, m3
52  paddw              m0, m1
53  paddw              m2, m3
54  SWAP               1, 8
55  SWAP               3, 9
56  psubw              m8, m4, m5
57  psubw              m9, m6, m7
58  paddw              m4, m5
59  paddw              m6, m7
60  SWAP               5, 8
61  SWAP               7, 9
62
63  psubw              m8, m0, m2
64  psubw              m9, m1, m3
65  paddw              m0, m2
66  paddw              m1, m3
67  SWAP               2, 8
68  SWAP               3, 9
69  psubw              m8, m4, m6
70  psubw              m9, m5, m7
71  paddw              m4, m6
72  paddw              m5, m7
73  SWAP               6, 8
74  SWAP               7, 9
75
76  psubw              m8, m0, m4
77  psubw              m9, m1, m5
78  paddw              m0, m4
79  paddw              m1, m5
80  SWAP               4, 8
81  SWAP               5, 9
82  psubw              m8, m2, m6
83  psubw              m9, m3, m7
84  paddw              m2, m6
85  paddw              m3, m7
86  SWAP               6, 8
87  SWAP               7, 9
88%endmacro
89
90INIT_XMM ssse3
91cglobal hadamard_8x8, 3, 5, 10, input, stride, output
92  lea                r3, [2 * strideq]
93  lea                r4, [4 * strideq]
94
95  mova               m0, [inputq]
96  mova               m1, [inputq + r3]
97  lea                inputq, [inputq + r4]
98  mova               m2, [inputq]
99  mova               m3, [inputq + r3]
100  lea                inputq, [inputq + r4]
101  mova               m4, [inputq]
102  mova               m5, [inputq + r3]
103  lea                inputq, [inputq + r4]
104  mova               m6, [inputq]
105  mova               m7, [inputq + r3]
106
107  HMD8_1D
108  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
109  HMD8_1D
110
111  mova              [outputq +   0], m0
112  mova              [outputq +  16], m1
113  mova              [outputq +  32], m2
114  mova              [outputq +  48], m3
115  mova              [outputq +  64], m4
116  mova              [outputq +  80], m5
117  mova              [outputq +  96], m6
118  mova              [outputq + 112], m7
119
120  RET
121%endif
122