1 /*
2  * Copyright (c) 2017, Alliance for Open Media. All rights reserved
3  *
4  * This source code is subject to the terms of the BSD 2 Clause License and
5  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6  * was not distributed with this source code in the LICENSE file, you can
7  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8  * Media Patent License 1.0 was not distributed with this source code in the
9  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10  */
11 
12 #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
13 #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
14 
15 #include <emmintrin.h>  // SSE2
16 
17 #include "config/aom_config.h"
18 
highbd_transpose6x6_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5)19 static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
20                                             __m128i *x2, __m128i *x3,
21                                             __m128i *x4, __m128i *x5,
22                                             __m128i *d0, __m128i *d1,
23                                             __m128i *d2, __m128i *d3,
24                                             __m128i *d4, __m128i *d5) {
25   __m128i w0, w1, w2, w3, w4, w5, ww0;
26 
27   // 00 01 02 03 04 05 xx xx
28   // 10 11 12 13 14 15 xx xx
29   // 20 21 22 23 24 25 xx xx
30   // 30 31 32 33 34 35 xx xx
31   // 40 41 42 43 44 45 xx xx
32   // 50 51 52 53 54 55 xx xx
33 
34   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
35   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
36   w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
37 
38   ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
39   *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
40   *d1 = _mm_unpackhi_epi64(ww0,
41                            _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
42 
43   ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
44   *d2 = _mm_unpacklo_epi64(ww0,
45                            _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
46 
47   w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
48   w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
49   w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
50 
51   *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
52 
53   ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
54   *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
55   *d5 = _mm_unpackhi_epi64(ww0,
56                            _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
57 }
58 
highbd_transpose4x8_8x4_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)59 static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
60                                                     __m128i *x2, __m128i *x3,
61                                                     __m128i *d0, __m128i *d1,
62                                                     __m128i *d2, __m128i *d3) {
63   __m128i zero = _mm_setzero_si128();
64   __m128i w0, w1, ww0, ww1;
65 
66   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
67   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
68 
69   ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
70   ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
71 
72   *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
73   *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
74   *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
75   *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
76 }
77 
highbd_transpose4x8_8x4_high_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)78 static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
79                                                      __m128i *x2, __m128i *x3,
80                                                      __m128i *d4, __m128i *d5,
81                                                      __m128i *d6, __m128i *d7) {
82   __m128i w0, w1, ww2, ww3;
83   __m128i zero = _mm_setzero_si128();
84 
85   w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
86   w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
87 
88   ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
89   ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
90 
91   *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
92   *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
93   *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
94   *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
95 }
96 
97 // here in and out pointers (x and d) should be different! we don't store their
98 // values inside
highbd_transpose4x8_8x4_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)99 static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
100                                                 __m128i *x2, __m128i *x3,
101                                                 __m128i *d0, __m128i *d1,
102                                                 __m128i *d2, __m128i *d3,
103                                                 __m128i *d4, __m128i *d5,
104                                                 __m128i *d6, __m128i *d7) {
105   // input
106   // x0 00 01 02 03 04 05 06 07
107   // x1 10 11 12 13 14 15 16 17
108   // x2 20 21 22 23 24 25 26 27
109   // x3 30 31 32 33 34 35 36 37
110   // output
111   // 00 10 20 30 xx xx xx xx
112   // 01 11 21 31 xx xx xx xx
113   // 02 12 22 32 xx xx xx xx
114   // 03 13 23 33 xx xx xx xx
115   // 04 14 24 34 xx xx xx xx
116   // 05 15 25 35 xx xx xx xx
117   // 06 16 26 36 xx xx xx xx
118   // 07 17 27 37 xx xx xx xx
119   highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
120   highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
121 }
122 
highbd_transpose8x8_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)123 static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
124                                                 __m128i *x2, __m128i *x3,
125                                                 __m128i *x4, __m128i *x5,
126                                                 __m128i *x6, __m128i *x7,
127                                                 __m128i *d0, __m128i *d1,
128                                                 __m128i *d2, __m128i *d3) {
129   __m128i w0, w1, w2, w3, ww0, ww1;
130   // x0 00 01 02 03 04 05 06 07
131   // x1 10 11 12 13 14 15 16 17
132   // x2 20 21 22 23 24 25 26 27
133   // x3 30 31 32 33 34 35 36 37
134   // x4 40 41 42 43 44 45 46 47
135   // x5 50 51 52 53 54 55 56 57
136   // x6 60 61 62 63 64 65 66 67
137   // x7 70 71 72 73 74 75 76 77
138 
139   w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
140   w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
141   w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
142   w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
143 
144   ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
145   ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
146 
147   *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
148   *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
149 
150   ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
151   ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
152 
153   *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
154   *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
155 }
156 
highbd_transpose8x8_high_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)157 static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
158                                                  __m128i *x2, __m128i *x3,
159                                                  __m128i *x4, __m128i *x5,
160                                                  __m128i *x6, __m128i *x7,
161                                                  __m128i *d4, __m128i *d5,
162                                                  __m128i *d6, __m128i *d7) {
163   __m128i w0, w1, w2, w3, ww0, ww1;
164   // x0 00 01 02 03 04 05 06 07
165   // x1 10 11 12 13 14 15 16 17
166   // x2 20 21 22 23 24 25 26 27
167   // x3 30 31 32 33 34 35 36 37
168   // x4 40 41 42 43 44 45 46 47
169   // x5 50 51 52 53 54 55 56 57
170   // x6 60 61 62 63 64 65 66 67
171   // x7 70 71 72 73 74 75 76 77
172   w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
173   w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
174   w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
175   w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
176 
177   ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
178   ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
179 
180   *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
181   *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
182 
183   ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
184   ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
185 
186   *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
187   *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
188 }
189 
190 // here in and out pointers (x and d) should be different! we don't store their
191 // values inside
highbd_transpose8x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)192 static INLINE void highbd_transpose8x8_sse2(
193     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
194     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
195     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
196     __m128i *d7) {
197   highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
198   highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
199 }
200 
201 // here in and out pointers (x and d arrays) should be different! we don't store
202 // their values inside
highbd_transpose8x16_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)203 static INLINE void highbd_transpose8x16_sse2(
204     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
205     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
206     __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
207     __m128i *d7) {
208   highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
209                            d5, d6, d7);
210   highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
211                            x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
212                            d4 + 1, d5 + 1, d6 + 1, d7 + 1);
213 }
214 
215 // Low bit depth functions
transpose4x8_8x4_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)216 static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
217                                              __m128i *x2, __m128i *x3,
218                                              __m128i *d0, __m128i *d1,
219                                              __m128i *d2, __m128i *d3) {
220   // input
221   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
222   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
223   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
224   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
225   // output
226   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
227   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
228   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
229   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
230 
231   __m128i w0, w1;
232 
233   w0 = _mm_unpacklo_epi8(
234       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
235   w1 = _mm_unpacklo_epi8(
236       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
237 
238   *d0 = _mm_unpacklo_epi16(
239       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
240 
241   *d1 = _mm_srli_si128(*d0,
242                        4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
243   *d2 = _mm_srli_si128(*d0,
244                        8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
245   *d3 = _mm_srli_si128(*d0,
246                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
247 }
248 
transpose4x8_8x4_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)249 static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
250                                          __m128i *x3, __m128i *d0, __m128i *d1,
251                                          __m128i *d2, __m128i *d3, __m128i *d4,
252                                          __m128i *d5, __m128i *d6,
253                                          __m128i *d7) {
254   // input
255   // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
256   // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
257   // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
258   // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
259   // output
260   // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
261   // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
262   // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
263   // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
264   // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
265   // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
266   // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
267   // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
268 
269   __m128i w0, w1, ww0, ww1;
270 
271   w0 = _mm_unpacklo_epi8(
272       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
273   w1 = _mm_unpacklo_epi8(
274       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
275 
276   ww0 = _mm_unpacklo_epi16(
277       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
278   ww1 = _mm_unpackhi_epi16(
279       w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
280 
281   *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
282   *d1 = _mm_srli_si128(ww0,
283                        4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
284   *d2 = _mm_srli_si128(ww0,
285                        8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
286   *d3 = _mm_srli_si128(ww0,
287                        12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
288 
289   *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
290   *d5 = _mm_srli_si128(ww1,
291                        4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
292   *d6 = _mm_srli_si128(ww1,
293                        8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
294   *d7 = _mm_srli_si128(ww1,
295                        12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
296 }
297 
transpose8x8_low_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3)298 static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
299                                          __m128i *x3, __m128i *x4, __m128i *x5,
300                                          __m128i *x6, __m128i *x7, __m128i *d0,
301                                          __m128i *d1, __m128i *d2,
302                                          __m128i *d3) {
303   // input
304   // x0 00 01 02 03 04 05 06 07
305   // x1 10 11 12 13 14 15 16 17
306   // x2 20 21 22 23 24 25 26 27
307   // x3 30 31 32 33 34 35 36 37
308   // x4 40 41 42 43 44 45 46 47
309   // x5  50 51 52 53 54 55 56 57
310   // x6  60 61 62 63 64 65 66 67
311   // x7 70 71 72 73 74 75 76 77
312   // output
313   // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
314   // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
315   // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
316   // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
317 
318   __m128i w0, w1, w2, w3, w4, w5;
319 
320   w0 = _mm_unpacklo_epi8(
321       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
322 
323   w1 = _mm_unpacklo_epi8(
324       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
325 
326   w2 = _mm_unpacklo_epi8(
327       *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
328 
329   w3 = _mm_unpacklo_epi8(
330       *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
331 
332   w4 = _mm_unpacklo_epi16(
333       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
334   w5 = _mm_unpacklo_epi16(
335       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
336 
337   *d0 = _mm_unpacklo_epi32(
338       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
339   *d1 = _mm_srli_si128(*d0, 8);
340   *d2 = _mm_unpackhi_epi32(
341       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
342   *d3 = _mm_srli_si128(*d2, 8);
343 }
344 
transpose8x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0d1,__m128i * d2d3,__m128i * d4d5,__m128i * d6d7)345 static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
346                                      __m128i *x3, __m128i *x4, __m128i *x5,
347                                      __m128i *x6, __m128i *x7, __m128i *d0d1,
348                                      __m128i *d2d3, __m128i *d4d5,
349                                      __m128i *d6d7) {
350   __m128i w0, w1, w2, w3, w4, w5, w6, w7;
351   // x0 00 01 02 03 04 05 06 07
352   // x1 10 11 12 13 14 15 16 17
353   w0 = _mm_unpacklo_epi8(
354       *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
355 
356   // x2 20 21 22 23 24 25 26 27
357   // x3 30 31 32 33 34 35 36 37
358   w1 = _mm_unpacklo_epi8(
359       *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
360 
361   // x4 40 41 42 43 44 45 46 47
362   // x5  50 51 52 53 54 55 56 57
363   w2 = _mm_unpacklo_epi8(
364       *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
365 
366   // x6  60 61 62 63 64 65 66 67
367   // x7 70 71 72 73 74 75 76 77
368   w3 = _mm_unpacklo_epi8(
369       *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
370 
371   w4 = _mm_unpacklo_epi16(
372       w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
373   w5 = _mm_unpacklo_epi16(
374       w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
375 
376   *d0d1 = _mm_unpacklo_epi32(
377       w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
378   *d2d3 = _mm_unpackhi_epi32(
379       w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
380 
381   w6 = _mm_unpackhi_epi16(
382       w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
383   w7 = _mm_unpackhi_epi16(
384       w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
385 
386   *d4d5 = _mm_unpacklo_epi32(
387       w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
388   *d6d7 = _mm_unpackhi_epi32(
389       w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
390 }
391 
transpose16x8_8x16_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * x8,__m128i * x9,__m128i * x10,__m128i * x11,__m128i * x12,__m128i * x13,__m128i * x14,__m128i * x15,__m128i * d0,__m128i * d1,__m128i * d2,__m128i * d3,__m128i * d4,__m128i * d5,__m128i * d6,__m128i * d7)392 static INLINE void transpose16x8_8x16_sse2(
393     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
394     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
395     __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
396     __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
397     __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
398   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
399   __m128i w10, w11, w12, w13, w14, w15;
400 
401   w0 = _mm_unpacklo_epi8(*x0, *x1);
402   w1 = _mm_unpacklo_epi8(*x2, *x3);
403   w2 = _mm_unpacklo_epi8(*x4, *x5);
404   w3 = _mm_unpacklo_epi8(*x6, *x7);
405 
406   w8 = _mm_unpacklo_epi8(*x8, *x9);
407   w9 = _mm_unpacklo_epi8(*x10, *x11);
408   w10 = _mm_unpacklo_epi8(*x12, *x13);
409   w11 = _mm_unpacklo_epi8(*x14, *x15);
410 
411   w4 = _mm_unpacklo_epi16(w0, w1);
412   w5 = _mm_unpacklo_epi16(w2, w3);
413   w12 = _mm_unpacklo_epi16(w8, w9);
414   w13 = _mm_unpacklo_epi16(w10, w11);
415 
416   w6 = _mm_unpacklo_epi32(w4, w5);
417   w7 = _mm_unpackhi_epi32(w4, w5);
418   w14 = _mm_unpacklo_epi32(w12, w13);
419   w15 = _mm_unpackhi_epi32(w12, w13);
420 
421   // Store first 4-line result
422   *d0 = _mm_unpacklo_epi64(w6, w14);
423   *d1 = _mm_unpackhi_epi64(w6, w14);
424   *d2 = _mm_unpacklo_epi64(w7, w15);
425   *d3 = _mm_unpackhi_epi64(w7, w15);
426 
427   w4 = _mm_unpackhi_epi16(w0, w1);
428   w5 = _mm_unpackhi_epi16(w2, w3);
429   w12 = _mm_unpackhi_epi16(w8, w9);
430   w13 = _mm_unpackhi_epi16(w10, w11);
431 
432   w6 = _mm_unpacklo_epi32(w4, w5);
433   w7 = _mm_unpackhi_epi32(w4, w5);
434   w14 = _mm_unpacklo_epi32(w12, w13);
435   w15 = _mm_unpackhi_epi32(w12, w13);
436 
437   // Store second 4-line result
438   *d4 = _mm_unpacklo_epi64(w6, w14);
439   *d5 = _mm_unpackhi_epi64(w6, w14);
440   *d6 = _mm_unpacklo_epi64(w7, w15);
441   *d7 = _mm_unpackhi_epi64(w7, w15);
442 }
443 
transpose8x16_16x8_sse2(__m128i * x0,__m128i * x1,__m128i * x2,__m128i * x3,__m128i * x4,__m128i * x5,__m128i * x6,__m128i * x7,__m128i * d0d1,__m128i * d2d3,__m128i * d4d5,__m128i * d6d7,__m128i * d8d9,__m128i * d10d11,__m128i * d12d13,__m128i * d14d15)444 static INLINE void transpose8x16_16x8_sse2(
445     __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
446     __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
447     __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
448     __m128i *d12d13, __m128i *d14d15) {
449   __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
450   __m128i w10, w11, w12, w13, w14, w15;
451 
452   w0 = _mm_unpacklo_epi8(*x0, *x1);
453   w1 = _mm_unpacklo_epi8(*x2, *x3);
454   w2 = _mm_unpacklo_epi8(*x4, *x5);
455   w3 = _mm_unpacklo_epi8(*x6, *x7);
456 
457   w8 = _mm_unpackhi_epi8(*x0, *x1);
458   w9 = _mm_unpackhi_epi8(*x2, *x3);
459   w10 = _mm_unpackhi_epi8(*x4, *x5);
460   w11 = _mm_unpackhi_epi8(*x6, *x7);
461 
462   w4 = _mm_unpacklo_epi16(w0, w1);
463   w5 = _mm_unpacklo_epi16(w2, w3);
464   w12 = _mm_unpacklo_epi16(w8, w9);
465   w13 = _mm_unpacklo_epi16(w10, w11);
466 
467   w6 = _mm_unpacklo_epi32(w4, w5);
468   w7 = _mm_unpackhi_epi32(w4, w5);
469   w14 = _mm_unpacklo_epi32(w12, w13);
470   w15 = _mm_unpackhi_epi32(w12, w13);
471 
472   // Store first 4-line result
473   *d0d1 = _mm_unpacklo_epi64(w6, w14);
474   *d2d3 = _mm_unpackhi_epi64(w6, w14);
475   *d4d5 = _mm_unpacklo_epi64(w7, w15);
476   *d6d7 = _mm_unpackhi_epi64(w7, w15);
477 
478   w4 = _mm_unpackhi_epi16(w0, w1);
479   w5 = _mm_unpackhi_epi16(w2, w3);
480   w12 = _mm_unpackhi_epi16(w8, w9);
481   w13 = _mm_unpackhi_epi16(w10, w11);
482 
483   w6 = _mm_unpacklo_epi32(w4, w5);
484   w7 = _mm_unpackhi_epi32(w4, w5);
485   w14 = _mm_unpacklo_epi32(w12, w13);
486   w15 = _mm_unpackhi_epi32(w12, w13);
487 
488   // Store second 4-line result
489   *d8d9 = _mm_unpacklo_epi64(w6, w14);
490   *d10d11 = _mm_unpackhi_epi64(w6, w14);
491   *d12d13 = _mm_unpacklo_epi64(w7, w15);
492   *d14d15 = _mm_unpackhi_epi64(w7, w15);
493 }
494 
495 #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
496