1 /*
2  *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #ifndef AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
12 #define AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
13 
14 #include <arm_neon.h>
15 
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)16 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
17                                     uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
18                                     uint8x8_t *a6, uint8x8_t *a7) {
19   // Swap 8 bit elements. Goes from:
20   // a0: 00 01 02 03 04 05 06 07
21   // a1: 10 11 12 13 14 15 16 17
22   // a2: 20 21 22 23 24 25 26 27
23   // a3: 30 31 32 33 34 35 36 37
24   // a4: 40 41 42 43 44 45 46 47
25   // a5: 50 51 52 53 54 55 56 57
26   // a6: 60 61 62 63 64 65 66 67
27   // a7: 70 71 72 73 74 75 76 77
28   // to:
29   // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
30   // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
31   // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
32   // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
33 
34   const uint8x16x2_t b0 =
35       vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
36   const uint8x16x2_t b1 =
37       vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
38 
39   // Swap 16 bit elements resulting in:
40   // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
41   // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
42   // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
43   // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
44 
45   const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
46                                     vreinterpretq_u16_u8(b1.val[0]));
47   const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
48                                     vreinterpretq_u16_u8(b1.val[1]));
49 
50   // Unzip 32 bit elements resulting in:
51   // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
52   // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
53   // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
54   // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
55   const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
56                                     vreinterpretq_u32_u16(c1.val[0]));
57   const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
58                                     vreinterpretq_u32_u16(c1.val[1]));
59 
60   *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
61   *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
62   *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
63   *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
64   *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
65   *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
66   *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
67   *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
68 }
69 
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)70 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
71                                     uint8x8_t *a3) {
72   // Swap 8 bit elements. Goes from:
73   // a0: 00 01 02 03 04 05 06 07
74   // a1: 10 11 12 13 14 15 16 17
75   // a2: 20 21 22 23 24 25 26 27
76   // a3: 30 31 32 33 34 35 36 37
77   // to:
78   // b0.val[0]: 00 10 02 12 04 14 06 16
79   // b0.val[1]: 01 11 03 13 05 15 07 17
80   // b1.val[0]: 20 30 22 32 24 34 26 36
81   // b1.val[1]: 21 31 23 33 25 35 27 37
82 
83   const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
84   const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
85 
86   // Swap 16 bit elements resulting in:
87   // c0.val[0]: 00 10 20 30 04 14 24 34
88   // c0.val[1]: 02 12 22 32 06 16 26 36
89   // c1.val[0]: 01 11 21 31 05 15 25 35
90   // c1.val[1]: 03 13 23 33 07 17 27 37
91 
92   const uint16x4x2_t c0 =
93       vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
94   const uint16x4x2_t c1 =
95       vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
96 
97   *a0 = vreinterpret_u8_u16(c0.val[0]);
98   *a1 = vreinterpret_u8_u16(c1.val[0]);
99   *a2 = vreinterpret_u8_u16(c0.val[1]);
100   *a3 = vreinterpret_u8_u16(c1.val[1]);
101 }
102 
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)103 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
104   // Swap 16 bit elements. Goes from:
105   // a0: 00 01 02 03  10 11 12 13
106   // a1: 20 21 22 23  30 31 32 33
107   // to:
108   // b0.val[0]: 00 01 20 21  10 11 30 31
109   // b0.val[1]: 02 03 22 23  12 13 32 33
110 
111   const uint16x4x2_t b0 =
112       vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
113 
114   // Swap 32 bit elements resulting in:
115   // c0.val[0]: 00 01 20 21  02 03 22 23
116   // c0.val[1]: 10 11 30 31  12 13 32 33
117 
118   const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
119                                    vreinterpret_u32_u16(b0.val[1]));
120 
121   // Swap 8 bit elements resulting in:
122   // d0.val[0]: 00 10 20 30  02 12 22 32
123   // d0.val[1]: 01 11 21 31  03 13 23 33
124 
125   const uint8x8x2_t d0 =
126       vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
127 
128   *a0 = d0.val[0];
129   *a1 = d0.val[1];
130 }
131 
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)132 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
133                                     uint8x8_t *a3, const uint8x8_t a4,
134                                     const uint8x8_t a5, const uint8x8_t a6,
135                                     const uint8x8_t a7) {
136   // Swap 32 bit elements. Goes from:
137   // a0: 00 01 02 03 XX XX XX XX
138   // a1: 10 11 12 13 XX XX XX XX
139   // a2: 20 21 22 23 XX XX XX XX
140   // a3; 30 31 32 33 XX XX XX XX
141   // a4: 40 41 42 43 XX XX XX XX
142   // a5: 50 51 52 53 XX XX XX XX
143   // a6: 60 61 62 63 XX XX XX XX
144   // a7: 70 71 72 73 XX XX XX XX
145   // to:
146   // b0.val[0]: 00 01 02 03 40 41 42 43
147   // b1.val[0]: 10 11 12 13 50 51 52 53
148   // b2.val[0]: 20 21 22 23 60 61 62 63
149   // b3.val[0]: 30 31 32 33 70 71 72 73
150 
151   const uint32x2x2_t b0 =
152       vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
153   const uint32x2x2_t b1 =
154       vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
155   const uint32x2x2_t b2 =
156       vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
157   const uint32x2x2_t b3 =
158       vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
159 
160   // Swap 16 bit elements resulting in:
161   // c0.val[0]: 00 01 20 21 40 41 60 61
162   // c0.val[1]: 02 03 22 23 42 43 62 63
163   // c1.val[0]: 10 11 30 31 50 51 70 71
164   // c1.val[1]: 12 13 32 33 52 53 72 73
165 
166   const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
167                                    vreinterpret_u16_u32(b2.val[0]));
168   const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
169                                    vreinterpret_u16_u32(b3.val[0]));
170 
171   // Swap 8 bit elements resulting in:
172   // d0.val[0]: 00 10 20 30 40 50 60 70
173   // d0.val[1]: 01 11 21 31 41 51 61 71
174   // d1.val[0]: 02 12 22 32 42 52 62 72
175   // d1.val[1]: 03 13 23 33 43 53 63 73
176 
177   const uint8x8x2_t d0 =
178       vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
179   const uint8x8x2_t d1 =
180       vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
181 
182   *a0 = d0.val[0];
183   *a1 = d0.val[1];
184   *a2 = d1.val[0];
185   *a3 = d1.val[1];
186 }
187 
transpose_u16_4x8(uint16x4_t * a0,uint16x4_t * a1,uint16x4_t * a2,uint16x4_t * a3,uint16x4_t * a4,uint16x4_t * a5,uint16x4_t * a6,uint16x4_t * a7,uint16x8_t * o0,uint16x8_t * o1,uint16x8_t * o2,uint16x8_t * o3)188 static INLINE void transpose_u16_4x8(uint16x4_t *a0, uint16x4_t *a1,
189                                      uint16x4_t *a2, uint16x4_t *a3,
190                                      uint16x4_t *a4, uint16x4_t *a5,
191                                      uint16x4_t *a6, uint16x4_t *a7,
192                                      uint16x8_t *o0, uint16x8_t *o1,
193                                      uint16x8_t *o2, uint16x8_t *o3) {
194   // Swap 16 bit elements. Goes from:
195   // a0: 00 01 02 03
196   // a1: 10 11 12 13
197   // a2: 20 21 22 23
198   // a3: 30 31 32 33
199   // a4: 40 41 42 43
200   // a5: 50 51 52 53
201   // a6: 60 61 62 63
202   // a7: 70 71 72 73
203   // to:
204   // b0.val[0]: 00 10 02 12
205   // b0.val[1]: 01 11 03 13
206   // b1.val[0]: 20 30 22 32
207   // b1.val[1]: 21 31 23 33
208   // b2.val[0]: 40 50 42 52
209   // b2.val[1]: 41 51 43 53
210   // b3.val[0]: 60 70 62 72
211   // b3.val[1]: 61 71 63 73
212 
213   uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
214   uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
215   uint16x4x2_t b2 = vtrn_u16(*a4, *a5);
216   uint16x4x2_t b3 = vtrn_u16(*a6, *a7);
217 
218   // Swap 32 bit elements resulting in:
219   // c0.val[0]: 00 10 20 30
220   // c0.val[1]: 02 12 22 32
221   // c1.val[0]: 01 11 21 31
222   // c1.val[1]: 03 13 23 33
223   // c2.val[0]: 40 50 60 70
224   // c2.val[1]: 42 52 62 72
225   // c3.val[0]: 41 51 61 71
226   // c3.val[1]: 43 53 63 73
227 
228   uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
229                              vreinterpret_u32_u16(b1.val[0]));
230   uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
231                              vreinterpret_u32_u16(b1.val[1]));
232   uint32x2x2_t c2 = vtrn_u32(vreinterpret_u32_u16(b2.val[0]),
233                              vreinterpret_u32_u16(b3.val[0]));
234   uint32x2x2_t c3 = vtrn_u32(vreinterpret_u32_u16(b2.val[1]),
235                              vreinterpret_u32_u16(b3.val[1]));
236 
237   // Swap 64 bit elements resulting in:
238   // o0: 00 10 20 30 40 50 60 70
239   // o1: 01 11 21 31 41 51 61 71
240   // o2: 02 12 22 32 42 52 62 72
241   // o3: 03 13 23 33 43 53 63 73
242 
243   *o0 = vcombine_u16(vreinterpret_u16_u32(c0.val[0]),
244                      vreinterpret_u16_u32(c2.val[0]));
245   *o1 = vcombine_u16(vreinterpret_u16_u32(c1.val[0]),
246                      vreinterpret_u16_u32(c3.val[0]));
247   *o2 = vcombine_u16(vreinterpret_u16_u32(c0.val[1]),
248                      vreinterpret_u16_u32(c2.val[1]));
249   *o3 = vcombine_u16(vreinterpret_u16_u32(c1.val[1]),
250                      vreinterpret_u16_u32(c3.val[1]));
251 }
252 
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)253 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
254                                      uint16x8_t *a2, uint16x8_t *a3,
255                                      uint16x8_t *a4, uint16x8_t *a5,
256                                      uint16x8_t *a6, uint16x8_t *a7) {
257   // Swap 16 bit elements. Goes from:
258   // a0: 00 01 02 03 04 05 06 07
259   // a1: 10 11 12 13 14 15 16 17
260   // a2: 20 21 22 23 24 25 26 27
261   // a3: 30 31 32 33 34 35 36 37
262   // a4: 40 41 42 43 44 45 46 47
263   // a5: 50 51 52 53 54 55 56 57
264   // a6: 60 61 62 63 64 65 66 67
265   // a7: 70 71 72 73 74 75 76 77
266   // to:
267   // b0.val[0]: 00 10 02 12 04 14 06 16
268   // b0.val[1]: 01 11 03 13 05 15 07 17
269   // b1.val[0]: 20 30 22 32 24 34 26 36
270   // b1.val[1]: 21 31 23 33 25 35 27 37
271   // b2.val[0]: 40 50 42 52 44 54 46 56
272   // b2.val[1]: 41 51 43 53 45 55 47 57
273   // b3.val[0]: 60 70 62 72 64 74 66 76
274   // b3.val[1]: 61 71 63 73 65 75 67 77
275 
276   const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
277   const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
278   const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
279   const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
280 
281   // Swap 32 bit elements resulting in:
282   // c0.val[0]: 00 10 20 30 04 14 24 34
283   // c0.val[1]: 02 12 22 32 06 16 26 36
284   // c1.val[0]: 01 11 21 31 05 15 25 35
285   // c1.val[1]: 03 13 23 33 07 17 27 37
286   // c2.val[0]: 40 50 60 70 44 54 64 74
287   // c2.val[1]: 42 52 62 72 46 56 66 76
288   // c3.val[0]: 41 51 61 71 45 55 65 75
289   // c3.val[1]: 43 53 63 73 47 57 67 77
290 
291   const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
292                                     vreinterpretq_u32_u16(b1.val[0]));
293   const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
294                                     vreinterpretq_u32_u16(b1.val[1]));
295   const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
296                                     vreinterpretq_u32_u16(b3.val[0]));
297   const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
298                                     vreinterpretq_u32_u16(b3.val[1]));
299 
300   *a0 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[0])),
301                      vget_low_u16(vreinterpretq_u16_u32(c2.val[0])));
302   *a4 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[0])),
303                      vget_high_u16(vreinterpretq_u16_u32(c2.val[0])));
304 
305   *a2 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c0.val[1])),
306                      vget_low_u16(vreinterpretq_u16_u32(c2.val[1])));
307   *a6 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c0.val[1])),
308                      vget_high_u16(vreinterpretq_u16_u32(c2.val[1])));
309 
310   *a1 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[0])),
311                      vget_low_u16(vreinterpretq_u16_u32(c3.val[0])));
312   *a5 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[0])),
313                      vget_high_u16(vreinterpretq_u16_u32(c3.val[0])));
314 
315   *a3 = vcombine_u16(vget_low_u16(vreinterpretq_u16_u32(c1.val[1])),
316                      vget_low_u16(vreinterpretq_u16_u32(c3.val[1])));
317   *a7 = vcombine_u16(vget_high_u16(vreinterpretq_u16_u32(c1.val[1])),
318                      vget_high_u16(vreinterpretq_u16_u32(c3.val[1])));
319 }
320 
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)321 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
322                                      int16x8_t *a2, int16x8_t *a3,
323                                      int16x8_t *a4, int16x8_t *a5,
324                                      int16x8_t *a6, int16x8_t *a7) {
325   // Swap 16 bit elements. Goes from:
326   // a0: 00 01 02 03 04 05 06 07
327   // a1: 10 11 12 13 14 15 16 17
328   // a2: 20 21 22 23 24 25 26 27
329   // a3: 30 31 32 33 34 35 36 37
330   // a4: 40 41 42 43 44 45 46 47
331   // a5: 50 51 52 53 54 55 56 57
332   // a6: 60 61 62 63 64 65 66 67
333   // a7: 70 71 72 73 74 75 76 77
334   // to:
335   // b0.val[0]: 00 10 02 12 04 14 06 16
336   // b0.val[1]: 01 11 03 13 05 15 07 17
337   // b1.val[0]: 20 30 22 32 24 34 26 36
338   // b1.val[1]: 21 31 23 33 25 35 27 37
339   // b2.val[0]: 40 50 42 52 44 54 46 56
340   // b2.val[1]: 41 51 43 53 45 55 47 57
341   // b3.val[0]: 60 70 62 72 64 74 66 76
342   // b3.val[1]: 61 71 63 73 65 75 67 77
343 
344   const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
345   const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
346   const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
347   const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
348 
349   // Swap 32 bit elements resulting in:
350   // c0.val[0]: 00 10 20 30 04 14 24 34
351   // c0.val[1]: 02 12 22 32 06 16 26 36
352   // c1.val[0]: 01 11 21 31 05 15 25 35
353   // c1.val[1]: 03 13 23 33 07 17 27 37
354   // c2.val[0]: 40 50 60 70 44 54 64 74
355   // c2.val[1]: 42 52 62 72 46 56 66 76
356   // c3.val[0]: 41 51 61 71 45 55 65 75
357   // c3.val[1]: 43 53 63 73 47 57 67 77
358 
359   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
360                                    vreinterpretq_s32_s16(b1.val[0]));
361   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
362                                    vreinterpretq_s32_s16(b1.val[1]));
363   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
364                                    vreinterpretq_s32_s16(b3.val[0]));
365   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
366                                    vreinterpretq_s32_s16(b3.val[1]));
367 
368   *a0 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[0])),
369                      vget_low_s16(vreinterpretq_s16_s32(c2.val[0])));
370   *a4 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[0])),
371                      vget_high_s16(vreinterpretq_s16_s32(c2.val[0])));
372 
373   *a2 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c0.val[1])),
374                      vget_low_s16(vreinterpretq_s16_s32(c2.val[1])));
375   *a6 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c0.val[1])),
376                      vget_high_s16(vreinterpretq_s16_s32(c2.val[1])));
377 
378   *a1 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[0])),
379                      vget_low_s16(vreinterpretq_s16_s32(c3.val[0])));
380   *a5 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[0])),
381                      vget_high_s16(vreinterpretq_s16_s32(c3.val[0])));
382 
383   *a3 = vcombine_s16(vget_low_s16(vreinterpretq_s16_s32(c1.val[1])),
384                      vget_low_s16(vreinterpretq_s16_s32(c3.val[1])));
385   *a7 = vcombine_s16(vget_high_s16(vreinterpretq_s16_s32(c1.val[1])),
386                      vget_high_s16(vreinterpretq_s16_s32(c3.val[1])));
387 }
388 
vpx_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)389 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
390   int16x8x2_t b0;
391   b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
392                            vreinterpret_s16_s32(vget_low_s32(a1)));
393   b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
394                            vreinterpret_s16_s32(vget_high_s32(a1)));
395   return b0;
396 }
397 
transpose_s16_8x8q(int16x8_t * a0,int16x8_t * out)398 static INLINE void transpose_s16_8x8q(int16x8_t *a0, int16x8_t *out) {
399   // Swap 16 bit elements. Goes from:
400   // a0: 00 01 02 03 04 05 06 07
401   // a1: 10 11 12 13 14 15 16 17
402   // a2: 20 21 22 23 24 25 26 27
403   // a3: 30 31 32 33 34 35 36 37
404   // a4: 40 41 42 43 44 45 46 47
405   // a5: 50 51 52 53 54 55 56 57
406   // a6: 60 61 62 63 64 65 66 67
407   // a7: 70 71 72 73 74 75 76 77
408   // to:
409   // b0.val[0]: 00 10 02 12 04 14 06 16
410   // b0.val[1]: 01 11 03 13 05 15 07 17
411   // b1.val[0]: 20 30 22 32 24 34 26 36
412   // b1.val[1]: 21 31 23 33 25 35 27 37
413   // b2.val[0]: 40 50 42 52 44 54 46 56
414   // b2.val[1]: 41 51 43 53 45 55 47 57
415   // b3.val[0]: 60 70 62 72 64 74 66 76
416   // b3.val[1]: 61 71 63 73 65 75 67 77
417 
418   const int16x8x2_t b0 = vtrnq_s16(*a0, *(a0 + 1));
419   const int16x8x2_t b1 = vtrnq_s16(*(a0 + 2), *(a0 + 3));
420   const int16x8x2_t b2 = vtrnq_s16(*(a0 + 4), *(a0 + 5));
421   const int16x8x2_t b3 = vtrnq_s16(*(a0 + 6), *(a0 + 7));
422 
423   // Swap 32 bit elements resulting in:
424   // c0.val[0]: 00 10 20 30 04 14 24 34
425   // c0.val[1]: 02 12 22 32 06 16 26 36
426   // c1.val[0]: 01 11 21 31 05 15 25 35
427   // c1.val[1]: 03 13 23 33 07 17 27 37
428   // c2.val[0]: 40 50 60 70 44 54 64 74
429   // c2.val[1]: 42 52 62 72 46 56 66 76
430   // c3.val[0]: 41 51 61 71 45 55 65 75
431   // c3.val[1]: 43 53 63 73 47 57 67 77
432 
433   const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
434                                    vreinterpretq_s32_s16(b1.val[0]));
435   const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
436                                    vreinterpretq_s32_s16(b1.val[1]));
437   const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
438                                    vreinterpretq_s32_s16(b3.val[0]));
439   const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
440                                    vreinterpretq_s32_s16(b3.val[1]));
441 
442   // Swap 64 bit elements resulting in:
443   // d0.val[0]: 00 10 20 30 40 50 60 70
444   // d0.val[1]: 04 14 24 34 44 54 64 74
445   // d1.val[0]: 01 11 21 31 41 51 61 71
446   // d1.val[1]: 05 15 25 35 45 55 65 75
447   // d2.val[0]: 02 12 22 32 42 52 62 72
448   // d2.val[1]: 06 16 26 36 46 56 66 76
449   // d3.val[0]: 03 13 23 33 43 53 63 73
450   // d3.val[1]: 07 17 27 37 47 57 67 77
451   const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
452   const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
453   const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
454   const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
455 
456   *out = d0.val[0];
457   *(out + 1) = d1.val[0];
458   *(out + 2) = d2.val[0];
459   *(out + 3) = d3.val[0];
460   *(out + 4) = d0.val[1];
461   *(out + 5) = d1.val[1];
462   *(out + 6) = d2.val[1];
463   *(out + 7) = d3.val[1];
464 }
465 
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)466 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
467                                       int16x4_t *a2, int16x4_t *a3) {
468   // Swap 16 bit elements. Goes from:
469   // a0: 00 01 02 03
470   // a1: 10 11 12 13
471   // a2: 20 21 22 23
472   // a3: 30 31 32 33
473   // to:
474   // b0.val[0]: 00 10 02 12
475   // b0.val[1]: 01 11 03 13
476   // b1.val[0]: 20 30 22 32
477   // b1.val[1]: 21 31 23 33
478 
479   const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
480   const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
481 
482   // Swap 32 bit elements resulting in:
483   // c0.val[0]: 00 10 20 30
484   // c0.val[1]: 02 12 22 32
485   // c1.val[0]: 01 11 21 31
486   // c1.val[1]: 03 13 23 33
487 
488   const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
489                                   vreinterpret_s32_s16(b1.val[0]));
490   const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
491                                   vreinterpret_s32_s16(b1.val[1]));
492 
493   *a0 = vreinterpret_s16_s32(c0.val[0]);
494   *a1 = vreinterpret_s16_s32(c1.val[0]);
495   *a2 = vreinterpret_s16_s32(c0.val[1]);
496   *a3 = vreinterpret_s16_s32(c1.val[1]);
497 }
498 
aom_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)499 static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
500   int32x4x2_t b0;
501   b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
502   b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
503   return b0;
504 }
505 
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)506 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
507                                      int32x4_t *a2, int32x4_t *a3) {
508   // Swap 32 bit elements. Goes from:
509   // a0: 00 01 02 03
510   // a1: 10 11 12 13
511   // a2: 20 21 22 23
512   // a3: 30 31 32 33
513   // to:
514   // b0.val[0]: 00 10 02 12
515   // b0.val[1]: 01 11 03 13
516   // b1.val[0]: 20 30 22 32
517   // b1.val[1]: 21 31 23 33
518 
519   const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
520   const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
521 
522   // Swap 64 bit elements resulting in:
523   // c0.val[0]: 00 10 20 30
524   // c0.val[1]: 02 12 22 32
525   // c1.val[0]: 01 11 21 31
526   // c1.val[1]: 03 13 23 33
527 
528   const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
529   const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
530 
531   *a0 = c0.val[0];
532   *a1 = c1.val[0];
533   *a2 = c0.val[1];
534   *a3 = c1.val[1];
535 }
536 
537 #endif  // AOM_AV1_COMMON_ARM_TRANSPOSE_NEON_H_
538