1 /*
2 * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #ifndef VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
12 #define VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
13
14 #include <arm_neon.h>
15
16 #include "./vpx_config.h"
17
18 // Transpose 64 bit elements as follows:
19 // a0: 00 01 02 03 04 05 06 07
20 // a1: 16 17 18 19 20 21 22 23
21 //
22 // b0.val[0]: 00 01 02 03 16 17 18 19
23 // b0.val[1]: 04 05 06 07 20 21 22 23
vpx_vtrnq_s64_to_s16(int32x4_t a0,int32x4_t a1)24 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
25 int16x8x2_t b0;
26 b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
27 vreinterpret_s16_s32(vget_low_s32(a1)));
28 b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
29 vreinterpret_s16_s32(vget_high_s32(a1)));
30 return b0;
31 }
32
vpx_vtrnq_s64_to_s32(int32x4_t a0,int32x4_t a1)33 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
34 int32x4x2_t b0;
35 b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
36 b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
37 return b0;
38 }
39
vpx_vtrnq_s64(int32x4_t a0,int32x4_t a1)40 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
41 int64x2x2_t b0;
42 b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)),
43 vreinterpret_s64_s32(vget_low_s32(a1)));
44 b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)),
45 vreinterpret_s64_s32(vget_high_s32(a1)));
46 return b0;
47 }
48
vpx_vtrnq_u64_to_u8(uint32x4_t a0,uint32x4_t a1)49 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
50 uint8x16x2_t b0;
51 b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)),
52 vreinterpret_u8_u32(vget_low_u32(a1)));
53 b0.val[1] = vcombine_u8(vreinterpret_u8_u32(vget_high_u32(a0)),
54 vreinterpret_u8_u32(vget_high_u32(a1)));
55 return b0;
56 }
57
vpx_vtrnq_u64_to_u16(uint32x4_t a0,uint32x4_t a1)58 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
59 uint16x8x2_t b0;
60 b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
61 vreinterpret_u16_u32(vget_low_u32(a1)));
62 b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
63 vreinterpret_u16_u32(vget_high_u32(a1)));
64 return b0;
65 }
66
transpose_u8_4x4(uint8x8_t * a0,uint8x8_t * a1)67 static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) {
68 // Swap 16 bit elements. Goes from:
69 // a0: 00 01 02 03 10 11 12 13
70 // a1: 20 21 22 23 30 31 32 33
71 // to:
72 // b0.val[0]: 00 01 20 21 10 11 30 31
73 // b0.val[1]: 02 03 22 23 12 13 32 33
74
75 const uint16x4x2_t b0 =
76 vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
77
78 // Swap 32 bit elements resulting in:
79 // c0.val[0]: 00 01 20 21 02 03 22 23
80 // c0.val[1]: 10 11 30 31 12 13 32 33
81
82 const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
83 vreinterpret_u32_u16(b0.val[1]));
84
85 // Swap 8 bit elements resulting in:
86 // d0.val[0]: 00 10 20 30 02 12 22 32
87 // d0.val[1]: 01 11 21 31 03 13 23 33
88
89 const uint8x8x2_t d0 =
90 vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
91
92 *a0 = d0.val[0];
93 *a1 = d0.val[1];
94 }
95
transpose_s16_4x4d(int16x4_t * a0,int16x4_t * a1,int16x4_t * a2,int16x4_t * a3)96 static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1,
97 int16x4_t *a2, int16x4_t *a3) {
98 // Swap 16 bit elements. Goes from:
99 // a0: 00 01 02 03
100 // a1: 10 11 12 13
101 // a2: 20 21 22 23
102 // a3: 30 31 32 33
103 // to:
104 // b0.val[0]: 00 10 02 12
105 // b0.val[1]: 01 11 03 13
106 // b1.val[0]: 20 30 22 32
107 // b1.val[1]: 21 31 23 33
108
109 const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
110 const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
111
112 // Swap 32 bit elements resulting in:
113 // c0.val[0]: 00 10 20 30
114 // c0.val[1]: 02 12 22 32
115 // c1.val[0]: 01 11 21 31
116 // c1.val[1]: 03 13 23 33
117
118 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
119 vreinterpret_s32_s16(b1.val[0]));
120 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
121 vreinterpret_s32_s16(b1.val[1]));
122
123 *a0 = vreinterpret_s16_s32(c0.val[0]);
124 *a1 = vreinterpret_s16_s32(c1.val[0]);
125 *a2 = vreinterpret_s16_s32(c0.val[1]);
126 *a3 = vreinterpret_s16_s32(c1.val[1]);
127 }
128
transpose_s16_4x4q(int16x8_t * a0,int16x8_t * a1)129 static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) {
130 // Swap 32 bit elements. Goes from:
131 // a0: 00 01 02 03 10 11 12 13
132 // a1: 20 21 22 23 30 31 32 33
133 // to:
134 // b0.val[0]: 00 01 20 21 10 11 30 31
135 // b0.val[1]: 02 03 22 23 12 13 32 33
136
137 const int32x4x2_t b0 =
138 vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1));
139
140 // Swap 64 bit elements resulting in:
141 // c0: 00 01 20 21 02 03 22 23
142 // c1: 10 11 30 31 12 13 32 33
143
144 const int32x4_t c0 =
145 vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1]));
146 const int32x4_t c1 =
147 vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1]));
148
149 // Swap 16 bit elements resulting in:
150 // d0.val[0]: 00 10 20 30 02 12 22 32
151 // d0.val[1]: 01 11 21 31 03 13 23 33
152
153 const int16x8x2_t d0 =
154 vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1));
155
156 *a0 = d0.val[0];
157 *a1 = d0.val[1];
158 }
159
transpose_u16_4x4q(uint16x8_t * a0,uint16x8_t * a1)160 static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) {
161 // Swap 32 bit elements. Goes from:
162 // a0: 00 01 02 03 10 11 12 13
163 // a1: 20 21 22 23 30 31 32 33
164 // to:
165 // b0.val[0]: 00 01 20 21 10 11 30 31
166 // b0.val[1]: 02 03 22 23 12 13 32 33
167
168 const uint32x4x2_t b0 =
169 vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1));
170
171 // Swap 64 bit elements resulting in:
172 // c0: 00 01 20 21 02 03 22 23
173 // c1: 10 11 30 31 12 13 32 33
174
175 const uint32x4_t c0 =
176 vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1]));
177 const uint32x4_t c1 =
178 vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1]));
179
180 // Swap 16 bit elements resulting in:
181 // d0.val[0]: 00 10 20 30 02 12 22 32
182 // d0.val[1]: 01 11 21 31 03 13 23 33
183
184 const uint16x8x2_t d0 =
185 vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1));
186
187 *a0 = d0.val[0];
188 *a1 = d0.val[1];
189 }
190
transpose_u8_4x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,const uint8x8_t a4,const uint8x8_t a5,const uint8x8_t a6,const uint8x8_t a7)191 static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
192 uint8x8_t *a3, const uint8x8_t a4,
193 const uint8x8_t a5, const uint8x8_t a6,
194 const uint8x8_t a7) {
195 // Swap 32 bit elements. Goes from:
196 // a0: 00 01 02 03 XX XX XX XX
197 // a1: 10 11 12 13 XX XX XX XX
198 // a2: 20 21 22 23 XX XX XX XX
199 // a3; 30 31 32 33 XX XX XX XX
200 // a4: 40 41 42 43 XX XX XX XX
201 // a5: 50 51 52 53 XX XX XX XX
202 // a6: 60 61 62 63 XX XX XX XX
203 // a7: 70 71 72 73 XX XX XX XX
204 // to:
205 // b0.val[0]: 00 01 02 03 40 41 42 43
206 // b1.val[0]: 10 11 12 13 50 51 52 53
207 // b2.val[0]: 20 21 22 23 60 61 62 63
208 // b3.val[0]: 30 31 32 33 70 71 72 73
209
210 const uint32x2x2_t b0 =
211 vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4));
212 const uint32x2x2_t b1 =
213 vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5));
214 const uint32x2x2_t b2 =
215 vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6));
216 const uint32x2x2_t b3 =
217 vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7));
218
219 // Swap 16 bit elements resulting in:
220 // c0.val[0]: 00 01 20 21 40 41 60 61
221 // c0.val[1]: 02 03 22 23 42 43 62 63
222 // c1.val[0]: 10 11 30 31 50 51 70 71
223 // c1.val[1]: 12 13 32 33 52 53 72 73
224
225 const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
226 vreinterpret_u16_u32(b2.val[0]));
227 const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
228 vreinterpret_u16_u32(b3.val[0]));
229
230 // Swap 8 bit elements resulting in:
231 // d0.val[0]: 00 10 20 30 40 50 60 70
232 // d0.val[1]: 01 11 21 31 41 51 61 71
233 // d1.val[0]: 02 12 22 32 42 52 62 72
234 // d1.val[1]: 03 13 23 33 43 53 63 73
235
236 const uint8x8x2_t d0 =
237 vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
238 const uint8x8x2_t d1 =
239 vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
240
241 *a0 = d0.val[0];
242 *a1 = d0.val[1];
243 *a2 = d1.val[0];
244 *a3 = d1.val[1];
245 }
246
transpose_s32_4x4(int32x4_t * a0,int32x4_t * a1,int32x4_t * a2,int32x4_t * a3)247 static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1,
248 int32x4_t *a2, int32x4_t *a3) {
249 // Swap 32 bit elements. Goes from:
250 // a0: 00 01 02 03
251 // a1: 10 11 12 13
252 // a2: 20 21 22 23
253 // a3: 30 31 32 33
254 // to:
255 // b0.val[0]: 00 10 02 12
256 // b0.val[1]: 01 11 03 13
257 // b1.val[0]: 20 30 22 32
258 // b1.val[1]: 21 31 23 33
259
260 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
261 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
262
263 // Swap 64 bit elements resulting in:
264 // c0.val[0]: 00 10 20 30
265 // c0.val[1]: 02 12 22 32
266 // c1.val[0]: 01 11 21 31
267 // c1.val[1]: 03 13 23 33
268
269 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
270 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
271
272 *a0 = c0.val[0];
273 *a1 = c1.val[0];
274 *a2 = c0.val[1];
275 *a3 = c1.val[1];
276 }
277
transpose_s16_4x8(const int16x4_t a0,const int16x4_t a1,const int16x4_t a2,const int16x4_t a3,const int16x4_t a4,const int16x4_t a5,const int16x4_t a6,const int16x4_t a7,int16x8_t * const o0,int16x8_t * const o1,int16x8_t * const o2,int16x8_t * const o3)278 static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1,
279 const int16x4_t a2, const int16x4_t a3,
280 const int16x4_t a4, const int16x4_t a5,
281 const int16x4_t a6, const int16x4_t a7,
282 int16x8_t *const o0, int16x8_t *const o1,
283 int16x8_t *const o2, int16x8_t *const o3) {
284 // Swap 16 bit elements. Goes from:
285 // a0: 00 01 02 03
286 // a1: 10 11 12 13
287 // a2: 20 21 22 23
288 // a3: 30 31 32 33
289 // a4: 40 41 42 43
290 // a5: 50 51 52 53
291 // a6: 60 61 62 63
292 // a7: 70 71 72 73
293 // to:
294 // b0.val[0]: 00 10 02 12
295 // b0.val[1]: 01 11 03 13
296 // b1.val[0]: 20 30 22 32
297 // b1.val[1]: 21 31 23 33
298 // b2.val[0]: 40 50 42 52
299 // b2.val[1]: 41 51 43 53
300 // b3.val[0]: 60 70 62 72
301 // b3.val[1]: 61 71 63 73
302
303 const int16x4x2_t b0 = vtrn_s16(a0, a1);
304 const int16x4x2_t b1 = vtrn_s16(a2, a3);
305 const int16x4x2_t b2 = vtrn_s16(a4, a5);
306 const int16x4x2_t b3 = vtrn_s16(a6, a7);
307
308 // Swap 32 bit elements resulting in:
309 // c0.val[0]: 00 10 20 30
310 // c0.val[1]: 02 12 22 32
311 // c1.val[0]: 01 11 21 31
312 // c1.val[1]: 03 13 23 33
313 // c2.val[0]: 40 50 60 70
314 // c2.val[1]: 42 52 62 72
315 // c3.val[0]: 41 51 61 71
316 // c3.val[1]: 43 53 63 73
317
318 const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
319 vreinterpret_s32_s16(b1.val[0]));
320 const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
321 vreinterpret_s32_s16(b1.val[1]));
322 const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]),
323 vreinterpret_s32_s16(b3.val[0]));
324 const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]),
325 vreinterpret_s32_s16(b3.val[1]));
326
327 // Swap 64 bit elements resulting in:
328 // o0: 00 10 20 30 40 50 60 70
329 // o1: 01 11 21 31 41 51 61 71
330 // o2: 02 12 22 32 42 52 62 72
331 // o3: 03 13 23 33 43 53 63 73
332
333 *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]),
334 vreinterpret_s16_s32(c2.val[0]));
335 *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]),
336 vreinterpret_s16_s32(c3.val[0]));
337 *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]),
338 vreinterpret_s16_s32(c2.val[1]));
339 *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]),
340 vreinterpret_s16_s32(c3.val[1]));
341 }
342
transpose_s32_4x8(int32x4_t * const a0,int32x4_t * const a1,int32x4_t * const a2,int32x4_t * const a3,int32x4_t * const a4,int32x4_t * const a5,int32x4_t * const a6,int32x4_t * const a7)343 static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1,
344 int32x4_t *const a2, int32x4_t *const a3,
345 int32x4_t *const a4, int32x4_t *const a5,
346 int32x4_t *const a6, int32x4_t *const a7) {
347 // Swap 32 bit elements. Goes from:
348 // a0: 00 01 02 03
349 // a1: 10 11 12 13
350 // a2: 20 21 22 23
351 // a3: 30 31 32 33
352 // a4: 40 41 42 43
353 // a5: 50 51 52 53
354 // a6: 60 61 62 63
355 // a7: 70 71 72 73
356 // to:
357 // b0.val[0]: 00 10 02 12
358 // b0.val[1]: 01 11 03 13
359 // b1.val[0]: 20 30 22 32
360 // b1.val[1]: 21 31 23 33
361 // b2.val[0]: 40 50 42 52
362 // b2.val[1]: 41 51 43 53
363 // b3.val[0]: 60 70 62 72
364 // b3.val[1]: 61 71 63 73
365
366 const int32x4x2_t b0 = vtrnq_s32(*a0, *a1);
367 const int32x4x2_t b1 = vtrnq_s32(*a2, *a3);
368 const int32x4x2_t b2 = vtrnq_s32(*a4, *a5);
369 const int32x4x2_t b3 = vtrnq_s32(*a6, *a7);
370
371 // Swap 64 bit elements resulting in:
372 // c0.val[0]: 00 10 20 30
373 // c0.val[1]: 02 12 22 32
374 // c1.val[0]: 01 11 21 31
375 // c1.val[1]: 03 13 23 33
376 // c2.val[0]: 40 50 60 70
377 // c2.val[1]: 42 52 62 72
378 // c3.val[0]: 41 51 61 71
379 // c3.val[1]: 43 53 63 73
380
381 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]);
382 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]);
383 const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]);
384 const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]);
385
386 *a0 = vreinterpretq_s32_s64(c0.val[0]);
387 *a1 = vreinterpretq_s32_s64(c2.val[0]);
388 *a2 = vreinterpretq_s32_s64(c1.val[0]);
389 *a3 = vreinterpretq_s32_s64(c3.val[0]);
390 *a4 = vreinterpretq_s32_s64(c0.val[1]);
391 *a5 = vreinterpretq_s32_s64(c2.val[1]);
392 *a6 = vreinterpretq_s32_s64(c1.val[1]);
393 *a7 = vreinterpretq_s32_s64(c3.val[1]);
394 }
395
transpose_u8_8x4(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)396 static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
397 uint8x8_t *a3) {
398 // Swap 8 bit elements. Goes from:
399 // a0: 00 01 02 03 04 05 06 07
400 // a1: 10 11 12 13 14 15 16 17
401 // a2: 20 21 22 23 24 25 26 27
402 // a3: 30 31 32 33 34 35 36 37
403 // to:
404 // b0.val[0]: 00 10 02 12 04 14 06 16
405 // b0.val[1]: 01 11 03 13 05 15 07 17
406 // b1.val[0]: 20 30 22 32 24 34 26 36
407 // b1.val[1]: 21 31 23 33 25 35 27 37
408
409 const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
410 const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
411
412 // Swap 16 bit elements resulting in:
413 // c0.val[0]: 00 10 20 30 04 14 24 34
414 // c0.val[1]: 02 12 22 32 06 16 26 36
415 // c1.val[0]: 01 11 21 31 05 15 25 35
416 // c1.val[1]: 03 13 23 33 07 17 27 37
417
418 const uint16x4x2_t c0 =
419 vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
420 const uint16x4x2_t c1 =
421 vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
422
423 *a0 = vreinterpret_u8_u16(c0.val[0]);
424 *a1 = vreinterpret_u8_u16(c1.val[0]);
425 *a2 = vreinterpret_u8_u16(c0.val[1]);
426 *a3 = vreinterpret_u8_u16(c1.val[1]);
427 }
428
transpose_u16_8x4(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3)429 static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1,
430 uint16x8_t *a2, uint16x8_t *a3) {
431 // Swap 16 bit elements. Goes from:
432 // a0: 00 01 02 03 04 05 06 07
433 // a1: 10 11 12 13 14 15 16 17
434 // a2: 20 21 22 23 24 25 26 27
435 // a3: 30 31 32 33 34 35 36 37
436 // to:
437 // b0.val[0]: 00 10 02 12 04 14 06 16
438 // b0.val[1]: 01 11 03 13 05 15 07 17
439 // b1.val[0]: 20 30 22 32 24 34 26 36
440 // b1.val[1]: 21 31 23 33 25 35 27 37
441
442 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
443 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
444
445 // Swap 32 bit elements resulting in:
446 // c0.val[0]: 00 10 20 30 04 14 24 34
447 // c0.val[1]: 02 12 22 32 06 16 26 36
448 // c1.val[0]: 01 11 21 31 05 15 25 35
449 // c1.val[1]: 03 13 23 33 07 17 27 37
450
451 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
452 vreinterpretq_u32_u16(b1.val[0]));
453 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
454 vreinterpretq_u32_u16(b1.val[1]));
455
456 *a0 = vreinterpretq_u16_u32(c0.val[0]);
457 *a1 = vreinterpretq_u16_u32(c1.val[0]);
458 *a2 = vreinterpretq_u16_u32(c0.val[1]);
459 *a3 = vreinterpretq_u16_u32(c1.val[1]);
460 }
461
transpose_s32_8x4(int32x4_t * const a0,int32x4_t * const a1,int32x4_t * const a2,int32x4_t * const a3,int32x4_t * const a4,int32x4_t * const a5,int32x4_t * const a6,int32x4_t * const a7)462 static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1,
463 int32x4_t *const a2, int32x4_t *const a3,
464 int32x4_t *const a4, int32x4_t *const a5,
465 int32x4_t *const a6, int32x4_t *const a7) {
466 // Swap 32 bit elements. Goes from:
467 // a0: 00 01 02 03
468 // a1: 04 05 06 07
469 // a2: 10 11 12 13
470 // a3: 14 15 16 17
471 // a4: 20 21 22 23
472 // a5: 24 25 26 27
473 // a6: 30 31 32 33
474 // a7: 34 35 36 37
475 // to:
476 // b0.val[0]: 00 10 02 12
477 // b0.val[1]: 01 11 03 13
478 // b1.val[0]: 04 14 06 16
479 // b1.val[1]: 05 15 07 17
480 // b2.val[0]: 20 30 22 32
481 // b2.val[1]: 21 31 23 33
482 // b3.val[0]: 24 34 26 36
483 // b3.val[1]: 25 35 27 37
484
485 const int32x4x2_t b0 = vtrnq_s32(*a0, *a2);
486 const int32x4x2_t b1 = vtrnq_s32(*a1, *a3);
487 const int32x4x2_t b2 = vtrnq_s32(*a4, *a6);
488 const int32x4x2_t b3 = vtrnq_s32(*a5, *a7);
489
490 // Swap 64 bit elements resulting in:
491 // c0.val[0]: 00 10 20 30
492 // c0.val[1]: 02 12 22 32
493 // c1.val[0]: 01 11 21 31
494 // c1.val[1]: 03 13 23 33
495 // c2.val[0]: 04 14 24 34
496 // c2.val[1]: 06 16 26 36
497 // c3.val[0]: 05 15 25 35
498 // c3.val[1]: 07 17 27 37
499
500 const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]);
501 const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]);
502 const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]);
503 const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]);
504
505 *a0 = vreinterpretq_s32_s64(c0.val[0]);
506 *a1 = vreinterpretq_s32_s64(c1.val[0]);
507 *a2 = vreinterpretq_s32_s64(c0.val[1]);
508 *a3 = vreinterpretq_s32_s64(c1.val[1]);
509 *a4 = vreinterpretq_s32_s64(c2.val[0]);
510 *a5 = vreinterpretq_s32_s64(c3.val[0]);
511 *a6 = vreinterpretq_s32_s64(c2.val[1]);
512 *a7 = vreinterpretq_s32_s64(c3.val[1]);
513 }
514
515 // Note: Using 'd' registers or 'q' registers has almost identical speed. We use
516 // 'q' registers here to save some instructions.
transpose_u8_8x8(uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)517 static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2,
518 uint8x8_t *a3, uint8x8_t *a4, uint8x8_t *a5,
519 uint8x8_t *a6, uint8x8_t *a7) {
520 // Swap 8 bit elements. Goes from:
521 // a0: 00 01 02 03 04 05 06 07
522 // a1: 10 11 12 13 14 15 16 17
523 // a2: 20 21 22 23 24 25 26 27
524 // a3: 30 31 32 33 34 35 36 37
525 // a4: 40 41 42 43 44 45 46 47
526 // a5: 50 51 52 53 54 55 56 57
527 // a6: 60 61 62 63 64 65 66 67
528 // a7: 70 71 72 73 74 75 76 77
529 // to:
530 // b0.val[0]: 00 10 02 12 04 14 06 16 40 50 42 52 44 54 46 56
531 // b0.val[1]: 01 11 03 13 05 15 07 17 41 51 43 53 45 55 47 57
532 // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76
533 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77
534
535 const uint8x16x2_t b0 =
536 vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5));
537 const uint8x16x2_t b1 =
538 vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7));
539
540 // Swap 16 bit elements resulting in:
541 // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74
542 // c0.val[1]: 02 12 22 32 06 16 26 36 42 52 62 72 46 56 66 76
543 // c1.val[0]: 01 11 21 31 05 15 25 35 41 51 61 71 45 55 65 75
544 // c1.val[1]: 03 13 23 33 07 17 27 37 43 53 63 73 47 57 67 77
545
546 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
547 vreinterpretq_u16_u8(b1.val[0]));
548 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
549 vreinterpretq_u16_u8(b1.val[1]));
550
551 // Unzip 32 bit elements resulting in:
552 // d0.val[0]: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
553 // d0.val[1]: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
554 // d1.val[0]: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
555 // d1.val[1]: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
556 const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
557 vreinterpretq_u32_u16(c1.val[0]));
558 const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
559 vreinterpretq_u32_u16(c1.val[1]));
560
561 *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
562 *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
563 *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
564 *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
565 *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
566 *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
567 *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
568 *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
569 }
570
transpose_s16_8x8(int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)571 static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1,
572 int16x8_t *a2, int16x8_t *a3,
573 int16x8_t *a4, int16x8_t *a5,
574 int16x8_t *a6, int16x8_t *a7) {
575 // Swap 16 bit elements. Goes from:
576 // a0: 00 01 02 03 04 05 06 07
577 // a1: 10 11 12 13 14 15 16 17
578 // a2: 20 21 22 23 24 25 26 27
579 // a3: 30 31 32 33 34 35 36 37
580 // a4: 40 41 42 43 44 45 46 47
581 // a5: 50 51 52 53 54 55 56 57
582 // a6: 60 61 62 63 64 65 66 67
583 // a7: 70 71 72 73 74 75 76 77
584 // to:
585 // b0.val[0]: 00 10 02 12 04 14 06 16
586 // b0.val[1]: 01 11 03 13 05 15 07 17
587 // b1.val[0]: 20 30 22 32 24 34 26 36
588 // b1.val[1]: 21 31 23 33 25 35 27 37
589 // b2.val[0]: 40 50 42 52 44 54 46 56
590 // b2.val[1]: 41 51 43 53 45 55 47 57
591 // b3.val[0]: 60 70 62 72 64 74 66 76
592 // b3.val[1]: 61 71 63 73 65 75 67 77
593
594 const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
595 const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
596 const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
597 const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
598
599 // Swap 32 bit elements resulting in:
600 // c0.val[0]: 00 10 20 30 04 14 24 34
601 // c0.val[1]: 02 12 22 32 06 16 26 36
602 // c1.val[0]: 01 11 21 31 05 15 25 35
603 // c1.val[1]: 03 13 23 33 07 17 27 37
604 // c2.val[0]: 40 50 60 70 44 54 64 74
605 // c2.val[1]: 42 52 62 72 46 56 66 76
606 // c3.val[0]: 41 51 61 71 45 55 65 75
607 // c3.val[1]: 43 53 63 73 47 57 67 77
608
609 const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
610 vreinterpretq_s32_s16(b1.val[0]));
611 const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
612 vreinterpretq_s32_s16(b1.val[1]));
613 const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
614 vreinterpretq_s32_s16(b3.val[0]));
615 const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
616 vreinterpretq_s32_s16(b3.val[1]));
617
618 // Swap 64 bit elements resulting in:
619 // d0.val[0]: 00 10 20 30 40 50 60 70
620 // d0.val[1]: 04 14 24 34 44 54 64 74
621 // d1.val[0]: 01 11 21 31 41 51 61 71
622 // d1.val[1]: 05 15 25 35 45 55 65 75
623 // d2.val[0]: 02 12 22 32 42 52 62 72
624 // d2.val[1]: 06 16 26 36 46 56 66 76
625 // d3.val[0]: 03 13 23 33 43 53 63 73
626 // d3.val[1]: 07 17 27 37 47 57 67 77
627 const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
628 const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
629 const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
630 const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
631
632 *a0 = d0.val[0];
633 *a1 = d1.val[0];
634 *a2 = d2.val[0];
635 *a3 = d3.val[0];
636 *a4 = d0.val[1];
637 *a5 = d1.val[1];
638 *a6 = d2.val[1];
639 *a7 = d3.val[1];
640 }
641
transpose_u16_8x8(uint16x8_t * a0,uint16x8_t * a1,uint16x8_t * a2,uint16x8_t * a3,uint16x8_t * a4,uint16x8_t * a5,uint16x8_t * a6,uint16x8_t * a7)642 static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
643 uint16x8_t *a2, uint16x8_t *a3,
644 uint16x8_t *a4, uint16x8_t *a5,
645 uint16x8_t *a6, uint16x8_t *a7) {
646 // Swap 16 bit elements. Goes from:
647 // a0: 00 01 02 03 04 05 06 07
648 // a1: 10 11 12 13 14 15 16 17
649 // a2: 20 21 22 23 24 25 26 27
650 // a3: 30 31 32 33 34 35 36 37
651 // a4: 40 41 42 43 44 45 46 47
652 // a5: 50 51 52 53 54 55 56 57
653 // a6: 60 61 62 63 64 65 66 67
654 // a7: 70 71 72 73 74 75 76 77
655 // to:
656 // b0.val[0]: 00 10 02 12 04 14 06 16
657 // b0.val[1]: 01 11 03 13 05 15 07 17
658 // b1.val[0]: 20 30 22 32 24 34 26 36
659 // b1.val[1]: 21 31 23 33 25 35 27 37
660 // b2.val[0]: 40 50 42 52 44 54 46 56
661 // b2.val[1]: 41 51 43 53 45 55 47 57
662 // b3.val[0]: 60 70 62 72 64 74 66 76
663 // b3.val[1]: 61 71 63 73 65 75 67 77
664
665 const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
666 const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
667 const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
668 const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
669
670 // Swap 32 bit elements resulting in:
671 // c0.val[0]: 00 10 20 30 04 14 24 34
672 // c0.val[1]: 02 12 22 32 06 16 26 36
673 // c1.val[0]: 01 11 21 31 05 15 25 35
674 // c1.val[1]: 03 13 23 33 07 17 27 37
675 // c2.val[0]: 40 50 60 70 44 54 64 74
676 // c2.val[1]: 42 52 62 72 46 56 66 76
677 // c3.val[0]: 41 51 61 71 45 55 65 75
678 // c3.val[1]: 43 53 63 73 47 57 67 77
679
680 const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
681 vreinterpretq_u32_u16(b1.val[0]));
682 const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
683 vreinterpretq_u32_u16(b1.val[1]));
684 const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
685 vreinterpretq_u32_u16(b3.val[0]));
686 const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
687 vreinterpretq_u32_u16(b3.val[1]));
688
689 // Swap 64 bit elements resulting in:
690 // d0.val[0]: 00 10 20 30 40 50 60 70
691 // d0.val[1]: 04 14 24 34 44 54 64 74
692 // d1.val[0]: 01 11 21 31 41 51 61 71
693 // d1.val[1]: 05 15 25 35 45 55 65 75
694 // d2.val[0]: 02 12 22 32 42 52 62 72
695 // d2.val[1]: 06 16 26 36 46 56 66 76
696 // d3.val[0]: 03 13 23 33 43 53 63 73
697 // d3.val[1]: 07 17 27 37 47 57 67 77
698 const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
699 const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
700 const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
701 const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
702
703 *a0 = d0.val[0];
704 *a1 = d1.val[0];
705 *a2 = d2.val[0];
706 *a3 = d3.val[0];
707 *a4 = d0.val[1];
708 *a5 = d1.val[1];
709 *a6 = d2.val[1];
710 *a7 = d3.val[1];
711 }
712
transpose_s32_8x8(int32x4x2_t * a0,int32x4x2_t * a1,int32x4x2_t * a2,int32x4x2_t * a3,int32x4x2_t * a4,int32x4x2_t * a5,int32x4x2_t * a6,int32x4x2_t * a7)713 static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
714 int32x4x2_t *a2, int32x4x2_t *a3,
715 int32x4x2_t *a4, int32x4x2_t *a5,
716 int32x4x2_t *a6, int32x4x2_t *a7) {
717 // Swap 32 bit elements. Goes from:
718 // a0: 00 01 02 03 04 05 06 07
719 // a1: 10 11 12 13 14 15 16 17
720 // a2: 20 21 22 23 24 25 26 27
721 // a3: 30 31 32 33 34 35 36 37
722 // a4: 40 41 42 43 44 45 46 47
723 // a5: 50 51 52 53 54 55 56 57
724 // a6: 60 61 62 63 64 65 66 67
725 // a7: 70 71 72 73 74 75 76 77
726 // to:
727 // b0: 00 10 02 12 01 11 03 13
728 // b1: 20 30 22 32 21 31 23 33
729 // b2: 40 50 42 52 41 51 43 53
730 // b3: 60 70 62 72 61 71 63 73
731 // b4: 04 14 06 16 05 15 07 17
732 // b5: 24 34 26 36 25 35 27 37
733 // b6: 44 54 46 56 45 55 47 57
734 // b7: 64 74 66 76 65 75 67 77
735
736 const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
737 const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
738 const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
739 const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
740 const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
741 const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
742 const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
743 const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
744
745 // Swap 64 bit elements resulting in:
746 // c0: 00 10 20 30 02 12 22 32
747 // c1: 01 11 21 31 03 13 23 33
748 // c2: 40 50 60 70 42 52 62 72
749 // c3: 41 51 61 71 43 53 63 73
750 // c4: 04 14 24 34 06 16 26 36
751 // c5: 05 15 25 35 07 17 27 37
752 // c6: 44 54 64 74 46 56 66 76
753 // c7: 45 55 65 75 47 57 67 77
754 const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
755 const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
756 const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
757 const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
758 const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
759 const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
760 const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
761 const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
762
763 // Swap 128 bit elements resulting in:
764 // a0: 00 10 20 30 40 50 60 70
765 // a1: 01 11 21 31 41 51 61 71
766 // a2: 02 12 22 32 42 52 62 72
767 // a3: 03 13 23 33 43 53 63 73
768 // a4: 04 14 24 34 44 54 64 74
769 // a5: 05 15 25 35 45 55 65 75
770 // a6: 06 16 26 36 46 56 66 76
771 // a7: 07 17 27 37 47 57 67 77
772 a0->val[0] = c0.val[0];
773 a0->val[1] = c2.val[0];
774 a1->val[0] = c1.val[0];
775 a1->val[1] = c3.val[0];
776 a2->val[0] = c0.val[1];
777 a2->val[1] = c2.val[1];
778 a3->val[0] = c1.val[1];
779 a3->val[1] = c3.val[1];
780 a4->val[0] = c4.val[0];
781 a4->val[1] = c6.val[0];
782 a5->val[0] = c5.val[0];
783 a5->val[1] = c7.val[0];
784 a6->val[0] = c4.val[1];
785 a6->val[1] = c6.val[1];
786 a7->val[0] = c5.val[1];
787 a7->val[1] = c7.val[1];
788 }
789
transpose_u8_16x8(const uint8x16_t i0,const uint8x16_t i1,const uint8x16_t i2,const uint8x16_t i3,const uint8x16_t i4,const uint8x16_t i5,const uint8x16_t i6,const uint8x16_t i7,uint8x8_t * o0,uint8x8_t * o1,uint8x8_t * o2,uint8x8_t * o3,uint8x8_t * o4,uint8x8_t * o5,uint8x8_t * o6,uint8x8_t * o7,uint8x8_t * o8,uint8x8_t * o9,uint8x8_t * o10,uint8x8_t * o11,uint8x8_t * o12,uint8x8_t * o13,uint8x8_t * o14,uint8x8_t * o15)790 static INLINE void transpose_u8_16x8(
791 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
792 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
793 const uint8x16_t i6, const uint8x16_t i7, uint8x8_t *o0, uint8x8_t *o1,
794 uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
795 uint8x8_t *o7, uint8x8_t *o8, uint8x8_t *o9, uint8x8_t *o10, uint8x8_t *o11,
796 uint8x8_t *o12, uint8x8_t *o13, uint8x8_t *o14, uint8x8_t *o15) {
797 // Swap 8 bit elements. Goes from:
798 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
799 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
800 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
801 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
802 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
803 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
804 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
805 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
806 // to:
807 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
808 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
809 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
810 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
811 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
812 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
813 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
814 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
815 const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
816 const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
817 const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
818 const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
819
820 // Swap 16 bit elements resulting in:
821 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
822 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
823 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
824 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
825 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
826 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
827 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
828 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
829 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
830 vreinterpretq_u16_u8(b1.val[0]));
831 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
832 vreinterpretq_u16_u8(b1.val[1]));
833 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
834 vreinterpretq_u16_u8(b3.val[0]));
835 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
836 vreinterpretq_u16_u8(b3.val[1]));
837
838 // Swap 32 bit elements resulting in:
839 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
840 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
841 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
842 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
843 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
844 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
845 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
846 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
847 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
848 vreinterpretq_u32_u16(c2.val[0]));
849 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
850 vreinterpretq_u32_u16(c2.val[1]));
851 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
852 vreinterpretq_u32_u16(c3.val[0]));
853 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
854 vreinterpretq_u32_u16(c3.val[1]));
855
856 // Output:
857 // o0 : 00 10 20 30 40 50 60 70
858 // o1 : 01 11 21 31 41 51 61 71
859 // o2 : 02 12 22 32 42 52 62 72
860 // o3 : 03 13 23 33 43 53 63 73
861 // o4 : 04 14 24 34 44 54 64 74
862 // o5 : 05 15 25 35 45 55 65 75
863 // o6 : 06 16 26 36 46 56 66 76
864 // o7 : 07 17 27 37 47 57 67 77
865 // o8 : 08 18 28 38 48 58 68 78
866 // o9 : 09 19 29 39 49 59 69 79
867 // o10: 0A 1A 2A 3A 4A 5A 6A 7A
868 // o11: 0B 1B 2B 3B 4B 5B 6B 7B
869 // o12: 0C 1C 2C 3C 4C 5C 6C 7C
870 // o13: 0D 1D 2D 3D 4D 5D 6D 7D
871 // o14: 0E 1E 2E 3E 4E 5E 6E 7E
872 // o15: 0F 1F 2F 3F 4F 5F 6F 7F
873 *o0 = vget_low_u8(vreinterpretq_u8_u32(d0.val[0]));
874 *o1 = vget_low_u8(vreinterpretq_u8_u32(d2.val[0]));
875 *o2 = vget_low_u8(vreinterpretq_u8_u32(d1.val[0]));
876 *o3 = vget_low_u8(vreinterpretq_u8_u32(d3.val[0]));
877 *o4 = vget_low_u8(vreinterpretq_u8_u32(d0.val[1]));
878 *o5 = vget_low_u8(vreinterpretq_u8_u32(d2.val[1]));
879 *o6 = vget_low_u8(vreinterpretq_u8_u32(d1.val[1]));
880 *o7 = vget_low_u8(vreinterpretq_u8_u32(d3.val[1]));
881 *o8 = vget_high_u8(vreinterpretq_u8_u32(d0.val[0]));
882 *o9 = vget_high_u8(vreinterpretq_u8_u32(d2.val[0]));
883 *o10 = vget_high_u8(vreinterpretq_u8_u32(d1.val[0]));
884 *o11 = vget_high_u8(vreinterpretq_u8_u32(d3.val[0]));
885 *o12 = vget_high_u8(vreinterpretq_u8_u32(d0.val[1]));
886 *o13 = vget_high_u8(vreinterpretq_u8_u32(d2.val[1]));
887 *o14 = vget_high_u8(vreinterpretq_u8_u32(d1.val[1]));
888 *o15 = vget_high_u8(vreinterpretq_u8_u32(d3.val[1]));
889 }
890
transpose_u8_8x16(const uint8x8_t i0,const uint8x8_t i1,const uint8x8_t i2,const uint8x8_t i3,const uint8x8_t i4,const uint8x8_t i5,const uint8x8_t i6,const uint8x8_t i7,const uint8x8_t i8,const uint8x8_t i9,const uint8x8_t i10,const uint8x8_t i11,const uint8x8_t i12,const uint8x8_t i13,const uint8x8_t i14,const uint8x8_t i15,uint8x16_t * o0,uint8x16_t * o1,uint8x16_t * o2,uint8x16_t * o3,uint8x16_t * o4,uint8x16_t * o5,uint8x16_t * o6,uint8x16_t * o7)891 static INLINE void transpose_u8_8x16(
892 const uint8x8_t i0, const uint8x8_t i1, const uint8x8_t i2,
893 const uint8x8_t i3, const uint8x8_t i4, const uint8x8_t i5,
894 const uint8x8_t i6, const uint8x8_t i7, const uint8x8_t i8,
895 const uint8x8_t i9, const uint8x8_t i10, const uint8x8_t i11,
896 const uint8x8_t i12, const uint8x8_t i13, const uint8x8_t i14,
897 const uint8x8_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
898 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
899 uint8x16_t *o7) {
900 // Combine 8 bit elements. Goes from:
901 // i0 : 00 01 02 03 04 05 06 07
902 // i1 : 10 11 12 13 14 15 16 17
903 // i2 : 20 21 22 23 24 25 26 27
904 // i3 : 30 31 32 33 34 35 36 37
905 // i4 : 40 41 42 43 44 45 46 47
906 // i5 : 50 51 52 53 54 55 56 57
907 // i6 : 60 61 62 63 64 65 66 67
908 // i7 : 70 71 72 73 74 75 76 77
909 // i8 : 80 81 82 83 84 85 86 87
910 // i9 : 90 91 92 93 94 95 96 97
911 // i10: A0 A1 A2 A3 A4 A5 A6 A7
912 // i11: B0 B1 B2 B3 B4 B5 B6 B7
913 // i12: C0 C1 C2 C3 C4 C5 C6 C7
914 // i13: D0 D1 D2 D3 D4 D5 D6 D7
915 // i14: E0 E1 E2 E3 E4 E5 E6 E7
916 // i15: F0 F1 F2 F3 F4 F5 F6 F7
917 // to:
918 // a0: 00 01 02 03 04 05 06 07 80 81 82 83 84 85 86 87
919 // a1: 10 11 12 13 14 15 16 17 90 91 92 93 94 95 96 97
920 // a2: 20 21 22 23 24 25 26 27 A0 A1 A2 A3 A4 A5 A6 A7
921 // a3: 30 31 32 33 34 35 36 37 B0 B1 B2 B3 B4 B5 B6 B7
922 // a4: 40 41 42 43 44 45 46 47 C0 C1 C2 C3 C4 C5 C6 C7
923 // a5: 50 51 52 53 54 55 56 57 D0 D1 D2 D3 D4 D5 D6 D7
924 // a6: 60 61 62 63 64 65 66 67 E0 E1 E2 E3 E4 E5 E6 E7
925 // a7: 70 71 72 73 74 75 76 77 F0 F1 F2 F3 F4 F5 F6 F7
926 const uint8x16_t a0 = vcombine_u8(i0, i8);
927 const uint8x16_t a1 = vcombine_u8(i1, i9);
928 const uint8x16_t a2 = vcombine_u8(i2, i10);
929 const uint8x16_t a3 = vcombine_u8(i3, i11);
930 const uint8x16_t a4 = vcombine_u8(i4, i12);
931 const uint8x16_t a5 = vcombine_u8(i5, i13);
932 const uint8x16_t a6 = vcombine_u8(i6, i14);
933 const uint8x16_t a7 = vcombine_u8(i7, i15);
934
935 // Swap 8 bit elements resulting in:
936 // b0.val[0]: 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96
937 // b0.val[1]: 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97
938 // b1.val[0]: 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6
939 // b1.val[1]: 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7
940 // b2.val[0]: 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6
941 // b2.val[1]: 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7
942 // b3.val[0]: 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6
943 // b3.val[1]: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7
944 const uint8x16x2_t b0 = vtrnq_u8(a0, a1);
945 const uint8x16x2_t b1 = vtrnq_u8(a2, a3);
946 const uint8x16x2_t b2 = vtrnq_u8(a4, a5);
947 const uint8x16x2_t b3 = vtrnq_u8(a6, a7);
948
949 // Swap 16 bit elements resulting in:
950 // c0.val[0]: 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4
951 // c0.val[1]: 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6
952 // c1.val[0]: 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5
953 // c1.val[1]: 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7
954 // c2.val[0]: 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4
955 // c2.val[1]: 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6
956 // c3.val[0]: 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5
957 // c3.val[1]: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7
958 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
959 vreinterpretq_u16_u8(b1.val[0]));
960 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
961 vreinterpretq_u16_u8(b1.val[1]));
962 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
963 vreinterpretq_u16_u8(b3.val[0]));
964 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
965 vreinterpretq_u16_u8(b3.val[1]));
966
967 // Swap 32 bit elements resulting in:
968 // d0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
969 // d0.val[1]: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
970 // d1.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
971 // d1.val[1]: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
972 // d2.val[0]: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
973 // d2.val[1]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
974 // d3.val[0]: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
975 // d3.val[1]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
976 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
977 vreinterpretq_u32_u16(c2.val[0]));
978 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
979 vreinterpretq_u32_u16(c2.val[1]));
980 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
981 vreinterpretq_u32_u16(c3.val[0]));
982 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
983 vreinterpretq_u32_u16(c3.val[1]));
984
985 // Output:
986 // o0: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
987 // o1: 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1
988 // o2: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
989 // o3: 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3
990 // o4: 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4
991 // o5: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
992 // o6: 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6
993 // o7: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
994 *o0 = vreinterpretq_u8_u32(d0.val[0]);
995 *o1 = vreinterpretq_u8_u32(d2.val[0]);
996 *o2 = vreinterpretq_u8_u32(d1.val[0]);
997 *o3 = vreinterpretq_u8_u32(d3.val[0]);
998 *o4 = vreinterpretq_u8_u32(d0.val[1]);
999 *o5 = vreinterpretq_u8_u32(d2.val[1]);
1000 *o6 = vreinterpretq_u8_u32(d1.val[1]);
1001 *o7 = vreinterpretq_u8_u32(d3.val[1]);
1002 }
1003
transpose_u8_16x16(const uint8x16_t i0,const uint8x16_t i1,const uint8x16_t i2,const uint8x16_t i3,const uint8x16_t i4,const uint8x16_t i5,const uint8x16_t i6,const uint8x16_t i7,const uint8x16_t i8,const uint8x16_t i9,const uint8x16_t i10,const uint8x16_t i11,const uint8x16_t i12,const uint8x16_t i13,const uint8x16_t i14,const uint8x16_t i15,uint8x16_t * o0,uint8x16_t * o1,uint8x16_t * o2,uint8x16_t * o3,uint8x16_t * o4,uint8x16_t * o5,uint8x16_t * o6,uint8x16_t * o7,uint8x16_t * o8,uint8x16_t * o9,uint8x16_t * o10,uint8x16_t * o11,uint8x16_t * o12,uint8x16_t * o13,uint8x16_t * o14,uint8x16_t * o15)1004 static INLINE void transpose_u8_16x16(
1005 const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
1006 const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
1007 const uint8x16_t i6, const uint8x16_t i7, const uint8x16_t i8,
1008 const uint8x16_t i9, const uint8x16_t i10, const uint8x16_t i11,
1009 const uint8x16_t i12, const uint8x16_t i13, const uint8x16_t i14,
1010 const uint8x16_t i15, uint8x16_t *o0, uint8x16_t *o1, uint8x16_t *o2,
1011 uint8x16_t *o3, uint8x16_t *o4, uint8x16_t *o5, uint8x16_t *o6,
1012 uint8x16_t *o7, uint8x16_t *o8, uint8x16_t *o9, uint8x16_t *o10,
1013 uint8x16_t *o11, uint8x16_t *o12, uint8x16_t *o13, uint8x16_t *o14,
1014 uint8x16_t *o15) {
1015 // Swap 8 bit elements. Goes from:
1016 // i0: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
1017 // i1: 10 11 12 13 14 15 16 17 18 19 1A 1B 1C 1D 1E 1F
1018 // i2: 20 21 22 23 24 25 26 27 28 29 2A 2B 2C 2D 2E 2F
1019 // i3: 30 31 32 33 34 35 36 37 38 39 3A 3B 3C 3D 3E 3F
1020 // i4: 40 41 42 43 44 45 46 47 48 49 4A 4B 4C 4D 4E 4F
1021 // i5: 50 51 52 53 54 55 56 57 58 59 5A 5B 5C 5D 5E 5F
1022 // i6: 60 61 62 63 64 65 66 67 68 69 6A 6B 6C 6D 6E 6F
1023 // i7: 70 71 72 73 74 75 76 77 78 79 7A 7B 7C 7D 7E 7F
1024 // i8: 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F
1025 // i9: 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F
1026 // i10: A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF
1027 // i11: B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF
1028 // i12: C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF
1029 // i13: D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF
1030 // i14: E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF
1031 // i15: F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF
1032 // to:
1033 // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 0A 1A 0C 1C 0E 1E
1034 // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 0B 1B 0D 1D 0F 1F
1035 // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 2A 3A 2C 3C 2E 3E
1036 // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 2B 3B 2D 3D 2F 3F
1037 // b2.val[0]: 40 50 42 52 44 54 46 56 48 58 4A 5A 4C 5C 4E 5E
1038 // b2.val[1]: 41 51 43 53 45 55 47 57 49 59 4B 5B 4D 5D 4F 5F
1039 // b3.val[0]: 60 70 62 72 64 74 66 76 68 78 6A 7A 6C 7C 6E 7E
1040 // b3.val[1]: 61 71 63 73 65 75 67 77 69 79 6B 7B 6D 7D 6F 7F
1041 // b4.val[0]: 80 90 82 92 84 94 86 96 88 98 8A 9A 8C 9C 8E 9E
1042 // b4.val[1]: 81 91 83 93 85 95 87 97 89 99 8B 9B 8D 9D 8F 9F
1043 // b5.val[0]: A0 B0 A2 B2 A4 B4 A6 B6 A8 B8 AA BA AC BC AE BE
1044 // b5.val[1]: A1 B1 A3 B3 A5 B5 A7 B7 A9 B9 AB BB AD BD AF BF
1045 // b6.val[0]: C0 D0 C2 D2 C4 D4 C6 D6 C8 D8 CA DA CC DC CE DE
1046 // b6.val[1]: C1 D1 C3 D3 C5 D5 C7 D7 C9 D9 CB DB CD DD CF DF
1047 // b7.val[0]: E0 F0 E2 F2 E4 F4 E6 F6 E8 F8 EA FA EC FC EE FE
1048 // b7.val[1]: E1 F1 E3 F3 E5 F5 E7 F7 E9 F9 EB FB ED FD EF FF
1049 const uint8x16x2_t b0 = vtrnq_u8(i0, i1);
1050 const uint8x16x2_t b1 = vtrnq_u8(i2, i3);
1051 const uint8x16x2_t b2 = vtrnq_u8(i4, i5);
1052 const uint8x16x2_t b3 = vtrnq_u8(i6, i7);
1053 const uint8x16x2_t b4 = vtrnq_u8(i8, i9);
1054 const uint8x16x2_t b5 = vtrnq_u8(i10, i11);
1055 const uint8x16x2_t b6 = vtrnq_u8(i12, i13);
1056 const uint8x16x2_t b7 = vtrnq_u8(i14, i15);
1057
1058 // Swap 16 bit elements resulting in:
1059 // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 0C 1C 2C 3C
1060 // c0.val[1]: 02 12 22 32 06 16 26 36 0A 1A 2A 3A 0E 1E 2E 3E
1061 // c1.val[0]: 01 11 21 31 05 15 25 35 09 19 29 39 0D 1D 2D 3D
1062 // c1.val[1]: 03 13 23 33 07 17 27 37 0B 1B 2B 3B 0F 1F 2F 3F
1063 // c2.val[0]: 40 50 60 70 44 54 64 74 48 58 68 78 4C 5C 6C 7C
1064 // c2.val[1]: 42 52 62 72 46 56 66 76 4A 5A 6A 7A 4E 5E 6E 7E
1065 // c3.val[0]: 41 51 61 71 45 55 65 75 49 59 69 79 4D 5D 6D 7D
1066 // c3.val[1]: 43 53 63 73 47 57 67 77 4B 5B 6B 7B 4F 5F 6F 7F
1067 // c4.val[0]: 80 90 A0 B0 84 94 A4 B4 88 98 A8 B8 8C 9C AC BC
1068 // c4.val[1]: 82 92 A2 B2 86 96 A6 B6 8A 9A AA BA 8E 9E AE BE
1069 // c5.val[0]: 81 91 A1 B1 85 95 A5 B5 89 99 A9 B9 8D 9D AD BD
1070 // c5.val[1]: 83 93 A3 B3 87 97 A7 B7 8B 9B AB BB 8F 9F AF BF
1071 // c6.val[0]: C0 D0 E0 F0 C4 D4 E4 F4 C8 D8 E8 F8 CC DC EC FC
1072 // c6.val[1]: C2 D2 E2 F2 C6 D6 E6 F6 CA DA EA FA CE DE EE FE
1073 // c7.val[0]: C1 D1 E1 F1 C5 D5 E5 F5 C9 D9 E9 F9 CD DD ED FD
1074 // c7.val[1]: C3 D3 E3 F3 C7 D7 E7 F7 CB DB EB FB CF DF EF FF
1075 const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
1076 vreinterpretq_u16_u8(b1.val[0]));
1077 const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
1078 vreinterpretq_u16_u8(b1.val[1]));
1079 const uint16x8x2_t c2 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[0]),
1080 vreinterpretq_u16_u8(b3.val[0]));
1081 const uint16x8x2_t c3 = vtrnq_u16(vreinterpretq_u16_u8(b2.val[1]),
1082 vreinterpretq_u16_u8(b3.val[1]));
1083 const uint16x8x2_t c4 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[0]),
1084 vreinterpretq_u16_u8(b5.val[0]));
1085 const uint16x8x2_t c5 = vtrnq_u16(vreinterpretq_u16_u8(b4.val[1]),
1086 vreinterpretq_u16_u8(b5.val[1]));
1087 const uint16x8x2_t c6 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[0]),
1088 vreinterpretq_u16_u8(b7.val[0]));
1089 const uint16x8x2_t c7 = vtrnq_u16(vreinterpretq_u16_u8(b6.val[1]),
1090 vreinterpretq_u16_u8(b7.val[1]));
1091
1092 // Swap 32 bit elements resulting in:
1093 // d0.val[0]: 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
1094 // d0.val[1]: 04 14 24 34 44 54 64 74 0C 1C 2C 3C 4C 5C 6C 7C
1095 // d1.val[0]: 02 12 22 32 42 52 62 72 0A 1A 2A 3A 4A 5A 6A 7A
1096 // d1.val[1]: 06 16 26 36 46 56 66 76 0E 1E 2E 3E 4E 5E 6E 7E
1097 // d2.val[0]: 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
1098 // d2.val[1]: 05 15 25 35 45 55 65 75 0D 1D 2D 3D 4D 5D 6D 7D
1099 // d3.val[0]: 03 13 23 33 43 53 63 73 0B 1B 2B 3B 4B 5B 6B 7B
1100 // d3.val[1]: 07 17 27 37 47 57 67 77 0F 1F 2F 3F 4F 5F 6F 7F
1101 // d4.val[0]: 80 90 A0 B0 C0 D0 E0 F0 88 98 A8 B8 C8 D8 E8 F8
1102 // d4.val[1]: 84 94 A4 B4 C4 D4 E4 F4 8C 9C AC BC CC DC EC FC
1103 // d5.val[0]: 82 92 A2 B2 C2 D2 E2 F2 8A 9A AA BA CA DA EA FA
1104 // d5.val[1]: 86 96 A6 B6 C6 D6 E6 F6 8E 9E AE BE CE DE EE FE
1105 // d6.val[0]: 81 91 A1 B1 C1 D1 E1 F1 89 99 A9 B9 C9 D9 E9 F9
1106 // d6.val[1]: 85 95 A5 B5 C5 D5 E5 F5 8D 9D AD BD CD DD ED FD
1107 // d7.val[0]: 83 93 A3 B3 C3 D3 E3 F3 8B 9B AB BB CB DB EB FB
1108 // d7.val[1]: 87 97 A7 B7 C7 D7 E7 F7 8F 9F AF BF CF DF EF FF
1109 const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
1110 vreinterpretq_u32_u16(c2.val[0]));
1111 const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
1112 vreinterpretq_u32_u16(c2.val[1]));
1113 const uint32x4x2_t d2 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[0]),
1114 vreinterpretq_u32_u16(c3.val[0]));
1115 const uint32x4x2_t d3 = vtrnq_u32(vreinterpretq_u32_u16(c1.val[1]),
1116 vreinterpretq_u32_u16(c3.val[1]));
1117 const uint32x4x2_t d4 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[0]),
1118 vreinterpretq_u32_u16(c6.val[0]));
1119 const uint32x4x2_t d5 = vtrnq_u32(vreinterpretq_u32_u16(c4.val[1]),
1120 vreinterpretq_u32_u16(c6.val[1]));
1121 const uint32x4x2_t d6 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[0]),
1122 vreinterpretq_u32_u16(c7.val[0]));
1123 const uint32x4x2_t d7 = vtrnq_u32(vreinterpretq_u32_u16(c5.val[1]),
1124 vreinterpretq_u32_u16(c7.val[1]));
1125
1126 // Swap 64 bit elements resulting in:
1127 // e0.val[0]: 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1128 // e0.val[1]: 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
1129 // e1.val[0]: 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
1130 // e1.val[1]: 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
1131 // e2.val[0]: 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1132 // e2.val[1]: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
1133 // e3.val[0]: 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
1134 // e3.val[1]: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
1135 // e4.val[0]: 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
1136 // e4.val[1]: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
1137 // e5.val[0]: 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1138 // e5.val[1]: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
1139 // e6.val[0]: 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
1140 // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
1141 // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1142 // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
1143 const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]);
1144 const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]);
1145 const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]);
1146 const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]);
1147 const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]);
1148 const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]);
1149 const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]);
1150 const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]);
1151
1152 // Output:
1153 // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0
1154 // o1 : 01 11 21 31 41 51 61 71 84 94 A4 B4 C4 D4 E4 F4
1155 // o2 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2
1156 // o3 : 03 13 23 33 43 53 63 73 86 96 A6 B6 C6 D6 E6 F6
1157 // o4 : 04 14 24 34 44 54 64 74 81 91 A1 B1 C1 D1 E1 F1
1158 // o5 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5
1159 // o6 : 06 16 26 36 46 56 66 76 83 93 A3 B3 C3 D3 E3 F3
1160 // o7 : 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7
1161 // o8 : 08 18 28 38 48 58 68 78 88 98 A8 B8 C8 D8 E8 F8
1162 // o9 : 09 19 29 39 49 59 69 79 89 99 A9 B9 C9 D9 E9 F9
1163 // o10: 0A 1A 2A 3A 4A 5A 6A 7A 8A 9A AA BA CA DA EA FA
1164 // o11: 0B 1B 2B 3B 4B 5B 6B 7B 8B 9B AB BB CB DB EB FB
1165 // o12: 0C 1C 2C 3C 4C 5C 6C 7C 8C 9C AC BC CC DC EC FC
1166 // o13: 0D 1D 2D 3D 4D 5D 6D 7D 8D 9D AD BD CD DD ED FD
1167 // o14: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE
1168 // o15: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF
1169 *o0 = e0.val[0];
1170 *o1 = e1.val[0];
1171 *o2 = e2.val[0];
1172 *o3 = e3.val[0];
1173 *o4 = e4.val[0];
1174 *o5 = e5.val[0];
1175 *o6 = e6.val[0];
1176 *o7 = e7.val[0];
1177 *o8 = e0.val[1];
1178 *o9 = e1.val[1];
1179 *o10 = e2.val[1];
1180 *o11 = e3.val[1];
1181 *o12 = e4.val[1];
1182 *o13 = e5.val[1];
1183 *o14 = e6.val[1];
1184 *o15 = e7.val[1];
1185 }
1186
load_and_transpose_u8_4x8(const uint8_t * a,const int a_stride,uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3)1187 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
1188 const int a_stride, uint8x8_t *a0,
1189 uint8x8_t *a1, uint8x8_t *a2,
1190 uint8x8_t *a3) {
1191 uint8x8_t a4, a5, a6, a7;
1192 *a0 = vld1_u8(a);
1193 a += a_stride;
1194 *a1 = vld1_u8(a);
1195 a += a_stride;
1196 *a2 = vld1_u8(a);
1197 a += a_stride;
1198 *a3 = vld1_u8(a);
1199 a += a_stride;
1200 a4 = vld1_u8(a);
1201 a += a_stride;
1202 a5 = vld1_u8(a);
1203 a += a_stride;
1204 a6 = vld1_u8(a);
1205 a += a_stride;
1206 a7 = vld1_u8(a);
1207
1208 transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7);
1209 }
1210
load_and_transpose_u8_8x8(const uint8_t * a,const int a_stride,uint8x8_t * a0,uint8x8_t * a1,uint8x8_t * a2,uint8x8_t * a3,uint8x8_t * a4,uint8x8_t * a5,uint8x8_t * a6,uint8x8_t * a7)1211 static INLINE void load_and_transpose_u8_8x8(const uint8_t *a,
1212 const int a_stride, uint8x8_t *a0,
1213 uint8x8_t *a1, uint8x8_t *a2,
1214 uint8x8_t *a3, uint8x8_t *a4,
1215 uint8x8_t *a5, uint8x8_t *a6,
1216 uint8x8_t *a7) {
1217 *a0 = vld1_u8(a);
1218 a += a_stride;
1219 *a1 = vld1_u8(a);
1220 a += a_stride;
1221 *a2 = vld1_u8(a);
1222 a += a_stride;
1223 *a3 = vld1_u8(a);
1224 a += a_stride;
1225 *a4 = vld1_u8(a);
1226 a += a_stride;
1227 *a5 = vld1_u8(a);
1228 a += a_stride;
1229 *a6 = vld1_u8(a);
1230 a += a_stride;
1231 *a7 = vld1_u8(a);
1232
1233 transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1234 }
1235
transpose_and_store_u8_8x8(uint8_t * a,const int a_stride,uint8x8_t a0,uint8x8_t a1,uint8x8_t a2,uint8x8_t a3,uint8x8_t a4,uint8x8_t a5,uint8x8_t a6,uint8x8_t a7)1236 static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride,
1237 uint8x8_t a0, uint8x8_t a1,
1238 uint8x8_t a2, uint8x8_t a3,
1239 uint8x8_t a4, uint8x8_t a5,
1240 uint8x8_t a6, uint8x8_t a7) {
1241 transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
1242
1243 vst1_u8(a, a0);
1244 a += a_stride;
1245 vst1_u8(a, a1);
1246 a += a_stride;
1247 vst1_u8(a, a2);
1248 a += a_stride;
1249 vst1_u8(a, a3);
1250 a += a_stride;
1251 vst1_u8(a, a4);
1252 a += a_stride;
1253 vst1_u8(a, a5);
1254 a += a_stride;
1255 vst1_u8(a, a6);
1256 a += a_stride;
1257 vst1_u8(a, a7);
1258 }
1259
load_and_transpose_s16_8x8(const int16_t * a,const int a_stride,int16x8_t * a0,int16x8_t * a1,int16x8_t * a2,int16x8_t * a3,int16x8_t * a4,int16x8_t * a5,int16x8_t * a6,int16x8_t * a7)1260 static INLINE void load_and_transpose_s16_8x8(const int16_t *a,
1261 const int a_stride, int16x8_t *a0,
1262 int16x8_t *a1, int16x8_t *a2,
1263 int16x8_t *a3, int16x8_t *a4,
1264 int16x8_t *a5, int16x8_t *a6,
1265 int16x8_t *a7) {
1266 *a0 = vld1q_s16(a);
1267 a += a_stride;
1268 *a1 = vld1q_s16(a);
1269 a += a_stride;
1270 *a2 = vld1q_s16(a);
1271 a += a_stride;
1272 *a3 = vld1q_s16(a);
1273 a += a_stride;
1274 *a4 = vld1q_s16(a);
1275 a += a_stride;
1276 *a5 = vld1q_s16(a);
1277 a += a_stride;
1278 *a6 = vld1q_s16(a);
1279 a += a_stride;
1280 *a7 = vld1q_s16(a);
1281
1282 transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1283 }
1284
load_and_transpose_s32_8x8(const int32_t * a,const int a_stride,int32x4x2_t * const a0,int32x4x2_t * const a1,int32x4x2_t * const a2,int32x4x2_t * const a3,int32x4x2_t * const a4,int32x4x2_t * const a5,int32x4x2_t * const a6,int32x4x2_t * const a7)1285 static INLINE void load_and_transpose_s32_8x8(
1286 const int32_t *a, const int a_stride, int32x4x2_t *const a0,
1287 int32x4x2_t *const a1, int32x4x2_t *const a2, int32x4x2_t *const a3,
1288 int32x4x2_t *const a4, int32x4x2_t *const a5, int32x4x2_t *const a6,
1289 int32x4x2_t *const a7) {
1290 a0->val[0] = vld1q_s32(a);
1291 a0->val[1] = vld1q_s32(a + 4);
1292 a += a_stride;
1293 a1->val[0] = vld1q_s32(a);
1294 a1->val[1] = vld1q_s32(a + 4);
1295 a += a_stride;
1296 a2->val[0] = vld1q_s32(a);
1297 a2->val[1] = vld1q_s32(a + 4);
1298 a += a_stride;
1299 a3->val[0] = vld1q_s32(a);
1300 a3->val[1] = vld1q_s32(a + 4);
1301 a += a_stride;
1302 a4->val[0] = vld1q_s32(a);
1303 a4->val[1] = vld1q_s32(a + 4);
1304 a += a_stride;
1305 a5->val[0] = vld1q_s32(a);
1306 a5->val[1] = vld1q_s32(a + 4);
1307 a += a_stride;
1308 a6->val[0] = vld1q_s32(a);
1309 a6->val[1] = vld1q_s32(a + 4);
1310 a += a_stride;
1311 a7->val[0] = vld1q_s32(a);
1312 a7->val[1] = vld1q_s32(a + 4);
1313
1314 transpose_s32_8x8(a0, a1, a2, a3, a4, a5, a6, a7);
1315 }
1316 #endif // VPX_VPX_DSP_ARM_TRANSPOSE_NEON_H_
1317