1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12
13 #include "./vpx_config.h"
14 #include "vpx_dsp/txfm_common.h"
15
16 #define LOAD_FROM_TRANSPOSED(prev, first, second) \
17 q14s16 = vld1q_s16(trans_buf + first * 8); \
18 q13s16 = vld1q_s16(trans_buf + second * 8);
19
20 #define LOAD_FROM_OUTPUT(prev, first, second, qA, qB) \
21 qA = vld1q_s16(out + first * 32); \
22 qB = vld1q_s16(out + second * 32);
23
24 #define STORE_IN_OUTPUT(prev, first, second, qA, qB) \
25 vst1q_s16(out + first * 32, qA); \
26 vst1q_s16(out + second * 32, qB);
27
28 #define STORE_COMBINE_CENTER_RESULTS(r10, r9) \
29 __STORE_COMBINE_CENTER_RESULTS(r10, r9, stride, \
30 q6s16, q7s16, q8s16, q9s16);
__STORE_COMBINE_CENTER_RESULTS(uint8_t * p1,uint8_t * p2,int stride,int16x8_t q6s16,int16x8_t q7s16,int16x8_t q8s16,int16x8_t q9s16)31 static INLINE void __STORE_COMBINE_CENTER_RESULTS(
32 uint8_t *p1,
33 uint8_t *p2,
34 int stride,
35 int16x8_t q6s16,
36 int16x8_t q7s16,
37 int16x8_t q8s16,
38 int16x8_t q9s16) {
39 int16x4_t d8s16, d9s16, d10s16, d11s16;
40
41 d8s16 = vld1_s16((int16_t *)p1);
42 p1 += stride;
43 d11s16 = vld1_s16((int16_t *)p2);
44 p2 -= stride;
45 d9s16 = vld1_s16((int16_t *)p1);
46 d10s16 = vld1_s16((int16_t *)p2);
47
48 q7s16 = vrshrq_n_s16(q7s16, 6);
49 q8s16 = vrshrq_n_s16(q8s16, 6);
50 q9s16 = vrshrq_n_s16(q9s16, 6);
51 q6s16 = vrshrq_n_s16(q6s16, 6);
52
53 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
54 vreinterpret_u8_s16(d9s16)));
55 q8s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q8s16),
56 vreinterpret_u8_s16(d10s16)));
57 q9s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q9s16),
58 vreinterpret_u8_s16(d11s16)));
59 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
60 vreinterpret_u8_s16(d8s16)));
61
62 d9s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
63 d10s16 = vreinterpret_s16_u8(vqmovun_s16(q8s16));
64 d11s16 = vreinterpret_s16_u8(vqmovun_s16(q9s16));
65 d8s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
66
67 vst1_s16((int16_t *)p1, d9s16);
68 p1 -= stride;
69 vst1_s16((int16_t *)p2, d10s16);
70 p2 += stride;
71 vst1_s16((int16_t *)p1, d8s16);
72 vst1_s16((int16_t *)p2, d11s16);
73 return;
74 }
75
76 #define STORE_COMBINE_EXTREME_RESULTS(r7, r6); \
77 __STORE_COMBINE_EXTREME_RESULTS(r7, r6, stride, \
78 q4s16, q5s16, q6s16, q7s16);
__STORE_COMBINE_EXTREME_RESULTS(uint8_t * p1,uint8_t * p2,int stride,int16x8_t q4s16,int16x8_t q5s16,int16x8_t q6s16,int16x8_t q7s16)79 static INLINE void __STORE_COMBINE_EXTREME_RESULTS(
80 uint8_t *p1,
81 uint8_t *p2,
82 int stride,
83 int16x8_t q4s16,
84 int16x8_t q5s16,
85 int16x8_t q6s16,
86 int16x8_t q7s16) {
87 int16x4_t d4s16, d5s16, d6s16, d7s16;
88
89 d4s16 = vld1_s16((int16_t *)p1);
90 p1 += stride;
91 d7s16 = vld1_s16((int16_t *)p2);
92 p2 -= stride;
93 d5s16 = vld1_s16((int16_t *)p1);
94 d6s16 = vld1_s16((int16_t *)p2);
95
96 q5s16 = vrshrq_n_s16(q5s16, 6);
97 q6s16 = vrshrq_n_s16(q6s16, 6);
98 q7s16 = vrshrq_n_s16(q7s16, 6);
99 q4s16 = vrshrq_n_s16(q4s16, 6);
100
101 q5s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q5s16),
102 vreinterpret_u8_s16(d5s16)));
103 q6s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q6s16),
104 vreinterpret_u8_s16(d6s16)));
105 q7s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q7s16),
106 vreinterpret_u8_s16(d7s16)));
107 q4s16 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q4s16),
108 vreinterpret_u8_s16(d4s16)));
109
110 d5s16 = vreinterpret_s16_u8(vqmovun_s16(q5s16));
111 d6s16 = vreinterpret_s16_u8(vqmovun_s16(q6s16));
112 d7s16 = vreinterpret_s16_u8(vqmovun_s16(q7s16));
113 d4s16 = vreinterpret_s16_u8(vqmovun_s16(q4s16));
114
115 vst1_s16((int16_t *)p1, d5s16);
116 p1 -= stride;
117 vst1_s16((int16_t *)p2, d6s16);
118 p2 += stride;
119 vst1_s16((int16_t *)p2, d7s16);
120 vst1_s16((int16_t *)p1, d4s16);
121 return;
122 }
123
124 #define DO_BUTTERFLY_STD(const_1, const_2, qA, qB) \
125 DO_BUTTERFLY(q14s16, q13s16, const_1, const_2, qA, qB);
DO_BUTTERFLY(int16x8_t q14s16,int16x8_t q13s16,int16_t first_const,int16_t second_const,int16x8_t * qAs16,int16x8_t * qBs16)126 static INLINE void DO_BUTTERFLY(
127 int16x8_t q14s16,
128 int16x8_t q13s16,
129 int16_t first_const,
130 int16_t second_const,
131 int16x8_t *qAs16,
132 int16x8_t *qBs16) {
133 int16x4_t d30s16, d31s16;
134 int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q15s32;
135 int16x4_t dCs16, dDs16, dAs16, dBs16;
136
137 dCs16 = vget_low_s16(q14s16);
138 dDs16 = vget_high_s16(q14s16);
139 dAs16 = vget_low_s16(q13s16);
140 dBs16 = vget_high_s16(q13s16);
141
142 d30s16 = vdup_n_s16(first_const);
143 d31s16 = vdup_n_s16(second_const);
144
145 q8s32 = vmull_s16(dCs16, d30s16);
146 q10s32 = vmull_s16(dAs16, d31s16);
147 q9s32 = vmull_s16(dDs16, d30s16);
148 q11s32 = vmull_s16(dBs16, d31s16);
149 q12s32 = vmull_s16(dCs16, d31s16);
150
151 q8s32 = vsubq_s32(q8s32, q10s32);
152 q9s32 = vsubq_s32(q9s32, q11s32);
153
154 q10s32 = vmull_s16(dDs16, d31s16);
155 q11s32 = vmull_s16(dAs16, d30s16);
156 q15s32 = vmull_s16(dBs16, d30s16);
157
158 q11s32 = vaddq_s32(q12s32, q11s32);
159 q10s32 = vaddq_s32(q10s32, q15s32);
160
161 *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14),
162 vqrshrn_n_s32(q9s32, 14));
163 *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14),
164 vqrshrn_n_s32(q10s32, 14));
165 return;
166 }
167
idct32_transpose_pair(int16_t * input,int16_t * t_buf)168 static INLINE void idct32_transpose_pair(
169 int16_t *input,
170 int16_t *t_buf) {
171 int16_t *in;
172 int i;
173 const int stride = 32;
174 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
175 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
176 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
177 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
178 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
179
180 for (i = 0; i < 4; i++, input += 8) {
181 in = input;
182 q8s16 = vld1q_s16(in);
183 in += stride;
184 q9s16 = vld1q_s16(in);
185 in += stride;
186 q10s16 = vld1q_s16(in);
187 in += stride;
188 q11s16 = vld1q_s16(in);
189 in += stride;
190 q12s16 = vld1q_s16(in);
191 in += stride;
192 q13s16 = vld1q_s16(in);
193 in += stride;
194 q14s16 = vld1q_s16(in);
195 in += stride;
196 q15s16 = vld1q_s16(in);
197
198 d16s16 = vget_low_s16(q8s16);
199 d17s16 = vget_high_s16(q8s16);
200 d18s16 = vget_low_s16(q9s16);
201 d19s16 = vget_high_s16(q9s16);
202 d20s16 = vget_low_s16(q10s16);
203 d21s16 = vget_high_s16(q10s16);
204 d22s16 = vget_low_s16(q11s16);
205 d23s16 = vget_high_s16(q11s16);
206 d24s16 = vget_low_s16(q12s16);
207 d25s16 = vget_high_s16(q12s16);
208 d26s16 = vget_low_s16(q13s16);
209 d27s16 = vget_high_s16(q13s16);
210 d28s16 = vget_low_s16(q14s16);
211 d29s16 = vget_high_s16(q14s16);
212 d30s16 = vget_low_s16(q15s16);
213 d31s16 = vget_high_s16(q15s16);
214
215 q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
216 q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
217 q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
218 q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
219 q12s16 = vcombine_s16(d17s16, d25s16);
220 q13s16 = vcombine_s16(d19s16, d27s16);
221 q14s16 = vcombine_s16(d21s16, d29s16);
222 q15s16 = vcombine_s16(d23s16, d31s16);
223
224 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16),
225 vreinterpretq_s32_s16(q10s16));
226 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q9s16),
227 vreinterpretq_s32_s16(q11s16));
228 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q12s16),
229 vreinterpretq_s32_s16(q14s16));
230 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q13s16),
231 vreinterpretq_s32_s16(q15s16));
232
233 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
234 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
235 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
236 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
237 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
238 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
239 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
240 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
241
242 vst1q_s16(t_buf, q0x2s16.val[0]);
243 t_buf += 8;
244 vst1q_s16(t_buf, q0x2s16.val[1]);
245 t_buf += 8;
246 vst1q_s16(t_buf, q1x2s16.val[0]);
247 t_buf += 8;
248 vst1q_s16(t_buf, q1x2s16.val[1]);
249 t_buf += 8;
250 vst1q_s16(t_buf, q2x2s16.val[0]);
251 t_buf += 8;
252 vst1q_s16(t_buf, q2x2s16.val[1]);
253 t_buf += 8;
254 vst1q_s16(t_buf, q3x2s16.val[0]);
255 t_buf += 8;
256 vst1q_s16(t_buf, q3x2s16.val[1]);
257 t_buf += 8;
258 }
259 return;
260 }
261
idct32_bands_end_1st_pass(int16_t * out,int16x8_t q2s16,int16x8_t q3s16,int16x8_t q6s16,int16x8_t q7s16,int16x8_t q8s16,int16x8_t q9s16,int16x8_t q10s16,int16x8_t q11s16,int16x8_t q12s16,int16x8_t q13s16,int16x8_t q14s16,int16x8_t q15s16)262 static INLINE void idct32_bands_end_1st_pass(
263 int16_t *out,
264 int16x8_t q2s16,
265 int16x8_t q3s16,
266 int16x8_t q6s16,
267 int16x8_t q7s16,
268 int16x8_t q8s16,
269 int16x8_t q9s16,
270 int16x8_t q10s16,
271 int16x8_t q11s16,
272 int16x8_t q12s16,
273 int16x8_t q13s16,
274 int16x8_t q14s16,
275 int16x8_t q15s16) {
276 int16x8_t q0s16, q1s16, q4s16, q5s16;
277
278 STORE_IN_OUTPUT(17, 16, 17, q6s16, q7s16);
279 STORE_IN_OUTPUT(17, 14, 15, q8s16, q9s16);
280
281 LOAD_FROM_OUTPUT(15, 30, 31, q0s16, q1s16);
282 q4s16 = vaddq_s16(q2s16, q1s16);
283 q5s16 = vaddq_s16(q3s16, q0s16);
284 q6s16 = vsubq_s16(q3s16, q0s16);
285 q7s16 = vsubq_s16(q2s16, q1s16);
286 STORE_IN_OUTPUT(31, 30, 31, q6s16, q7s16);
287 STORE_IN_OUTPUT(31, 0, 1, q4s16, q5s16);
288
289 LOAD_FROM_OUTPUT(1, 12, 13, q0s16, q1s16);
290 q2s16 = vaddq_s16(q10s16, q1s16);
291 q3s16 = vaddq_s16(q11s16, q0s16);
292 q4s16 = vsubq_s16(q11s16, q0s16);
293 q5s16 = vsubq_s16(q10s16, q1s16);
294
295 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16);
296 q8s16 = vaddq_s16(q4s16, q1s16);
297 q9s16 = vaddq_s16(q5s16, q0s16);
298 q6s16 = vsubq_s16(q5s16, q0s16);
299 q7s16 = vsubq_s16(q4s16, q1s16);
300 STORE_IN_OUTPUT(19, 18, 19, q6s16, q7s16);
301 STORE_IN_OUTPUT(19, 12, 13, q8s16, q9s16);
302
303 LOAD_FROM_OUTPUT(13, 28, 29, q0s16, q1s16);
304 q4s16 = vaddq_s16(q2s16, q1s16);
305 q5s16 = vaddq_s16(q3s16, q0s16);
306 q6s16 = vsubq_s16(q3s16, q0s16);
307 q7s16 = vsubq_s16(q2s16, q1s16);
308 STORE_IN_OUTPUT(29, 28, 29, q6s16, q7s16);
309 STORE_IN_OUTPUT(29, 2, 3, q4s16, q5s16);
310
311 LOAD_FROM_OUTPUT(3, 10, 11, q0s16, q1s16);
312 q2s16 = vaddq_s16(q12s16, q1s16);
313 q3s16 = vaddq_s16(q13s16, q0s16);
314 q4s16 = vsubq_s16(q13s16, q0s16);
315 q5s16 = vsubq_s16(q12s16, q1s16);
316
317 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16);
318 q8s16 = vaddq_s16(q4s16, q1s16);
319 q9s16 = vaddq_s16(q5s16, q0s16);
320 q6s16 = vsubq_s16(q5s16, q0s16);
321 q7s16 = vsubq_s16(q4s16, q1s16);
322 STORE_IN_OUTPUT(21, 20, 21, q6s16, q7s16);
323 STORE_IN_OUTPUT(21, 10, 11, q8s16, q9s16);
324
325 LOAD_FROM_OUTPUT(11, 26, 27, q0s16, q1s16);
326 q4s16 = vaddq_s16(q2s16, q1s16);
327 q5s16 = vaddq_s16(q3s16, q0s16);
328 q6s16 = vsubq_s16(q3s16, q0s16);
329 q7s16 = vsubq_s16(q2s16, q1s16);
330 STORE_IN_OUTPUT(27, 26, 27, q6s16, q7s16);
331 STORE_IN_OUTPUT(27, 4, 5, q4s16, q5s16);
332
333 LOAD_FROM_OUTPUT(5, 8, 9, q0s16, q1s16);
334 q2s16 = vaddq_s16(q14s16, q1s16);
335 q3s16 = vaddq_s16(q15s16, q0s16);
336 q4s16 = vsubq_s16(q15s16, q0s16);
337 q5s16 = vsubq_s16(q14s16, q1s16);
338
339 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16);
340 q8s16 = vaddq_s16(q4s16, q1s16);
341 q9s16 = vaddq_s16(q5s16, q0s16);
342 q6s16 = vsubq_s16(q5s16, q0s16);
343 q7s16 = vsubq_s16(q4s16, q1s16);
344 STORE_IN_OUTPUT(23, 22, 23, q6s16, q7s16);
345 STORE_IN_OUTPUT(23, 8, 9, q8s16, q9s16);
346
347 LOAD_FROM_OUTPUT(9, 24, 25, q0s16, q1s16);
348 q4s16 = vaddq_s16(q2s16, q1s16);
349 q5s16 = vaddq_s16(q3s16, q0s16);
350 q6s16 = vsubq_s16(q3s16, q0s16);
351 q7s16 = vsubq_s16(q2s16, q1s16);
352 STORE_IN_OUTPUT(25, 24, 25, q6s16, q7s16);
353 STORE_IN_OUTPUT(25, 6, 7, q4s16, q5s16);
354 return;
355 }
356
idct32_bands_end_2nd_pass(int16_t * out,uint8_t * dest,int stride,int16x8_t q2s16,int16x8_t q3s16,int16x8_t q6s16,int16x8_t q7s16,int16x8_t q8s16,int16x8_t q9s16,int16x8_t q10s16,int16x8_t q11s16,int16x8_t q12s16,int16x8_t q13s16,int16x8_t q14s16,int16x8_t q15s16)357 static INLINE void idct32_bands_end_2nd_pass(
358 int16_t *out,
359 uint8_t *dest,
360 int stride,
361 int16x8_t q2s16,
362 int16x8_t q3s16,
363 int16x8_t q6s16,
364 int16x8_t q7s16,
365 int16x8_t q8s16,
366 int16x8_t q9s16,
367 int16x8_t q10s16,
368 int16x8_t q11s16,
369 int16x8_t q12s16,
370 int16x8_t q13s16,
371 int16x8_t q14s16,
372 int16x8_t q15s16) {
373 uint8_t *r6 = dest + 31 * stride;
374 uint8_t *r7 = dest/* + 0 * stride*/;
375 uint8_t *r9 = dest + 15 * stride;
376 uint8_t *r10 = dest + 16 * stride;
377 int str2 = stride << 1;
378 int16x8_t q0s16, q1s16, q4s16, q5s16;
379
380 STORE_COMBINE_CENTER_RESULTS(r10, r9);
381 r10 += str2; r9 -= str2;
382
383 LOAD_FROM_OUTPUT(17, 30, 31, q0s16, q1s16)
384 q4s16 = vaddq_s16(q2s16, q1s16);
385 q5s16 = vaddq_s16(q3s16, q0s16);
386 q6s16 = vsubq_s16(q3s16, q0s16);
387 q7s16 = vsubq_s16(q2s16, q1s16);
388 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
389 r7 += str2; r6 -= str2;
390
391 LOAD_FROM_OUTPUT(31, 12, 13, q0s16, q1s16)
392 q2s16 = vaddq_s16(q10s16, q1s16);
393 q3s16 = vaddq_s16(q11s16, q0s16);
394 q4s16 = vsubq_s16(q11s16, q0s16);
395 q5s16 = vsubq_s16(q10s16, q1s16);
396
397 LOAD_FROM_OUTPUT(13, 18, 19, q0s16, q1s16)
398 q8s16 = vaddq_s16(q4s16, q1s16);
399 q9s16 = vaddq_s16(q5s16, q0s16);
400 q6s16 = vsubq_s16(q5s16, q0s16);
401 q7s16 = vsubq_s16(q4s16, q1s16);
402 STORE_COMBINE_CENTER_RESULTS(r10, r9);
403 r10 += str2; r9 -= str2;
404
405 LOAD_FROM_OUTPUT(19, 28, 29, q0s16, q1s16)
406 q4s16 = vaddq_s16(q2s16, q1s16);
407 q5s16 = vaddq_s16(q3s16, q0s16);
408 q6s16 = vsubq_s16(q3s16, q0s16);
409 q7s16 = vsubq_s16(q2s16, q1s16);
410 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
411 r7 += str2; r6 -= str2;
412
413 LOAD_FROM_OUTPUT(29, 10, 11, q0s16, q1s16)
414 q2s16 = vaddq_s16(q12s16, q1s16);
415 q3s16 = vaddq_s16(q13s16, q0s16);
416 q4s16 = vsubq_s16(q13s16, q0s16);
417 q5s16 = vsubq_s16(q12s16, q1s16);
418
419 LOAD_FROM_OUTPUT(11, 20, 21, q0s16, q1s16)
420 q8s16 = vaddq_s16(q4s16, q1s16);
421 q9s16 = vaddq_s16(q5s16, q0s16);
422 q6s16 = vsubq_s16(q5s16, q0s16);
423 q7s16 = vsubq_s16(q4s16, q1s16);
424 STORE_COMBINE_CENTER_RESULTS(r10, r9);
425 r10 += str2; r9 -= str2;
426
427 LOAD_FROM_OUTPUT(21, 26, 27, q0s16, q1s16)
428 q4s16 = vaddq_s16(q2s16, q1s16);
429 q5s16 = vaddq_s16(q3s16, q0s16);
430 q6s16 = vsubq_s16(q3s16, q0s16);
431 q7s16 = vsubq_s16(q2s16, q1s16);
432 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
433 r7 += str2; r6 -= str2;
434
435 LOAD_FROM_OUTPUT(27, 8, 9, q0s16, q1s16)
436 q2s16 = vaddq_s16(q14s16, q1s16);
437 q3s16 = vaddq_s16(q15s16, q0s16);
438 q4s16 = vsubq_s16(q15s16, q0s16);
439 q5s16 = vsubq_s16(q14s16, q1s16);
440
441 LOAD_FROM_OUTPUT(9, 22, 23, q0s16, q1s16)
442 q8s16 = vaddq_s16(q4s16, q1s16);
443 q9s16 = vaddq_s16(q5s16, q0s16);
444 q6s16 = vsubq_s16(q5s16, q0s16);
445 q7s16 = vsubq_s16(q4s16, q1s16);
446 STORE_COMBINE_CENTER_RESULTS(r10, r9);
447
448 LOAD_FROM_OUTPUT(23, 24, 25, q0s16, q1s16)
449 q4s16 = vaddq_s16(q2s16, q1s16);
450 q5s16 = vaddq_s16(q3s16, q0s16);
451 q6s16 = vsubq_s16(q3s16, q0s16);
452 q7s16 = vsubq_s16(q2s16, q1s16);
453 STORE_COMBINE_EXTREME_RESULTS(r7, r6);
454 return;
455 }
456
vpx_idct32x32_1024_add_neon(int16_t * input,uint8_t * dest,int stride)457 void vpx_idct32x32_1024_add_neon(
458 int16_t *input,
459 uint8_t *dest,
460 int stride) {
461 int i, idct32_pass_loop;
462 int16_t trans_buf[32 * 8];
463 int16_t pass1[32 * 32];
464 int16_t pass2[32 * 32];
465 int16_t *out;
466 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
467 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
468
469 for (idct32_pass_loop = 0, out = pass1;
470 idct32_pass_loop < 2;
471 idct32_pass_loop++,
472 input = pass1, // the input of pass2 is the result of pass1
473 out = pass2) {
474 for (i = 0;
475 i < 4; i++,
476 input += 32 * 8, out += 8) { // idct32_bands_loop
477 idct32_transpose_pair(input, trans_buf);
478
479 // -----------------------------------------
480 // BLOCK A: 16-19,28-31
481 // -----------------------------------------
482 // generate 16,17,30,31
483 // part of stage 1
484 LOAD_FROM_TRANSPOSED(0, 1, 31)
485 DO_BUTTERFLY_STD(cospi_31_64, cospi_1_64, &q0s16, &q2s16)
486 LOAD_FROM_TRANSPOSED(31, 17, 15)
487 DO_BUTTERFLY_STD(cospi_15_64, cospi_17_64, &q1s16, &q3s16)
488 // part of stage 2
489 q4s16 = vaddq_s16(q0s16, q1s16);
490 q13s16 = vsubq_s16(q0s16, q1s16);
491 q6s16 = vaddq_s16(q2s16, q3s16);
492 q14s16 = vsubq_s16(q2s16, q3s16);
493 // part of stage 3
494 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q5s16, &q7s16)
495
496 // generate 18,19,28,29
497 // part of stage 1
498 LOAD_FROM_TRANSPOSED(15, 9, 23)
499 DO_BUTTERFLY_STD(cospi_23_64, cospi_9_64, &q0s16, &q2s16)
500 LOAD_FROM_TRANSPOSED(23, 25, 7)
501 DO_BUTTERFLY_STD(cospi_7_64, cospi_25_64, &q1s16, &q3s16)
502 // part of stage 2
503 q13s16 = vsubq_s16(q3s16, q2s16);
504 q3s16 = vaddq_s16(q3s16, q2s16);
505 q14s16 = vsubq_s16(q1s16, q0s16);
506 q2s16 = vaddq_s16(q1s16, q0s16);
507 // part of stage 3
508 DO_BUTTERFLY_STD(-cospi_4_64, -cospi_28_64, &q1s16, &q0s16)
509 // part of stage 4
510 q8s16 = vaddq_s16(q4s16, q2s16);
511 q9s16 = vaddq_s16(q5s16, q0s16);
512 q10s16 = vaddq_s16(q7s16, q1s16);
513 q15s16 = vaddq_s16(q6s16, q3s16);
514 q13s16 = vsubq_s16(q5s16, q0s16);
515 q14s16 = vsubq_s16(q7s16, q1s16);
516 STORE_IN_OUTPUT(0, 16, 31, q8s16, q15s16)
517 STORE_IN_OUTPUT(31, 17, 30, q9s16, q10s16)
518 // part of stage 5
519 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q0s16, &q1s16)
520 STORE_IN_OUTPUT(30, 29, 18, q1s16, q0s16)
521 // part of stage 4
522 q13s16 = vsubq_s16(q4s16, q2s16);
523 q14s16 = vsubq_s16(q6s16, q3s16);
524 // part of stage 5
525 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q4s16, &q6s16)
526 STORE_IN_OUTPUT(18, 19, 28, q4s16, q6s16)
527
528 // -----------------------------------------
529 // BLOCK B: 20-23,24-27
530 // -----------------------------------------
531 // generate 20,21,26,27
532 // part of stage 1
533 LOAD_FROM_TRANSPOSED(7, 5, 27)
534 DO_BUTTERFLY_STD(cospi_27_64, cospi_5_64, &q0s16, &q2s16)
535 LOAD_FROM_TRANSPOSED(27, 21, 11)
536 DO_BUTTERFLY_STD(cospi_11_64, cospi_21_64, &q1s16, &q3s16)
537 // part of stage 2
538 q13s16 = vsubq_s16(q0s16, q1s16);
539 q0s16 = vaddq_s16(q0s16, q1s16);
540 q14s16 = vsubq_s16(q2s16, q3s16);
541 q2s16 = vaddq_s16(q2s16, q3s16);
542 // part of stage 3
543 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
544
545 // generate 22,23,24,25
546 // part of stage 1
547 LOAD_FROM_TRANSPOSED(11, 13, 19)
548 DO_BUTTERFLY_STD(cospi_19_64, cospi_13_64, &q5s16, &q7s16)
549 LOAD_FROM_TRANSPOSED(19, 29, 3)
550 DO_BUTTERFLY_STD(cospi_3_64, cospi_29_64, &q4s16, &q6s16)
551 // part of stage 2
552 q14s16 = vsubq_s16(q4s16, q5s16);
553 q5s16 = vaddq_s16(q4s16, q5s16);
554 q13s16 = vsubq_s16(q6s16, q7s16);
555 q6s16 = vaddq_s16(q6s16, q7s16);
556 // part of stage 3
557 DO_BUTTERFLY_STD(-cospi_20_64, -cospi_12_64, &q4s16, &q7s16)
558 // part of stage 4
559 q10s16 = vaddq_s16(q7s16, q1s16);
560 q11s16 = vaddq_s16(q5s16, q0s16);
561 q12s16 = vaddq_s16(q6s16, q2s16);
562 q15s16 = vaddq_s16(q4s16, q3s16);
563 // part of stage 6
564 LOAD_FROM_OUTPUT(28, 16, 17, q14s16, q13s16)
565 q8s16 = vaddq_s16(q14s16, q11s16);
566 q9s16 = vaddq_s16(q13s16, q10s16);
567 q13s16 = vsubq_s16(q13s16, q10s16);
568 q11s16 = vsubq_s16(q14s16, q11s16);
569 STORE_IN_OUTPUT(17, 17, 16, q9s16, q8s16)
570 LOAD_FROM_OUTPUT(16, 30, 31, q14s16, q9s16)
571 q8s16 = vsubq_s16(q9s16, q12s16);
572 q10s16 = vaddq_s16(q14s16, q15s16);
573 q14s16 = vsubq_s16(q14s16, q15s16);
574 q12s16 = vaddq_s16(q9s16, q12s16);
575 STORE_IN_OUTPUT(31, 30, 31, q10s16, q12s16)
576 // part of stage 7
577 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
578 STORE_IN_OUTPUT(31, 25, 22, q14s16, q13s16)
579 q13s16 = q11s16;
580 q14s16 = q8s16;
581 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
582 STORE_IN_OUTPUT(22, 24, 23, q14s16, q13s16)
583 // part of stage 4
584 q14s16 = vsubq_s16(q5s16, q0s16);
585 q13s16 = vsubq_s16(q6s16, q2s16);
586 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q5s16, &q6s16);
587 q14s16 = vsubq_s16(q7s16, q1s16);
588 q13s16 = vsubq_s16(q4s16, q3s16);
589 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q0s16, &q1s16);
590 // part of stage 6
591 LOAD_FROM_OUTPUT(23, 18, 19, q14s16, q13s16)
592 q8s16 = vaddq_s16(q14s16, q1s16);
593 q9s16 = vaddq_s16(q13s16, q6s16);
594 q13s16 = vsubq_s16(q13s16, q6s16);
595 q1s16 = vsubq_s16(q14s16, q1s16);
596 STORE_IN_OUTPUT(19, 18, 19, q8s16, q9s16)
597 LOAD_FROM_OUTPUT(19, 28, 29, q8s16, q9s16)
598 q14s16 = vsubq_s16(q8s16, q5s16);
599 q10s16 = vaddq_s16(q8s16, q5s16);
600 q11s16 = vaddq_s16(q9s16, q0s16);
601 q0s16 = vsubq_s16(q9s16, q0s16);
602 STORE_IN_OUTPUT(29, 28, 29, q10s16, q11s16)
603 // part of stage 7
604 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q13s16, &q14s16)
605 STORE_IN_OUTPUT(29, 20, 27, q13s16, q14s16)
606 DO_BUTTERFLY(q0s16, q1s16, cospi_16_64, cospi_16_64,
607 &q1s16, &q0s16);
608 STORE_IN_OUTPUT(27, 21, 26, q1s16, q0s16)
609
610 // -----------------------------------------
611 // BLOCK C: 8-10,11-15
612 // -----------------------------------------
613 // generate 8,9,14,15
614 // part of stage 2
615 LOAD_FROM_TRANSPOSED(3, 2, 30)
616 DO_BUTTERFLY_STD(cospi_30_64, cospi_2_64, &q0s16, &q2s16)
617 LOAD_FROM_TRANSPOSED(30, 18, 14)
618 DO_BUTTERFLY_STD(cospi_14_64, cospi_18_64, &q1s16, &q3s16)
619 // part of stage 3
620 q13s16 = vsubq_s16(q0s16, q1s16);
621 q0s16 = vaddq_s16(q0s16, q1s16);
622 q14s16 = vsubq_s16(q2s16, q3s16);
623 q2s16 = vaddq_s16(q2s16, q3s16);
624 // part of stage 4
625 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q1s16, &q3s16)
626
627 // generate 10,11,12,13
628 // part of stage 2
629 LOAD_FROM_TRANSPOSED(14, 10, 22)
630 DO_BUTTERFLY_STD(cospi_22_64, cospi_10_64, &q5s16, &q7s16)
631 LOAD_FROM_TRANSPOSED(22, 26, 6)
632 DO_BUTTERFLY_STD(cospi_6_64, cospi_26_64, &q4s16, &q6s16)
633 // part of stage 3
634 q14s16 = vsubq_s16(q4s16, q5s16);
635 q5s16 = vaddq_s16(q4s16, q5s16);
636 q13s16 = vsubq_s16(q6s16, q7s16);
637 q6s16 = vaddq_s16(q6s16, q7s16);
638 // part of stage 4
639 DO_BUTTERFLY_STD(-cospi_8_64, -cospi_24_64, &q4s16, &q7s16)
640 // part of stage 5
641 q8s16 = vaddq_s16(q0s16, q5s16);
642 q9s16 = vaddq_s16(q1s16, q7s16);
643 q13s16 = vsubq_s16(q1s16, q7s16);
644 q14s16 = vsubq_s16(q3s16, q4s16);
645 q10s16 = vaddq_s16(q3s16, q4s16);
646 q15s16 = vaddq_s16(q2s16, q6s16);
647 STORE_IN_OUTPUT(26, 8, 15, q8s16, q15s16)
648 STORE_IN_OUTPUT(15, 9, 14, q9s16, q10s16)
649 // part of stage 6
650 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
651 STORE_IN_OUTPUT(14, 13, 10, q3s16, q1s16)
652 q13s16 = vsubq_s16(q0s16, q5s16);
653 q14s16 = vsubq_s16(q2s16, q6s16);
654 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
655 STORE_IN_OUTPUT(10, 11, 12, q1s16, q3s16)
656
657 // -----------------------------------------
658 // BLOCK D: 0-3,4-7
659 // -----------------------------------------
660 // generate 4,5,6,7
661 // part of stage 3
662 LOAD_FROM_TRANSPOSED(6, 4, 28)
663 DO_BUTTERFLY_STD(cospi_28_64, cospi_4_64, &q0s16, &q2s16)
664 LOAD_FROM_TRANSPOSED(28, 20, 12)
665 DO_BUTTERFLY_STD(cospi_12_64, cospi_20_64, &q1s16, &q3s16)
666 // part of stage 4
667 q13s16 = vsubq_s16(q0s16, q1s16);
668 q0s16 = vaddq_s16(q0s16, q1s16);
669 q14s16 = vsubq_s16(q2s16, q3s16);
670 q2s16 = vaddq_s16(q2s16, q3s16);
671 // part of stage 5
672 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q1s16, &q3s16)
673
674 // generate 0,1,2,3
675 // part of stage 4
676 LOAD_FROM_TRANSPOSED(12, 0, 16)
677 DO_BUTTERFLY_STD(cospi_16_64, cospi_16_64, &q5s16, &q7s16)
678 LOAD_FROM_TRANSPOSED(16, 8, 24)
679 DO_BUTTERFLY_STD(cospi_24_64, cospi_8_64, &q14s16, &q6s16)
680 // part of stage 5
681 q4s16 = vaddq_s16(q7s16, q6s16);
682 q7s16 = vsubq_s16(q7s16, q6s16);
683 q6s16 = vsubq_s16(q5s16, q14s16);
684 q5s16 = vaddq_s16(q5s16, q14s16);
685 // part of stage 6
686 q8s16 = vaddq_s16(q4s16, q2s16);
687 q9s16 = vaddq_s16(q5s16, q3s16);
688 q10s16 = vaddq_s16(q6s16, q1s16);
689 q11s16 = vaddq_s16(q7s16, q0s16);
690 q12s16 = vsubq_s16(q7s16, q0s16);
691 q13s16 = vsubq_s16(q6s16, q1s16);
692 q14s16 = vsubq_s16(q5s16, q3s16);
693 q15s16 = vsubq_s16(q4s16, q2s16);
694 // part of stage 7
695 LOAD_FROM_OUTPUT(12, 14, 15, q0s16, q1s16)
696 q2s16 = vaddq_s16(q8s16, q1s16);
697 q3s16 = vaddq_s16(q9s16, q0s16);
698 q4s16 = vsubq_s16(q9s16, q0s16);
699 q5s16 = vsubq_s16(q8s16, q1s16);
700 LOAD_FROM_OUTPUT(15, 16, 17, q0s16, q1s16)
701 q8s16 = vaddq_s16(q4s16, q1s16);
702 q9s16 = vaddq_s16(q5s16, q0s16);
703 q6s16 = vsubq_s16(q5s16, q0s16);
704 q7s16 = vsubq_s16(q4s16, q1s16);
705
706 if (idct32_pass_loop == 0) {
707 idct32_bands_end_1st_pass(out,
708 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
709 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
710 } else {
711 idct32_bands_end_2nd_pass(out, dest, stride,
712 q2s16, q3s16, q6s16, q7s16, q8s16, q9s16,
713 q10s16, q11s16, q12s16, q13s16, q14s16, q15s16);
714 dest += 8;
715 }
716 }
717 }
718 return;
719 }
720