1 /*
2 * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <arm_neon.h>
12 #include <assert.h>
13
14 #include "./vp9_rtcd.h"
15 #include "./vpx_config.h"
16 #include "vp9/common/vp9_common.h"
17
18 static int16_t cospi_2_64 = 16305;
19 static int16_t cospi_4_64 = 16069;
20 static int16_t cospi_6_64 = 15679;
21 static int16_t cospi_8_64 = 15137;
22 static int16_t cospi_10_64 = 14449;
23 static int16_t cospi_12_64 = 13623;
24 static int16_t cospi_14_64 = 12665;
25 static int16_t cospi_16_64 = 11585;
26 static int16_t cospi_18_64 = 10394;
27 static int16_t cospi_20_64 = 9102;
28 static int16_t cospi_22_64 = 7723;
29 static int16_t cospi_24_64 = 6270;
30 static int16_t cospi_26_64 = 4756;
31 static int16_t cospi_28_64 = 3196;
32 static int16_t cospi_30_64 = 1606;
33
TRANSPOSE8X8(int16x8_t * q8s16,int16x8_t * q9s16,int16x8_t * q10s16,int16x8_t * q11s16,int16x8_t * q12s16,int16x8_t * q13s16,int16x8_t * q14s16,int16x8_t * q15s16)34 static INLINE void TRANSPOSE8X8(
35 int16x8_t *q8s16,
36 int16x8_t *q9s16,
37 int16x8_t *q10s16,
38 int16x8_t *q11s16,
39 int16x8_t *q12s16,
40 int16x8_t *q13s16,
41 int16x8_t *q14s16,
42 int16x8_t *q15s16) {
43 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
44 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
45 int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32;
46 int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16;
47
48 d16s16 = vget_low_s16(*q8s16);
49 d17s16 = vget_high_s16(*q8s16);
50 d18s16 = vget_low_s16(*q9s16);
51 d19s16 = vget_high_s16(*q9s16);
52 d20s16 = vget_low_s16(*q10s16);
53 d21s16 = vget_high_s16(*q10s16);
54 d22s16 = vget_low_s16(*q11s16);
55 d23s16 = vget_high_s16(*q11s16);
56 d24s16 = vget_low_s16(*q12s16);
57 d25s16 = vget_high_s16(*q12s16);
58 d26s16 = vget_low_s16(*q13s16);
59 d27s16 = vget_high_s16(*q13s16);
60 d28s16 = vget_low_s16(*q14s16);
61 d29s16 = vget_high_s16(*q14s16);
62 d30s16 = vget_low_s16(*q15s16);
63 d31s16 = vget_high_s16(*q15s16);
64
65 *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24
66 *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26
67 *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28
68 *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30
69 *q12s16 = vcombine_s16(d17s16, d25s16);
70 *q13s16 = vcombine_s16(d19s16, d27s16);
71 *q14s16 = vcombine_s16(d21s16, d29s16);
72 *q15s16 = vcombine_s16(d23s16, d31s16);
73
74 q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16),
75 vreinterpretq_s32_s16(*q10s16));
76 q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16),
77 vreinterpretq_s32_s16(*q11s16));
78 q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16),
79 vreinterpretq_s32_s16(*q14s16));
80 q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16),
81 vreinterpretq_s32_s16(*q15s16));
82
83 q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8
84 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9
85 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10
86 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11
87 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12
88 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13
89 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14
90 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15
91
92 *q8s16 = q0x2s16.val[0];
93 *q9s16 = q0x2s16.val[1];
94 *q10s16 = q1x2s16.val[0];
95 *q11s16 = q1x2s16.val[1];
96 *q12s16 = q2x2s16.val[0];
97 *q13s16 = q2x2s16.val[1];
98 *q14s16 = q3x2s16.val[0];
99 *q15s16 = q3x2s16.val[1];
100 return;
101 }
102
IDCT8x8_1D(int16x8_t * q8s16,int16x8_t * q9s16,int16x8_t * q10s16,int16x8_t * q11s16,int16x8_t * q12s16,int16x8_t * q13s16,int16x8_t * q14s16,int16x8_t * q15s16)103 static INLINE void IDCT8x8_1D(
104 int16x8_t *q8s16,
105 int16x8_t *q9s16,
106 int16x8_t *q10s16,
107 int16x8_t *q11s16,
108 int16x8_t *q12s16,
109 int16x8_t *q13s16,
110 int16x8_t *q14s16,
111 int16x8_t *q15s16) {
112 int16x4_t d0s16, d1s16, d2s16, d3s16;
113 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
114 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
115 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
116 int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
117 int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
118 int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
119
120 d0s16 = vdup_n_s16(cospi_28_64);
121 d1s16 = vdup_n_s16(cospi_4_64);
122 d2s16 = vdup_n_s16(cospi_12_64);
123 d3s16 = vdup_n_s16(cospi_20_64);
124
125 d16s16 = vget_low_s16(*q8s16);
126 d17s16 = vget_high_s16(*q8s16);
127 d18s16 = vget_low_s16(*q9s16);
128 d19s16 = vget_high_s16(*q9s16);
129 d20s16 = vget_low_s16(*q10s16);
130 d21s16 = vget_high_s16(*q10s16);
131 d22s16 = vget_low_s16(*q11s16);
132 d23s16 = vget_high_s16(*q11s16);
133 d24s16 = vget_low_s16(*q12s16);
134 d25s16 = vget_high_s16(*q12s16);
135 d26s16 = vget_low_s16(*q13s16);
136 d27s16 = vget_high_s16(*q13s16);
137 d28s16 = vget_low_s16(*q14s16);
138 d29s16 = vget_high_s16(*q14s16);
139 d30s16 = vget_low_s16(*q15s16);
140 d31s16 = vget_high_s16(*q15s16);
141
142 q2s32 = vmull_s16(d18s16, d0s16);
143 q3s32 = vmull_s16(d19s16, d0s16);
144 q5s32 = vmull_s16(d26s16, d2s16);
145 q6s32 = vmull_s16(d27s16, d2s16);
146
147 q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
148 q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
149 q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
150 q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
151
152 d8s16 = vqrshrn_n_s32(q2s32, 14);
153 d9s16 = vqrshrn_n_s32(q3s32, 14);
154 d10s16 = vqrshrn_n_s32(q5s32, 14);
155 d11s16 = vqrshrn_n_s32(q6s32, 14);
156 q4s16 = vcombine_s16(d8s16, d9s16);
157 q5s16 = vcombine_s16(d10s16, d11s16);
158
159 q2s32 = vmull_s16(d18s16, d1s16);
160 q3s32 = vmull_s16(d19s16, d1s16);
161 q9s32 = vmull_s16(d26s16, d3s16);
162 q13s32 = vmull_s16(d27s16, d3s16);
163
164 q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
165 q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
166 q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
167 q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
168
169 d14s16 = vqrshrn_n_s32(q2s32, 14);
170 d15s16 = vqrshrn_n_s32(q3s32, 14);
171 d12s16 = vqrshrn_n_s32(q9s32, 14);
172 d13s16 = vqrshrn_n_s32(q13s32, 14);
173 q6s16 = vcombine_s16(d12s16, d13s16);
174 q7s16 = vcombine_s16(d14s16, d15s16);
175
176 d0s16 = vdup_n_s16(cospi_16_64);
177
178 q2s32 = vmull_s16(d16s16, d0s16);
179 q3s32 = vmull_s16(d17s16, d0s16);
180 q13s32 = vmull_s16(d16s16, d0s16);
181 q15s32 = vmull_s16(d17s16, d0s16);
182
183 q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
184 q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
185 q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
186 q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
187
188 d0s16 = vdup_n_s16(cospi_24_64);
189 d1s16 = vdup_n_s16(cospi_8_64);
190
191 d18s16 = vqrshrn_n_s32(q2s32, 14);
192 d19s16 = vqrshrn_n_s32(q3s32, 14);
193 d22s16 = vqrshrn_n_s32(q13s32, 14);
194 d23s16 = vqrshrn_n_s32(q15s32, 14);
195 *q9s16 = vcombine_s16(d18s16, d19s16);
196 *q11s16 = vcombine_s16(d22s16, d23s16);
197
198 q2s32 = vmull_s16(d20s16, d0s16);
199 q3s32 = vmull_s16(d21s16, d0s16);
200 q8s32 = vmull_s16(d20s16, d1s16);
201 q12s32 = vmull_s16(d21s16, d1s16);
202
203 q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
204 q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
205 q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
206 q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
207
208 d26s16 = vqrshrn_n_s32(q2s32, 14);
209 d27s16 = vqrshrn_n_s32(q3s32, 14);
210 d30s16 = vqrshrn_n_s32(q8s32, 14);
211 d31s16 = vqrshrn_n_s32(q12s32, 14);
212 *q13s16 = vcombine_s16(d26s16, d27s16);
213 *q15s16 = vcombine_s16(d30s16, d31s16);
214
215 q0s16 = vaddq_s16(*q9s16, *q15s16);
216 q1s16 = vaddq_s16(*q11s16, *q13s16);
217 q2s16 = vsubq_s16(*q11s16, *q13s16);
218 q3s16 = vsubq_s16(*q9s16, *q15s16);
219
220 *q13s16 = vsubq_s16(q4s16, q5s16);
221 q4s16 = vaddq_s16(q4s16, q5s16);
222 *q14s16 = vsubq_s16(q7s16, q6s16);
223 q7s16 = vaddq_s16(q7s16, q6s16);
224 d26s16 = vget_low_s16(*q13s16);
225 d27s16 = vget_high_s16(*q13s16);
226 d28s16 = vget_low_s16(*q14s16);
227 d29s16 = vget_high_s16(*q14s16);
228
229 d16s16 = vdup_n_s16(cospi_16_64);
230
231 q9s32 = vmull_s16(d28s16, d16s16);
232 q10s32 = vmull_s16(d29s16, d16s16);
233 q11s32 = vmull_s16(d28s16, d16s16);
234 q12s32 = vmull_s16(d29s16, d16s16);
235
236 q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
237 q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
238 q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
239 q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
240
241 d10s16 = vqrshrn_n_s32(q9s32, 14);
242 d11s16 = vqrshrn_n_s32(q10s32, 14);
243 d12s16 = vqrshrn_n_s32(q11s32, 14);
244 d13s16 = vqrshrn_n_s32(q12s32, 14);
245 q5s16 = vcombine_s16(d10s16, d11s16);
246 q6s16 = vcombine_s16(d12s16, d13s16);
247
248 *q8s16 = vaddq_s16(q0s16, q7s16);
249 *q9s16 = vaddq_s16(q1s16, q6s16);
250 *q10s16 = vaddq_s16(q2s16, q5s16);
251 *q11s16 = vaddq_s16(q3s16, q4s16);
252 *q12s16 = vsubq_s16(q3s16, q4s16);
253 *q13s16 = vsubq_s16(q2s16, q5s16);
254 *q14s16 = vsubq_s16(q1s16, q6s16);
255 *q15s16 = vsubq_s16(q0s16, q7s16);
256 return;
257 }
258
IADST8X8_1D(int16x8_t * q8s16,int16x8_t * q9s16,int16x8_t * q10s16,int16x8_t * q11s16,int16x8_t * q12s16,int16x8_t * q13s16,int16x8_t * q14s16,int16x8_t * q15s16)259 static INLINE void IADST8X8_1D(
260 int16x8_t *q8s16,
261 int16x8_t *q9s16,
262 int16x8_t *q10s16,
263 int16x8_t *q11s16,
264 int16x8_t *q12s16,
265 int16x8_t *q13s16,
266 int16x8_t *q14s16,
267 int16x8_t *q15s16) {
268 int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
269 int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
270 int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
271 int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
272 int16x8_t q2s16, q4s16, q5s16, q6s16;
273 int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
274 int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
275
276 d16s16 = vget_low_s16(*q8s16);
277 d17s16 = vget_high_s16(*q8s16);
278 d18s16 = vget_low_s16(*q9s16);
279 d19s16 = vget_high_s16(*q9s16);
280 d20s16 = vget_low_s16(*q10s16);
281 d21s16 = vget_high_s16(*q10s16);
282 d22s16 = vget_low_s16(*q11s16);
283 d23s16 = vget_high_s16(*q11s16);
284 d24s16 = vget_low_s16(*q12s16);
285 d25s16 = vget_high_s16(*q12s16);
286 d26s16 = vget_low_s16(*q13s16);
287 d27s16 = vget_high_s16(*q13s16);
288 d28s16 = vget_low_s16(*q14s16);
289 d29s16 = vget_high_s16(*q14s16);
290 d30s16 = vget_low_s16(*q15s16);
291 d31s16 = vget_high_s16(*q15s16);
292
293 d14s16 = vdup_n_s16(cospi_2_64);
294 d15s16 = vdup_n_s16(cospi_30_64);
295
296 q1s32 = vmull_s16(d30s16, d14s16);
297 q2s32 = vmull_s16(d31s16, d14s16);
298 q3s32 = vmull_s16(d30s16, d15s16);
299 q4s32 = vmull_s16(d31s16, d15s16);
300
301 d30s16 = vdup_n_s16(cospi_18_64);
302 d31s16 = vdup_n_s16(cospi_14_64);
303
304 q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
305 q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
306 q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
307 q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
308
309 q5s32 = vmull_s16(d22s16, d30s16);
310 q6s32 = vmull_s16(d23s16, d30s16);
311 q7s32 = vmull_s16(d22s16, d31s16);
312 q8s32 = vmull_s16(d23s16, d31s16);
313
314 q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
315 q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
316 q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
317 q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
318
319 q11s32 = vaddq_s32(q1s32, q5s32);
320 q12s32 = vaddq_s32(q2s32, q6s32);
321 q1s32 = vsubq_s32(q1s32, q5s32);
322 q2s32 = vsubq_s32(q2s32, q6s32);
323
324 d22s16 = vqrshrn_n_s32(q11s32, 14);
325 d23s16 = vqrshrn_n_s32(q12s32, 14);
326 *q11s16 = vcombine_s16(d22s16, d23s16);
327
328 q12s32 = vaddq_s32(q3s32, q7s32);
329 q15s32 = vaddq_s32(q4s32, q8s32);
330 q3s32 = vsubq_s32(q3s32, q7s32);
331 q4s32 = vsubq_s32(q4s32, q8s32);
332
333 d2s16 = vqrshrn_n_s32(q1s32, 14);
334 d3s16 = vqrshrn_n_s32(q2s32, 14);
335 d24s16 = vqrshrn_n_s32(q12s32, 14);
336 d25s16 = vqrshrn_n_s32(q15s32, 14);
337 d6s16 = vqrshrn_n_s32(q3s32, 14);
338 d7s16 = vqrshrn_n_s32(q4s32, 14);
339 *q12s16 = vcombine_s16(d24s16, d25s16);
340
341 d0s16 = vdup_n_s16(cospi_10_64);
342 d1s16 = vdup_n_s16(cospi_22_64);
343 q4s32 = vmull_s16(d26s16, d0s16);
344 q5s32 = vmull_s16(d27s16, d0s16);
345 q2s32 = vmull_s16(d26s16, d1s16);
346 q6s32 = vmull_s16(d27s16, d1s16);
347
348 d30s16 = vdup_n_s16(cospi_26_64);
349 d31s16 = vdup_n_s16(cospi_6_64);
350
351 q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
352 q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
353 q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
354 q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
355
356 q0s32 = vmull_s16(d18s16, d30s16);
357 q13s32 = vmull_s16(d19s16, d30s16);
358
359 q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
360 q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
361
362 q10s32 = vmull_s16(d18s16, d31s16);
363 q9s32 = vmull_s16(d19s16, d31s16);
364
365 q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
366 q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
367
368 q14s32 = vaddq_s32(q2s32, q10s32);
369 q15s32 = vaddq_s32(q6s32, q9s32);
370 q2s32 = vsubq_s32(q2s32, q10s32);
371 q6s32 = vsubq_s32(q6s32, q9s32);
372
373 d28s16 = vqrshrn_n_s32(q14s32, 14);
374 d29s16 = vqrshrn_n_s32(q15s32, 14);
375 d4s16 = vqrshrn_n_s32(q2s32, 14);
376 d5s16 = vqrshrn_n_s32(q6s32, 14);
377 *q14s16 = vcombine_s16(d28s16, d29s16);
378
379 q9s32 = vaddq_s32(q4s32, q0s32);
380 q10s32 = vaddq_s32(q5s32, q13s32);
381 q4s32 = vsubq_s32(q4s32, q0s32);
382 q5s32 = vsubq_s32(q5s32, q13s32);
383
384 d30s16 = vdup_n_s16(cospi_8_64);
385 d31s16 = vdup_n_s16(cospi_24_64);
386
387 d18s16 = vqrshrn_n_s32(q9s32, 14);
388 d19s16 = vqrshrn_n_s32(q10s32, 14);
389 d8s16 = vqrshrn_n_s32(q4s32, 14);
390 d9s16 = vqrshrn_n_s32(q5s32, 14);
391 *q9s16 = vcombine_s16(d18s16, d19s16);
392
393 q5s32 = vmull_s16(d2s16, d30s16);
394 q6s32 = vmull_s16(d3s16, d30s16);
395 q7s32 = vmull_s16(d2s16, d31s16);
396 q0s32 = vmull_s16(d3s16, d31s16);
397
398 q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
399 q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
400 q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
401 q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
402
403 q1s32 = vmull_s16(d4s16, d30s16);
404 q3s32 = vmull_s16(d5s16, d30s16);
405 q10s32 = vmull_s16(d4s16, d31s16);
406 q2s32 = vmull_s16(d5s16, d31s16);
407
408 q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
409 q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
410 q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
411 q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
412
413 *q8s16 = vaddq_s16(*q11s16, *q9s16);
414 *q11s16 = vsubq_s16(*q11s16, *q9s16);
415 q4s16 = vaddq_s16(*q12s16, *q14s16);
416 *q12s16 = vsubq_s16(*q12s16, *q14s16);
417
418 q14s32 = vaddq_s32(q5s32, q1s32);
419 q15s32 = vaddq_s32(q6s32, q3s32);
420 q5s32 = vsubq_s32(q5s32, q1s32);
421 q6s32 = vsubq_s32(q6s32, q3s32);
422
423 d18s16 = vqrshrn_n_s32(q14s32, 14);
424 d19s16 = vqrshrn_n_s32(q15s32, 14);
425 d10s16 = vqrshrn_n_s32(q5s32, 14);
426 d11s16 = vqrshrn_n_s32(q6s32, 14);
427 *q9s16 = vcombine_s16(d18s16, d19s16);
428
429 q1s32 = vaddq_s32(q7s32, q10s32);
430 q3s32 = vaddq_s32(q0s32, q2s32);
431 q7s32 = vsubq_s32(q7s32, q10s32);
432 q0s32 = vsubq_s32(q0s32, q2s32);
433
434 d28s16 = vqrshrn_n_s32(q1s32, 14);
435 d29s16 = vqrshrn_n_s32(q3s32, 14);
436 d14s16 = vqrshrn_n_s32(q7s32, 14);
437 d15s16 = vqrshrn_n_s32(q0s32, 14);
438 *q14s16 = vcombine_s16(d28s16, d29s16);
439
440 d30s16 = vdup_n_s16(cospi_16_64);
441
442 d22s16 = vget_low_s16(*q11s16);
443 d23s16 = vget_high_s16(*q11s16);
444 q2s32 = vmull_s16(d22s16, d30s16);
445 q3s32 = vmull_s16(d23s16, d30s16);
446 q13s32 = vmull_s16(d22s16, d30s16);
447 q1s32 = vmull_s16(d23s16, d30s16);
448
449 d24s16 = vget_low_s16(*q12s16);
450 d25s16 = vget_high_s16(*q12s16);
451 q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
452 q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
453 q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
454 q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
455
456 d4s16 = vqrshrn_n_s32(q2s32, 14);
457 d5s16 = vqrshrn_n_s32(q3s32, 14);
458 d24s16 = vqrshrn_n_s32(q13s32, 14);
459 d25s16 = vqrshrn_n_s32(q1s32, 14);
460 q2s16 = vcombine_s16(d4s16, d5s16);
461 *q12s16 = vcombine_s16(d24s16, d25s16);
462
463 q13s32 = vmull_s16(d10s16, d30s16);
464 q1s32 = vmull_s16(d11s16, d30s16);
465 q11s32 = vmull_s16(d10s16, d30s16);
466 q0s32 = vmull_s16(d11s16, d30s16);
467
468 q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
469 q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
470 q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
471 q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
472
473 d20s16 = vqrshrn_n_s32(q13s32, 14);
474 d21s16 = vqrshrn_n_s32(q1s32, 14);
475 d12s16 = vqrshrn_n_s32(q11s32, 14);
476 d13s16 = vqrshrn_n_s32(q0s32, 14);
477 *q10s16 = vcombine_s16(d20s16, d21s16);
478 q6s16 = vcombine_s16(d12s16, d13s16);
479
480 q5s16 = vdupq_n_s16(0);
481
482 *q9s16 = vsubq_s16(q5s16, *q9s16);
483 *q11s16 = vsubq_s16(q5s16, q2s16);
484 *q13s16 = vsubq_s16(q5s16, q6s16);
485 *q15s16 = vsubq_s16(q5s16, q4s16);
486 return;
487 }
488
vp9_iht8x8_64_add_neon(const tran_low_t * input,uint8_t * dest,int dest_stride,int tx_type)489 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest,
490 int dest_stride, int tx_type) {
491 int i;
492 uint8_t *d1, *d2;
493 uint8x8_t d0u8, d1u8, d2u8, d3u8;
494 uint64x1_t d0u64, d1u64, d2u64, d3u64;
495 int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
496 uint16x8_t q8u16, q9u16, q10u16, q11u16;
497
498 q8s16 = vld1q_s16(input);
499 q9s16 = vld1q_s16(input + 8);
500 q10s16 = vld1q_s16(input + 8 * 2);
501 q11s16 = vld1q_s16(input + 8 * 3);
502 q12s16 = vld1q_s16(input + 8 * 4);
503 q13s16 = vld1q_s16(input + 8 * 5);
504 q14s16 = vld1q_s16(input + 8 * 6);
505 q15s16 = vld1q_s16(input + 8 * 7);
506
507 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
508 &q12s16, &q13s16, &q14s16, &q15s16);
509
510 switch (tx_type) {
511 case 0: // idct_idct is not supported. Fall back to C
512 vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type);
513 return;
514 break;
515 case 1: // iadst_idct
516 // generate IDCT constants
517 // GENERATE_IDCT_CONSTANTS
518
519 // first transform rows
520 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
521 &q12s16, &q13s16, &q14s16, &q15s16);
522
523 // transpose the matrix
524 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
525 &q12s16, &q13s16, &q14s16, &q15s16);
526
527 // generate IADST constants
528 // GENERATE_IADST_CONSTANTS
529
530 // then transform columns
531 IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
532 &q12s16, &q13s16, &q14s16, &q15s16);
533 break;
534 case 2: // idct_iadst
535 // generate IADST constants
536 // GENERATE_IADST_CONSTANTS
537
538 // first transform rows
539 IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
540 &q12s16, &q13s16, &q14s16, &q15s16);
541
542 // transpose the matrix
543 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
544 &q12s16, &q13s16, &q14s16, &q15s16);
545
546 // generate IDCT constants
547 // GENERATE_IDCT_CONSTANTS
548
549 // then transform columns
550 IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
551 &q12s16, &q13s16, &q14s16, &q15s16);
552 break;
553 case 3: // iadst_iadst
554 // generate IADST constants
555 // GENERATE_IADST_CONSTANTS
556
557 // first transform rows
558 IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
559 &q12s16, &q13s16, &q14s16, &q15s16);
560
561 // transpose the matrix
562 TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16,
563 &q12s16, &q13s16, &q14s16, &q15s16);
564
565 // then transform columns
566 IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16,
567 &q12s16, &q13s16, &q14s16, &q15s16);
568 break;
569 default: // iadst_idct
570 assert(0);
571 break;
572 }
573
574 q8s16 = vrshrq_n_s16(q8s16, 5);
575 q9s16 = vrshrq_n_s16(q9s16, 5);
576 q10s16 = vrshrq_n_s16(q10s16, 5);
577 q11s16 = vrshrq_n_s16(q11s16, 5);
578 q12s16 = vrshrq_n_s16(q12s16, 5);
579 q13s16 = vrshrq_n_s16(q13s16, 5);
580 q14s16 = vrshrq_n_s16(q14s16, 5);
581 q15s16 = vrshrq_n_s16(q15s16, 5);
582
583 for (d1 = d2 = dest, i = 0; i < 2; i++) {
584 if (i != 0) {
585 q8s16 = q12s16;
586 q9s16 = q13s16;
587 q10s16 = q14s16;
588 q11s16 = q15s16;
589 }
590
591 d0u64 = vld1_u64((uint64_t *)d1);
592 d1 += dest_stride;
593 d1u64 = vld1_u64((uint64_t *)d1);
594 d1 += dest_stride;
595 d2u64 = vld1_u64((uint64_t *)d1);
596 d1 += dest_stride;
597 d3u64 = vld1_u64((uint64_t *)d1);
598 d1 += dest_stride;
599
600 q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16),
601 vreinterpret_u8_u64(d0u64));
602 q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16),
603 vreinterpret_u8_u64(d1u64));
604 q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16),
605 vreinterpret_u8_u64(d2u64));
606 q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16),
607 vreinterpret_u8_u64(d3u64));
608
609 d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
610 d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
611 d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
612 d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
613
614 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
615 d2 += dest_stride;
616 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
617 d2 += dest_stride;
618 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
619 d2 += dest_stride;
620 vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
621 d2 += dest_stride;
622 }
623 return;
624 }
625