1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
20
21 #if HAVE_DSPR2
idct16_rows_dspr2(const int16_t * input,int16_t * output,uint32_t no_rows)22 static void idct16_rows_dspr2(const int16_t *input, int16_t *output,
23 uint32_t no_rows) {
24 int i;
25 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
26 int step1_10, step1_11, step1_12, step1_13;
27 int step2_0, step2_1, step2_2, step2_3;
28 int step2_8, step2_9, step2_10, step2_11;
29 int step2_12, step2_13, step2_14, step2_15;
30 int load1, load2, load3, load4, load5, load6, load7, load8;
31 int result1, result2, result3, result4;
32 const int const_2_power_13 = 8192;
33
34 for (i = no_rows; i--; ) {
35 /* prefetch row */
36 vp9_prefetch_load((const uint8_t *)(input + 16));
37
38 __asm__ __volatile__ (
39 "lh %[load1], 0(%[input]) \n\t"
40 "lh %[load2], 16(%[input]) \n\t"
41 "lh %[load3], 8(%[input]) \n\t"
42 "lh %[load4], 24(%[input]) \n\t"
43
44 "mtlo %[const_2_power_13], $ac1 \n\t"
45 "mthi $zero, $ac1 \n\t"
46 "mtlo %[const_2_power_13], $ac2 \n\t"
47 "mthi $zero, $ac2 \n\t"
48 "add %[result1], %[load1], %[load2] \n\t"
49 "sub %[result2], %[load1], %[load2] \n\t"
50 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
51 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
52 "extp %[step2_0], $ac1, 31 \n\t"
53 "extp %[step2_1], $ac2, 31 \n\t"
54
55 "mtlo %[const_2_power_13], $ac3 \n\t"
56 "mthi $zero, $ac3 \n\t"
57 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
58 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
59 "extp %[step2_2], $ac3, 31 \n\t"
60
61 "mtlo %[const_2_power_13], $ac1 \n\t"
62 "mthi $zero, $ac1 \n\t"
63 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
64 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
65 "extp %[step2_3], $ac1, 31 \n\t"
66
67 "add %[step1_0], %[step2_0], %[step2_3] \n\t"
68 "add %[step1_1], %[step2_1], %[step2_2] \n\t"
69 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
70 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
71
72 : [load1] "=&r" (load1), [load2] "=&r" (load2),
73 [load3] "=&r" (load3), [load4] "=&r" (load4),
74 [result1] "=&r" (result1), [result2] "=&r" (result2),
75 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
76 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
77 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
78 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
79 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
80 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
81 [cospi_16_64] "r" (cospi_16_64)
82 );
83
84 __asm__ __volatile__ (
85 "lh %[load5], 2(%[input]) \n\t"
86 "lh %[load6], 30(%[input]) \n\t"
87 "lh %[load7], 18(%[input]) \n\t"
88 "lh %[load8], 14(%[input]) \n\t"
89
90 "mtlo %[const_2_power_13], $ac1 \n\t"
91 "mthi $zero, $ac1 \n\t"
92 "mtlo %[const_2_power_13], $ac3 \n\t"
93 "mthi $zero, $ac3 \n\t"
94
95 "madd $ac1, %[load5], %[cospi_30_64] \n\t"
96 "msub $ac1, %[load6], %[cospi_2_64] \n\t"
97 "extp %[result1], $ac1, 31 \n\t"
98
99 "madd $ac3, %[load7], %[cospi_14_64] \n\t"
100 "msub $ac3, %[load8], %[cospi_18_64] \n\t"
101 "extp %[result2], $ac3, 31 \n\t"
102
103 "mtlo %[const_2_power_13], $ac1 \n\t"
104 "mthi $zero, $ac1 \n\t"
105 "mtlo %[const_2_power_13], $ac2 \n\t"
106 "mthi $zero, $ac2 \n\t"
107
108 "madd $ac1, %[load7], %[cospi_18_64] \n\t"
109 "madd $ac1, %[load8], %[cospi_14_64] \n\t"
110 "extp %[result3], $ac1, 31 \n\t"
111
112 "madd $ac2, %[load5], %[cospi_2_64] \n\t"
113 "madd $ac2, %[load6], %[cospi_30_64] \n\t"
114 "extp %[result4], $ac2, 31 \n\t"
115
116 "sub %[load5], %[result1], %[result2] \n\t"
117 "sub %[load6], %[result4], %[result3] \n\t"
118
119 "mtlo %[const_2_power_13], $ac1 \n\t"
120 "mthi $zero, $ac1 \n\t"
121 "mtlo %[const_2_power_13], $ac3 \n\t"
122 "mthi $zero, $ac3 \n\t"
123
124 "madd $ac1, %[load6], %[cospi_24_64] \n\t"
125 "msub $ac1, %[load5], %[cospi_8_64] \n\t"
126 "madd $ac3, %[load5], %[cospi_24_64] \n\t"
127 "madd $ac3, %[load6], %[cospi_8_64] \n\t"
128
129 "extp %[step2_9], $ac1, 31 \n\t"
130 "extp %[step2_14], $ac3, 31 \n\t"
131 "add %[step2_8], %[result1], %[result2] \n\t"
132 "add %[step2_15], %[result4], %[result3] \n\t"
133
134 : [load5] "=&r" (load5), [load6] "=&r" (load6),
135 [load7] "=&r" (load7), [load8] "=&r" (load8),
136 [result1] "=&r" (result1), [result2] "=&r" (result2),
137 [result3] "=&r" (result3), [result4] "=&r" (result4),
138 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
139 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
140 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
141 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
142 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
143 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
144 );
145
146 __asm__ __volatile__ (
147 "lh %[load1], 10(%[input]) \n\t"
148 "lh %[load2], 22(%[input]) \n\t"
149 "lh %[load3], 26(%[input]) \n\t"
150 "lh %[load4], 6(%[input]) \n\t"
151
152 "mtlo %[const_2_power_13], $ac1 \n\t"
153 "mthi $zero, $ac1 \n\t"
154 "mtlo %[const_2_power_13], $ac3 \n\t"
155 "mthi $zero, $ac3 \n\t"
156
157 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
158 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
159 "extp %[result1], $ac1, 31 \n\t"
160
161 "madd $ac3, %[load3], %[cospi_6_64] \n\t"
162 "msub $ac3, %[load4], %[cospi_26_64] \n\t"
163 "extp %[result2], $ac3, 31 \n\t"
164
165 "mtlo %[const_2_power_13], $ac1 \n\t"
166 "mthi $zero, $ac1 \n\t"
167 "mtlo %[const_2_power_13], $ac2 \n\t"
168 "mthi $zero, $ac2 \n\t"
169
170 "madd $ac1, %[load1], %[cospi_10_64] \n\t"
171 "madd $ac1, %[load2], %[cospi_22_64] \n\t"
172 "extp %[result3], $ac1, 31 \n\t"
173
174 "madd $ac2, %[load3], %[cospi_26_64] \n\t"
175 "madd $ac2, %[load4], %[cospi_6_64] \n\t"
176 "extp %[result4], $ac2, 31 \n\t"
177
178 "mtlo %[const_2_power_13], $ac1 \n\t"
179 "mthi $zero, $ac1 \n\t"
180 "mtlo %[const_2_power_13], $ac3 \n\t"
181 "mthi $zero, $ac3 \n\t"
182
183 "sub %[load1], %[result2], %[result1] \n\t"
184 "sub %[load2], %[result4], %[result3] \n\t"
185
186 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
187 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
188 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
189 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
190
191 "extp %[step2_10], $ac1, 31 \n\t"
192 "extp %[step2_13], $ac3, 31 \n\t"
193 "add %[step2_11], %[result1], %[result2] \n\t"
194 "add %[step2_12], %[result4], %[result3] \n\t"
195
196 : [load1] "=&r" (load1), [load2] "=&r" (load2),
197 [load3] "=&r" (load3), [load4] "=&r" (load4),
198 [result1] "=&r" (result1), [result2] "=&r" (result2),
199 [result3] "=&r" (result3), [result4] "=&r" (result4),
200 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
201 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
202 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
203 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
204 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
205 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
206 );
207
208 __asm__ __volatile__ (
209 "lh %[load5], 4(%[input]) \n\t"
210 "lh %[load6], 28(%[input]) \n\t"
211 "lh %[load7], 20(%[input]) \n\t"
212 "lh %[load8], 12(%[input]) \n\t"
213
214 "mtlo %[const_2_power_13], $ac1 \n\t"
215 "mthi $zero, $ac1 \n\t"
216 "mtlo %[const_2_power_13], $ac3 \n\t"
217 "mthi $zero, $ac3 \n\t"
218
219 "madd $ac1, %[load5], %[cospi_28_64] \n\t"
220 "msub $ac1, %[load6], %[cospi_4_64] \n\t"
221 "extp %[result1], $ac1, 31 \n\t"
222
223 "madd $ac3, %[load7], %[cospi_12_64] \n\t"
224 "msub $ac3, %[load8], %[cospi_20_64] \n\t"
225 "extp %[result2], $ac3, 31 \n\t"
226
227 "mtlo %[const_2_power_13], $ac1 \n\t"
228 "mthi $zero, $ac1 \n\t"
229 "mtlo %[const_2_power_13], $ac2 \n\t"
230 "mthi $zero, $ac2 \n\t"
231
232 "madd $ac1, %[load7], %[cospi_20_64] \n\t"
233 "madd $ac1, %[load8], %[cospi_12_64] \n\t"
234 "extp %[result3], $ac1, 31 \n\t"
235
236 "madd $ac2, %[load5], %[cospi_4_64] \n\t"
237 "madd $ac2, %[load6], %[cospi_28_64] \n\t"
238 "extp %[result4], $ac2, 31 \n\t"
239
240 "mtlo %[const_2_power_13], $ac1 \n\t"
241 "mthi $zero, $ac1 \n\t"
242 "mtlo %[const_2_power_13], $ac3 \n\t"
243 "mthi $zero, $ac3 \n\t"
244
245 "sub %[load5], %[result4], %[result3] \n\t"
246 "sub %[load5], %[load5], %[result1] \n\t"
247 "add %[load5], %[load5], %[result2] \n\t"
248
249 "sub %[load6], %[result1], %[result2] \n\t"
250 "sub %[load6], %[load6], %[result3] \n\t"
251 "add %[load6], %[load6], %[result4] \n\t"
252
253 "madd $ac1, %[load5], %[cospi_16_64] \n\t"
254 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
255
256 "extp %[step1_5], $ac1, 31 \n\t"
257 "extp %[step1_6], $ac3, 31 \n\t"
258 "add %[step1_4], %[result1], %[result2] \n\t"
259 "add %[step1_7], %[result4], %[result3] \n\t"
260
261 : [load5] "=&r" (load5), [load6] "=&r" (load6),
262 [load7] "=&r" (load7), [load8] "=&r" (load8),
263 [result1] "=&r" (result1), [result2] "=&r" (result2),
264 [result3] "=&r" (result3), [result4] "=&r" (result4),
265 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
266 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
267 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
268 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
269 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
270 [cospi_16_64] "r" (cospi_16_64)
271 );
272
273 __asm__ __volatile__ (
274 "mtlo %[const_2_power_13], $ac0 \n\t"
275 "mthi $zero, $ac0 \n\t"
276 "mtlo %[const_2_power_13], $ac1 \n\t"
277 "mthi $zero, $ac1 \n\t"
278
279 "sub %[load5], %[step2_14], %[step2_13] \n\t"
280 "sub %[load5], %[load5], %[step2_9] \n\t"
281 "add %[load5], %[load5], %[step2_10] \n\t"
282
283 "madd $ac0, %[load5], %[cospi_16_64] \n\t"
284
285 "sub %[load6], %[step2_14], %[step2_13] \n\t"
286 "sub %[load6], %[load6], %[step2_10] \n\t"
287 "add %[load6], %[load6], %[step2_9] \n\t"
288
289 "madd $ac1, %[load6], %[cospi_16_64] \n\t"
290
291 "mtlo %[const_2_power_13], $ac2 \n\t"
292 "mthi $zero, $ac2 \n\t"
293 "mtlo %[const_2_power_13], $ac3 \n\t"
294 "mthi $zero, $ac3 \n\t"
295
296 "sub %[load5], %[step2_15], %[step2_12] \n\t"
297 "sub %[load5], %[load5], %[step2_8] \n\t"
298 "add %[load5], %[load5], %[step2_11] \n\t"
299
300 "madd $ac2, %[load5], %[cospi_16_64] \n\t"
301
302 "sub %[load6], %[step2_15], %[step2_12] \n\t"
303 "sub %[load6], %[load6], %[step2_11] \n\t"
304 "add %[load6], %[load6], %[step2_8] \n\t"
305
306 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
307
308 "extp %[step1_10], $ac0, 31 \n\t"
309 "extp %[step1_13], $ac1, 31 \n\t"
310 "extp %[step1_11], $ac2, 31 \n\t"
311 "extp %[step1_12], $ac3, 31 \n\t"
312
313 : [load5] "=&r" (load5), [load6] "=&r" (load6),
314 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
315 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
316 : [const_2_power_13] "r" (const_2_power_13),
317 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
318 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
319 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
320 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
321 [cospi_16_64] "r" (cospi_16_64)
322 );
323
324 __asm__ __volatile__ (
325 "add %[load5], %[step1_0], %[step1_7] \n\t"
326 "add %[load5], %[load5], %[step2_12] \n\t"
327 "add %[load5], %[load5], %[step2_15] \n\t"
328 "add %[load6], %[step1_1], %[step1_6] \n\t"
329 "add %[load6], %[load6], %[step2_13] \n\t"
330 "add %[load6], %[load6], %[step2_14] \n\t"
331 "sh %[load5], 0(%[output]) \n\t"
332 "sh %[load6], 32(%[output]) \n\t"
333 "sub %[load5], %[step1_1], %[step1_6] \n\t"
334 "add %[load5], %[load5], %[step2_9] \n\t"
335 "add %[load5], %[load5], %[step2_10] \n\t"
336 "sub %[load6], %[step1_0], %[step1_7] \n\t"
337 "add %[load6], %[load6], %[step2_8] \n\t"
338 "add %[load6], %[load6], %[step2_11] \n\t"
339 "sh %[load5], 192(%[output]) \n\t"
340 "sh %[load6], 224(%[output]) \n\t"
341 "sub %[load5], %[step1_0], %[step1_7] \n\t"
342 "sub %[load5], %[load5], %[step2_8] \n\t"
343 "sub %[load5], %[load5], %[step2_11] \n\t"
344 "sub %[load6], %[step1_1], %[step1_6] \n\t"
345 "sub %[load6], %[load6], %[step2_9] \n\t"
346 "sub %[load6], %[load6], %[step2_10] \n\t"
347 "sh %[load5], 256(%[output]) \n\t"
348 "sh %[load6], 288(%[output]) \n\t"
349 "add %[load5], %[step1_1], %[step1_6] \n\t"
350 "sub %[load5], %[load5], %[step2_13] \n\t"
351 "sub %[load5], %[load5], %[step2_14] \n\t"
352 "add %[load6], %[step1_0], %[step1_7] \n\t"
353 "sub %[load6], %[load6], %[step2_12] \n\t"
354 "sub %[load6], %[load6], %[step2_15] \n\t"
355 "sh %[load5], 448(%[output]) \n\t"
356 "sh %[load6], 480(%[output]) \n\t"
357
358 : [load5] "=&r" (load5), [load6] "=&r" (load6)
359 : [output] "r" (output),
360 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
361 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
362 [step2_8] "r" (step2_8), [step2_9] "r" (step2_9),
363 [step2_10] "r" (step2_10), [step2_11] "r" (step2_11),
364 [step2_12] "r" (step2_12), [step2_13] "r" (step2_13),
365 [step2_14] "r" (step2_14), [step2_15] "r" (step2_15)
366 );
367
368 __asm__ __volatile__ (
369 "add %[load5], %[step1_2], %[step1_5] \n\t"
370 "add %[load5], %[load5], %[step1_13] \n\t"
371 "add %[load6], %[step1_3], %[step1_4] \n\t"
372 "add %[load6], %[load6], %[step1_12] \n\t"
373 "sh %[load5], 64(%[output]) \n\t"
374 "sh %[load6], 96(%[output]) \n\t"
375 "sub %[load5], %[step1_3], %[step1_4] \n\t"
376 "add %[load5], %[load5], %[step1_11] \n\t"
377 "sub %[load6], %[step1_2], %[step1_5] \n\t"
378 "add %[load6], %[load6], %[step1_10] \n\t"
379 "sh %[load5], 128(%[output]) \n\t"
380 "sh %[load6], 160(%[output]) \n\t"
381 "sub %[load5], %[step1_2], %[step1_5] \n\t"
382 "sub %[load5], %[load5], %[step1_10] \n\t"
383 "sub %[load6], %[step1_3], %[step1_4] \n\t"
384 "sub %[load6], %[load6], %[step1_11] \n\t"
385 "sh %[load5], 320(%[output]) \n\t"
386 "sh %[load6], 352(%[output]) \n\t"
387 "add %[load5], %[step1_3], %[step1_4] \n\t"
388 "sub %[load5], %[load5], %[step1_12] \n\t"
389 "add %[load6], %[step1_2], %[step1_5] \n\t"
390 "sub %[load6], %[load6], %[step1_13] \n\t"
391 "sh %[load5], 384(%[output]) \n\t"
392 "sh %[load6], 416(%[output]) \n\t"
393
394 : [load5] "=&r" (load5), [load6] "=&r" (load6)
395 : [output] "r" (output),
396 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
397 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
398 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
399 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13)
400 );
401
402 input += 16;
403 output += 1;
404 }
405 }
406
idct16_cols_add_blk_dspr2(int16_t * input,uint8_t * dest,int dest_stride)407 static void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest,
408 int dest_stride) {
409 int i;
410 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
411 int step1_8, step1_9, step1_10, step1_11;
412 int step1_12, step1_13, step1_14, step1_15;
413 int step2_0, step2_1, step2_2, step2_3;
414 int step2_8, step2_9, step2_10, step2_11;
415 int step2_12, step2_13, step2_14, step2_15;
416 int load1, load2, load3, load4, load5, load6, load7, load8;
417 int result1, result2, result3, result4;
418 const int const_2_power_13 = 8192;
419 uint8_t *dest_pix;
420 uint8_t *cm = vp9_ff_cropTbl;
421
422 /* prefetch vp9_ff_cropTbl */
423 vp9_prefetch_load(vp9_ff_cropTbl);
424 vp9_prefetch_load(vp9_ff_cropTbl + 32);
425 vp9_prefetch_load(vp9_ff_cropTbl + 64);
426 vp9_prefetch_load(vp9_ff_cropTbl + 96);
427 vp9_prefetch_load(vp9_ff_cropTbl + 128);
428 vp9_prefetch_load(vp9_ff_cropTbl + 160);
429 vp9_prefetch_load(vp9_ff_cropTbl + 192);
430 vp9_prefetch_load(vp9_ff_cropTbl + 224);
431
432 for (i = 0; i < 16; ++i) {
433 dest_pix = (dest + i);
434 __asm__ __volatile__ (
435 "lh %[load1], 0(%[input]) \n\t"
436 "lh %[load2], 16(%[input]) \n\t"
437 "lh %[load3], 8(%[input]) \n\t"
438 "lh %[load4], 24(%[input]) \n\t"
439
440 "mtlo %[const_2_power_13], $ac1 \n\t"
441 "mthi $zero, $ac1 \n\t"
442 "mtlo %[const_2_power_13], $ac2 \n\t"
443 "mthi $zero, $ac2 \n\t"
444 "add %[result1], %[load1], %[load2] \n\t"
445 "sub %[result2], %[load1], %[load2] \n\t"
446 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
447 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
448 "extp %[step2_0], $ac1, 31 \n\t"
449 "extp %[step2_1], $ac2, 31 \n\t"
450
451 "mtlo %[const_2_power_13], $ac3 \n\t"
452 "mthi $zero, $ac3 \n\t"
453 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
454 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
455 "extp %[step2_2], $ac3, 31 \n\t"
456
457 "mtlo %[const_2_power_13], $ac1 \n\t"
458 "mthi $zero, $ac1 \n\t"
459 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
460 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
461 "extp %[step2_3], $ac1, 31 \n\t"
462
463 "add %[step1_0], %[step2_0], %[step2_3] \n\t"
464 "add %[step1_1], %[step2_1], %[step2_2] \n\t"
465 "sub %[step1_2], %[step2_1], %[step2_2] \n\t"
466 "sub %[step1_3], %[step2_0], %[step2_3] \n\t"
467
468 : [load1] "=&r" (load1), [load2] "=&r" (load2),
469 [load3] "=&r" (load3), [load4] "=&r" (load4),
470 [result1] "=&r" (result1), [result2] "=&r" (result2),
471 [step2_0] "=&r" (step2_0), [step2_1] "=&r" (step2_1),
472 [step2_2] "=&r" (step2_2), [step2_3] "=&r" (step2_3),
473 [step1_0] "=r" (step1_0), [step1_1] "=r" (step1_1),
474 [step1_2] "=r" (step1_2), [step1_3] "=r" (step1_3)
475 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
476 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64),
477 [cospi_16_64] "r" (cospi_16_64)
478 );
479
480 __asm__ __volatile__ (
481 "lh %[load5], 2(%[input]) \n\t"
482 "lh %[load6], 30(%[input]) \n\t"
483 "lh %[load7], 18(%[input]) \n\t"
484 "lh %[load8], 14(%[input]) \n\t"
485
486 "mtlo %[const_2_power_13], $ac1 \n\t"
487 "mthi $zero, $ac1 \n\t"
488 "mtlo %[const_2_power_13], $ac3 \n\t"
489 "mthi $zero, $ac3 \n\t"
490
491 "madd $ac1, %[load5], %[cospi_30_64] \n\t"
492 "msub $ac1, %[load6], %[cospi_2_64] \n\t"
493 "extp %[result1], $ac1, 31 \n\t"
494
495 "madd $ac3, %[load7], %[cospi_14_64] \n\t"
496 "msub $ac3, %[load8], %[cospi_18_64] \n\t"
497 "extp %[result2], $ac3, 31 \n\t"
498
499 "mtlo %[const_2_power_13], $ac1 \n\t"
500 "mthi $zero, $ac1 \n\t"
501 "mtlo %[const_2_power_13], $ac2 \n\t"
502 "mthi $zero, $ac2 \n\t"
503
504 "madd $ac1, %[load7], %[cospi_18_64] \n\t"
505 "madd $ac1, %[load8], %[cospi_14_64] \n\t"
506 "extp %[result3], $ac1, 31 \n\t"
507
508 "madd $ac2, %[load5], %[cospi_2_64] \n\t"
509 "madd $ac2, %[load6], %[cospi_30_64] \n\t"
510 "extp %[result4], $ac2, 31 \n\t"
511
512 "sub %[load5], %[result1], %[result2] \n\t"
513 "sub %[load6], %[result4], %[result3] \n\t"
514
515 "mtlo %[const_2_power_13], $ac1 \n\t"
516 "mthi $zero, $ac1 \n\t"
517 "mtlo %[const_2_power_13], $ac3 \n\t"
518 "mthi $zero, $ac3 \n\t"
519
520 "madd $ac1, %[load6], %[cospi_24_64] \n\t"
521 "msub $ac1, %[load5], %[cospi_8_64] \n\t"
522 "madd $ac3, %[load5], %[cospi_24_64] \n\t"
523 "madd $ac3, %[load6], %[cospi_8_64] \n\t"
524
525 "extp %[step2_9], $ac1, 31 \n\t"
526 "extp %[step2_14], $ac3, 31 \n\t"
527 "add %[step2_8], %[result1], %[result2] \n\t"
528 "add %[step2_15], %[result4], %[result3] \n\t"
529
530 : [load5] "=&r" (load5), [load6] "=&r" (load6),
531 [load7] "=&r" (load7), [load8] "=&r" (load8),
532 [result1] "=&r" (result1), [result2] "=&r" (result2),
533 [result3] "=&r" (result3), [result4] "=&r" (result4),
534 [step2_8] "=r" (step2_8), [step2_15] "=r" (step2_15),
535 [step2_9] "=r" (step2_9), [step2_14] "=r" (step2_14)
536 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
537 [cospi_30_64] "r" (cospi_30_64), [cospi_2_64] "r" (cospi_2_64),
538 [cospi_14_64] "r" (cospi_14_64), [cospi_18_64] "r" (cospi_18_64),
539 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
540 );
541
542 __asm__ __volatile__ (
543 "lh %[load1], 10(%[input]) \n\t"
544 "lh %[load2], 22(%[input]) \n\t"
545 "lh %[load3], 26(%[input]) \n\t"
546 "lh %[load4], 6(%[input]) \n\t"
547
548 "mtlo %[const_2_power_13], $ac1 \n\t"
549 "mthi $zero, $ac1 \n\t"
550 "mtlo %[const_2_power_13], $ac3 \n\t"
551 "mthi $zero, $ac3 \n\t"
552
553 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
554 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
555 "extp %[result1], $ac1, 31 \n\t"
556
557 "madd $ac3, %[load3], %[cospi_6_64] \n\t"
558 "msub $ac3, %[load4], %[cospi_26_64] \n\t"
559 "extp %[result2], $ac3, 31 \n\t"
560
561 "mtlo %[const_2_power_13], $ac1 \n\t"
562 "mthi $zero, $ac1 \n\t"
563 "mtlo %[const_2_power_13], $ac2 \n\t"
564 "mthi $zero, $ac2 \n\t"
565
566 "madd $ac1, %[load1], %[cospi_10_64] \n\t"
567 "madd $ac1, %[load2], %[cospi_22_64] \n\t"
568 "extp %[result3], $ac1, 31 \n\t"
569
570 "madd $ac2, %[load3], %[cospi_26_64] \n\t"
571 "madd $ac2, %[load4], %[cospi_6_64] \n\t"
572 "extp %[result4], $ac2, 31 \n\t"
573
574 "mtlo %[const_2_power_13], $ac1 \n\t"
575 "mthi $zero, $ac1 \n\t"
576 "mtlo %[const_2_power_13], $ac3 \n\t"
577 "mthi $zero, $ac3 \n\t"
578
579 "sub %[load1], %[result2], %[result1] \n\t"
580 "sub %[load2], %[result4], %[result3] \n\t"
581
582 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
583 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
584 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
585 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
586
587 "extp %[step2_10], $ac1, 31 \n\t"
588 "extp %[step2_13], $ac3, 31 \n\t"
589 "add %[step2_11], %[result1], %[result2] \n\t"
590 "add %[step2_12], %[result4], %[result3] \n\t"
591
592 : [load1] "=&r" (load1), [load2] "=&r" (load2),
593 [load3] "=&r" (load3), [load4] "=&r" (load4),
594 [result1] "=&r" (result1), [result2] "=&r" (result2),
595 [result3] "=&r" (result3), [result4] "=&r" (result4),
596 [step2_10] "=r" (step2_10), [step2_11] "=r" (step2_11),
597 [step2_12] "=r" (step2_12), [step2_13] "=r" (step2_13)
598 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
599 [cospi_22_64] "r" (cospi_22_64), [cospi_10_64] "r" (cospi_10_64),
600 [cospi_6_64] "r" (cospi_6_64), [cospi_26_64] "r" (cospi_26_64),
601 [cospi_24_64] "r" (cospi_24_64), [cospi_8_64] "r" (cospi_8_64)
602 );
603
604 __asm__ __volatile__ (
605 "lh %[load5], 4(%[input]) \n\t"
606 "lh %[load6], 28(%[input]) \n\t"
607 "lh %[load7], 20(%[input]) \n\t"
608 "lh %[load8], 12(%[input]) \n\t"
609
610 "mtlo %[const_2_power_13], $ac1 \n\t"
611 "mthi $zero, $ac1 \n\t"
612 "mtlo %[const_2_power_13], $ac3 \n\t"
613 "mthi $zero, $ac3 \n\t"
614
615 "madd $ac1, %[load5], %[cospi_28_64] \n\t"
616 "msub $ac1, %[load6], %[cospi_4_64] \n\t"
617 "extp %[result1], $ac1, 31 \n\t"
618
619 "madd $ac3, %[load7], %[cospi_12_64] \n\t"
620 "msub $ac3, %[load8], %[cospi_20_64] \n\t"
621 "extp %[result2], $ac3, 31 \n\t"
622
623 "mtlo %[const_2_power_13], $ac1 \n\t"
624 "mthi $zero, $ac1 \n\t"
625 "mtlo %[const_2_power_13], $ac2 \n\t"
626 "mthi $zero, $ac2 \n\t"
627
628 "madd $ac1, %[load7], %[cospi_20_64] \n\t"
629 "madd $ac1, %[load8], %[cospi_12_64] \n\t"
630 "extp %[result3], $ac1, 31 \n\t"
631
632 "madd $ac2, %[load5], %[cospi_4_64] \n\t"
633 "madd $ac2, %[load6], %[cospi_28_64] \n\t"
634 "extp %[result4], $ac2, 31 \n\t"
635
636 "mtlo %[const_2_power_13], $ac1 \n\t"
637 "mthi $zero, $ac1 \n\t"
638 "mtlo %[const_2_power_13], $ac3 \n\t"
639 "mthi $zero, $ac3 \n\t"
640
641 "sub %[load5], %[result4], %[result3] \n\t"
642 "sub %[load5], %[load5], %[result1] \n\t"
643 "add %[load5], %[load5], %[result2] \n\t"
644
645 "sub %[load6], %[result1], %[result2] \n\t"
646 "sub %[load6], %[load6], %[result3] \n\t"
647 "add %[load6], %[load6], %[result4] \n\t"
648
649 "madd $ac1, %[load5], %[cospi_16_64] \n\t"
650 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
651
652 "extp %[step1_5], $ac1, 31 \n\t"
653 "extp %[step1_6], $ac3, 31 \n\t"
654
655 "add %[step1_4], %[result1], %[result2] \n\t"
656 "add %[step1_7], %[result4], %[result3] \n\t"
657
658 : [load5] "=&r" (load5), [load6] "=&r" (load6),
659 [load7] "=&r" (load7), [load8] "=&r" (load8),
660 [result1] "=&r" (result1), [result2] "=&r" (result2),
661 [result3] "=&r" (result3), [result4] "=&r" (result4),
662 [step1_4] "=r" (step1_4), [step1_5] "=r" (step1_5),
663 [step1_6] "=r" (step1_6), [step1_7] "=r" (step1_7)
664 : [const_2_power_13] "r" (const_2_power_13), [input] "r" (input),
665 [cospi_20_64] "r" (cospi_20_64), [cospi_12_64] "r" (cospi_12_64),
666 [cospi_4_64] "r" (cospi_4_64), [cospi_28_64] "r" (cospi_28_64),
667 [cospi_16_64] "r" (cospi_16_64)
668 );
669
670 __asm__ __volatile__ (
671 "mtlo %[const_2_power_13], $ac0 \n\t"
672 "mthi $zero, $ac0 \n\t"
673 "mtlo %[const_2_power_13], $ac1 \n\t"
674 "mthi $zero, $ac1 \n\t"
675
676 "sub %[load5], %[step2_14], %[step2_13] \n\t"
677 "sub %[load5], %[load5], %[step2_9] \n\t"
678 "add %[load5], %[load5], %[step2_10] \n\t"
679
680 "madd $ac0, %[load5], %[cospi_16_64] \n\t"
681
682 "sub %[load6], %[step2_14], %[step2_13] \n\t"
683 "sub %[load6], %[load6], %[step2_10] \n\t"
684 "add %[load6], %[load6], %[step2_9] \n\t"
685
686 "madd $ac1, %[load6], %[cospi_16_64] \n\t"
687
688 "mtlo %[const_2_power_13], $ac2 \n\t"
689 "mthi $zero, $ac2 \n\t"
690 "mtlo %[const_2_power_13], $ac3 \n\t"
691 "mthi $zero, $ac3 \n\t"
692
693 "sub %[load5], %[step2_15], %[step2_12] \n\t"
694 "sub %[load5], %[load5], %[step2_8] \n\t"
695 "add %[load5], %[load5], %[step2_11] \n\t"
696
697 "madd $ac2, %[load5], %[cospi_16_64] \n\t"
698
699 "sub %[load6], %[step2_15], %[step2_12] \n\t"
700 "sub %[load6], %[load6], %[step2_11] \n\t"
701 "add %[load6], %[load6], %[step2_8] \n\t"
702
703 "madd $ac3, %[load6], %[cospi_16_64] \n\t"
704
705 "extp %[step1_10], $ac0, 31 \n\t"
706 "extp %[step1_13], $ac1, 31 \n\t"
707 "extp %[step1_11], $ac2, 31 \n\t"
708 "extp %[step1_12], $ac3, 31 \n\t"
709
710 : [load5] "=&r" (load5), [load6] "=&r" (load6),
711 [step1_10] "=r" (step1_10), [step1_11] "=r" (step1_11),
712 [step1_12] "=r" (step1_12), [step1_13] "=r" (step1_13)
713 : [const_2_power_13] "r" (const_2_power_13),
714 [step2_14] "r" (step2_14), [step2_13] "r" (step2_13),
715 [step2_9] "r" (step2_9), [step2_10] "r" (step2_10),
716 [step2_15] "r" (step2_15), [step2_12] "r" (step2_12),
717 [step2_8] "r" (step2_8), [step2_11] "r" (step2_11),
718 [cospi_16_64] "r" (cospi_16_64)
719 );
720
721 step1_8 = step2_8 + step2_11;
722 step1_9 = step2_9 + step2_10;
723 step1_14 = step2_13 + step2_14;
724 step1_15 = step2_12 + step2_15;
725
726 __asm__ __volatile__ (
727 "lbu %[load7], 0(%[dest_pix]) \n\t"
728 "add %[load5], %[step1_0], %[step1_7] \n\t"
729 "add %[load5], %[load5], %[step1_15] \n\t"
730 "addi %[load5], %[load5], 32 \n\t"
731 "sra %[load5], %[load5], 6 \n\t"
732 "add %[load7], %[load7], %[load5] \n\t"
733 "lbux %[load5], %[load7](%[cm]) \n\t"
734 "add %[load6], %[step1_1], %[step1_6] \n\t"
735 "add %[load6], %[load6], %[step1_14] \n\t"
736 "sb %[load5], 0(%[dest_pix]) \n\t"
737 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
738 "lbu %[load8], 0(%[dest_pix]) \n\t"
739 "addi %[load6], %[load6], 32 \n\t"
740 "sra %[load6], %[load6], 6 \n\t"
741 "add %[load8], %[load8], %[load6] \n\t"
742 "lbux %[load6], %[load8](%[cm]) \n\t"
743 "sb %[load6], 0(%[dest_pix]) \n\t"
744 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
745
746 "lbu %[load7], 0(%[dest_pix]) \n\t"
747 "add %[load5], %[step1_2], %[step1_5] \n\t"
748 "add %[load5], %[load5], %[step1_13] \n\t"
749 "addi %[load5], %[load5], 32 \n\t"
750 "sra %[load5], %[load5], 6 \n\t"
751 "add %[load7], %[load7], %[load5] \n\t"
752 "lbux %[load5], %[load7](%[cm]) \n\t"
753 "add %[load6], %[step1_3], %[step1_4] \n\t"
754 "add %[load6], %[load6], %[step1_12] \n\t"
755 "sb %[load5], 0(%[dest_pix]) \n\t"
756 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
757 "lbu %[load8], 0(%[dest_pix]) \n\t"
758 "addi %[load6], %[load6], 32 \n\t"
759 "sra %[load6], %[load6], 6 \n\t"
760 "add %[load8], %[load8], %[load6] \n\t"
761 "lbux %[load6], %[load8](%[cm]) \n\t"
762 "sb %[load6], 0(%[dest_pix]) \n\t"
763 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
764
765 "lbu %[load7], 0(%[dest_pix]) \n\t"
766 "sub %[load5], %[step1_3], %[step1_4] \n\t"
767 "add %[load5], %[load5], %[step1_11] \n\t"
768 "addi %[load5], %[load5], 32 \n\t"
769 "sra %[load5], %[load5], 6 \n\t"
770 "add %[load7], %[load7], %[load5] \n\t"
771 "lbux %[load5], %[load7](%[cm]) \n\t"
772 "sub %[load6], %[step1_2], %[step1_5] \n\t"
773 "add %[load6], %[load6], %[step1_10] \n\t"
774 "sb %[load5], 0(%[dest_pix]) \n\t"
775 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
776 "lbu %[load8], 0(%[dest_pix]) \n\t"
777 "addi %[load6], %[load6], 32 \n\t"
778 "sra %[load6], %[load6], 6 \n\t"
779 "add %[load8], %[load8], %[load6] \n\t"
780 "lbux %[load6], %[load8](%[cm]) \n\t"
781 "sb %[load6], 0(%[dest_pix]) \n\t"
782 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
783
784 "sub %[load5], %[step1_1], %[step1_6] \n\t"
785 "lbu %[load7], 0(%[dest_pix]) \n\t"
786 "add %[load5], %[load5], %[step1_9] \n\t"
787 "addi %[load5], %[load5], 32 \n\t"
788 "sra %[load5], %[load5], 6 \n\t"
789 "add %[load7], %[load7], %[load5] \n\t"
790 "lbux %[load5], %[load7](%[cm]) \n\t"
791 "sub %[load6], %[step1_0], %[step1_7] \n\t"
792 "add %[load6], %[load6], %[step1_8] \n\t"
793 "sb %[load5], 0(%[dest_pix]) \n\t"
794 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
795 "lbu %[load8], 0(%[dest_pix]) \n\t"
796 "addi %[load6], %[load6], 32 \n\t"
797 "sra %[load6], %[load6], 6 \n\t"
798 "add %[load8], %[load8], %[load6] \n\t"
799 "lbux %[load6], %[load8](%[cm]) \n\t"
800 "sb %[load6], 0(%[dest_pix]) \n\t"
801 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
802
803 "lbu %[load7], 0(%[dest_pix]) \n\t"
804 "sub %[load5], %[step1_0], %[step1_7] \n\t"
805 "sub %[load5], %[load5], %[step1_8] \n\t"
806 "addi %[load5], %[load5], 32 \n\t"
807 "sra %[load5], %[load5], 6 \n\t"
808 "add %[load7], %[load7], %[load5] \n\t"
809 "lbux %[load5], %[load7](%[cm]) \n\t"
810 "sub %[load6], %[step1_1], %[step1_6] \n\t"
811 "sub %[load6], %[load6], %[step1_9] \n\t"
812 "sb %[load5], 0(%[dest_pix]) \n\t"
813 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
814 "lbu %[load8], 0(%[dest_pix]) \n\t"
815 "addi %[load6], %[load6], 32 \n\t"
816 "sra %[load6], %[load6], 6 \n\t"
817 "add %[load8], %[load8], %[load6] \n\t"
818 "lbux %[load6], %[load8](%[cm]) \n\t"
819 "sb %[load6], 0(%[dest_pix]) \n\t"
820 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
821
822 "lbu %[load7], 0(%[dest_pix]) \n\t"
823 "sub %[load5], %[step1_2], %[step1_5] \n\t"
824 "sub %[load5], %[load5], %[step1_10] \n\t"
825 "addi %[load5], %[load5], 32 \n\t"
826 "sra %[load5], %[load5], 6 \n\t"
827 "add %[load7], %[load7], %[load5] \n\t"
828 "lbux %[load5], %[load7](%[cm]) \n\t"
829 "sub %[load6], %[step1_3], %[step1_4] \n\t"
830 "sub %[load6], %[load6], %[step1_11] \n\t"
831 "sb %[load5], 0(%[dest_pix]) \n\t"
832 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
833 "lbu %[load8], 0(%[dest_pix]) \n\t"
834 "addi %[load6], %[load6], 32 \n\t"
835 "sra %[load6], %[load6], 6 \n\t"
836 "add %[load8], %[load8], %[load6] \n\t"
837 "lbux %[load6], %[load8](%[cm]) \n\t"
838 "sb %[load6], 0(%[dest_pix]) \n\t"
839 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
840
841 "lbu %[load7], 0(%[dest_pix]) \n\t"
842 "add %[load5], %[step1_3], %[step1_4] \n\t"
843 "sub %[load5], %[load5], %[step1_12] \n\t"
844 "addi %[load5], %[load5], 32 \n\t"
845 "sra %[load5], %[load5], 6 \n\t"
846 "add %[load7], %[load7], %[load5] \n\t"
847 "lbux %[load5], %[load7](%[cm]) \n\t"
848 "add %[load6], %[step1_2], %[step1_5] \n\t"
849 "sub %[load6], %[load6], %[step1_13] \n\t"
850 "sb %[load5], 0(%[dest_pix]) \n\t"
851 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
852 "lbu %[load8], 0(%[dest_pix]) \n\t"
853 "addi %[load6], %[load6], 32 \n\t"
854 "sra %[load6], %[load6], 6 \n\t"
855 "add %[load8], %[load8], %[load6] \n\t"
856 "lbux %[load6], %[load8](%[cm]) \n\t"
857 "sb %[load6], 0(%[dest_pix]) \n\t"
858 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
859
860 "lbu %[load7], 0(%[dest_pix]) \n\t"
861 "add %[load5], %[step1_1], %[step1_6] \n\t"
862 "sub %[load5], %[load5], %[step1_14] \n\t"
863 "addi %[load5], %[load5], 32 \n\t"
864 "sra %[load5], %[load5], 6 \n\t"
865 "add %[load7], %[load7], %[load5] \n\t"
866 "lbux %[load5], %[load7](%[cm]) \n\t"
867 "add %[load6], %[step1_0], %[step1_7] \n\t"
868 "sub %[load6], %[load6], %[step1_15] \n\t"
869 "sb %[load5], 0(%[dest_pix]) \n\t"
870 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
871 "lbu %[load8], 0(%[dest_pix]) \n\t"
872 "addi %[load6], %[load6], 32 \n\t"
873 "sra %[load6], %[load6], 6 \n\t"
874 "add %[load8], %[load8], %[load6] \n\t"
875 "lbux %[load6], %[load8](%[cm]) \n\t"
876 "sb %[load6], 0(%[dest_pix]) \n\t"
877
878 : [load5] "=&r" (load5), [load6] "=&r" (load6), [load7] "=&r" (load7),
879 [load8] "=&r" (load8), [dest_pix] "+r" (dest_pix)
880 : [cm] "r" (cm), [dest_stride] "r" (dest_stride),
881 [step1_0] "r" (step1_0), [step1_1] "r" (step1_1),
882 [step1_2] "r" (step1_2), [step1_3] "r" (step1_3),
883 [step1_4] "r" (step1_4), [step1_5] "r" (step1_5),
884 [step1_6] "r" (step1_6), [step1_7] "r" (step1_7),
885 [step1_8] "r" (step1_8), [step1_9] "r" (step1_9),
886 [step1_10] "r" (step1_10), [step1_11] "r" (step1_11),
887 [step1_12] "r" (step1_12), [step1_13] "r" (step1_13),
888 [step1_14] "r" (step1_14), [step1_15] "r" (step1_15)
889 );
890
891 input += 16;
892 }
893 }
894
vp9_idct16x16_256_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)895 void vp9_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
896 int dest_stride) {
897 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
898 uint32_t pos = 45;
899
900 /* bit positon for extract from acc */
901 __asm__ __volatile__ (
902 "wrdsp %[pos], 1 \n\t"
903 :
904 : [pos] "r" (pos)
905 );
906
907 // First transform rows
908 idct16_rows_dspr2(input, out, 16);
909
910 // Then transform columns and add to dest
911 idct16_cols_add_blk_dspr2(out, dest, dest_stride);
912 }
913
iadst16(const int16_t * input,int16_t * output)914 static void iadst16(const int16_t *input, int16_t *output) {
915 int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
916
917 int x0 = input[15];
918 int x1 = input[0];
919 int x2 = input[13];
920 int x3 = input[2];
921 int x4 = input[11];
922 int x5 = input[4];
923 int x6 = input[9];
924 int x7 = input[6];
925 int x8 = input[7];
926 int x9 = input[8];
927 int x10 = input[5];
928 int x11 = input[10];
929 int x12 = input[3];
930 int x13 = input[12];
931 int x14 = input[1];
932 int x15 = input[14];
933
934 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
935 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
936 output[0] = output[1] = output[2] = output[3] = output[4]
937 = output[5] = output[6] = output[7] = output[8]
938 = output[9] = output[10] = output[11] = output[12]
939 = output[13] = output[14] = output[15] = 0;
940 return;
941 }
942
943 // stage 1
944 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
945 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
946 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
947 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
948 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
949 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
950 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
951 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
952 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
953 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
954 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
955 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
956 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
957 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
958 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
959 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
960
961 x0 = dct_const_round_shift(s0 + s8);
962 x1 = dct_const_round_shift(s1 + s9);
963 x2 = dct_const_round_shift(s2 + s10);
964 x3 = dct_const_round_shift(s3 + s11);
965 x4 = dct_const_round_shift(s4 + s12);
966 x5 = dct_const_round_shift(s5 + s13);
967 x6 = dct_const_round_shift(s6 + s14);
968 x7 = dct_const_round_shift(s7 + s15);
969 x8 = dct_const_round_shift(s0 - s8);
970 x9 = dct_const_round_shift(s1 - s9);
971 x10 = dct_const_round_shift(s2 - s10);
972 x11 = dct_const_round_shift(s3 - s11);
973 x12 = dct_const_round_shift(s4 - s12);
974 x13 = dct_const_round_shift(s5 - s13);
975 x14 = dct_const_round_shift(s6 - s14);
976 x15 = dct_const_round_shift(s7 - s15);
977
978 // stage 2
979 s0 = x0;
980 s1 = x1;
981 s2 = x2;
982 s3 = x3;
983 s4 = x4;
984 s5 = x5;
985 s6 = x6;
986 s7 = x7;
987 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
988 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
989 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
990 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
991 s12 = - x12 * cospi_28_64 + x13 * cospi_4_64;
992 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
993 s14 = - x14 * cospi_12_64 + x15 * cospi_20_64;
994 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
995
996 x0 = s0 + s4;
997 x1 = s1 + s5;
998 x2 = s2 + s6;
999 x3 = s3 + s7;
1000 x4 = s0 - s4;
1001 x5 = s1 - s5;
1002 x6 = s2 - s6;
1003 x7 = s3 - s7;
1004 x8 = dct_const_round_shift(s8 + s12);
1005 x9 = dct_const_round_shift(s9 + s13);
1006 x10 = dct_const_round_shift(s10 + s14);
1007 x11 = dct_const_round_shift(s11 + s15);
1008 x12 = dct_const_round_shift(s8 - s12);
1009 x13 = dct_const_round_shift(s9 - s13);
1010 x14 = dct_const_round_shift(s10 - s14);
1011 x15 = dct_const_round_shift(s11 - s15);
1012
1013 // stage 3
1014 s0 = x0;
1015 s1 = x1;
1016 s2 = x2;
1017 s3 = x3;
1018 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1019 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1020 s6 = - x6 * cospi_24_64 + x7 * cospi_8_64;
1021 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1022 s8 = x8;
1023 s9 = x9;
1024 s10 = x10;
1025 s11 = x11;
1026 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1027 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1028 s14 = - x14 * cospi_24_64 + x15 * cospi_8_64;
1029 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1030
1031 x0 = s0 + s2;
1032 x1 = s1 + s3;
1033 x2 = s0 - s2;
1034 x3 = s1 - s3;
1035 x4 = dct_const_round_shift(s4 + s6);
1036 x5 = dct_const_round_shift(s5 + s7);
1037 x6 = dct_const_round_shift(s4 - s6);
1038 x7 = dct_const_round_shift(s5 - s7);
1039 x8 = s8 + s10;
1040 x9 = s9 + s11;
1041 x10 = s8 - s10;
1042 x11 = s9 - s11;
1043 x12 = dct_const_round_shift(s12 + s14);
1044 x13 = dct_const_round_shift(s13 + s15);
1045 x14 = dct_const_round_shift(s12 - s14);
1046 x15 = dct_const_round_shift(s13 - s15);
1047
1048 // stage 4
1049 s2 = (- cospi_16_64) * (x2 + x3);
1050 s3 = cospi_16_64 * (x2 - x3);
1051 s6 = cospi_16_64 * (x6 + x7);
1052 s7 = cospi_16_64 * (- x6 + x7);
1053 s10 = cospi_16_64 * (x10 + x11);
1054 s11 = cospi_16_64 * (- x10 + x11);
1055 s14 = (- cospi_16_64) * (x14 + x15);
1056 s15 = cospi_16_64 * (x14 - x15);
1057
1058 x2 = dct_const_round_shift(s2);
1059 x3 = dct_const_round_shift(s3);
1060 x6 = dct_const_round_shift(s6);
1061 x7 = dct_const_round_shift(s7);
1062 x10 = dct_const_round_shift(s10);
1063 x11 = dct_const_round_shift(s11);
1064 x14 = dct_const_round_shift(s14);
1065 x15 = dct_const_round_shift(s15);
1066
1067 output[0] = x0;
1068 output[1] = -x8;
1069 output[2] = x12;
1070 output[3] = -x4;
1071 output[4] = x6;
1072 output[5] = x14;
1073 output[6] = x10;
1074 output[7] = x2;
1075 output[8] = x3;
1076 output[9] = x11;
1077 output[10] = x15;
1078 output[11] = x7;
1079 output[12] = x5;
1080 output[13] = -x13;
1081 output[14] = x9;
1082 output[15] = -x1;
1083 }
1084
vp9_iht16x16_256_add_dspr2(const int16_t * input,uint8_t * dest,int pitch,int tx_type)1085 void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest,
1086 int pitch, int tx_type) {
1087 int i, j;
1088 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
1089 int16_t *outptr = out;
1090 int16_t temp_out[16];
1091 uint32_t pos = 45;
1092
1093 /* bit positon for extract from acc */
1094 __asm__ __volatile__ (
1095 "wrdsp %[pos], 1 \n\t"
1096 :
1097 : [pos] "r" (pos)
1098 );
1099
1100 switch (tx_type) {
1101 case DCT_DCT: // DCT in both horizontal and vertical
1102 idct16_rows_dspr2(input, outptr, 16);
1103 idct16_cols_add_blk_dspr2(out, dest, pitch);
1104 break;
1105 case ADST_DCT: // ADST in vertical, DCT in horizontal
1106 idct16_rows_dspr2(input, outptr, 16);
1107
1108 outptr = out;
1109
1110 for (i = 0; i < 16; ++i) {
1111 iadst16(outptr, temp_out);
1112
1113 for (j = 0; j < 16; ++j)
1114 dest[j * pitch + i] =
1115 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1116 + dest[j * pitch + i]);
1117 outptr += 16;
1118 }
1119 break;
1120 case DCT_ADST: // DCT in vertical, ADST in horizontal
1121 {
1122 int16_t temp_in[16 * 16];
1123
1124 for (i = 0; i < 16; ++i) {
1125 /* prefetch row */
1126 vp9_prefetch_load((const uint8_t *)(input + 16));
1127
1128 iadst16(input, outptr);
1129 input += 16;
1130 outptr += 16;
1131 }
1132
1133 for (i = 0; i < 16; ++i)
1134 for (j = 0; j < 16; ++j)
1135 temp_in[j * 16 + i] = out[i * 16 + j];
1136
1137 idct16_cols_add_blk_dspr2(temp_in, dest, pitch);
1138 }
1139 break;
1140 case ADST_ADST: // ADST in both directions
1141 {
1142 int16_t temp_in[16];
1143
1144 for (i = 0; i < 16; ++i) {
1145 /* prefetch row */
1146 vp9_prefetch_load((const uint8_t *)(input + 16));
1147
1148 iadst16(input, outptr);
1149 input += 16;
1150 outptr += 16;
1151 }
1152
1153 for (i = 0; i < 16; ++i) {
1154 for (j = 0; j < 16; ++j)
1155 temp_in[j] = out[j * 16 + i];
1156 iadst16(temp_in, temp_out);
1157 for (j = 0; j < 16; ++j)
1158 dest[j * pitch + i] =
1159 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
1160 + dest[j * pitch + i]);
1161 }
1162 }
1163 break;
1164 default:
1165 printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n");
1166 break;
1167 }
1168 }
1169
vp9_idct16x16_10_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)1170 void vp9_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest,
1171 int dest_stride) {
1172 DECLARE_ALIGNED(32, int16_t, out[16 * 16]);
1173 int16_t *outptr = out;
1174 uint32_t i;
1175 uint32_t pos = 45;
1176
1177 /* bit positon for extract from acc */
1178 __asm__ __volatile__ (
1179 "wrdsp %[pos], 1 \n\t"
1180 :
1181 : [pos] "r" (pos)
1182 );
1183
1184 // First transform rows. Since all non-zero dct coefficients are in
1185 // upper-left 4x4 area, we only need to calculate first 4 rows here.
1186 idct16_rows_dspr2(input, outptr, 4);
1187
1188 outptr += 4;
1189 for (i = 0; i < 6; ++i) {
1190 __asm__ __volatile__ (
1191 "sw $zero, 0(%[outptr]) \n\t"
1192 "sw $zero, 32(%[outptr]) \n\t"
1193 "sw $zero, 64(%[outptr]) \n\t"
1194 "sw $zero, 96(%[outptr]) \n\t"
1195 "sw $zero, 128(%[outptr]) \n\t"
1196 "sw $zero, 160(%[outptr]) \n\t"
1197 "sw $zero, 192(%[outptr]) \n\t"
1198 "sw $zero, 224(%[outptr]) \n\t"
1199 "sw $zero, 256(%[outptr]) \n\t"
1200 "sw $zero, 288(%[outptr]) \n\t"
1201 "sw $zero, 320(%[outptr]) \n\t"
1202 "sw $zero, 352(%[outptr]) \n\t"
1203 "sw $zero, 384(%[outptr]) \n\t"
1204 "sw $zero, 416(%[outptr]) \n\t"
1205 "sw $zero, 448(%[outptr]) \n\t"
1206 "sw $zero, 480(%[outptr]) \n\t"
1207
1208 :
1209 : [outptr] "r" (outptr)
1210 );
1211
1212 outptr += 2;
1213 }
1214
1215 // Then transform columns
1216 idct16_cols_add_blk_dspr2(out, dest, dest_stride);
1217 }
1218
vp9_idct16x16_1_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)1219 void vp9_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest,
1220 int dest_stride) {
1221 uint32_t pos = 45;
1222 int32_t out;
1223 int32_t r;
1224 int32_t a1, absa1;
1225 int32_t vector_a1;
1226 int32_t t1, t2, t3, t4;
1227 int32_t vector_1, vector_2, vector_3, vector_4;
1228
1229 /* bit positon for extract from acc */
1230 __asm__ __volatile__ (
1231 "wrdsp %[pos], 1 \n\t"
1232
1233 :
1234 : [pos] "r" (pos)
1235 );
1236
1237 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
1238 __asm__ __volatile__ (
1239 "addi %[out], %[out], 32 \n\t"
1240 "sra %[a1], %[out], 6 \n\t"
1241
1242 : [out] "+r" (out), [a1] "=r" (a1)
1243 :
1244 );
1245
1246 if (a1 < 0) {
1247 /* use quad-byte
1248 * input and output memory are four byte aligned */
1249 __asm__ __volatile__ (
1250 "abs %[absa1], %[a1] \n\t"
1251 "replv.qb %[vector_a1], %[absa1] \n\t"
1252
1253 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
1254 : [a1] "r" (a1)
1255 );
1256
1257 for (r = 16; r--;) {
1258 __asm__ __volatile__ (
1259 "lw %[t1], 0(%[dest]) \n\t"
1260 "lw %[t2], 4(%[dest]) \n\t"
1261 "lw %[t3], 8(%[dest]) \n\t"
1262 "lw %[t4], 12(%[dest]) \n\t"
1263 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1264 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1265 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1266 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1267 "sw %[vector_1], 0(%[dest]) \n\t"
1268 "sw %[vector_2], 4(%[dest]) \n\t"
1269 "sw %[vector_3], 8(%[dest]) \n\t"
1270 "sw %[vector_4], 12(%[dest]) \n\t"
1271 "add %[dest], %[dest], %[dest_stride] \n\t"
1272
1273 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1274 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1275 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1276 [dest] "+&r" (dest)
1277 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
1278 );
1279 }
1280 } else {
1281 /* use quad-byte
1282 * input and output memory are four byte aligned */
1283 __asm__ __volatile__ (
1284 "replv.qb %[vector_a1], %[a1] \n\t"
1285
1286 : [vector_a1] "=r" (vector_a1)
1287 : [a1] "r" (a1)
1288 );
1289
1290 for (r = 16; r--;) {
1291 __asm__ __volatile__ (
1292 "lw %[t1], 0(%[dest]) \n\t"
1293 "lw %[t2], 4(%[dest]) \n\t"
1294 "lw %[t3], 8(%[dest]) \n\t"
1295 "lw %[t4], 12(%[dest]) \n\t"
1296 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1297 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1298 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1299 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1300 "sw %[vector_1], 0(%[dest]) \n\t"
1301 "sw %[vector_2], 4(%[dest]) \n\t"
1302 "sw %[vector_3], 8(%[dest]) \n\t"
1303 "sw %[vector_4], 12(%[dest]) \n\t"
1304 "add %[dest], %[dest], %[dest_stride] \n\t"
1305
1306 : [t1] "=&r" (t1), [t2] "=&r" (t2), [t3] "=&r" (t3), [t4] "=&r" (t4),
1307 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
1308 [vector_3] "=&r" (vector_3), [vector_4] "=&r" (vector_4),
1309 [dest] "+&r" (dest)
1310 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
1311 );
1312 }
1313 }
1314 }
1315 #endif // #if HAVE_DSPR2
1316