1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
16 #include "vpx_dsp/txfm_common.h"
17
18 #if HAVE_DSPR2
idct32_rows_dspr2(const int16_t * input,int16_t * output,uint32_t no_rows)19 static void idct32_rows_dspr2(const int16_t *input, int16_t *output,
20 uint32_t no_rows) {
21 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
22 int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
23 int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
24 int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
25 int step1_28, step1_29, step1_30, step1_31;
26 int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
27 int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
28 int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
29 int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
30 int step2_28, step2_29, step2_30, step2_31;
31 int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
32 int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
33 int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
34 int step3_29, step3_30, step3_31;
35 int temp0, temp1, temp2, temp3;
36 int load1, load2, load3, load4;
37 int result1, result2;
38 int i;
39 const int const_2_power_13 = 8192;
40 const int32_t *input_int;
41
42 for (i = no_rows; i--;) {
43 input_int = (const int32_t *)input;
44
45 if (!(input_int[0] | input_int[1] | input_int[2] | input_int[3] |
46 input_int[4] | input_int[5] | input_int[6] | input_int[7] |
47 input_int[8] | input_int[9] | input_int[10] | input_int[11] |
48 input_int[12] | input_int[13] | input_int[14] | input_int[15])) {
49 input += 32;
50
51 __asm__ __volatile__(
52 "sh $zero, 0(%[output]) \n\t"
53 "sh $zero, 64(%[output]) \n\t"
54 "sh $zero, 128(%[output]) \n\t"
55 "sh $zero, 192(%[output]) \n\t"
56 "sh $zero, 256(%[output]) \n\t"
57 "sh $zero, 320(%[output]) \n\t"
58 "sh $zero, 384(%[output]) \n\t"
59 "sh $zero, 448(%[output]) \n\t"
60 "sh $zero, 512(%[output]) \n\t"
61 "sh $zero, 576(%[output]) \n\t"
62 "sh $zero, 640(%[output]) \n\t"
63 "sh $zero, 704(%[output]) \n\t"
64 "sh $zero, 768(%[output]) \n\t"
65 "sh $zero, 832(%[output]) \n\t"
66 "sh $zero, 896(%[output]) \n\t"
67 "sh $zero, 960(%[output]) \n\t"
68 "sh $zero, 1024(%[output]) \n\t"
69 "sh $zero, 1088(%[output]) \n\t"
70 "sh $zero, 1152(%[output]) \n\t"
71 "sh $zero, 1216(%[output]) \n\t"
72 "sh $zero, 1280(%[output]) \n\t"
73 "sh $zero, 1344(%[output]) \n\t"
74 "sh $zero, 1408(%[output]) \n\t"
75 "sh $zero, 1472(%[output]) \n\t"
76 "sh $zero, 1536(%[output]) \n\t"
77 "sh $zero, 1600(%[output]) \n\t"
78 "sh $zero, 1664(%[output]) \n\t"
79 "sh $zero, 1728(%[output]) \n\t"
80 "sh $zero, 1792(%[output]) \n\t"
81 "sh $zero, 1856(%[output]) \n\t"
82 "sh $zero, 1920(%[output]) \n\t"
83 "sh $zero, 1984(%[output]) \n\t"
84
85 :
86 : [output] "r"(output));
87
88 output += 1;
89
90 continue;
91 }
92
93 /* prefetch row */
94 prefetch_load((const uint8_t *)(input + 32));
95 prefetch_load((const uint8_t *)(input + 48));
96
97 __asm__ __volatile__(
98 "lh %[load1], 2(%[input]) \n\t"
99 "lh %[load2], 62(%[input]) \n\t"
100 "lh %[load3], 34(%[input]) \n\t"
101 "lh %[load4], 30(%[input]) \n\t"
102
103 "mtlo %[const_2_power_13], $ac1 \n\t"
104 "mthi $zero, $ac1 \n\t"
105 "mtlo %[const_2_power_13], $ac3 \n\t"
106 "mthi $zero, $ac3 \n\t"
107
108 "madd $ac1, %[load1], %[cospi_31_64] \n\t"
109 "msub $ac1, %[load2], %[cospi_1_64] \n\t"
110 "extp %[temp0], $ac1, 31 \n\t"
111
112 "madd $ac3, %[load1], %[cospi_1_64] \n\t"
113 "madd $ac3, %[load2], %[cospi_31_64] \n\t"
114 "extp %[temp3], $ac3, 31 \n\t"
115
116 "mtlo %[const_2_power_13], $ac1 \n\t"
117 "mthi $zero, $ac1 \n\t"
118 "mtlo %[const_2_power_13], $ac2 \n\t"
119 "mthi $zero, $ac2 \n\t"
120
121 "madd $ac2, %[load3], %[cospi_15_64] \n\t"
122 "msub $ac2, %[load4], %[cospi_17_64] \n\t"
123 "extp %[temp1], $ac2, 31 \n\t"
124
125 "madd $ac1, %[load3], %[cospi_17_64] \n\t"
126 "madd $ac1, %[load4], %[cospi_15_64] \n\t"
127 "extp %[temp2], $ac1, 31 \n\t"
128
129 "mtlo %[const_2_power_13], $ac1 \n\t"
130 "mthi $zero, $ac1 \n\t"
131 "mtlo %[const_2_power_13], $ac3 \n\t"
132 "mthi $zero, $ac3 \n\t"
133
134 "sub %[load1], %[temp3], %[temp2] \n\t"
135 "sub %[load2], %[temp0], %[temp1] \n\t"
136
137 "madd $ac1, %[load1], %[cospi_28_64] \n\t"
138 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
139 "madd $ac3, %[load1], %[cospi_4_64] \n\t"
140 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
141
142 "extp %[step1_17], $ac1, 31 \n\t"
143 "extp %[step1_30], $ac3, 31 \n\t"
144 "add %[step1_16], %[temp0], %[temp1] \n\t"
145 "add %[step1_31], %[temp2], %[temp3] \n\t"
146
147 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
148 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
149 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
150 [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
151 [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
152 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
153 [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
154 [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
155 [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
156
157 __asm__ __volatile__(
158 "lh %[load1], 18(%[input]) \n\t"
159 "lh %[load2], 46(%[input]) \n\t"
160 "lh %[load3], 50(%[input]) \n\t"
161 "lh %[load4], 14(%[input]) \n\t"
162
163 "mtlo %[const_2_power_13], $ac1 \n\t"
164 "mthi $zero, $ac1 \n\t"
165 "mtlo %[const_2_power_13], $ac3 \n\t"
166 "mthi $zero, $ac3 \n\t"
167
168 "madd $ac1, %[load1], %[cospi_23_64] \n\t"
169 "msub $ac1, %[load2], %[cospi_9_64] \n\t"
170 "extp %[temp0], $ac1, 31 \n\t"
171
172 "madd $ac3, %[load1], %[cospi_9_64] \n\t"
173 "madd $ac3, %[load2], %[cospi_23_64] \n\t"
174 "extp %[temp3], $ac3, 31 \n\t"
175
176 "mtlo %[const_2_power_13], $ac1 \n\t"
177 "mthi $zero, $ac1 \n\t"
178 "mtlo %[const_2_power_13], $ac2 \n\t"
179 "mthi $zero, $ac2 \n\t"
180
181 "madd $ac2, %[load3], %[cospi_7_64] \n\t"
182 "msub $ac2, %[load4], %[cospi_25_64] \n\t"
183 "extp %[temp1], $ac2, 31 \n\t"
184
185 "madd $ac1, %[load3], %[cospi_25_64] \n\t"
186 "madd $ac1, %[load4], %[cospi_7_64] \n\t"
187 "extp %[temp2], $ac1, 31 \n\t"
188
189 "mtlo %[const_2_power_13], $ac1 \n\t"
190 "mthi $zero, $ac1 \n\t"
191 "mtlo %[const_2_power_13], $ac3 \n\t"
192 "mthi $zero, $ac3 \n\t"
193
194 "sub %[load1], %[temp1], %[temp0] \n\t"
195 "sub %[load2], %[temp2], %[temp3] \n\t"
196
197 "msub $ac1, %[load1], %[cospi_28_64] \n\t"
198 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
199 "msub $ac3, %[load1], %[cospi_4_64] \n\t"
200 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
201
202 "extp %[step1_18], $ac1, 31 \n\t"
203 "extp %[step1_29], $ac3, 31 \n\t"
204 "add %[step1_19], %[temp0], %[temp1] \n\t"
205 "add %[step1_28], %[temp2], %[temp3] \n\t"
206
207 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
208 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
209 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
210 [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
211 [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
212 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
213 [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
214 [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
215 [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
216
217 __asm__ __volatile__(
218 "lh %[load1], 10(%[input]) \n\t"
219 "lh %[load2], 54(%[input]) \n\t"
220 "lh %[load3], 42(%[input]) \n\t"
221 "lh %[load4], 22(%[input]) \n\t"
222
223 "mtlo %[const_2_power_13], $ac1 \n\t"
224 "mthi $zero, $ac1 \n\t"
225 "mtlo %[const_2_power_13], $ac3 \n\t"
226 "mthi $zero, $ac3 \n\t"
227
228 "madd $ac1, %[load1], %[cospi_27_64] \n\t"
229 "msub $ac1, %[load2], %[cospi_5_64] \n\t"
230 "extp %[temp0], $ac1, 31 \n\t"
231
232 "madd $ac3, %[load1], %[cospi_5_64] \n\t"
233 "madd $ac3, %[load2], %[cospi_27_64] \n\t"
234 "extp %[temp3], $ac3, 31 \n\t"
235
236 "mtlo %[const_2_power_13], $ac1 \n\t"
237 "mthi $zero, $ac1 \n\t"
238 "mtlo %[const_2_power_13], $ac2 \n\t"
239 "mthi $zero, $ac2 \n\t"
240
241 "madd $ac2, %[load3], %[cospi_11_64] \n\t"
242 "msub $ac2, %[load4], %[cospi_21_64] \n\t"
243 "extp %[temp1], $ac2, 31 \n\t"
244
245 "madd $ac1, %[load3], %[cospi_21_64] \n\t"
246 "madd $ac1, %[load4], %[cospi_11_64] \n\t"
247 "extp %[temp2], $ac1, 31 \n\t"
248
249 "mtlo %[const_2_power_13], $ac1 \n\t"
250 "mthi $zero, $ac1 \n\t"
251 "mtlo %[const_2_power_13], $ac3 \n\t"
252 "mthi $zero, $ac3 \n\t"
253
254 "sub %[load1], %[temp0], %[temp1] \n\t"
255 "sub %[load2], %[temp3], %[temp2] \n\t"
256
257 "madd $ac1, %[load2], %[cospi_12_64] \n\t"
258 "msub $ac1, %[load1], %[cospi_20_64] \n\t"
259 "madd $ac3, %[load1], %[cospi_12_64] \n\t"
260 "madd $ac3, %[load2], %[cospi_20_64] \n\t"
261
262 "extp %[step1_21], $ac1, 31 \n\t"
263 "extp %[step1_26], $ac3, 31 \n\t"
264 "add %[step1_20], %[temp0], %[temp1] \n\t"
265 "add %[step1_27], %[temp2], %[temp3] \n\t"
266
267 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
268 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
269 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
270 [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
271 [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
272 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
273 [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
274 [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
275 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
276
277 __asm__ __volatile__(
278 "lh %[load1], 26(%[input]) \n\t"
279 "lh %[load2], 38(%[input]) \n\t"
280 "lh %[load3], 58(%[input]) \n\t"
281 "lh %[load4], 6(%[input]) \n\t"
282
283 "mtlo %[const_2_power_13], $ac1 \n\t"
284 "mthi $zero, $ac1 \n\t"
285 "mtlo %[const_2_power_13], $ac3 \n\t"
286 "mthi $zero, $ac3 \n\t"
287
288 "madd $ac1, %[load1], %[cospi_19_64] \n\t"
289 "msub $ac1, %[load2], %[cospi_13_64] \n\t"
290 "extp %[temp0], $ac1, 31 \n\t"
291 "madd $ac3, %[load1], %[cospi_13_64] \n\t"
292 "madd $ac3, %[load2], %[cospi_19_64] \n\t"
293 "extp %[temp3], $ac3, 31 \n\t"
294
295 "mtlo %[const_2_power_13], $ac1 \n\t"
296 "mthi $zero, $ac1 \n\t"
297 "mtlo %[const_2_power_13], $ac2 \n\t"
298 "mthi $zero, $ac2 \n\t"
299
300 "madd $ac2, %[load3], %[cospi_3_64] \n\t"
301 "msub $ac2, %[load4], %[cospi_29_64] \n\t"
302 "extp %[temp1], $ac2, 31 \n\t"
303 "madd $ac1, %[load3], %[cospi_29_64] \n\t"
304 "madd $ac1, %[load4], %[cospi_3_64] \n\t"
305 "extp %[temp2], $ac1, 31 \n\t"
306
307 "mtlo %[const_2_power_13], $ac1 \n\t"
308 "mthi $zero, $ac1 \n\t"
309 "mtlo %[const_2_power_13], $ac3 \n\t"
310 "mthi $zero, $ac3 \n\t"
311
312 "sub %[load1], %[temp1], %[temp0] \n\t"
313 "sub %[load2], %[temp2], %[temp3] \n\t"
314 "msub $ac1, %[load1], %[cospi_12_64] \n\t"
315 "msub $ac1, %[load2], %[cospi_20_64] \n\t"
316 "msub $ac3, %[load1], %[cospi_20_64] \n\t"
317 "madd $ac3, %[load2], %[cospi_12_64] \n\t"
318 "extp %[step1_22], $ac1, 31 \n\t"
319 "extp %[step1_25], $ac3, 31 \n\t"
320 "add %[step1_23], %[temp0], %[temp1] \n\t"
321 "add %[step1_24], %[temp2], %[temp3] \n\t"
322
323 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
324 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
325 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
326 [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
327 [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
328 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
329 [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
330 [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
331 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
332
333 __asm__ __volatile__(
334 "lh %[load1], 4(%[input]) \n\t"
335 "lh %[load2], 60(%[input]) \n\t"
336 "lh %[load3], 36(%[input]) \n\t"
337 "lh %[load4], 28(%[input]) \n\t"
338
339 "mtlo %[const_2_power_13], $ac1 \n\t"
340 "mthi $zero, $ac1 \n\t"
341 "mtlo %[const_2_power_13], $ac3 \n\t"
342 "mthi $zero, $ac3 \n\t"
343
344 "madd $ac1, %[load1], %[cospi_30_64] \n\t"
345 "msub $ac1, %[load2], %[cospi_2_64] \n\t"
346 "extp %[temp0], $ac1, 31 \n\t"
347 "madd $ac3, %[load1], %[cospi_2_64] \n\t"
348 "madd $ac3, %[load2], %[cospi_30_64] \n\t"
349 "extp %[temp3], $ac3, 31 \n\t"
350
351 "mtlo %[const_2_power_13], $ac1 \n\t"
352 "mthi $zero, $ac1 \n\t"
353 "mtlo %[const_2_power_13], $ac2 \n\t"
354 "mthi $zero, $ac2 \n\t"
355
356 "madd $ac2, %[load3], %[cospi_14_64] \n\t"
357 "msub $ac2, %[load4], %[cospi_18_64] \n\t"
358 "extp %[temp1], $ac2, 31 \n\t"
359 "madd $ac1, %[load3], %[cospi_18_64] \n\t"
360 "madd $ac1, %[load4], %[cospi_14_64] \n\t"
361 "extp %[temp2], $ac1, 31 \n\t"
362
363 "mtlo %[const_2_power_13], $ac1 \n\t"
364 "mthi $zero, $ac1 \n\t"
365 "mtlo %[const_2_power_13], $ac3 \n\t"
366 "mthi $zero, $ac3 \n\t"
367
368 "sub %[load1], %[temp0], %[temp1] \n\t"
369 "sub %[load2], %[temp3], %[temp2] \n\t"
370 "msub $ac1, %[load1], %[cospi_8_64] \n\t"
371 "madd $ac1, %[load2], %[cospi_24_64] \n\t"
372 "madd $ac3, %[load1], %[cospi_24_64] \n\t"
373 "madd $ac3, %[load2], %[cospi_8_64] \n\t"
374 "extp %[step2_9], $ac1, 31 \n\t"
375 "extp %[step2_14], $ac3, 31 \n\t"
376 "add %[step2_8], %[temp0], %[temp1] \n\t"
377 "add %[step2_15], %[temp2], %[temp3] \n\t"
378
379 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
380 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
381 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
382 [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
383 [step2_15] "=&r"(step2_15)
384 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
385 [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
386 [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
387 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
388
389 __asm__ __volatile__(
390 "lh %[load1], 20(%[input]) \n\t"
391 "lh %[load2], 44(%[input]) \n\t"
392 "lh %[load3], 52(%[input]) \n\t"
393 "lh %[load4], 12(%[input]) \n\t"
394
395 "mtlo %[const_2_power_13], $ac1 \n\t"
396 "mthi $zero, $ac1 \n\t"
397 "mtlo %[const_2_power_13], $ac3 \n\t"
398 "mthi $zero, $ac3 \n\t"
399
400 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
401 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
402 "extp %[temp0], $ac1, 31 \n\t"
403 "madd $ac3, %[load1], %[cospi_10_64] \n\t"
404 "madd $ac3, %[load2], %[cospi_22_64] \n\t"
405 "extp %[temp3], $ac3, 31 \n\t"
406
407 "mtlo %[const_2_power_13], $ac1 \n\t"
408 "mthi $zero, $ac1 \n\t"
409 "mtlo %[const_2_power_13], $ac2 \n\t"
410 "mthi $zero, $ac2 \n\t"
411
412 "madd $ac2, %[load3], %[cospi_6_64] \n\t"
413 "msub $ac2, %[load4], %[cospi_26_64] \n\t"
414 "extp %[temp1], $ac2, 31 \n\t"
415 "madd $ac1, %[load3], %[cospi_26_64] \n\t"
416 "madd $ac1, %[load4], %[cospi_6_64] \n\t"
417 "extp %[temp2], $ac1, 31 \n\t"
418
419 "mtlo %[const_2_power_13], $ac1 \n\t"
420 "mthi $zero, $ac1 \n\t"
421 "mtlo %[const_2_power_13], $ac3 \n\t"
422 "mthi $zero, $ac3 \n\t"
423
424 "sub %[load1], %[temp1], %[temp0] \n\t"
425 "sub %[load2], %[temp2], %[temp3] \n\t"
426 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
427 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
428 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
429 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
430 "extp %[step2_10], $ac1, 31 \n\t"
431 "extp %[step2_13], $ac3, 31 \n\t"
432 "add %[step2_11], %[temp0], %[temp1] \n\t"
433 "add %[step2_12], %[temp2], %[temp3] \n\t"
434
435 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
436 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
437 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
438 [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
439 [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
440 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
441 [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
442 [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
443 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
444
445 __asm__ __volatile__(
446 "mtlo %[const_2_power_13], $ac0 \n\t"
447 "mthi $zero, $ac0 \n\t"
448 "sub %[temp0], %[step2_14], %[step2_13] \n\t"
449 "sub %[temp0], %[temp0], %[step2_9] \n\t"
450 "add %[temp0], %[temp0], %[step2_10] \n\t"
451 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
452 "mtlo %[const_2_power_13], $ac1 \n\t"
453 "mthi $zero, $ac1 \n\t"
454 "sub %[temp1], %[step2_14], %[step2_13] \n\t"
455 "add %[temp1], %[temp1], %[step2_9] \n\t"
456 "sub %[temp1], %[temp1], %[step2_10] \n\t"
457 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
458 "mtlo %[const_2_power_13], $ac2 \n\t"
459 "mthi $zero, $ac2 \n\t"
460 "sub %[temp0], %[step2_15], %[step2_12] \n\t"
461 "sub %[temp0], %[temp0], %[step2_8] \n\t"
462 "add %[temp0], %[temp0], %[step2_11] \n\t"
463 "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
464 "mtlo %[const_2_power_13], $ac3 \n\t"
465 "mthi $zero, $ac3 \n\t"
466 "sub %[temp1], %[step2_15], %[step2_12] \n\t"
467 "add %[temp1], %[temp1], %[step2_8] \n\t"
468 "sub %[temp1], %[temp1], %[step2_11] \n\t"
469 "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
470
471 "add %[step3_8], %[step2_8], %[step2_11] \n\t"
472 "add %[step3_9], %[step2_9], %[step2_10] \n\t"
473 "add %[step3_14], %[step2_13], %[step2_14] \n\t"
474 "add %[step3_15], %[step2_12], %[step2_15] \n\t"
475 "extp %[step3_10], $ac0, 31 \n\t"
476 "extp %[step3_13], $ac1, 31 \n\t"
477 "extp %[step3_11], $ac2, 31 \n\t"
478 "extp %[step3_12], $ac3, 31 \n\t"
479
480 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
481 [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
482 [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
483 [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
484 [step3_15] "=&r"(step3_15)
485 : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
486 [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
487 [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
488 [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
489 [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
490
491 __asm__ __volatile__(
492 "mtlo %[const_2_power_13], $ac0 \n\t"
493 "mthi $zero, $ac0 \n\t"
494 "mtlo %[const_2_power_13], $ac1 \n\t"
495 "mthi $zero, $ac1 \n\t"
496 "sub %[temp0], %[step1_17], %[step1_18] \n\t"
497 "sub %[temp1], %[step1_30], %[step1_29] \n\t"
498 "add %[step3_17], %[step1_17], %[step1_18] \n\t"
499 "add %[step3_30], %[step1_30], %[step1_29] \n\t"
500
501 "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
502 "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
503 "extp %[step3_18], $ac0, 31 \n\t"
504 "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
505 "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
506 "extp %[step3_29], $ac1, 31 \n\t"
507
508 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
509 [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
510 [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
511 : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
512 [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
513 [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
514 [cospi_8_64] "r"(cospi_8_64));
515
516 __asm__ __volatile__(
517 "mtlo %[const_2_power_13], $ac0 \n\t"
518 "mthi $zero, $ac0 \n\t"
519 "mtlo %[const_2_power_13], $ac1 \n\t"
520 "mthi $zero, $ac1 \n\t"
521 "sub %[temp0], %[step1_16], %[step1_19] \n\t"
522 "sub %[temp1], %[step1_31], %[step1_28] \n\t"
523 "add %[step3_16], %[step1_16], %[step1_19] \n\t"
524 "add %[step3_31], %[step1_31], %[step1_28] \n\t"
525
526 "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
527 "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
528 "extp %[step3_19], $ac0, 31 \n\t"
529 "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
530 "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
531 "extp %[step3_28], $ac1, 31 \n\t"
532
533 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
534 [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
535 [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
536 : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
537 [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
538 [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
539 [cospi_8_64] "r"(cospi_8_64));
540
541 __asm__ __volatile__(
542 "mtlo %[const_2_power_13], $ac0 \n\t"
543 "mthi $zero, $ac0 \n\t"
544 "mtlo %[const_2_power_13], $ac1 \n\t"
545 "mthi $zero, $ac1 \n\t"
546 "sub %[temp0], %[step1_23], %[step1_20] \n\t"
547 "sub %[temp1], %[step1_24], %[step1_27] \n\t"
548 "add %[step3_23], %[step1_23], %[step1_20] \n\t"
549 "add %[step3_24], %[step1_24], %[step1_27] \n\t"
550
551 "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
552 "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
553 "extp %[step3_27], $ac0, 31 \n\t"
554 "msub $ac1, %[temp0], %[cospi_24_64] \n\t"
555 "msub $ac1, %[temp1], %[cospi_8_64] \n\t"
556 "extp %[step3_20], $ac1, 31 \n\t"
557
558 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
559 [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
560 [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
561 : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
562 [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
563 [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
564 [cospi_8_64] "r"(cospi_8_64));
565
566 __asm__ __volatile__(
567 "mtlo %[const_2_power_13], $ac0 \n\t"
568 "mthi $zero, $ac0 \n\t"
569 "mtlo %[const_2_power_13], $ac1 \n\t"
570 "mthi $zero, $ac1 \n\t"
571 "sub %[temp0], %[step1_22], %[step1_21] \n\t"
572 "sub %[temp1], %[step1_25], %[step1_26] \n\t"
573 "add %[step3_22], %[step1_22], %[step1_21] \n\t"
574 "add %[step3_25], %[step1_25], %[step1_26] \n\t"
575
576 "msub $ac0, %[temp0], %[cospi_24_64] \n\t"
577 "msub $ac0, %[temp1], %[cospi_8_64] \n\t"
578 "extp %[step3_21], $ac0, 31 \n\t"
579 "msub $ac1, %[temp0], %[cospi_8_64] \n\t"
580 "madd $ac1, %[temp1], %[cospi_24_64] \n\t"
581 "extp %[step3_26], $ac1, 31 \n\t"
582
583 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
584 [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
585 [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
586 : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
587 [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
588 [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
589 [cospi_8_64] "r"(cospi_8_64));
590
591 __asm__ __volatile__(
592 "add %[step2_16], %[step3_16], %[step3_23] \n\t"
593 "add %[step2_17], %[step3_17], %[step3_22] \n\t"
594 "add %[step2_18], %[step3_18], %[step3_21] \n\t"
595 "add %[step2_19], %[step3_19], %[step3_20] \n\t"
596 "sub %[step2_20], %[step3_19], %[step3_20] \n\t"
597 "sub %[step2_21], %[step3_18], %[step3_21] \n\t"
598 "sub %[step2_22], %[step3_17], %[step3_22] \n\t"
599 "sub %[step2_23], %[step3_16], %[step3_23] \n\t"
600
601 : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
602 [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
603 [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
604 [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
605 : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
606 [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
607 [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
608 [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
609
610 __asm__ __volatile__(
611 "sub %[step2_24], %[step3_31], %[step3_24] \n\t"
612 "sub %[step2_25], %[step3_30], %[step3_25] \n\t"
613 "sub %[step2_26], %[step3_29], %[step3_26] \n\t"
614 "sub %[step2_27], %[step3_28], %[step3_27] \n\t"
615 "add %[step2_28], %[step3_28], %[step3_27] \n\t"
616 "add %[step2_29], %[step3_29], %[step3_26] \n\t"
617 "add %[step2_30], %[step3_30], %[step3_25] \n\t"
618 "add %[step2_31], %[step3_31], %[step3_24] \n\t"
619
620 : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
621 [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
622 [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
623 [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
624 : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
625 [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
626 [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
627 [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
628
629 __asm__ __volatile__(
630 "lh %[load1], 0(%[input]) \n\t"
631 "lh %[load2], 32(%[input]) \n\t"
632 "lh %[load3], 16(%[input]) \n\t"
633 "lh %[load4], 48(%[input]) \n\t"
634
635 "mtlo %[const_2_power_13], $ac1 \n\t"
636 "mthi $zero, $ac1 \n\t"
637 "mtlo %[const_2_power_13], $ac2 \n\t"
638 "mthi $zero, $ac2 \n\t"
639 "add %[result1], %[load1], %[load2] \n\t"
640 "sub %[result2], %[load1], %[load2] \n\t"
641 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
642 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
643 "extp %[temp0], $ac1, 31 \n\t"
644 "extp %[temp1], $ac2, 31 \n\t"
645
646 "mtlo %[const_2_power_13], $ac3 \n\t"
647 "mthi $zero, $ac3 \n\t"
648 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
649 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
650 "extp %[temp2], $ac3, 31 \n\t"
651 "mtlo %[const_2_power_13], $ac1 \n\t"
652 "mthi $zero, $ac1 \n\t"
653 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
654 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
655 "extp %[temp3], $ac1, 31 \n\t"
656 "add %[step1_0], %[temp0], %[temp3] \n\t"
657 "add %[step1_1], %[temp1], %[temp2] \n\t"
658 "sub %[step1_2], %[temp1], %[temp2] \n\t"
659 "sub %[step1_3], %[temp0], %[temp3] \n\t"
660
661 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
662 [load4] "=&r"(load4), [result1] "=&r"(result1),
663 [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
664 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
665 [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
666 [step1_3] "=&r"(step1_3)
667 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
668 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
669 [cospi_16_64] "r"(cospi_16_64));
670
671 __asm__ __volatile__(
672 "lh %[load1], 8(%[input]) \n\t"
673 "lh %[load2], 56(%[input]) \n\t"
674 "lh %[load3], 40(%[input]) \n\t"
675 "lh %[load4], 24(%[input]) \n\t"
676
677 "mtlo %[const_2_power_13], $ac1 \n\t"
678 "mthi $zero, $ac1 \n\t"
679 "mtlo %[const_2_power_13], $ac3 \n\t"
680 "mthi $zero, $ac3 \n\t"
681
682 "madd $ac1, %[load1], %[cospi_28_64] \n\t"
683 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
684 "extp %[temp0], $ac1, 31 \n\t"
685 "madd $ac3, %[load1], %[cospi_4_64] \n\t"
686 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
687 "extp %[temp3], $ac3, 31 \n\t"
688
689 "mtlo %[const_2_power_13], $ac1 \n\t"
690 "mthi $zero, $ac1 \n\t"
691 "mtlo %[const_2_power_13], $ac2 \n\t"
692 "mthi $zero, $ac2 \n\t"
693
694 "madd $ac2, %[load3], %[cospi_12_64] \n\t"
695 "msub $ac2, %[load4], %[cospi_20_64] \n\t"
696 "extp %[temp1], $ac2, 31 \n\t"
697 "madd $ac1, %[load3], %[cospi_20_64] \n\t"
698 "madd $ac1, %[load4], %[cospi_12_64] \n\t"
699 "extp %[temp2], $ac1, 31 \n\t"
700
701 "mtlo %[const_2_power_13], $ac1 \n\t"
702 "mthi $zero, $ac1 \n\t"
703 "mtlo %[const_2_power_13], $ac3 \n\t"
704 "mthi $zero, $ac3 \n\t"
705
706 "sub %[load1], %[temp3], %[temp2] \n\t"
707 "sub %[load1], %[load1], %[temp0] \n\t"
708 "add %[load1], %[load1], %[temp1] \n\t"
709 "sub %[load2], %[temp0], %[temp1] \n\t"
710 "sub %[load2], %[load2], %[temp2] \n\t"
711 "add %[load2], %[load2], %[temp3] \n\t"
712 "madd $ac1, %[load1], %[cospi_16_64] \n\t"
713 "madd $ac3, %[load2], %[cospi_16_64] \n\t"
714
715 "extp %[step1_5], $ac1, 31 \n\t"
716 "extp %[step1_6], $ac3, 31 \n\t"
717 "add %[step1_4], %[temp0], %[temp1] \n\t"
718 "add %[step1_7], %[temp3], %[temp2] \n\t"
719
720 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
721 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
722 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
723 [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
724 [step1_7] "=&r"(step1_7)
725 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
726 [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
727 [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
728 [cospi_16_64] "r"(cospi_16_64));
729
730 __asm__ __volatile__(
731 "add %[step2_0], %[step1_0], %[step1_7] \n\t"
732 "add %[step2_1], %[step1_1], %[step1_6] \n\t"
733 "add %[step2_2], %[step1_2], %[step1_5] \n\t"
734 "add %[step2_3], %[step1_3], %[step1_4] \n\t"
735 "sub %[step2_4], %[step1_3], %[step1_4] \n\t"
736 "sub %[step2_5], %[step1_2], %[step1_5] \n\t"
737 "sub %[step2_6], %[step1_1], %[step1_6] \n\t"
738 "sub %[step2_7], %[step1_0], %[step1_7] \n\t"
739
740 : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
741 [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
742 [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
743 [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
744 : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
745 [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
746 [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
747 [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
748
749 // stage 7
750 __asm__ __volatile__(
751 "add %[step1_0], %[step2_0], %[step3_15] \n\t"
752 "add %[step1_1], %[step2_1], %[step3_14] \n\t"
753 "add %[step1_2], %[step2_2], %[step3_13] \n\t"
754 "add %[step1_3], %[step2_3], %[step3_12] \n\t"
755 "sub %[step1_12], %[step2_3], %[step3_12] \n\t"
756 "sub %[step1_13], %[step2_2], %[step3_13] \n\t"
757 "sub %[step1_14], %[step2_1], %[step3_14] \n\t"
758 "sub %[step1_15], %[step2_0], %[step3_15] \n\t"
759
760 : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
761 [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
762 [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
763 [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
764 : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
765 [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
766 [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
767 [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
768
769 __asm__ __volatile__(
770 "add %[step1_4], %[step2_4], %[step3_11] \n\t"
771 "add %[step1_5], %[step2_5], %[step3_10] \n\t"
772 "add %[step1_6], %[step2_6], %[step3_9] \n\t"
773 "add %[step1_7], %[step2_7], %[step3_8] \n\t"
774 "sub %[step1_8], %[step2_7], %[step3_8] \n\t"
775 "sub %[step1_9], %[step2_6], %[step3_9] \n\t"
776 "sub %[step1_10], %[step2_5], %[step3_10] \n\t"
777 "sub %[step1_11], %[step2_4], %[step3_11] \n\t"
778
779 : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
780 [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
781 [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
782 [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
783 : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
784 [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
785 [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
786 [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
787
788 __asm__ __volatile__(
789 "sub %[temp0], %[step2_27], %[step2_20] \n\t"
790 "add %[temp1], %[step2_27], %[step2_20] \n\t"
791 "sub %[temp2], %[step2_26], %[step2_21] \n\t"
792 "add %[temp3], %[step2_26], %[step2_21] \n\t"
793
794 "mtlo %[const_2_power_13], $ac0 \n\t"
795 "mthi $zero, $ac0 \n\t"
796 "mtlo %[const_2_power_13], $ac1 \n\t"
797 "mthi $zero, $ac1 \n\t"
798 "mtlo %[const_2_power_13], $ac2 \n\t"
799 "mthi $zero, $ac2 \n\t"
800 "mtlo %[const_2_power_13], $ac3 \n\t"
801 "mthi $zero, $ac3 \n\t"
802
803 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
804 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
805 "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
806 "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
807
808 "extp %[step1_20], $ac0, 31 \n\t"
809 "extp %[step1_27], $ac1, 31 \n\t"
810 "extp %[step1_21], $ac2, 31 \n\t"
811 "extp %[step1_26], $ac3, 31 \n\t"
812
813 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
814 [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
815 [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
816 [step1_26] "=&r"(step1_26)
817 : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
818 [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
819 [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
820
821 __asm__ __volatile__(
822 "sub %[temp0], %[step2_25], %[step2_22] \n\t"
823 "add %[temp1], %[step2_25], %[step2_22] \n\t"
824 "sub %[temp2], %[step2_24], %[step2_23] \n\t"
825 "add %[temp3], %[step2_24], %[step2_23] \n\t"
826
827 "mtlo %[const_2_power_13], $ac0 \n\t"
828 "mthi $zero, $ac0 \n\t"
829 "mtlo %[const_2_power_13], $ac1 \n\t"
830 "mthi $zero, $ac1 \n\t"
831 "mtlo %[const_2_power_13], $ac2 \n\t"
832 "mthi $zero, $ac2 \n\t"
833 "mtlo %[const_2_power_13], $ac3 \n\t"
834 "mthi $zero, $ac3 \n\t"
835
836 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
837 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
838 "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
839 "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
840
841 "extp %[step1_22], $ac0, 31 \n\t"
842 "extp %[step1_25], $ac1, 31 \n\t"
843 "extp %[step1_23], $ac2, 31 \n\t"
844 "extp %[step1_24], $ac3, 31 \n\t"
845
846 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
847 [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
848 [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
849 [step1_24] "=&r"(step1_24)
850 : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
851 [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
852 [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
853
854 // final stage
855 __asm__ __volatile__(
856 "add %[temp0], %[step1_0], %[step2_31] \n\t"
857 "add %[temp1], %[step1_1], %[step2_30] \n\t"
858 "add %[temp2], %[step1_2], %[step2_29] \n\t"
859 "add %[temp3], %[step1_3], %[step2_28] \n\t"
860 "sub %[load1], %[step1_3], %[step2_28] \n\t"
861 "sub %[load2], %[step1_2], %[step2_29] \n\t"
862 "sub %[load3], %[step1_1], %[step2_30] \n\t"
863 "sub %[load4], %[step1_0], %[step2_31] \n\t"
864 "sh %[temp0], 0(%[output]) \n\t"
865 "sh %[temp1], 64(%[output]) \n\t"
866 "sh %[temp2], 128(%[output]) \n\t"
867 "sh %[temp3], 192(%[output]) \n\t"
868 "sh %[load1], 1792(%[output]) \n\t"
869 "sh %[load2], 1856(%[output]) \n\t"
870 "sh %[load3], 1920(%[output]) \n\t"
871 "sh %[load4], 1984(%[output]) \n\t"
872
873 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
874 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
875 [temp3] "=&r"(temp3), [load4] "=&r"(load4)
876 : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31),
877 [step1_1] "r"(step1_1), [step2_30] "r"(step2_30),
878 [step1_2] "r"(step1_2), [step2_29] "r"(step2_29),
879 [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
880 [output] "r"(output));
881
882 __asm__ __volatile__(
883 "add %[temp0], %[step1_4], %[step1_27] \n\t"
884 "add %[temp1], %[step1_5], %[step1_26] \n\t"
885 "add %[temp2], %[step1_6], %[step1_25] \n\t"
886 "add %[temp3], %[step1_7], %[step1_24] \n\t"
887 "sub %[load1], %[step1_7], %[step1_24] \n\t"
888 "sub %[load2], %[step1_6], %[step1_25] \n\t"
889 "sub %[load3], %[step1_5], %[step1_26] \n\t"
890 "sub %[load4], %[step1_4], %[step1_27] \n\t"
891 "sh %[temp0], 256(%[output]) \n\t"
892 "sh %[temp1], 320(%[output]) \n\t"
893 "sh %[temp2], 384(%[output]) \n\t"
894 "sh %[temp3], 448(%[output]) \n\t"
895 "sh %[load1], 1536(%[output]) \n\t"
896 "sh %[load2], 1600(%[output]) \n\t"
897 "sh %[load3], 1664(%[output]) \n\t"
898 "sh %[load4], 1728(%[output]) \n\t"
899
900 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
901 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
902 [temp3] "=&r"(temp3), [load4] "=&r"(load4)
903 : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27),
904 [step1_5] "r"(step1_5), [step1_26] "r"(step1_26),
905 [step1_6] "r"(step1_6), [step1_25] "r"(step1_25),
906 [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
907 [output] "r"(output));
908
909 __asm__ __volatile__(
910 "add %[temp0], %[step1_8], %[step1_23] \n\t"
911 "add %[temp1], %[step1_9], %[step1_22] \n\t"
912 "add %[temp2], %[step1_10], %[step1_21] \n\t"
913 "add %[temp3], %[step1_11], %[step1_20] \n\t"
914 "sub %[load1], %[step1_11], %[step1_20] \n\t"
915 "sub %[load2], %[step1_10], %[step1_21] \n\t"
916 "sub %[load3], %[step1_9], %[step1_22] \n\t"
917 "sub %[load4], %[step1_8], %[step1_23] \n\t"
918 "sh %[temp0], 512(%[output]) \n\t"
919 "sh %[temp1], 576(%[output]) \n\t"
920 "sh %[temp2], 640(%[output]) \n\t"
921 "sh %[temp3], 704(%[output]) \n\t"
922 "sh %[load1], 1280(%[output]) \n\t"
923 "sh %[load2], 1344(%[output]) \n\t"
924 "sh %[load3], 1408(%[output]) \n\t"
925 "sh %[load4], 1472(%[output]) \n\t"
926
927 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
928 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
929 [temp3] "=&r"(temp3), [load4] "=&r"(load4)
930 : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23),
931 [step1_9] "r"(step1_9), [step1_22] "r"(step1_22),
932 [step1_10] "r"(step1_10), [step1_21] "r"(step1_21),
933 [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
934 [output] "r"(output));
935
936 __asm__ __volatile__(
937 "add %[temp0], %[step1_12], %[step2_19] \n\t"
938 "add %[temp1], %[step1_13], %[step2_18] \n\t"
939 "add %[temp2], %[step1_14], %[step2_17] \n\t"
940 "add %[temp3], %[step1_15], %[step2_16] \n\t"
941 "sub %[load1], %[step1_15], %[step2_16] \n\t"
942 "sub %[load2], %[step1_14], %[step2_17] \n\t"
943 "sub %[load3], %[step1_13], %[step2_18] \n\t"
944 "sub %[load4], %[step1_12], %[step2_19] \n\t"
945 "sh %[temp0], 768(%[output]) \n\t"
946 "sh %[temp1], 832(%[output]) \n\t"
947 "sh %[temp2], 896(%[output]) \n\t"
948 "sh %[temp3], 960(%[output]) \n\t"
949 "sh %[load1], 1024(%[output]) \n\t"
950 "sh %[load2], 1088(%[output]) \n\t"
951 "sh %[load3], 1152(%[output]) \n\t"
952 "sh %[load4], 1216(%[output]) \n\t"
953
954 : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1),
955 [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3),
956 [temp3] "=&r"(temp3), [load4] "=&r"(load4)
957 : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19),
958 [step1_13] "r"(step1_13), [step2_18] "r"(step2_18),
959 [step1_14] "r"(step1_14), [step2_17] "r"(step2_17),
960 [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
961 [output] "r"(output));
962
963 input += 32;
964 output += 1;
965 }
966 }
967
vpx_idct32x32_1024_add_dspr2(const int16_t * input,uint8_t * dest,int stride)968 void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest,
969 int stride) {
970 DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
971 int16_t *outptr = out;
972 uint32_t pos = 45;
973
974 /* bit positon for extract from acc */
975 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
976 :
977 : [pos] "r"(pos));
978
979 // Rows
980 idct32_rows_dspr2(input, outptr, 32);
981
982 // Columns
983 vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
984 }
985
vpx_idct32x32_34_add_dspr2(const int16_t * input,uint8_t * dest,int stride)986 void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest,
987 int stride) {
988 DECLARE_ALIGNED(32, int16_t, out[32 * 32]);
989 int16_t *outptr = out;
990 uint32_t i;
991 uint32_t pos = 45;
992
993 /* bit positon for extract from acc */
994 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
995 :
996 : [pos] "r"(pos));
997
998 // Rows
999 idct32_rows_dspr2(input, outptr, 8);
1000
1001 outptr += 8;
1002 __asm__ __volatile__(
1003 "sw $zero, 0(%[outptr]) \n\t"
1004 "sw $zero, 4(%[outptr]) \n\t"
1005 "sw $zero, 8(%[outptr]) \n\t"
1006 "sw $zero, 12(%[outptr]) \n\t"
1007 "sw $zero, 16(%[outptr]) \n\t"
1008 "sw $zero, 20(%[outptr]) \n\t"
1009 "sw $zero, 24(%[outptr]) \n\t"
1010 "sw $zero, 28(%[outptr]) \n\t"
1011 "sw $zero, 32(%[outptr]) \n\t"
1012 "sw $zero, 36(%[outptr]) \n\t"
1013 "sw $zero, 40(%[outptr]) \n\t"
1014 "sw $zero, 44(%[outptr]) \n\t"
1015
1016 :
1017 : [outptr] "r"(outptr));
1018
1019 for (i = 0; i < 31; ++i) {
1020 outptr += 32;
1021
1022 __asm__ __volatile__(
1023 "sw $zero, 0(%[outptr]) \n\t"
1024 "sw $zero, 4(%[outptr]) \n\t"
1025 "sw $zero, 8(%[outptr]) \n\t"
1026 "sw $zero, 12(%[outptr]) \n\t"
1027 "sw $zero, 16(%[outptr]) \n\t"
1028 "sw $zero, 20(%[outptr]) \n\t"
1029 "sw $zero, 24(%[outptr]) \n\t"
1030 "sw $zero, 28(%[outptr]) \n\t"
1031 "sw $zero, 32(%[outptr]) \n\t"
1032 "sw $zero, 36(%[outptr]) \n\t"
1033 "sw $zero, 40(%[outptr]) \n\t"
1034 "sw $zero, 44(%[outptr]) \n\t"
1035
1036 :
1037 : [outptr] "r"(outptr));
1038 }
1039
1040 // Columns
1041 vpx_idct32_cols_add_blk_dspr2(out, dest, stride);
1042 }
1043
vpx_idct32x32_1_add_dspr2(const int16_t * input,uint8_t * dest,int stride)1044 void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest,
1045 int stride) {
1046 int r, out;
1047 int32_t a1, absa1;
1048 int32_t vector_a1;
1049 int32_t t1, t2, t3, t4;
1050 int32_t vector_1, vector_2, vector_3, vector_4;
1051 uint32_t pos = 45;
1052
1053 /* bit positon for extract from acc */
1054 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
1055
1056 :
1057 : [pos] "r"(pos));
1058
1059 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
1060 __asm__ __volatile__(
1061 "addi %[out], %[out], 32 \n\t"
1062 "sra %[a1], %[out], 6 \n\t"
1063
1064 : [out] "+r"(out), [a1] "=r"(a1)
1065 :);
1066
1067 if (a1 < 0) {
1068 /* use quad-byte
1069 * input and output memory are four byte aligned */
1070 __asm__ __volatile__(
1071 "abs %[absa1], %[a1] \n\t"
1072 "replv.qb %[vector_a1], %[absa1] \n\t"
1073
1074 : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1)
1075 : [a1] "r"(a1));
1076
1077 for (r = 32; r--;) {
1078 __asm__ __volatile__(
1079 "lw %[t1], 0(%[dest]) \n\t"
1080 "lw %[t2], 4(%[dest]) \n\t"
1081 "lw %[t3], 8(%[dest]) \n\t"
1082 "lw %[t4], 12(%[dest]) \n\t"
1083 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1084 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1085 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1086 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1087 "sw %[vector_1], 0(%[dest]) \n\t"
1088 "sw %[vector_2], 4(%[dest]) \n\t"
1089 "sw %[vector_3], 8(%[dest]) \n\t"
1090 "sw %[vector_4], 12(%[dest]) \n\t"
1091
1092 "lw %[t1], 16(%[dest]) \n\t"
1093 "lw %[t2], 20(%[dest]) \n\t"
1094 "lw %[t3], 24(%[dest]) \n\t"
1095 "lw %[t4], 28(%[dest]) \n\t"
1096 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1097 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1098 "subu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1099 "subu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1100 "sw %[vector_1], 16(%[dest]) \n\t"
1101 "sw %[vector_2], 20(%[dest]) \n\t"
1102 "sw %[vector_3], 24(%[dest]) \n\t"
1103 "sw %[vector_4], 28(%[dest]) \n\t"
1104
1105 "add %[dest], %[dest], %[stride] \n\t"
1106
1107 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1108 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1109 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1110 [dest] "+&r"(dest)
1111 : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
1112 }
1113 } else if (a1 > 255) {
1114 int32_t a11, a12, vector_a11, vector_a12;
1115
1116 /* use quad-byte
1117 * input and output memory are four byte aligned */
1118 a11 = a1 >> 1;
1119 a12 = a1 - a11;
1120 __asm__ __volatile__(
1121 "replv.qb %[vector_a11], %[a11] \n\t"
1122 "replv.qb %[vector_a12], %[a12] \n\t"
1123
1124 : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
1125 : [a11] "r"(a11), [a12] "r"(a12));
1126
1127 for (r = 32; r--;) {
1128 __asm__ __volatile__(
1129 "lw %[t1], 0(%[dest]) \n\t"
1130 "lw %[t2], 4(%[dest]) \n\t"
1131 "lw %[t3], 8(%[dest]) \n\t"
1132 "lw %[t4], 12(%[dest]) \n\t"
1133 "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
1134 "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
1135 "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
1136 "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
1137 "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
1138 "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
1139 "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
1140 "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
1141 "sw %[vector_1], 0(%[dest]) \n\t"
1142 "sw %[vector_2], 4(%[dest]) \n\t"
1143 "sw %[vector_3], 8(%[dest]) \n\t"
1144 "sw %[vector_4], 12(%[dest]) \n\t"
1145
1146 "lw %[t1], 16(%[dest]) \n\t"
1147 "lw %[t2], 20(%[dest]) \n\t"
1148 "lw %[t3], 24(%[dest]) \n\t"
1149 "lw %[t4], 28(%[dest]) \n\t"
1150 "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t"
1151 "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t"
1152 "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t"
1153 "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t"
1154 "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t"
1155 "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t"
1156 "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t"
1157 "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t"
1158 "sw %[vector_1], 16(%[dest]) \n\t"
1159 "sw %[vector_2], 20(%[dest]) \n\t"
1160 "sw %[vector_3], 24(%[dest]) \n\t"
1161 "sw %[vector_4], 28(%[dest]) \n\t"
1162
1163 "add %[dest], %[dest], %[stride] \n\t"
1164
1165 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1166 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1167 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1168 [dest] "+&r"(dest)
1169 : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
1170 [vector_a12] "r"(vector_a12));
1171 }
1172 } else {
1173 /* use quad-byte
1174 * input and output memory are four byte aligned */
1175 __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t"
1176
1177 : [vector_a1] "=&r"(vector_a1)
1178 : [a1] "r"(a1));
1179
1180 for (r = 32; r--;) {
1181 __asm__ __volatile__(
1182 "lw %[t1], 0(%[dest]) \n\t"
1183 "lw %[t2], 4(%[dest]) \n\t"
1184 "lw %[t3], 8(%[dest]) \n\t"
1185 "lw %[t4], 12(%[dest]) \n\t"
1186 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1187 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1188 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1189 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1190 "sw %[vector_1], 0(%[dest]) \n\t"
1191 "sw %[vector_2], 4(%[dest]) \n\t"
1192 "sw %[vector_3], 8(%[dest]) \n\t"
1193 "sw %[vector_4], 12(%[dest]) \n\t"
1194
1195 "lw %[t1], 16(%[dest]) \n\t"
1196 "lw %[t2], 20(%[dest]) \n\t"
1197 "lw %[t3], 24(%[dest]) \n\t"
1198 "lw %[t4], 28(%[dest]) \n\t"
1199 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
1200 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
1201 "addu_s.qb %[vector_3], %[t3], %[vector_a1] \n\t"
1202 "addu_s.qb %[vector_4], %[t4], %[vector_a1] \n\t"
1203 "sw %[vector_1], 16(%[dest]) \n\t"
1204 "sw %[vector_2], 20(%[dest]) \n\t"
1205 "sw %[vector_3], 24(%[dest]) \n\t"
1206 "sw %[vector_4], 28(%[dest]) \n\t"
1207
1208 "add %[dest], %[dest], %[stride] \n\t"
1209
1210 : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4),
1211 [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2),
1212 [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4),
1213 [dest] "+&r"(dest)
1214 : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
1215 }
1216 }
1217 }
1218 #endif // #if HAVE_DSPR2
1219