1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "./vpx_config.h"
12 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
13 #include "vpx_dsp/txfm_common.h"
14
15 #if HAVE_DSPR2
vpx_idct32_cols_add_blk_dspr2(int16_t * input,uint8_t * dest,int stride)16 void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) {
17 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6;
18 int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13;
19 int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20;
20 int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27;
21 int step1_28, step1_29, step1_30, step1_31;
22 int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
23 int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13;
24 int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20;
25 int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27;
26 int step2_28, step2_29, step2_30, step2_31;
27 int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14;
28 int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21;
29 int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28;
30 int step3_29, step3_30, step3_31;
31 int temp0, temp1, temp2, temp3;
32 int load1, load2, load3, load4;
33 int result1, result2;
34 int i;
35 uint8_t *dest_pix, *dest_pix1;
36 const int const_2_power_13 = 8192;
37 uint8_t *cm = vpx_ff_cropTbl;
38
39 /* prefetch vpx_ff_cropTbl */
40 prefetch_load(vpx_ff_cropTbl);
41 prefetch_load(vpx_ff_cropTbl + 32);
42 prefetch_load(vpx_ff_cropTbl + 64);
43 prefetch_load(vpx_ff_cropTbl + 96);
44 prefetch_load(vpx_ff_cropTbl + 128);
45 prefetch_load(vpx_ff_cropTbl + 160);
46 prefetch_load(vpx_ff_cropTbl + 192);
47 prefetch_load(vpx_ff_cropTbl + 224);
48
49 for (i = 0; i < 32; ++i) {
50 dest_pix = dest + i;
51 dest_pix1 = dest + i + 31 * stride;
52
53 __asm__ __volatile__(
54 "lh %[load1], 2(%[input]) \n\t"
55 "lh %[load2], 62(%[input]) \n\t"
56 "lh %[load3], 34(%[input]) \n\t"
57 "lh %[load4], 30(%[input]) \n\t"
58
59 "mtlo %[const_2_power_13], $ac1 \n\t"
60 "mthi $zero, $ac1 \n\t"
61 "mtlo %[const_2_power_13], $ac3 \n\t"
62 "mthi $zero, $ac3 \n\t"
63
64 "madd $ac1, %[load1], %[cospi_31_64] \n\t"
65 "msub $ac1, %[load2], %[cospi_1_64] \n\t"
66 "extp %[temp0], $ac1, 31 \n\t"
67
68 "madd $ac3, %[load1], %[cospi_1_64] \n\t"
69 "madd $ac3, %[load2], %[cospi_31_64] \n\t"
70 "extp %[temp3], $ac3, 31 \n\t"
71
72 "mtlo %[const_2_power_13], $ac1 \n\t"
73 "mthi $zero, $ac1 \n\t"
74 "mtlo %[const_2_power_13], $ac2 \n\t"
75 "mthi $zero, $ac2 \n\t"
76
77 "madd $ac2, %[load3], %[cospi_15_64] \n\t"
78 "msub $ac2, %[load4], %[cospi_17_64] \n\t"
79 "extp %[temp1], $ac2, 31 \n\t"
80
81 "madd $ac1, %[load3], %[cospi_17_64] \n\t"
82 "madd $ac1, %[load4], %[cospi_15_64] \n\t"
83 "extp %[temp2], $ac1, 31 \n\t"
84
85 "mtlo %[const_2_power_13], $ac1 \n\t"
86 "mthi $zero, $ac1 \n\t"
87 "mtlo %[const_2_power_13], $ac3 \n\t"
88 "mthi $zero, $ac3 \n\t"
89
90 "sub %[load1], %[temp3], %[temp2] \n\t"
91 "sub %[load2], %[temp0], %[temp1] \n\t"
92
93 "madd $ac1, %[load1], %[cospi_28_64] \n\t"
94 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
95 "madd $ac3, %[load1], %[cospi_4_64] \n\t"
96 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
97
98 "extp %[step1_17], $ac1, 31 \n\t"
99 "extp %[step1_30], $ac3, 31 \n\t"
100 "add %[step1_16], %[temp0], %[temp1] \n\t"
101 "add %[step1_31], %[temp2], %[temp3] \n\t"
102
103 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
104 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
105 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
106 [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17),
107 [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31)
108 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
109 [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64),
110 [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64),
111 [cospi_15_64] "r"(cospi_15_64), [cospi_28_64] "r"(cospi_28_64));
112
113 __asm__ __volatile__(
114 "lh %[load1], 18(%[input]) \n\t"
115 "lh %[load2], 46(%[input]) \n\t"
116 "lh %[load3], 50(%[input]) \n\t"
117 "lh %[load4], 14(%[input]) \n\t"
118
119 "mtlo %[const_2_power_13], $ac1 \n\t"
120 "mthi $zero, $ac1 \n\t"
121 "mtlo %[const_2_power_13], $ac3 \n\t"
122 "mthi $zero, $ac3 \n\t"
123
124 "madd $ac1, %[load1], %[cospi_23_64] \n\t"
125 "msub $ac1, %[load2], %[cospi_9_64] \n\t"
126 "extp %[temp0], $ac1, 31 \n\t"
127
128 "madd $ac3, %[load1], %[cospi_9_64] \n\t"
129 "madd $ac3, %[load2], %[cospi_23_64] \n\t"
130 "extp %[temp3], $ac3, 31 \n\t"
131
132 "mtlo %[const_2_power_13], $ac1 \n\t"
133 "mthi $zero, $ac1 \n\t"
134 "mtlo %[const_2_power_13], $ac2 \n\t"
135 "mthi $zero, $ac2 \n\t"
136
137 "madd $ac2, %[load3], %[cospi_7_64] \n\t"
138 "msub $ac2, %[load4], %[cospi_25_64] \n\t"
139 "extp %[temp1], $ac2, 31 \n\t"
140
141 "madd $ac1, %[load3], %[cospi_25_64] \n\t"
142 "madd $ac1, %[load4], %[cospi_7_64] \n\t"
143 "extp %[temp2], $ac1, 31 \n\t"
144
145 "mtlo %[const_2_power_13], $ac1 \n\t"
146 "mthi $zero, $ac1 \n\t"
147 "mtlo %[const_2_power_13], $ac3 \n\t"
148 "mthi $zero, $ac3 \n\t"
149
150 "sub %[load1], %[temp1], %[temp0] \n\t"
151 "sub %[load2], %[temp2], %[temp3] \n\t"
152
153 "msub $ac1, %[load1], %[cospi_28_64] \n\t"
154 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
155 "msub $ac3, %[load1], %[cospi_4_64] \n\t"
156 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
157
158 "extp %[step1_18], $ac1, 31 \n\t"
159 "extp %[step1_29], $ac3, 31 \n\t"
160 "add %[step1_19], %[temp0], %[temp1] \n\t"
161 "add %[step1_28], %[temp2], %[temp3] \n\t"
162
163 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
164 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
165 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
166 [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19),
167 [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29)
168 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
169 [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64),
170 [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64),
171 [cospi_25_64] "r"(cospi_25_64), [cospi_28_64] "r"(cospi_28_64));
172
173 __asm__ __volatile__(
174 "lh %[load1], 10(%[input]) \n\t"
175 "lh %[load2], 54(%[input]) \n\t"
176 "lh %[load3], 42(%[input]) \n\t"
177 "lh %[load4], 22(%[input]) \n\t"
178
179 "mtlo %[const_2_power_13], $ac1 \n\t"
180 "mthi $zero, $ac1 \n\t"
181 "mtlo %[const_2_power_13], $ac3 \n\t"
182 "mthi $zero, $ac3 \n\t"
183
184 "madd $ac1, %[load1], %[cospi_27_64] \n\t"
185 "msub $ac1, %[load2], %[cospi_5_64] \n\t"
186 "extp %[temp0], $ac1, 31 \n\t"
187
188 "madd $ac3, %[load1], %[cospi_5_64] \n\t"
189 "madd $ac3, %[load2], %[cospi_27_64] \n\t"
190 "extp %[temp3], $ac3, 31 \n\t"
191
192 "mtlo %[const_2_power_13], $ac1 \n\t"
193 "mthi $zero, $ac1 \n\t"
194 "mtlo %[const_2_power_13], $ac2 \n\t"
195 "mthi $zero, $ac2 \n\t"
196
197 "madd $ac2, %[load3], %[cospi_11_64] \n\t"
198 "msub $ac2, %[load4], %[cospi_21_64] \n\t"
199 "extp %[temp1], $ac2, 31 \n\t"
200
201 "madd $ac1, %[load3], %[cospi_21_64] \n\t"
202 "madd $ac1, %[load4], %[cospi_11_64] \n\t"
203 "extp %[temp2], $ac1, 31 \n\t"
204
205 "mtlo %[const_2_power_13], $ac1 \n\t"
206 "mthi $zero, $ac1 \n\t"
207 "mtlo %[const_2_power_13], $ac3 \n\t"
208 "mthi $zero, $ac3 \n\t"
209
210 "sub %[load1], %[temp0], %[temp1] \n\t"
211 "sub %[load2], %[temp3], %[temp2] \n\t"
212
213 "madd $ac1, %[load2], %[cospi_12_64] \n\t"
214 "msub $ac1, %[load1], %[cospi_20_64] \n\t"
215 "madd $ac3, %[load1], %[cospi_12_64] \n\t"
216 "madd $ac3, %[load2], %[cospi_20_64] \n\t"
217
218 "extp %[step1_21], $ac1, 31 \n\t"
219 "extp %[step1_26], $ac3, 31 \n\t"
220 "add %[step1_20], %[temp0], %[temp1] \n\t"
221 "add %[step1_27], %[temp2], %[temp3] \n\t"
222
223 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
224 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
225 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
226 [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21),
227 [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27)
228 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
229 [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64),
230 [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64),
231 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
232
233 __asm__ __volatile__(
234 "lh %[load1], 26(%[input]) \n\t"
235 "lh %[load2], 38(%[input]) \n\t"
236 "lh %[load3], 58(%[input]) \n\t"
237 "lh %[load4], 6(%[input]) \n\t"
238
239 "mtlo %[const_2_power_13], $ac1 \n\t"
240 "mthi $zero, $ac1 \n\t"
241 "mtlo %[const_2_power_13], $ac3 \n\t"
242 "mthi $zero, $ac3 \n\t"
243
244 "madd $ac1, %[load1], %[cospi_19_64] \n\t"
245 "msub $ac1, %[load2], %[cospi_13_64] \n\t"
246 "extp %[temp0], $ac1, 31 \n\t"
247 "madd $ac3, %[load1], %[cospi_13_64] \n\t"
248 "madd $ac3, %[load2], %[cospi_19_64] \n\t"
249 "extp %[temp3], $ac3, 31 \n\t"
250
251 "mtlo %[const_2_power_13], $ac1 \n\t"
252 "mthi $zero, $ac1 \n\t"
253 "mtlo %[const_2_power_13], $ac2 \n\t"
254 "mthi $zero, $ac2 \n\t"
255
256 "madd $ac2, %[load3], %[cospi_3_64] \n\t"
257 "msub $ac2, %[load4], %[cospi_29_64] \n\t"
258 "extp %[temp1], $ac2, 31 \n\t"
259 "madd $ac1, %[load3], %[cospi_29_64] \n\t"
260 "madd $ac1, %[load4], %[cospi_3_64] \n\t"
261 "extp %[temp2], $ac1, 31 \n\t"
262
263 "mtlo %[const_2_power_13], $ac1 \n\t"
264 "mthi $zero, $ac1 \n\t"
265 "mtlo %[const_2_power_13], $ac3 \n\t"
266 "mthi $zero, $ac3 \n\t"
267
268 "sub %[load1], %[temp1], %[temp0] \n\t"
269 "sub %[load2], %[temp2], %[temp3] \n\t"
270 "msub $ac1, %[load1], %[cospi_12_64] \n\t"
271 "msub $ac1, %[load2], %[cospi_20_64] \n\t"
272 "msub $ac3, %[load1], %[cospi_20_64] \n\t"
273 "madd $ac3, %[load2], %[cospi_12_64] \n\t"
274 "extp %[step1_22], $ac1, 31 \n\t"
275 "extp %[step1_25], $ac3, 31 \n\t"
276 "add %[step1_23], %[temp0], %[temp1] \n\t"
277 "add %[step1_24], %[temp2], %[temp3] \n\t"
278
279 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
280 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
281 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
282 [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23),
283 [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25)
284 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
285 [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64),
286 [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64),
287 [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64));
288
289 __asm__ __volatile__(
290 "lh %[load1], 4(%[input]) \n\t"
291 "lh %[load2], 60(%[input]) \n\t"
292 "lh %[load3], 36(%[input]) \n\t"
293 "lh %[load4], 28(%[input]) \n\t"
294
295 "mtlo %[const_2_power_13], $ac1 \n\t"
296 "mthi $zero, $ac1 \n\t"
297 "mtlo %[const_2_power_13], $ac3 \n\t"
298 "mthi $zero, $ac3 \n\t"
299
300 "madd $ac1, %[load1], %[cospi_30_64] \n\t"
301 "msub $ac1, %[load2], %[cospi_2_64] \n\t"
302 "extp %[temp0], $ac1, 31 \n\t"
303 "madd $ac3, %[load1], %[cospi_2_64] \n\t"
304 "madd $ac3, %[load2], %[cospi_30_64] \n\t"
305 "extp %[temp3], $ac3, 31 \n\t"
306
307 "mtlo %[const_2_power_13], $ac1 \n\t"
308 "mthi $zero, $ac1 \n\t"
309 "mtlo %[const_2_power_13], $ac2 \n\t"
310 "mthi $zero, $ac2 \n\t"
311
312 "madd $ac2, %[load3], %[cospi_14_64] \n\t"
313 "msub $ac2, %[load4], %[cospi_18_64] \n\t"
314 "extp %[temp1], $ac2, 31 \n\t"
315 "madd $ac1, %[load3], %[cospi_18_64] \n\t"
316 "madd $ac1, %[load4], %[cospi_14_64] \n\t"
317 "extp %[temp2], $ac1, 31 \n\t"
318
319 "mtlo %[const_2_power_13], $ac1 \n\t"
320 "mthi $zero, $ac1 \n\t"
321 "mtlo %[const_2_power_13], $ac3 \n\t"
322 "mthi $zero, $ac3 \n\t"
323
324 "sub %[load1], %[temp0], %[temp1] \n\t"
325 "sub %[load2], %[temp3], %[temp2] \n\t"
326 "msub $ac1, %[load1], %[cospi_8_64] \n\t"
327 "madd $ac1, %[load2], %[cospi_24_64] \n\t"
328 "madd $ac3, %[load1], %[cospi_24_64] \n\t"
329 "madd $ac3, %[load2], %[cospi_8_64] \n\t"
330 "extp %[step2_9], $ac1, 31 \n\t"
331 "extp %[step2_14], $ac3, 31 \n\t"
332 "add %[step2_8], %[temp0], %[temp1] \n\t"
333 "add %[step2_15], %[temp2], %[temp3] \n\t"
334
335 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
336 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
337 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8),
338 [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14),
339 [step2_15] "=&r"(step2_15)
340 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
341 [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64),
342 [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64),
343 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
344
345 __asm__ __volatile__(
346 "lh %[load1], 20(%[input]) \n\t"
347 "lh %[load2], 44(%[input]) \n\t"
348 "lh %[load3], 52(%[input]) \n\t"
349 "lh %[load4], 12(%[input]) \n\t"
350
351 "mtlo %[const_2_power_13], $ac1 \n\t"
352 "mthi $zero, $ac1 \n\t"
353 "mtlo %[const_2_power_13], $ac3 \n\t"
354 "mthi $zero, $ac3 \n\t"
355
356 "madd $ac1, %[load1], %[cospi_22_64] \n\t"
357 "msub $ac1, %[load2], %[cospi_10_64] \n\t"
358 "extp %[temp0], $ac1, 31 \n\t"
359 "madd $ac3, %[load1], %[cospi_10_64] \n\t"
360 "madd $ac3, %[load2], %[cospi_22_64] \n\t"
361 "extp %[temp3], $ac3, 31 \n\t"
362
363 "mtlo %[const_2_power_13], $ac1 \n\t"
364 "mthi $zero, $ac1 \n\t"
365 "mtlo %[const_2_power_13], $ac2 \n\t"
366 "mthi $zero, $ac2 \n\t"
367
368 "madd $ac2, %[load3], %[cospi_6_64] \n\t"
369 "msub $ac2, %[load4], %[cospi_26_64] \n\t"
370 "extp %[temp1], $ac2, 31 \n\t"
371 "madd $ac1, %[load3], %[cospi_26_64] \n\t"
372 "madd $ac1, %[load4], %[cospi_6_64] \n\t"
373 "extp %[temp2], $ac1, 31 \n\t"
374
375 "mtlo %[const_2_power_13], $ac1 \n\t"
376 "mthi $zero, $ac1 \n\t"
377 "mtlo %[const_2_power_13], $ac3 \n\t"
378 "mthi $zero, $ac3 \n\t"
379
380 "sub %[load1], %[temp1], %[temp0] \n\t"
381 "sub %[load2], %[temp2], %[temp3] \n\t"
382 "msub $ac1, %[load1], %[cospi_24_64] \n\t"
383 "msub $ac1, %[load2], %[cospi_8_64] \n\t"
384 "madd $ac3, %[load2], %[cospi_24_64] \n\t"
385 "msub $ac3, %[load1], %[cospi_8_64] \n\t"
386 "extp %[step2_10], $ac1, 31 \n\t"
387 "extp %[step2_13], $ac3, 31 \n\t"
388 "add %[step2_11], %[temp0], %[temp1] \n\t"
389 "add %[step2_12], %[temp2], %[temp3] \n\t"
390
391 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
392 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
393 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3),
394 [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11),
395 [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13)
396 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
397 [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64),
398 [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64),
399 [cospi_8_64] "r"(cospi_8_64), [cospi_24_64] "r"(cospi_24_64));
400
401 __asm__ __volatile__(
402 "mtlo %[const_2_power_13], $ac0 \n\t"
403 "mthi $zero, $ac0 \n\t"
404 "sub %[temp0], %[step2_14], %[step2_13] \n\t"
405 "sub %[temp0], %[temp0], %[step2_9] \n\t"
406 "add %[temp0], %[temp0], %[step2_10] \n\t"
407 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
408 "mtlo %[const_2_power_13], $ac1 \n\t"
409 "mthi $zero, $ac1 \n\t"
410 "sub %[temp1], %[step2_14], %[step2_13] \n\t"
411 "add %[temp1], %[temp1], %[step2_9] \n\t"
412 "sub %[temp1], %[temp1], %[step2_10] \n\t"
413 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
414 "mtlo %[const_2_power_13], $ac2 \n\t"
415 "mthi $zero, $ac2 \n\t"
416 "sub %[temp0], %[step2_15], %[step2_12] \n\t"
417 "sub %[temp0], %[temp0], %[step2_8] \n\t"
418 "add %[temp0], %[temp0], %[step2_11] \n\t"
419 "madd $ac2, %[temp0], %[cospi_16_64] \n\t"
420 "mtlo %[const_2_power_13], $ac3 \n\t"
421 "mthi $zero, $ac3 \n\t"
422 "sub %[temp1], %[step2_15], %[step2_12] \n\t"
423 "add %[temp1], %[temp1], %[step2_8] \n\t"
424 "sub %[temp1], %[temp1], %[step2_11] \n\t"
425 "madd $ac3, %[temp1], %[cospi_16_64] \n\t"
426
427 "add %[step3_8], %[step2_8], %[step2_11] \n\t"
428 "add %[step3_9], %[step2_9], %[step2_10] \n\t"
429 "add %[step3_14], %[step2_13], %[step2_14] \n\t"
430 "add %[step3_15], %[step2_12], %[step2_15] \n\t"
431 "extp %[step3_10], $ac0, 31 \n\t"
432 "extp %[step3_13], $ac1, 31 \n\t"
433 "extp %[step3_11], $ac2, 31 \n\t"
434 "extp %[step3_12], $ac3, 31 \n\t"
435
436 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8),
437 [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10),
438 [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12),
439 [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14),
440 [step3_15] "=&r"(step3_15)
441 : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8),
442 [step2_9] "r"(step2_9), [step2_10] "r"(step2_10),
443 [step2_11] "r"(step2_11), [step2_12] "r"(step2_12),
444 [step2_13] "r"(step2_13), [step2_14] "r"(step2_14),
445 [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64));
446
447 __asm__ __volatile__(
448 "mtlo %[const_2_power_13], $ac0 \n\t"
449 "mthi $zero, $ac0 \n\t"
450 "mtlo %[const_2_power_13], $ac1 \n\t"
451 "mthi $zero, $ac1 \n\t"
452 "sub %[temp0], %[step1_17], %[step1_18] \n\t"
453 "sub %[temp1], %[step1_30], %[step1_29] \n\t"
454 "add %[step3_17], %[step1_17], %[step1_18] \n\t"
455 "add %[step3_30], %[step1_30], %[step1_29] \n\t"
456
457 "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
458 "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
459 "extp %[step3_18], $ac0, 31 \n\t"
460 "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
461 "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
462 "extp %[step3_29], $ac1, 31 \n\t"
463
464 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
465 [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29),
466 [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30)
467 : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17),
468 [step1_18] "r"(step1_18), [step1_30] "r"(step1_30),
469 [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64),
470 [cospi_8_64] "r"(cospi_8_64));
471
472 __asm__ __volatile__(
473 "mtlo %[const_2_power_13], $ac0 \n\t"
474 "mthi $zero, $ac0 \n\t"
475 "mtlo %[const_2_power_13], $ac1 \n\t"
476 "mthi $zero, $ac1 \n\t"
477 "sub %[temp0], %[step1_16], %[step1_19] \n\t"
478 "sub %[temp1], %[step1_31], %[step1_28] \n\t"
479 "add %[step3_16], %[step1_16], %[step1_19] \n\t"
480 "add %[step3_31], %[step1_31], %[step1_28] \n\t"
481
482 "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
483 "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
484 "extp %[step3_19], $ac0, 31 \n\t"
485 "madd $ac1, %[temp0], %[cospi_24_64] \n\t"
486 "madd $ac1, %[temp1], %[cospi_8_64] \n\t"
487 "extp %[step3_28], $ac1, 31 \n\t"
488
489 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
490 [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31),
491 [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28)
492 : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16),
493 [step1_19] "r"(step1_19), [step1_31] "r"(step1_31),
494 [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64),
495 [cospi_8_64] "r"(cospi_8_64));
496
497 __asm__ __volatile__(
498 "mtlo %[const_2_power_13], $ac0 \n\t"
499 "mthi $zero, $ac0 \n\t"
500 "mtlo %[const_2_power_13], $ac1 \n\t"
501 "mthi $zero, $ac1 \n\t"
502 "sub %[temp0], %[step1_23], %[step1_20] \n\t"
503 "sub %[temp1], %[step1_24], %[step1_27] \n\t"
504 "add %[step3_23], %[step1_23], %[step1_20] \n\t"
505 "add %[step3_24], %[step1_24], %[step1_27] \n\t"
506
507 "msub $ac0, %[temp0], %[cospi_8_64] \n\t"
508 "madd $ac0, %[temp1], %[cospi_24_64] \n\t"
509 "extp %[step3_27], $ac0, 31 \n\t"
510 "msub $ac1, %[temp0], %[cospi_24_64] \n\t"
511 "msub $ac1, %[temp1], %[cospi_8_64] \n\t"
512 "extp %[step3_20], $ac1, 31 \n\t"
513
514 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
515 [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24),
516 [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27)
517 : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23),
518 [step1_20] "r"(step1_20), [step1_24] "r"(step1_24),
519 [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64),
520 [cospi_8_64] "r"(cospi_8_64));
521
522 __asm__ __volatile__(
523 "mtlo %[const_2_power_13], $ac0 \n\t"
524 "mthi $zero, $ac0 \n\t"
525 "mtlo %[const_2_power_13], $ac1 \n\t"
526 "mthi $zero, $ac1 \n\t"
527 "sub %[temp0], %[step1_22], %[step1_21] \n\t"
528 "sub %[temp1], %[step1_25], %[step1_26] \n\t"
529 "add %[step3_22], %[step1_22], %[step1_21] \n\t"
530 "add %[step3_25], %[step1_25], %[step1_26] \n\t"
531
532 "msub $ac0, %[temp0], %[cospi_24_64] \n\t"
533 "msub $ac0, %[temp1], %[cospi_8_64] \n\t"
534 "extp %[step3_21], $ac0, 31 \n\t"
535 "msub $ac1, %[temp0], %[cospi_8_64] \n\t"
536 "madd $ac1, %[temp1], %[cospi_24_64] \n\t"
537 "extp %[step3_26], $ac1, 31 \n\t"
538
539 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
540 [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25),
541 [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26)
542 : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22),
543 [step1_21] "r"(step1_21), [step1_25] "r"(step1_25),
544 [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64),
545 [cospi_8_64] "r"(cospi_8_64));
546
547 __asm__ __volatile__(
548 "add %[step2_16], %[step3_16], %[step3_23] \n\t"
549 "add %[step2_17], %[step3_17], %[step3_22] \n\t"
550 "add %[step2_18], %[step3_18], %[step3_21] \n\t"
551 "add %[step2_19], %[step3_19], %[step3_20] \n\t"
552 "sub %[step2_20], %[step3_19], %[step3_20] \n\t"
553 "sub %[step2_21], %[step3_18], %[step3_21] \n\t"
554 "sub %[step2_22], %[step3_17], %[step3_22] \n\t"
555 "sub %[step2_23], %[step3_16], %[step3_23] \n\t"
556
557 : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17),
558 [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19),
559 [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21),
560 [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23)
561 : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23),
562 [step3_17] "r"(step3_17), [step3_22] "r"(step3_22),
563 [step3_18] "r"(step3_18), [step3_21] "r"(step3_21),
564 [step3_19] "r"(step3_19), [step3_20] "r"(step3_20));
565
566 __asm__ __volatile__(
567 "sub %[step2_24], %[step3_31], %[step3_24] \n\t"
568 "sub %[step2_25], %[step3_30], %[step3_25] \n\t"
569 "sub %[step2_26], %[step3_29], %[step3_26] \n\t"
570 "sub %[step2_27], %[step3_28], %[step3_27] \n\t"
571 "add %[step2_28], %[step3_28], %[step3_27] \n\t"
572 "add %[step2_29], %[step3_29], %[step3_26] \n\t"
573 "add %[step2_30], %[step3_30], %[step3_25] \n\t"
574 "add %[step2_31], %[step3_31], %[step3_24] \n\t"
575
576 : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28),
577 [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29),
578 [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30),
579 [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31)
580 : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24),
581 [step3_30] "r"(step3_30), [step3_25] "r"(step3_25),
582 [step3_29] "r"(step3_29), [step3_26] "r"(step3_26),
583 [step3_28] "r"(step3_28), [step3_27] "r"(step3_27));
584
585 __asm__ __volatile__(
586 "lh %[load1], 0(%[input]) \n\t"
587 "lh %[load2], 32(%[input]) \n\t"
588 "lh %[load3], 16(%[input]) \n\t"
589 "lh %[load4], 48(%[input]) \n\t"
590
591 "mtlo %[const_2_power_13], $ac1 \n\t"
592 "mthi $zero, $ac1 \n\t"
593 "mtlo %[const_2_power_13], $ac2 \n\t"
594 "mthi $zero, $ac2 \n\t"
595 "add %[result1], %[load1], %[load2] \n\t"
596 "sub %[result2], %[load1], %[load2] \n\t"
597 "madd $ac1, %[result1], %[cospi_16_64] \n\t"
598 "madd $ac2, %[result2], %[cospi_16_64] \n\t"
599 "extp %[temp0], $ac1, 31 \n\t"
600 "extp %[temp1], $ac2, 31 \n\t"
601
602 "mtlo %[const_2_power_13], $ac3 \n\t"
603 "mthi $zero, $ac3 \n\t"
604 "madd $ac3, %[load3], %[cospi_24_64] \n\t"
605 "msub $ac3, %[load4], %[cospi_8_64] \n\t"
606 "extp %[temp2], $ac3, 31 \n\t"
607 "mtlo %[const_2_power_13], $ac1 \n\t"
608 "mthi $zero, $ac1 \n\t"
609 "madd $ac1, %[load3], %[cospi_8_64] \n\t"
610 "madd $ac1, %[load4], %[cospi_24_64] \n\t"
611 "extp %[temp3], $ac1, 31 \n\t"
612 "add %[step1_0], %[temp0], %[temp3] \n\t"
613 "add %[step1_1], %[temp1], %[temp2] \n\t"
614 "sub %[step1_2], %[temp1], %[temp2] \n\t"
615 "sub %[step1_3], %[temp0], %[temp3] \n\t"
616
617 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
618 [load4] "=&r"(load4), [result1] "=&r"(result1),
619 [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
620 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0),
621 [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2),
622 [step1_3] "=&r"(step1_3)
623 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
624 [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64),
625 [cospi_16_64] "r"(cospi_16_64));
626
627 __asm__ __volatile__(
628 "lh %[load1], 8(%[input]) \n\t"
629 "lh %[load2], 56(%[input]) \n\t"
630 "lh %[load3], 40(%[input]) \n\t"
631 "lh %[load4], 24(%[input]) \n\t"
632
633 "mtlo %[const_2_power_13], $ac1 \n\t"
634 "mthi $zero, $ac1 \n\t"
635 "mtlo %[const_2_power_13], $ac3 \n\t"
636 "mthi $zero, $ac3 \n\t"
637
638 "madd $ac1, %[load1], %[cospi_28_64] \n\t"
639 "msub $ac1, %[load2], %[cospi_4_64] \n\t"
640 "extp %[temp0], $ac1, 31 \n\t"
641 "madd $ac3, %[load1], %[cospi_4_64] \n\t"
642 "madd $ac3, %[load2], %[cospi_28_64] \n\t"
643 "extp %[temp3], $ac3, 31 \n\t"
644
645 "mtlo %[const_2_power_13], $ac1 \n\t"
646 "mthi $zero, $ac1 \n\t"
647 "mtlo %[const_2_power_13], $ac2 \n\t"
648 "mthi $zero, $ac2 \n\t"
649
650 "madd $ac2, %[load3], %[cospi_12_64] \n\t"
651 "msub $ac2, %[load4], %[cospi_20_64] \n\t"
652 "extp %[temp1], $ac2, 31 \n\t"
653 "madd $ac1, %[load3], %[cospi_20_64] \n\t"
654 "madd $ac1, %[load4], %[cospi_12_64] \n\t"
655 "extp %[temp2], $ac1, 31 \n\t"
656
657 "mtlo %[const_2_power_13], $ac1 \n\t"
658 "mthi $zero, $ac1 \n\t"
659 "mtlo %[const_2_power_13], $ac3 \n\t"
660 "mthi $zero, $ac3 \n\t"
661
662 "sub %[load1], %[temp3], %[temp2] \n\t"
663 "sub %[load1], %[load1], %[temp0] \n\t"
664 "add %[load1], %[load1], %[temp1] \n\t"
665 "sub %[load2], %[temp0], %[temp1] \n\t"
666 "sub %[load2], %[load2], %[temp2] \n\t"
667 "add %[load2], %[load2], %[temp3] \n\t"
668 "madd $ac1, %[load1], %[cospi_16_64] \n\t"
669 "madd $ac3, %[load2], %[cospi_16_64] \n\t"
670
671 "extp %[step1_5], $ac1, 31 \n\t"
672 "extp %[step1_6], $ac3, 31 \n\t"
673 "add %[step1_4], %[temp0], %[temp1] \n\t"
674 "add %[step1_7], %[temp3], %[temp2] \n\t"
675
676 : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3),
677 [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1),
678 [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4),
679 [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6),
680 [step1_7] "=&r"(step1_7)
681 : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input),
682 [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64),
683 [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64),
684 [cospi_16_64] "r"(cospi_16_64));
685
686 __asm__ __volatile__(
687 "add %[step2_0], %[step1_0], %[step1_7] \n\t"
688 "add %[step2_1], %[step1_1], %[step1_6] \n\t"
689 "add %[step2_2], %[step1_2], %[step1_5] \n\t"
690 "add %[step2_3], %[step1_3], %[step1_4] \n\t"
691 "sub %[step2_4], %[step1_3], %[step1_4] \n\t"
692 "sub %[step2_5], %[step1_2], %[step1_5] \n\t"
693 "sub %[step2_6], %[step1_1], %[step1_6] \n\t"
694 "sub %[step2_7], %[step1_0], %[step1_7] \n\t"
695
696 : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4),
697 [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5),
698 [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6),
699 [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7)
700 : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7),
701 [step1_1] "r"(step1_1), [step1_6] "r"(step1_6),
702 [step1_2] "r"(step1_2), [step1_5] "r"(step1_5),
703 [step1_3] "r"(step1_3), [step1_4] "r"(step1_4));
704
705 // stage 7
706 __asm__ __volatile__(
707 "add %[step1_0], %[step2_0], %[step3_15] \n\t"
708 "add %[step1_1], %[step2_1], %[step3_14] \n\t"
709 "add %[step1_2], %[step2_2], %[step3_13] \n\t"
710 "add %[step1_3], %[step2_3], %[step3_12] \n\t"
711 "sub %[step1_12], %[step2_3], %[step3_12] \n\t"
712 "sub %[step1_13], %[step2_2], %[step3_13] \n\t"
713 "sub %[step1_14], %[step2_1], %[step3_14] \n\t"
714 "sub %[step1_15], %[step2_0], %[step3_15] \n\t"
715
716 : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12),
717 [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13),
718 [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14),
719 [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15)
720 : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15),
721 [step2_1] "r"(step2_1), [step3_14] "r"(step3_14),
722 [step2_2] "r"(step2_2), [step3_13] "r"(step3_13),
723 [step2_3] "r"(step2_3), [step3_12] "r"(step3_12));
724
725 __asm__ __volatile__(
726 "add %[step1_4], %[step2_4], %[step3_11] \n\t"
727 "add %[step1_5], %[step2_5], %[step3_10] \n\t"
728 "add %[step1_6], %[step2_6], %[step3_9] \n\t"
729 "add %[step1_7], %[step2_7], %[step3_8] \n\t"
730 "sub %[step1_8], %[step2_7], %[step3_8] \n\t"
731 "sub %[step1_9], %[step2_6], %[step3_9] \n\t"
732 "sub %[step1_10], %[step2_5], %[step3_10] \n\t"
733 "sub %[step1_11], %[step2_4], %[step3_11] \n\t"
734
735 : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8),
736 [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9),
737 [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10),
738 [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11)
739 : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11),
740 [step2_5] "r"(step2_5), [step3_10] "r"(step3_10),
741 [step2_6] "r"(step2_6), [step3_9] "r"(step3_9),
742 [step2_7] "r"(step2_7), [step3_8] "r"(step3_8));
743
744 __asm__ __volatile__(
745 "sub %[temp0], %[step2_27], %[step2_20] \n\t"
746 "add %[temp1], %[step2_27], %[step2_20] \n\t"
747 "sub %[temp2], %[step2_26], %[step2_21] \n\t"
748 "add %[temp3], %[step2_26], %[step2_21] \n\t"
749
750 "mtlo %[const_2_power_13], $ac0 \n\t"
751 "mthi $zero, $ac0 \n\t"
752 "mtlo %[const_2_power_13], $ac1 \n\t"
753 "mthi $zero, $ac1 \n\t"
754 "mtlo %[const_2_power_13], $ac2 \n\t"
755 "mthi $zero, $ac2 \n\t"
756 "mtlo %[const_2_power_13], $ac3 \n\t"
757 "mthi $zero, $ac3 \n\t"
758
759 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
760 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
761 "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
762 "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
763
764 "extp %[step1_20], $ac0, 31 \n\t"
765 "extp %[step1_27], $ac1, 31 \n\t"
766 "extp %[step1_21], $ac2, 31 \n\t"
767 "extp %[step1_26], $ac3, 31 \n\t"
768
769 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
770 [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20),
771 [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21),
772 [step1_26] "=&r"(step1_26)
773 : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20),
774 [step2_27] "r"(step2_27), [step2_21] "r"(step2_21),
775 [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64));
776
777 __asm__ __volatile__(
778 "sub %[temp0], %[step2_25], %[step2_22] \n\t"
779 "add %[temp1], %[step2_25], %[step2_22] \n\t"
780 "sub %[temp2], %[step2_24], %[step2_23] \n\t"
781 "add %[temp3], %[step2_24], %[step2_23] \n\t"
782
783 "mtlo %[const_2_power_13], $ac0 \n\t"
784 "mthi $zero, $ac0 \n\t"
785 "mtlo %[const_2_power_13], $ac1 \n\t"
786 "mthi $zero, $ac1 \n\t"
787 "mtlo %[const_2_power_13], $ac2 \n\t"
788 "mthi $zero, $ac2 \n\t"
789 "mtlo %[const_2_power_13], $ac3 \n\t"
790 "mthi $zero, $ac3 \n\t"
791
792 "madd $ac0, %[temp0], %[cospi_16_64] \n\t"
793 "madd $ac1, %[temp1], %[cospi_16_64] \n\t"
794 "madd $ac2, %[temp2], %[cospi_16_64] \n\t"
795 "madd $ac3, %[temp3], %[cospi_16_64] \n\t"
796
797 "extp %[step1_22], $ac0, 31 \n\t"
798 "extp %[step1_25], $ac1, 31 \n\t"
799 "extp %[step1_23], $ac2, 31 \n\t"
800 "extp %[step1_24], $ac3, 31 \n\t"
801
802 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
803 [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22),
804 [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23),
805 [step1_24] "=&r"(step1_24)
806 : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22),
807 [step2_25] "r"(step2_25), [step2_23] "r"(step2_23),
808 [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64));
809
810 __asm__ __volatile__(
811 "lbu %[temp2], 0(%[dest_pix]) \n\t"
812 "add %[temp0], %[step1_0], %[step2_31] \n\t"
813 "addi %[temp0], %[temp0], 32 \n\t"
814 "sra %[temp0], %[temp0], 6 \n\t"
815 "add %[temp2], %[temp2], %[temp0] \n\t"
816 "lbux %[temp0], %[temp2](%[cm]) \n\t"
817 "add %[temp1], %[step1_1], %[step2_30] \n\t"
818 "sb %[temp0], 0(%[dest_pix]) \n\t"
819 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
820 "lbu %[temp3], 0(%[dest_pix]) \n\t"
821 "addi %[temp1], %[temp1], 32 \n\t"
822 "sra %[temp1], %[temp1], 6 \n\t"
823 "add %[temp3], %[temp3], %[temp1] \n\t"
824 "lbux %[temp1], %[temp3](%[cm]) \n\t"
825 "sb %[temp1], 0(%[dest_pix]) \n\t"
826 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
827
828 "lbu %[temp2], 0(%[dest_pix]) \n\t"
829 "add %[temp0], %[step1_2], %[step2_29] \n\t"
830 "addi %[temp0], %[temp0], 32 \n\t"
831 "sra %[temp0], %[temp0], 6 \n\t"
832 "add %[temp2], %[temp2], %[temp0] \n\t"
833 "lbux %[temp0], %[temp2](%[cm]) \n\t"
834 "add %[temp1], %[step1_3], %[step2_28] \n\t"
835 "sb %[temp0], 0(%[dest_pix]) \n\t"
836 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
837 "lbu %[temp3], 0(%[dest_pix]) \n\t"
838 "addi %[temp1], %[temp1], 32 \n\t"
839 "sra %[temp1], %[temp1], 6 \n\t"
840 "add %[temp3], %[temp3], %[temp1] \n\t"
841 "lbux %[temp1], %[temp3](%[cm]) \n\t"
842 "sb %[temp1], 0(%[dest_pix]) \n\t"
843 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
844
845 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
846 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
847 : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0),
848 [step1_1] "r"(step1_1), [step1_2] "r"(step1_2),
849 [step1_3] "r"(step1_3), [step2_28] "r"(step2_28),
850 [step2_29] "r"(step2_29), [step2_30] "r"(step2_30),
851 [step2_31] "r"(step2_31));
852
853 step3_12 = ROUND_POWER_OF_TWO((step1_3 - step2_28), 6);
854 step3_13 = ROUND_POWER_OF_TWO((step1_2 - step2_29), 6);
855 step3_14 = ROUND_POWER_OF_TWO((step1_1 - step2_30), 6);
856 step3_15 = ROUND_POWER_OF_TWO((step1_0 - step2_31), 6);
857
858 __asm__ __volatile__(
859 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
860 "add %[temp2], %[temp2], %[step3_15] \n\t"
861 "lbux %[temp0], %[temp2](%[cm]) \n\t"
862 "sb %[temp0], 0(%[dest_pix1]) \n\t"
863 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
864 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
865 "add %[temp3], %[temp3], %[step3_14] \n\t"
866 "lbux %[temp1], %[temp3](%[cm]) \n\t"
867 "sb %[temp1], 0(%[dest_pix1]) \n\t"
868 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
869
870 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
871 "add %[temp2], %[temp2], %[step3_13] \n\t"
872 "lbux %[temp0], %[temp2](%[cm]) \n\t"
873 "sb %[temp0], 0(%[dest_pix1]) \n\t"
874 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
875 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
876 "add %[temp3], %[temp3], %[step3_12] \n\t"
877 "lbux %[temp1], %[temp3](%[cm]) \n\t"
878 "sb %[temp1], 0(%[dest_pix1]) \n\t"
879 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
880
881 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
882 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
883 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
884 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
885 [step3_15] "r"(step3_15));
886
887 __asm__ __volatile__(
888 "lbu %[temp2], 0(%[dest_pix]) \n\t"
889 "add %[temp0], %[step1_4], %[step1_27] \n\t"
890 "addi %[temp0], %[temp0], 32 \n\t"
891 "sra %[temp0], %[temp0], 6 \n\t"
892 "add %[temp2], %[temp2], %[temp0] \n\t"
893 "lbux %[temp0], %[temp2](%[cm]) \n\t"
894 "add %[temp1], %[step1_5], %[step1_26] \n\t"
895 "sb %[temp0], 0(%[dest_pix]) \n\t"
896 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
897 "lbu %[temp3], 0(%[dest_pix]) \n\t"
898 "addi %[temp1], %[temp1], 32 \n\t"
899 "sra %[temp1], %[temp1], 6 \n\t"
900 "add %[temp3], %[temp3], %[temp1] \n\t"
901 "lbux %[temp1], %[temp3](%[cm]) \n\t"
902 "sb %[temp1], 0(%[dest_pix]) \n\t"
903 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
904
905 "lbu %[temp2], 0(%[dest_pix]) \n\t"
906 "add %[temp0], %[step1_6], %[step1_25] \n\t"
907 "addi %[temp0], %[temp0], 32 \n\t"
908 "sra %[temp0], %[temp0], 6 \n\t"
909 "add %[temp2], %[temp2], %[temp0] \n\t"
910 "lbux %[temp0], %[temp2](%[cm]) \n\t"
911 "add %[temp1], %[step1_7], %[step1_24] \n\t"
912 "sb %[temp0], 0(%[dest_pix]) \n\t"
913 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
914 "lbu %[temp3], 0(%[dest_pix]) \n\t"
915 "addi %[temp1], %[temp1], 32 \n\t"
916 "sra %[temp1], %[temp1], 6 \n\t"
917 "add %[temp3], %[temp3], %[temp1] \n\t"
918 "lbux %[temp1], %[temp3](%[cm]) \n\t"
919 "sb %[temp1], 0(%[dest_pix]) \n\t"
920 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
921
922 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
923 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
924 : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4),
925 [step1_5] "r"(step1_5), [step1_6] "r"(step1_6),
926 [step1_7] "r"(step1_7), [step1_24] "r"(step1_24),
927 [step1_25] "r"(step1_25), [step1_26] "r"(step1_26),
928 [step1_27] "r"(step1_27));
929
930 step3_12 = ROUND_POWER_OF_TWO((step1_7 - step1_24), 6);
931 step3_13 = ROUND_POWER_OF_TWO((step1_6 - step1_25), 6);
932 step3_14 = ROUND_POWER_OF_TWO((step1_5 - step1_26), 6);
933 step3_15 = ROUND_POWER_OF_TWO((step1_4 - step1_27), 6);
934
935 __asm__ __volatile__(
936 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
937 "add %[temp2], %[temp2], %[step3_15] \n\t"
938 "lbux %[temp0], %[temp2](%[cm]) \n\t"
939 "sb %[temp0], 0(%[dest_pix1]) \n\t"
940 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
941 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
942 "add %[temp3], %[temp3], %[step3_14] \n\t"
943 "lbux %[temp1], %[temp3](%[cm]) \n\t"
944 "sb %[temp1], 0(%[dest_pix1]) \n\t"
945 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
946
947 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
948 "add %[temp2], %[temp2], %[step3_13] \n\t"
949 "lbux %[temp0], %[temp2](%[cm]) \n\t"
950 "sb %[temp0], 0(%[dest_pix1]) \n\t"
951 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
952 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
953 "add %[temp3], %[temp3], %[step3_12] \n\t"
954 "lbux %[temp1], %[temp3](%[cm]) \n\t"
955 "sb %[temp1], 0(%[dest_pix1]) \n\t"
956 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
957
958 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
959 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
960 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
961 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
962 [step3_15] "r"(step3_15));
963
964 __asm__ __volatile__(
965 "lbu %[temp2], 0(%[dest_pix]) \n\t"
966 "add %[temp0], %[step1_8], %[step1_23] \n\t"
967 "addi %[temp0], %[temp0], 32 \n\t"
968 "sra %[temp0], %[temp0], 6 \n\t"
969 "add %[temp2], %[temp2], %[temp0] \n\t"
970 "lbux %[temp0], %[temp2](%[cm]) \n\t"
971 "add %[temp1], %[step1_9], %[step1_22] \n\t"
972 "sb %[temp0], 0(%[dest_pix]) \n\t"
973 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
974 "lbu %[temp3], 0(%[dest_pix]) \n\t"
975 "addi %[temp1], %[temp1], 32 \n\t"
976 "sra %[temp1], %[temp1], 6 \n\t"
977 "add %[temp3], %[temp3], %[temp1] \n\t"
978 "lbux %[temp1], %[temp3](%[cm]) \n\t"
979 "sb %[temp1], 0(%[dest_pix]) \n\t"
980 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
981
982 "lbu %[temp2], 0(%[dest_pix]) \n\t"
983 "add %[temp0], %[step1_10], %[step1_21] \n\t"
984 "addi %[temp0], %[temp0], 32 \n\t"
985 "sra %[temp0], %[temp0], 6 \n\t"
986 "add %[temp2], %[temp2], %[temp0] \n\t"
987 "lbux %[temp0], %[temp2](%[cm]) \n\t"
988 "add %[temp1], %[step1_11], %[step1_20] \n\t"
989 "sb %[temp0], 0(%[dest_pix]) \n\t"
990 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
991 "lbu %[temp3], 0(%[dest_pix]) \n\t"
992 "addi %[temp1], %[temp1], 32 \n\t"
993 "sra %[temp1], %[temp1], 6 \n\t"
994 "add %[temp3], %[temp3], %[temp1] \n\t"
995 "lbux %[temp1], %[temp3](%[cm]) \n\t"
996 "sb %[temp1], 0(%[dest_pix]) \n\t"
997 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
998
999 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1000 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
1001 : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8),
1002 [step1_9] "r"(step1_9), [step1_10] "r"(step1_10),
1003 [step1_11] "r"(step1_11), [step1_20] "r"(step1_20),
1004 [step1_21] "r"(step1_21), [step1_22] "r"(step1_22),
1005 [step1_23] "r"(step1_23));
1006
1007 step3_12 = ROUND_POWER_OF_TWO((step1_11 - step1_20), 6);
1008 step3_13 = ROUND_POWER_OF_TWO((step1_10 - step1_21), 6);
1009 step3_14 = ROUND_POWER_OF_TWO((step1_9 - step1_22), 6);
1010 step3_15 = ROUND_POWER_OF_TWO((step1_8 - step1_23), 6);
1011
1012 __asm__ __volatile__(
1013 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
1014 "add %[temp2], %[temp2], %[step3_15] \n\t"
1015 "lbux %[temp0], %[temp2](%[cm]) \n\t"
1016 "sb %[temp0], 0(%[dest_pix1]) \n\t"
1017 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1018 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
1019 "add %[temp3], %[temp3], %[step3_14] \n\t"
1020 "lbux %[temp1], %[temp3](%[cm]) \n\t"
1021 "sb %[temp1], 0(%[dest_pix1]) \n\t"
1022 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1023
1024 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
1025 "add %[temp2], %[temp2], %[step3_13] \n\t"
1026 "lbux %[temp0], %[temp2](%[cm]) \n\t"
1027 "sb %[temp0], 0(%[dest_pix1]) \n\t"
1028 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1029 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
1030 "add %[temp3], %[temp3], %[step3_12] \n\t"
1031 "lbux %[temp1], %[temp3](%[cm]) \n\t"
1032 "sb %[temp1], 0(%[dest_pix1]) \n\t"
1033 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1034
1035 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1036 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
1037 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
1038 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
1039 [step3_15] "r"(step3_15));
1040
1041 __asm__ __volatile__(
1042 "lbu %[temp2], 0(%[dest_pix]) \n\t"
1043 "add %[temp0], %[step1_12], %[step2_19] \n\t"
1044 "addi %[temp0], %[temp0], 32 \n\t"
1045 "sra %[temp0], %[temp0], 6 \n\t"
1046 "add %[temp2], %[temp2], %[temp0] \n\t"
1047 "lbux %[temp0], %[temp2](%[cm]) \n\t"
1048 "add %[temp1], %[step1_13], %[step2_18] \n\t"
1049 "sb %[temp0], 0(%[dest_pix]) \n\t"
1050 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
1051 "lbu %[temp3], 0(%[dest_pix]) \n\t"
1052 "addi %[temp1], %[temp1], 32 \n\t"
1053 "sra %[temp1], %[temp1], 6 \n\t"
1054 "add %[temp3], %[temp3], %[temp1] \n\t"
1055 "lbux %[temp1], %[temp3](%[cm]) \n\t"
1056 "sb %[temp1], 0(%[dest_pix]) \n\t"
1057 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
1058
1059 "lbu %[temp2], 0(%[dest_pix]) \n\t"
1060 "add %[temp0], %[step1_14], %[step2_17] \n\t"
1061 "addi %[temp0], %[temp0], 32 \n\t"
1062 "sra %[temp0], %[temp0], 6 \n\t"
1063 "add %[temp2], %[temp2], %[temp0] \n\t"
1064 "lbux %[temp0], %[temp2](%[cm]) \n\t"
1065 "add %[temp1], %[step1_15], %[step2_16] \n\t"
1066 "sb %[temp0], 0(%[dest_pix]) \n\t"
1067 "addu %[dest_pix], %[dest_pix], %[stride] \n\t"
1068 "lbu %[temp3], 0(%[dest_pix]) \n\t"
1069 "addi %[temp1], %[temp1], 32 \n\t"
1070 "sra %[temp1], %[temp1], 6 \n\t"
1071 "add %[temp3], %[temp3], %[temp1] \n\t"
1072 "lbux %[temp1], %[temp3](%[cm]) \n\t"
1073 "sb %[temp1], 0(%[dest_pix]) \n\t"
1074
1075 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1076 [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix)
1077 : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12),
1078 [step1_13] "r"(step1_13), [step1_14] "r"(step1_14),
1079 [step1_15] "r"(step1_15), [step2_16] "r"(step2_16),
1080 [step2_17] "r"(step2_17), [step2_18] "r"(step2_18),
1081 [step2_19] "r"(step2_19));
1082
1083 step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6);
1084 step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6);
1085 step3_14 = ROUND_POWER_OF_TWO((step1_13 - step2_18), 6);
1086 step3_15 = ROUND_POWER_OF_TWO((step1_12 - step2_19), 6);
1087
1088 __asm__ __volatile__(
1089 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
1090 "add %[temp2], %[temp2], %[step3_15] \n\t"
1091 "lbux %[temp0], %[temp2](%[cm]) \n\t"
1092 "sb %[temp0], 0(%[dest_pix1]) \n\t"
1093 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1094 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
1095 "add %[temp3], %[temp3], %[step3_14] \n\t"
1096 "lbux %[temp1], %[temp3](%[cm]) \n\t"
1097 "sb %[temp1], 0(%[dest_pix1]) \n\t"
1098 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1099
1100 "lbu %[temp2], 0(%[dest_pix1]) \n\t"
1101 "add %[temp2], %[temp2], %[step3_13] \n\t"
1102 "lbux %[temp0], %[temp2](%[cm]) \n\t"
1103 "sb %[temp0], 0(%[dest_pix1]) \n\t"
1104 "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t"
1105 "lbu %[temp3], 0(%[dest_pix1]) \n\t"
1106 "add %[temp3], %[temp3], %[step3_12] \n\t"
1107 "lbux %[temp1], %[temp3](%[cm]) \n\t"
1108 "sb %[temp1], 0(%[dest_pix1]) \n\t"
1109
1110 : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2),
1111 [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1)
1112 : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12),
1113 [step3_13] "r"(step3_13), [step3_14] "r"(step3_14),
1114 [step3_15] "r"(step3_15));
1115
1116 input += 32;
1117 }
1118 }
1119 #endif // #if HAVE_DSPR2
1120