1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
20
21 #if HAVE_DSPR2
idct8_rows_dspr2(const int16_t * input,int16_t * output,uint32_t no_rows)22 static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
23 uint32_t no_rows) {
24 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
25 const int const_2_power_13 = 8192;
26 int Temp0, Temp1, Temp2, Temp3, Temp4;
27 int i;
28
29 for (i = no_rows; i--; ) {
30 __asm__ __volatile__ (
31 /*
32 temp_1 = (input[0] + input[4]) * cospi_16_64;
33 step2_0 = dct_const_round_shift(temp_1);
34
35 temp_2 = (input[0] - input[4]) * cospi_16_64;
36 step2_1 = dct_const_round_shift(temp_2);
37 */
38 "lh %[Temp0], 0(%[input]) \n\t"
39 "lh %[Temp1], 8(%[input]) \n\t"
40 "mtlo %[const_2_power_13], $ac0 \n\t"
41 "mthi $zero, $ac0 \n\t"
42 "mtlo %[const_2_power_13], $ac1 \n\t"
43 "mthi $zero, $ac1 \n\t"
44 "add %[Temp2], %[Temp0], %[Temp1] \n\t"
45 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
46 "extp %[Temp4], $ac0, 31 \n\t"
47
48 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
49 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
50 "mtlo %[const_2_power_13], $ac0 \n\t"
51 "mthi $zero, $ac0 \n\t"
52 "extp %[Temp2], $ac1, 31 \n\t"
53
54 /*
55 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
56 step2_2 = dct_const_round_shift(temp_1);
57 */
58 "lh %[Temp0], 4(%[input]) \n\t"
59 "lh %[Temp1], 12(%[input]) \n\t"
60 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
61 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
62 "mtlo %[const_2_power_13], $ac1 \n\t"
63 "mthi $zero, $ac1 \n\t"
64 "extp %[Temp3], $ac0, 31 \n\t"
65
66 /*
67 step1_1 = step2_1 + step2_2;
68 step1_2 = step2_1 - step2_2;
69 */
70 "add %[step1_1], %[Temp2], %[Temp3] \n\t"
71 "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
72
73 /*
74 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
75 step2_3 = dct_const_round_shift(temp_2);
76 */
77 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
78 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
79 "extp %[Temp1], $ac1, 31 \n\t"
80
81 "mtlo %[const_2_power_13], $ac0 \n\t"
82 "mthi $zero, $ac0 \n\t"
83
84 /*
85 step1_0 = step2_0 + step2_3;
86 step1_3 = step2_0 - step2_3;
87 */
88 "add %[step1_0], %[Temp4], %[Temp1] \n\t"
89 "sub %[step1_3], %[Temp4], %[Temp1] \n\t"
90
91 /*
92 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
93 step1_4 = dct_const_round_shift(temp_1);
94 */
95 "lh %[Temp0], 2(%[input]) \n\t"
96 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
97 "mtlo %[const_2_power_13], $ac1 \n\t"
98 "mthi $zero, $ac1 \n\t"
99 "lh %[Temp1], 14(%[input]) \n\t"
100 "lh %[Temp0], 2(%[input]) \n\t"
101 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
102 "extp %[step1_4], $ac0, 31 \n\t"
103
104 /*
105 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
106 step1_7 = dct_const_round_shift(temp_2);
107 */
108 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
109 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
110 "extp %[step1_7], $ac1, 31 \n\t"
111
112 /*
113 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
114 step1_5 = dct_const_round_shift(temp_1);
115 */
116 "mtlo %[const_2_power_13], $ac0 \n\t"
117 "mthi $zero, $ac0 \n\t"
118 "lh %[Temp0], 10(%[input]) \n\t"
119 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
120 "lh %[Temp1], 6(%[input]) \n\t"
121 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
122 "extp %[step1_5], $ac0, 31 \n\t"
123
124 /*
125 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
126 step1_6 = dct_const_round_shift(temp_2);
127 */
128 "mtlo %[const_2_power_13], $ac1 \n\t"
129 "mthi $zero, $ac1 \n\t"
130 "lh %[Temp0], 10(%[input]) \n\t"
131 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
132 "lh %[Temp1], 6(%[input]) \n\t"
133 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
134 "extp %[step1_6], $ac1, 31 \n\t"
135
136 /*
137 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
138 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
139 */
140 "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
141 "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
142 "add %[Temp0], %[Temp0], %[step1_5] \n\t"
143 "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
144 "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
145 "add %[Temp1], %[Temp1], %[step1_7] \n\t"
146
147 "mtlo %[const_2_power_13], $ac0 \n\t"
148 "mthi $zero, $ac0 \n\t"
149 "mtlo %[const_2_power_13], $ac1 \n\t"
150 "mthi $zero, $ac1 \n\t"
151
152 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
153 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
154
155 /*
156 step1_4 = step1_4 + step1_5;
157 step1_7 = step1_6 + step1_7;
158 */
159 "add %[step1_4], %[step1_4], %[step1_5] \n\t"
160 "add %[step1_7], %[step1_7], %[step1_6] \n\t"
161
162 "extp %[step1_5], $ac0, 31 \n\t"
163 "extp %[step1_6], $ac1, 31 \n\t"
164
165 "add %[Temp0], %[step1_0], %[step1_7] \n\t"
166 "sh %[Temp0], 0(%[output]) \n\t"
167 "add %[Temp1], %[step1_1], %[step1_6] \n\t"
168 "sh %[Temp1], 16(%[output]) \n\t"
169 "add %[Temp0], %[step1_2], %[step1_5] \n\t"
170 "sh %[Temp0], 32(%[output]) \n\t"
171 "add %[Temp1], %[step1_3], %[step1_4] \n\t"
172 "sh %[Temp1], 48(%[output]) \n\t"
173
174 "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
175 "sh %[Temp0], 64(%[output]) \n\t"
176 "sub %[Temp1], %[step1_2], %[step1_5] \n\t"
177 "sh %[Temp1], 80(%[output]) \n\t"
178 "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
179 "sh %[Temp0], 96(%[output]) \n\t"
180 "sub %[Temp1], %[step1_0], %[step1_7] \n\t"
181 "sh %[Temp1], 112(%[output]) \n\t"
182
183 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
184 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
185 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
186 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
187 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
188 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
189 [Temp4] "=&r" (Temp4)
190 : [const_2_power_13] "r" (const_2_power_13),
191 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
192 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
193 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
194 [cospi_24_64] "r" (cospi_24_64),
195 [output] "r" (output), [input] "r" (input)
196 );
197
198 input += 8;
199 output += 1;
200 }
201 }
202
idct8_columns_add_blk_dspr2(int16_t * input,uint8_t * dest,int dest_stride)203 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
204 int dest_stride) {
205 int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
206 int Temp0, Temp1, Temp2, Temp3;
207 int i;
208 const int const_2_power_13 = 8192;
209 uint8_t *dest_pix;
210 uint8_t *cm = vp9_ff_cropTbl;
211
212 /* prefetch vp9_ff_cropTbl */
213 vp9_prefetch_load(vp9_ff_cropTbl);
214 vp9_prefetch_load(vp9_ff_cropTbl + 32);
215 vp9_prefetch_load(vp9_ff_cropTbl + 64);
216 vp9_prefetch_load(vp9_ff_cropTbl + 96);
217 vp9_prefetch_load(vp9_ff_cropTbl + 128);
218 vp9_prefetch_load(vp9_ff_cropTbl + 160);
219 vp9_prefetch_load(vp9_ff_cropTbl + 192);
220 vp9_prefetch_load(vp9_ff_cropTbl + 224);
221
222 for (i = 0; i < 8; ++i) {
223 dest_pix = (dest + i);
224
225 __asm__ __volatile__ (
226 /*
227 temp_1 = (input[0] + input[4]) * cospi_16_64;
228 step2_0 = dct_const_round_shift(temp_1);
229
230 temp_2 = (input[0] - input[4]) * cospi_16_64;
231 step2_1 = dct_const_round_shift(temp_2);
232 */
233 "lh %[Temp0], 0(%[input]) \n\t"
234 "lh %[Temp1], 8(%[input]) \n\t"
235 "mtlo %[const_2_power_13], $ac0 \n\t"
236 "mthi $zero, $ac0 \n\t"
237 "mtlo %[const_2_power_13], $ac1 \n\t"
238 "mthi $zero, $ac1 \n\t"
239 "add %[Temp2], %[Temp0], %[Temp1] \n\t"
240 "madd $ac0, %[Temp2], %[cospi_16_64] \n\t"
241 "extp %[step1_6], $ac0, 31 \n\t"
242
243 "sub %[Temp3], %[Temp0], %[Temp1] \n\t"
244 "madd $ac1, %[Temp3], %[cospi_16_64] \n\t"
245 "mtlo %[const_2_power_13], $ac0 \n\t"
246 "mthi $zero, $ac0 \n\t"
247 "extp %[Temp2], $ac1, 31 \n\t"
248
249 /*
250 temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
251 step2_2 = dct_const_round_shift(temp_1);
252 */
253 "lh %[Temp0], 4(%[input]) \n\t"
254 "lh %[Temp1], 12(%[input]) \n\t"
255 "madd $ac0, %[Temp0], %[cospi_24_64] \n\t"
256 "msub $ac0, %[Temp1], %[cospi_8_64] \n\t"
257 "mtlo %[const_2_power_13], $ac1 \n\t"
258 "mthi $zero, $ac1 \n\t"
259 "extp %[Temp3], $ac0, 31 \n\t"
260
261 /*
262 step1_1 = step2_1 + step2_2;
263 step1_2 = step2_1 - step2_2;
264 */
265 "add %[step1_1], %[Temp2], %[Temp3] \n\t"
266 "sub %[step1_2], %[Temp2], %[Temp3] \n\t"
267
268 /*
269 temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
270 step2_3 = dct_const_round_shift(temp_2);
271 */
272 "madd $ac1, %[Temp0], %[cospi_8_64] \n\t"
273 "madd $ac1, %[Temp1], %[cospi_24_64] \n\t"
274 "extp %[Temp1], $ac1, 31 \n\t"
275
276 "mtlo %[const_2_power_13], $ac0 \n\t"
277 "mthi $zero, $ac0 \n\t"
278
279 /*
280 step1_0 = step2_0 + step2_3;
281 step1_3 = step2_0 - step2_3;
282 */
283 "add %[step1_0], %[step1_6], %[Temp1] \n\t"
284 "sub %[step1_3], %[step1_6], %[Temp1] \n\t"
285
286 /*
287 temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
288 step1_4 = dct_const_round_shift(temp_1);
289 */
290 "lh %[Temp0], 2(%[input]) \n\t"
291 "madd $ac0, %[Temp0], %[cospi_28_64] \n\t"
292 "mtlo %[const_2_power_13], $ac1 \n\t"
293 "mthi $zero, $ac1 \n\t"
294 "lh %[Temp1], 14(%[input]) \n\t"
295 "lh %[Temp0], 2(%[input]) \n\t"
296 "msub $ac0, %[Temp1], %[cospi_4_64] \n\t"
297 "extp %[step1_4], $ac0, 31 \n\t"
298
299 /*
300 temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
301 step1_7 = dct_const_round_shift(temp_2);
302 */
303 "madd $ac1, %[Temp0], %[cospi_4_64] \n\t"
304 "madd $ac1, %[Temp1], %[cospi_28_64] \n\t"
305 "extp %[step1_7], $ac1, 31 \n\t"
306
307 /*
308 temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
309 step1_5 = dct_const_round_shift(temp_1);
310 */
311 "mtlo %[const_2_power_13], $ac0 \n\t"
312 "mthi $zero, $ac0 \n\t"
313 "lh %[Temp0], 10(%[input]) \n\t"
314 "madd $ac0, %[Temp0], %[cospi_12_64] \n\t"
315 "lh %[Temp1], 6(%[input]) \n\t"
316 "msub $ac0, %[Temp1], %[cospi_20_64] \n\t"
317 "extp %[step1_5], $ac0, 31 \n\t"
318
319 /*
320 temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
321 step1_6 = dct_const_round_shift(temp_2);
322 */
323 "mtlo %[const_2_power_13], $ac1 \n\t"
324 "mthi $zero, $ac1 \n\t"
325 "lh %[Temp0], 10(%[input]) \n\t"
326 "madd $ac1, %[Temp0], %[cospi_20_64] \n\t"
327 "lh %[Temp1], 6(%[input]) \n\t"
328 "madd $ac1, %[Temp1], %[cospi_12_64] \n\t"
329 "extp %[step1_6], $ac1, 31 \n\t"
330
331 /*
332 temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
333 temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
334 */
335 "sub %[Temp0], %[step1_7], %[step1_6] \n\t"
336 "sub %[Temp0], %[Temp0], %[step1_4] \n\t"
337 "add %[Temp0], %[Temp0], %[step1_5] \n\t"
338 "sub %[Temp1], %[step1_4], %[step1_5] \n\t"
339 "sub %[Temp1], %[Temp1], %[step1_6] \n\t"
340 "add %[Temp1], %[Temp1], %[step1_7] \n\t"
341
342 "mtlo %[const_2_power_13], $ac0 \n\t"
343 "mthi $zero, $ac0 \n\t"
344 "mtlo %[const_2_power_13], $ac1 \n\t"
345 "mthi $zero, $ac1 \n\t"
346
347 "madd $ac0, %[Temp0], %[cospi_16_64] \n\t"
348 "madd $ac1, %[Temp1], %[cospi_16_64] \n\t"
349
350 /*
351 step1_4 = step1_4 + step1_5;
352 step1_7 = step1_6 + step1_7;
353 */
354 "add %[step1_4], %[step1_4], %[step1_5] \n\t"
355 "add %[step1_7], %[step1_7], %[step1_6] \n\t"
356
357 "extp %[step1_5], $ac0, 31 \n\t"
358 "extp %[step1_6], $ac1, 31 \n\t"
359
360 /* add block */
361 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
362 "add %[Temp0], %[step1_0], %[step1_7] \n\t"
363 "addi %[Temp0], %[Temp0], 16 \n\t"
364 "sra %[Temp0], %[Temp0], 5 \n\t"
365 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
366 "add %[Temp0], %[step1_1], %[step1_6] \n\t"
367 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
368 "sb %[Temp2], 0(%[dest_pix]) \n\t"
369 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
370
371 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
372 "addi %[Temp0], %[Temp0], 16 \n\t"
373 "sra %[Temp0], %[Temp0], 5 \n\t"
374 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
375 "add %[Temp0], %[step1_2], %[step1_5] \n\t"
376 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
377 "sb %[Temp2], 0(%[dest_pix]) \n\t"
378 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
379
380 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
381 "addi %[Temp0], %[Temp0], 16 \n\t"
382 "sra %[Temp0], %[Temp0], 5 \n\t"
383 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
384 "add %[Temp0], %[step1_3], %[step1_4] \n\t"
385 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
386 "sb %[Temp2], 0(%[dest_pix]) \n\t"
387 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
388
389 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
390 "addi %[Temp0], %[Temp0], 16 \n\t"
391 "sra %[Temp0], %[Temp0], 5 \n\t"
392 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
393 "sub %[Temp0], %[step1_3], %[step1_4] \n\t"
394 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
395 "sb %[Temp2], 0(%[dest_pix]) \n\t"
396 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
397
398 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
399 "addi %[Temp0], %[Temp0], 16 \n\t"
400 "sra %[Temp0], %[Temp0], 5 \n\t"
401 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
402 "sub %[Temp0], %[step1_2], %[step1_5] \n\t"
403 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
404 "sb %[Temp2], 0(%[dest_pix]) \n\t"
405 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
406
407 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
408 "addi %[Temp0], %[Temp0], 16 \n\t"
409 "sra %[Temp0], %[Temp0], 5 \n\t"
410 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
411 "sub %[Temp0], %[step1_1], %[step1_6] \n\t"
412 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
413 "sb %[Temp2], 0(%[dest_pix]) \n\t"
414 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
415
416 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
417 "addi %[Temp0], %[Temp0], 16 \n\t"
418 "sra %[Temp0], %[Temp0], 5 \n\t"
419 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
420 "sub %[Temp0], %[step1_0], %[step1_7] \n\t"
421 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
422 "sb %[Temp2], 0(%[dest_pix]) \n\t"
423 "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t"
424
425 "lbu %[Temp1], 0(%[dest_pix]) \n\t"
426 "addi %[Temp0], %[Temp0], 16 \n\t"
427 "sra %[Temp0], %[Temp0], 5 \n\t"
428 "add %[Temp1], %[Temp1], %[Temp0] \n\t"
429 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
430 "sb %[Temp2], 0(%[dest_pix]) \n\t"
431
432 : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
433 [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
434 [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
435 [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
436 [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
437 [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
438 [dest_pix] "+r" (dest_pix)
439 : [const_2_power_13] "r" (const_2_power_13),
440 [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
441 [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
442 [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
443 [cospi_24_64] "r" (cospi_24_64),
444 [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
445 );
446
447 input += 8;
448 }
449 }
450
vp9_idct8x8_64_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)451 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
452 int dest_stride) {
453 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
454 int16_t *outptr = out;
455 uint32_t pos = 45;
456
457 /* bit positon for extract from acc */
458 __asm__ __volatile__ (
459 "wrdsp %[pos], 1 \n\t"
460 :
461 : [pos] "r" (pos)
462 );
463
464 // First transform rows
465 idct8_rows_dspr2(input, outptr, 8);
466
467 // Then transform columns and add to dest
468 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
469 }
470
iadst8_dspr2(const int16_t * input,int16_t * output)471 static void iadst8_dspr2(const int16_t *input, int16_t *output) {
472 int s0, s1, s2, s3, s4, s5, s6, s7;
473 int x0, x1, x2, x3, x4, x5, x6, x7;
474
475 x0 = input[7];
476 x1 = input[0];
477 x2 = input[5];
478 x3 = input[2];
479 x4 = input[3];
480 x5 = input[4];
481 x6 = input[1];
482 x7 = input[6];
483
484 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
485 output[0] = output[1] = output[2] = output[3] = output[4]
486 = output[5] = output[6] = output[7] = 0;
487 return;
488 }
489
490 // stage 1
491 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
492 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
493 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
494 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
495 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
496 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
497 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
498 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
499
500 x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
501 x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
502 x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
503 x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
504 x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
505 x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
506 x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
507 x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
508
509 // stage 2
510 s0 = x0;
511 s1 = x1;
512 s2 = x2;
513 s3 = x3;
514 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
515 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
516 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
517 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
518
519 x0 = s0 + s2;
520 x1 = s1 + s3;
521 x2 = s0 - s2;
522 x3 = s1 - s3;
523 x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
524 x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
525 x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
526 x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
527
528 // stage 3
529 s2 = cospi_16_64 * (x2 + x3);
530 s3 = cospi_16_64 * (x2 - x3);
531 s6 = cospi_16_64 * (x6 + x7);
532 s7 = cospi_16_64 * (x6 - x7);
533
534 x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
535 x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
536 x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
537 x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
538
539 output[0] = x0;
540 output[1] = -x4;
541 output[2] = x6;
542 output[3] = -x2;
543 output[4] = x3;
544 output[5] = -x7;
545 output[6] = x5;
546 output[7] = -x1;
547 }
548
vp9_iht8x8_64_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride,int tx_type)549 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
550 int dest_stride, int tx_type) {
551 int i, j;
552 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
553 int16_t *outptr = out;
554 int16_t temp_in[8 * 8], temp_out[8];
555 uint32_t pos = 45;
556
557 /* bit positon for extract from acc */
558 __asm__ __volatile__ (
559 "wrdsp %[pos], 1 \n\t"
560 :
561 : [pos] "r" (pos)
562 );
563
564 switch (tx_type) {
565 case DCT_DCT: // DCT in both horizontal and vertical
566 idct8_rows_dspr2(input, outptr, 8);
567 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
568 break;
569 case ADST_DCT: // ADST in vertical, DCT in horizontal
570 idct8_rows_dspr2(input, outptr, 8);
571
572 for (i = 0; i < 8; ++i) {
573 iadst8_dspr2(&out[i * 8], temp_out);
574
575 for (j = 0; j < 8; ++j)
576 dest[j * dest_stride + i] =
577 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
578 + dest[j * dest_stride + i]);
579 }
580 break;
581 case DCT_ADST: // DCT in vertical, ADST in horizontal
582 for (i = 0; i < 8; ++i) {
583 iadst8_dspr2(input, outptr);
584 input += 8;
585 outptr += 8;
586 }
587
588 for (i = 0; i < 8; ++i) {
589 for (j = 0; j < 8; ++j) {
590 temp_in[i * 8 + j] = out[j * 8 + i];
591 }
592 }
593 idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
594 break;
595 case ADST_ADST: // ADST in both directions
596 for (i = 0; i < 8; ++i) {
597 iadst8_dspr2(input, outptr);
598 input += 8;
599 outptr += 8;
600 }
601
602 for (i = 0; i < 8; ++i) {
603 for (j = 0; j < 8; ++j)
604 temp_in[j] = out[j * 8 + i];
605
606 iadst8_dspr2(temp_in, temp_out);
607
608 for (j = 0; j < 8; ++j)
609 dest[j * dest_stride + i] =
610 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
611 + dest[j * dest_stride + i]);
612 }
613 break;
614 default:
615 printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
616 break;
617 }
618 }
619
vp9_idct8x8_10_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)620 void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
621 int dest_stride) {
622 DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
623 int16_t *outptr = out;
624 uint32_t pos = 45;
625
626 /* bit positon for extract from acc */
627 __asm__ __volatile__ (
628 "wrdsp %[pos], 1 \n\t"
629 :
630 : [pos] "r" (pos)
631 );
632
633 // First transform rows
634 idct8_rows_dspr2(input, outptr, 4);
635
636 outptr += 4;
637
638 __asm__ __volatile__ (
639 "sw $zero, 0(%[outptr]) \n\t"
640 "sw $zero, 4(%[outptr]) \n\t"
641 "sw $zero, 16(%[outptr]) \n\t"
642 "sw $zero, 20(%[outptr]) \n\t"
643 "sw $zero, 32(%[outptr]) \n\t"
644 "sw $zero, 36(%[outptr]) \n\t"
645 "sw $zero, 48(%[outptr]) \n\t"
646 "sw $zero, 52(%[outptr]) \n\t"
647 "sw $zero, 64(%[outptr]) \n\t"
648 "sw $zero, 68(%[outptr]) \n\t"
649 "sw $zero, 80(%[outptr]) \n\t"
650 "sw $zero, 84(%[outptr]) \n\t"
651 "sw $zero, 96(%[outptr]) \n\t"
652 "sw $zero, 100(%[outptr]) \n\t"
653 "sw $zero, 112(%[outptr]) \n\t"
654 "sw $zero, 116(%[outptr]) \n\t"
655
656 :
657 : [outptr] "r" (outptr)
658 );
659
660
661 // Then transform columns and add to dest
662 idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
663 }
664
vp9_idct8x8_1_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)665 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
666 int dest_stride) {
667 uint32_t pos = 45;
668 int32_t out;
669 int32_t r;
670 int32_t a1, absa1;
671 int32_t t1, t2, vector_a1, vector_1, vector_2;
672
673 /* bit positon for extract from acc */
674 __asm__ __volatile__ (
675 "wrdsp %[pos], 1 \n\t"
676
677 :
678 : [pos] "r" (pos)
679 );
680
681 out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
682 __asm__ __volatile__ (
683 "addi %[out], %[out], 16 \n\t"
684 "sra %[a1], %[out], 5 \n\t"
685
686 : [out] "+r" (out), [a1] "=r" (a1)
687 :
688 );
689
690 if (a1 < 0) {
691 /* use quad-byte
692 * input and output memory are four byte aligned */
693 __asm__ __volatile__ (
694 "abs %[absa1], %[a1] \n\t"
695 "replv.qb %[vector_a1], %[absa1] \n\t"
696
697 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
698 : [a1] "r" (a1)
699 );
700
701 for (r = 8; r--;) {
702 __asm__ __volatile__ (
703 "lw %[t1], 0(%[dest]) \n\t"
704 "lw %[t2], 4(%[dest]) \n\t"
705 "subu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
706 "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
707 "sw %[vector_1], 0(%[dest]) \n\t"
708 "sw %[vector_2], 4(%[dest]) \n\t"
709 "add %[dest], %[dest], %[dest_stride] \n\t"
710
711 : [t1] "=&r" (t1), [t2] "=&r" (t2),
712 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
713 [dest] "+&r" (dest)
714 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
715 );
716 }
717 } else {
718 /* use quad-byte
719 * input and output memory are four byte aligned */
720 __asm__ __volatile__ (
721 "replv.qb %[vector_a1], %[a1] \n\t"
722
723 : [vector_a1] "=r" (vector_a1)
724 : [a1] "r" (a1)
725 );
726
727 for (r = 8; r--;) {
728 __asm__ __volatile__ (
729 "lw %[t1], 0(%[dest]) \n\t"
730 "lw %[t2], 4(%[dest]) \n\t"
731 "addu_s.qb %[vector_1], %[t1], %[vector_a1] \n\t"
732 "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t"
733 "sw %[vector_1], 0(%[dest]) \n\t"
734 "sw %[vector_2], 4(%[dest]) \n\t"
735 "add %[dest], %[dest], %[dest_stride] \n\t"
736
737 : [t1] "=&r" (t1), [t2] "=&r" (t2),
738 [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
739 [dest] "+r" (dest)
740 : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
741 );
742 }
743 }
744 }
745 #endif // #if HAVE_DSPR2
746