1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vp9/common/vp9_blockd.h"
18 #include "vp9/common/vp9_idct.h"
19 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
20 
21 #if HAVE_DSPR2
idct8_rows_dspr2(const int16_t * input,int16_t * output,uint32_t no_rows)22 static void idct8_rows_dspr2(const int16_t *input, int16_t *output,
23                              uint32_t no_rows) {
24   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
25   const int const_2_power_13 = 8192;
26   int Temp0, Temp1, Temp2, Temp3, Temp4;
27   int i;
28 
29   for (i = no_rows; i--; ) {
30     __asm__ __volatile__ (
31         /*
32           temp_1 = (input[0] + input[4]) * cospi_16_64;
33           step2_0 = dct_const_round_shift(temp_1);
34 
35           temp_2 = (input[0] - input[4]) * cospi_16_64;
36           step2_1 = dct_const_round_shift(temp_2);
37         */
38         "lh       %[Temp0],             0(%[input])                     \n\t"
39         "lh       %[Temp1],             8(%[input])                     \n\t"
40         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
41         "mthi     $zero,                $ac0                            \n\t"
42         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
43         "mthi     $zero,                $ac1                            \n\t"
44         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
45         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
46         "extp     %[Temp4],             $ac0,           31              \n\t"
47 
48         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
49         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
50         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
51         "mthi     $zero,                $ac0                            \n\t"
52         "extp     %[Temp2],             $ac1,           31              \n\t"
53 
54         /*
55           temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
56           step2_2 = dct_const_round_shift(temp_1);
57         */
58         "lh       %[Temp0],             4(%[input])                     \n\t"
59         "lh       %[Temp1],             12(%[input])                    \n\t"
60         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
61         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
62         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
63         "mthi     $zero,                $ac1                            \n\t"
64         "extp     %[Temp3],             $ac0,           31              \n\t"
65 
66         /*
67           step1_1 = step2_1 + step2_2;
68           step1_2 = step2_1 - step2_2;
69         */
70         "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
71         "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
72 
73         /*
74           temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
75           step2_3 = dct_const_round_shift(temp_2);
76         */
77         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
78         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
79         "extp     %[Temp1],             $ac1,           31              \n\t"
80 
81         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
82         "mthi     $zero,                $ac0                            \n\t"
83 
84         /*
85           step1_0 = step2_0 + step2_3;
86           step1_3 = step2_0 - step2_3;
87         */
88         "add      %[step1_0],           %[Temp4],       %[Temp1]        \n\t"
89         "sub      %[step1_3],           %[Temp4],       %[Temp1]        \n\t"
90 
91         /*
92           temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
93           step1_4 = dct_const_round_shift(temp_1);
94         */
95         "lh       %[Temp0],             2(%[input])                     \n\t"
96         "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
97         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
98         "mthi     $zero,                $ac1                            \n\t"
99         "lh       %[Temp1],             14(%[input])                    \n\t"
100         "lh       %[Temp0],             2(%[input])                     \n\t"
101         "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
102         "extp     %[step1_4],           $ac0,           31              \n\t"
103 
104         /*
105           temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
106           step1_7 = dct_const_round_shift(temp_2);
107         */
108         "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
109         "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
110         "extp     %[step1_7],           $ac1,           31              \n\t"
111 
112         /*
113           temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
114           step1_5 = dct_const_round_shift(temp_1);
115         */
116         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
117         "mthi     $zero,                $ac0                            \n\t"
118         "lh       %[Temp0],             10(%[input])                    \n\t"
119         "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
120         "lh       %[Temp1],             6(%[input])                     \n\t"
121         "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
122         "extp     %[step1_5],           $ac0,           31              \n\t"
123 
124         /*
125           temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
126           step1_6 = dct_const_round_shift(temp_2);
127         */
128         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
129         "mthi     $zero,                $ac1                            \n\t"
130         "lh       %[Temp0],             10(%[input])                    \n\t"
131         "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
132         "lh       %[Temp1],             6(%[input])                     \n\t"
133         "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
134         "extp     %[step1_6],           $ac1,           31              \n\t"
135 
136         /*
137           temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
138           temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
139         */
140         "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
141         "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
142         "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
143         "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
144         "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
145         "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
146 
147         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
148         "mthi     $zero,                $ac0                            \n\t"
149         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
150         "mthi     $zero,                $ac1                            \n\t"
151 
152         "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
153         "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
154 
155         /*
156           step1_4 = step1_4 + step1_5;
157           step1_7 = step1_6 + step1_7;
158         */
159         "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
160         "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
161 
162         "extp     %[step1_5],           $ac0,           31              \n\t"
163         "extp     %[step1_6],           $ac1,           31              \n\t"
164 
165         "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
166         "sh       %[Temp0],             0(%[output])                    \n\t"
167         "add      %[Temp1],             %[step1_1],     %[step1_6]      \n\t"
168         "sh       %[Temp1],             16(%[output])                   \n\t"
169         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
170         "sh       %[Temp0],             32(%[output])                   \n\t"
171         "add      %[Temp1],             %[step1_3],     %[step1_4]      \n\t"
172         "sh       %[Temp1],             48(%[output])                   \n\t"
173 
174         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
175         "sh       %[Temp0],             64(%[output])                   \n\t"
176         "sub      %[Temp1],             %[step1_2],     %[step1_5]      \n\t"
177         "sh       %[Temp1],             80(%[output])                   \n\t"
178         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
179         "sh       %[Temp0],             96(%[output])                   \n\t"
180         "sub      %[Temp1],             %[step1_0],     %[step1_7]      \n\t"
181         "sh       %[Temp1],             112(%[output])                  \n\t"
182 
183         : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
184           [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
185           [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
186           [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
187           [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
188           [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
189           [Temp4] "=&r" (Temp4)
190         : [const_2_power_13] "r" (const_2_power_13),
191           [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
192           [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
193           [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
194           [cospi_24_64] "r" (cospi_24_64),
195           [output] "r" (output), [input] "r" (input)
196     );
197 
198     input += 8;
199     output += 1;
200   }
201 }
202 
idct8_columns_add_blk_dspr2(int16_t * input,uint8_t * dest,int dest_stride)203 static void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
204                                         int dest_stride) {
205   int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7;
206   int Temp0, Temp1, Temp2, Temp3;
207   int i;
208   const int const_2_power_13 = 8192;
209   uint8_t *dest_pix;
210   uint8_t *cm = vp9_ff_cropTbl;
211 
212   /* prefetch vp9_ff_cropTbl */
213   vp9_prefetch_load(vp9_ff_cropTbl);
214   vp9_prefetch_load(vp9_ff_cropTbl +  32);
215   vp9_prefetch_load(vp9_ff_cropTbl +  64);
216   vp9_prefetch_load(vp9_ff_cropTbl +  96);
217   vp9_prefetch_load(vp9_ff_cropTbl + 128);
218   vp9_prefetch_load(vp9_ff_cropTbl + 160);
219   vp9_prefetch_load(vp9_ff_cropTbl + 192);
220   vp9_prefetch_load(vp9_ff_cropTbl + 224);
221 
222   for (i = 0; i < 8; ++i) {
223       dest_pix = (dest + i);
224 
225     __asm__ __volatile__ (
226         /*
227           temp_1 = (input[0] + input[4]) * cospi_16_64;
228           step2_0 = dct_const_round_shift(temp_1);
229 
230           temp_2 = (input[0] - input[4]) * cospi_16_64;
231           step2_1 = dct_const_round_shift(temp_2);
232         */
233         "lh       %[Temp0],             0(%[input])                     \n\t"
234         "lh       %[Temp1],             8(%[input])                     \n\t"
235         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
236         "mthi     $zero,                $ac0                            \n\t"
237         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
238         "mthi     $zero,                $ac1                            \n\t"
239         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
240         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
241         "extp     %[step1_6],           $ac0,           31              \n\t"
242 
243         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
244         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
245         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
246         "mthi     $zero,                $ac0                            \n\t"
247         "extp     %[Temp2],             $ac1,           31              \n\t"
248 
249         /*
250           temp_1 = input[2] * cospi_24_64 - input[6] * cospi_8_64;
251           step2_2 = dct_const_round_shift(temp_1);
252         */
253         "lh       %[Temp0],             4(%[input])                     \n\t"
254         "lh       %[Temp1],             12(%[input])                    \n\t"
255         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
256         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
257         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
258         "mthi     $zero,                $ac1                            \n\t"
259         "extp     %[Temp3],             $ac0,           31              \n\t"
260 
261         /*
262           step1_1 = step2_1 + step2_2;
263           step1_2 = step2_1 - step2_2;
264         */
265         "add      %[step1_1],           %[Temp2],       %[Temp3]        \n\t"
266         "sub      %[step1_2],           %[Temp2],       %[Temp3]        \n\t"
267 
268         /*
269           temp_2 = input[2] * cospi_8_64 + input[6] * cospi_24_64;
270           step2_3 = dct_const_round_shift(temp_2);
271         */
272         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
273         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
274         "extp     %[Temp1],             $ac1,           31              \n\t"
275 
276         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
277         "mthi     $zero,                $ac0                            \n\t"
278 
279         /*
280           step1_0 = step2_0 + step2_3;
281           step1_3 = step2_0 - step2_3;
282         */
283         "add      %[step1_0],           %[step1_6],     %[Temp1]        \n\t"
284         "sub      %[step1_3],           %[step1_6],     %[Temp1]        \n\t"
285 
286         /*
287           temp_1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
288           step1_4 = dct_const_round_shift(temp_1);
289         */
290         "lh       %[Temp0],             2(%[input])                     \n\t"
291         "madd     $ac0,                 %[Temp0],       %[cospi_28_64]  \n\t"
292         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
293         "mthi     $zero,                $ac1                            \n\t"
294         "lh       %[Temp1],             14(%[input])                    \n\t"
295         "lh       %[Temp0],             2(%[input])                     \n\t"
296         "msub     $ac0,                 %[Temp1],       %[cospi_4_64]   \n\t"
297         "extp     %[step1_4],           $ac0,           31              \n\t"
298 
299         /*
300           temp_2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
301           step1_7 = dct_const_round_shift(temp_2);
302         */
303         "madd     $ac1,                 %[Temp0],       %[cospi_4_64]   \n\t"
304         "madd     $ac1,                 %[Temp1],       %[cospi_28_64]  \n\t"
305         "extp     %[step1_7],           $ac1,           31              \n\t"
306 
307         /*
308           temp_1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
309           step1_5 = dct_const_round_shift(temp_1);
310         */
311         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
312         "mthi     $zero,                $ac0                            \n\t"
313         "lh       %[Temp0],             10(%[input])                    \n\t"
314         "madd     $ac0,                 %[Temp0],       %[cospi_12_64]  \n\t"
315         "lh       %[Temp1],             6(%[input])                     \n\t"
316         "msub     $ac0,                 %[Temp1],       %[cospi_20_64]  \n\t"
317         "extp     %[step1_5],           $ac0,           31              \n\t"
318 
319         /*
320           temp_2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
321           step1_6 = dct_const_round_shift(temp_2);
322         */
323         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
324         "mthi     $zero,                $ac1                            \n\t"
325         "lh       %[Temp0],             10(%[input])                    \n\t"
326         "madd     $ac1,                 %[Temp0],       %[cospi_20_64]  \n\t"
327         "lh       %[Temp1],             6(%[input])                     \n\t"
328         "madd     $ac1,                 %[Temp1],       %[cospi_12_64]  \n\t"
329         "extp     %[step1_6],           $ac1,           31              \n\t"
330 
331         /*
332           temp_1 = (step1_7 - step1_6 - step1_4 + step1_5) * cospi_16_64;
333           temp_2 = (step1_4 - step1_5 - step1_6 + step1_7) * cospi_16_64;
334         */
335         "sub      %[Temp0],             %[step1_7],     %[step1_6]      \n\t"
336         "sub      %[Temp0],             %[Temp0],       %[step1_4]      \n\t"
337         "add      %[Temp0],             %[Temp0],       %[step1_5]      \n\t"
338         "sub      %[Temp1],             %[step1_4],     %[step1_5]      \n\t"
339         "sub      %[Temp1],             %[Temp1],       %[step1_6]      \n\t"
340         "add      %[Temp1],             %[Temp1],       %[step1_7]      \n\t"
341 
342         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
343         "mthi     $zero,                $ac0                            \n\t"
344         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
345         "mthi     $zero,                $ac1                            \n\t"
346 
347         "madd     $ac0,                 %[Temp0],       %[cospi_16_64]  \n\t"
348         "madd     $ac1,                 %[Temp1],       %[cospi_16_64]  \n\t"
349 
350         /*
351           step1_4 = step1_4 + step1_5;
352           step1_7 = step1_6 + step1_7;
353         */
354         "add      %[step1_4],           %[step1_4],     %[step1_5]      \n\t"
355         "add      %[step1_7],           %[step1_7],     %[step1_6]      \n\t"
356 
357         "extp     %[step1_5],           $ac0,           31              \n\t"
358         "extp     %[step1_6],           $ac1,           31              \n\t"
359 
360         /* add block */
361         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
362         "add      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
363         "addi     %[Temp0],             %[Temp0],       16              \n\t"
364         "sra      %[Temp0],             %[Temp0],       5               \n\t"
365         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
366         "add      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
367         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
368         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
369         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
370 
371         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
372         "addi     %[Temp0],             %[Temp0],       16              \n\t"
373         "sra      %[Temp0],             %[Temp0],       5               \n\t"
374         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
375         "add      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
376         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
377         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
378         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
379 
380         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
381         "addi     %[Temp0],             %[Temp0],       16              \n\t"
382         "sra      %[Temp0],             %[Temp0],       5               \n\t"
383         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
384         "add      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
385         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
386         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
387         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
388 
389         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
390         "addi     %[Temp0],             %[Temp0],       16              \n\t"
391         "sra      %[Temp0],             %[Temp0],       5               \n\t"
392         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
393         "sub      %[Temp0],             %[step1_3],     %[step1_4]      \n\t"
394         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
395         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
396         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
397 
398         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
399         "addi     %[Temp0],             %[Temp0],       16              \n\t"
400         "sra      %[Temp0],             %[Temp0],       5               \n\t"
401         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
402         "sub      %[Temp0],             %[step1_2],     %[step1_5]      \n\t"
403         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
404         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
405         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
406 
407         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
408         "addi     %[Temp0],             %[Temp0],       16              \n\t"
409         "sra      %[Temp0],             %[Temp0],       5               \n\t"
410         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
411         "sub      %[Temp0],             %[step1_1],     %[step1_6]      \n\t"
412         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
413         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
414         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
415 
416         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
417         "addi     %[Temp0],             %[Temp0],       16              \n\t"
418         "sra      %[Temp0],             %[Temp0],       5               \n\t"
419         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
420         "sub      %[Temp0],             %[step1_0],     %[step1_7]      \n\t"
421         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
422         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
423         "addu     %[dest_pix],          %[dest_pix],    %[dest_stride]  \n\t"
424 
425         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
426         "addi     %[Temp0],             %[Temp0],       16              \n\t"
427         "sra      %[Temp0],             %[Temp0],       5               \n\t"
428         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
429         "lbux     %[Temp2],             %[Temp1](%[cm])                 \n\t"
430         "sb       %[Temp2],             0(%[dest_pix])                  \n\t"
431 
432         : [step1_0] "=&r" (step1_0), [step1_1] "=&r" (step1_1),
433           [step1_2] "=&r" (step1_2), [step1_3] "=&r" (step1_3),
434           [step1_4] "=&r" (step1_4), [step1_5] "=&r" (step1_5),
435           [step1_6] "=&r" (step1_6), [step1_7] "=&r" (step1_7),
436           [Temp0] "=&r" (Temp0), [Temp1] "=&r" (Temp1),
437           [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
438           [dest_pix] "+r" (dest_pix)
439         : [const_2_power_13] "r" (const_2_power_13),
440           [cospi_16_64] "r" (cospi_16_64), [cospi_28_64] "r" (cospi_28_64),
441           [cospi_4_64] "r" (cospi_4_64), [cospi_12_64] "r" (cospi_12_64),
442           [cospi_20_64] "r" (cospi_20_64), [cospi_8_64] "r" (cospi_8_64),
443           [cospi_24_64] "r" (cospi_24_64),
444           [input] "r" (input), [cm] "r" (cm), [dest_stride] "r" (dest_stride)
445     );
446 
447     input += 8;
448   }
449 }
450 
vp9_idct8x8_64_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)451 void vp9_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
452                               int dest_stride) {
453   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
454   int16_t *outptr = out;
455   uint32_t pos = 45;
456 
457   /* bit positon for extract from acc */
458   __asm__ __volatile__ (
459     "wrdsp    %[pos],    1    \n\t"
460     :
461     : [pos] "r" (pos)
462   );
463 
464   // First transform rows
465   idct8_rows_dspr2(input, outptr, 8);
466 
467   // Then transform columns and add to dest
468   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
469 }
470 
iadst8_dspr2(const int16_t * input,int16_t * output)471 static void iadst8_dspr2(const int16_t *input, int16_t *output) {
472   int s0, s1, s2, s3, s4, s5, s6, s7;
473   int x0, x1, x2, x3, x4, x5, x6, x7;
474 
475   x0 = input[7];
476   x1 = input[0];
477   x2 = input[5];
478   x3 = input[2];
479   x4 = input[3];
480   x5 = input[4];
481   x6 = input[1];
482   x7 = input[6];
483 
484   if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
485     output[0] = output[1] = output[2] = output[3] = output[4]
486               = output[5] = output[6] = output[7] = 0;
487     return;
488   }
489 
490   // stage 1
491   s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
492   s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
493   s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
494   s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
495   s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
496   s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
497   s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
498   s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
499 
500   x0 = ROUND_POWER_OF_TWO((s0 + s4), DCT_CONST_BITS);
501   x1 = ROUND_POWER_OF_TWO((s1 + s5), DCT_CONST_BITS);
502   x2 = ROUND_POWER_OF_TWO((s2 + s6), DCT_CONST_BITS);
503   x3 = ROUND_POWER_OF_TWO((s3 + s7), DCT_CONST_BITS);
504   x4 = ROUND_POWER_OF_TWO((s0 - s4), DCT_CONST_BITS);
505   x5 = ROUND_POWER_OF_TWO((s1 - s5), DCT_CONST_BITS);
506   x6 = ROUND_POWER_OF_TWO((s2 - s6), DCT_CONST_BITS);
507   x7 = ROUND_POWER_OF_TWO((s3 - s7), DCT_CONST_BITS);
508 
509   // stage 2
510   s0 = x0;
511   s1 = x1;
512   s2 = x2;
513   s3 = x3;
514   s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
515   s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
516   s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
517   s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
518 
519   x0 = s0 + s2;
520   x1 = s1 + s3;
521   x2 = s0 - s2;
522   x3 = s1 - s3;
523   x4 = ROUND_POWER_OF_TWO((s4 + s6), DCT_CONST_BITS);
524   x5 = ROUND_POWER_OF_TWO((s5 + s7), DCT_CONST_BITS);
525   x6 = ROUND_POWER_OF_TWO((s4 - s6), DCT_CONST_BITS);
526   x7 = ROUND_POWER_OF_TWO((s5 - s7), DCT_CONST_BITS);
527 
528   // stage 3
529   s2 = cospi_16_64 * (x2 + x3);
530   s3 = cospi_16_64 * (x2 - x3);
531   s6 = cospi_16_64 * (x6 + x7);
532   s7 = cospi_16_64 * (x6 - x7);
533 
534   x2 = ROUND_POWER_OF_TWO((s2), DCT_CONST_BITS);
535   x3 = ROUND_POWER_OF_TWO((s3), DCT_CONST_BITS);
536   x6 = ROUND_POWER_OF_TWO((s6), DCT_CONST_BITS);
537   x7 = ROUND_POWER_OF_TWO((s7), DCT_CONST_BITS);
538 
539   output[0] =  x0;
540   output[1] = -x4;
541   output[2] =  x6;
542   output[3] = -x2;
543   output[4] =  x3;
544   output[5] = -x7;
545   output[6] =  x5;
546   output[7] = -x1;
547 }
548 
vp9_iht8x8_64_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride,int tx_type)549 void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest,
550                              int dest_stride, int tx_type) {
551   int i, j;
552   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
553   int16_t *outptr = out;
554   int16_t temp_in[8 * 8], temp_out[8];
555   uint32_t pos = 45;
556 
557   /* bit positon for extract from acc */
558   __asm__ __volatile__ (
559     "wrdsp    %[pos],    1    \n\t"
560     :
561     : [pos] "r" (pos)
562   );
563 
564   switch (tx_type) {
565     case DCT_DCT:     // DCT in both horizontal and vertical
566       idct8_rows_dspr2(input, outptr, 8);
567       idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
568       break;
569     case ADST_DCT:    // ADST in vertical, DCT in horizontal
570       idct8_rows_dspr2(input, outptr, 8);
571 
572       for (i = 0; i < 8; ++i) {
573         iadst8_dspr2(&out[i * 8], temp_out);
574 
575         for (j = 0; j < 8; ++j)
576           dest[j * dest_stride + i] =
577                     clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
578                                       + dest[j * dest_stride + i]);
579       }
580       break;
581     case DCT_ADST:    // DCT in vertical, ADST in horizontal
582       for (i = 0; i < 8; ++i) {
583         iadst8_dspr2(input, outptr);
584         input += 8;
585         outptr += 8;
586       }
587 
588       for (i = 0; i < 8; ++i) {
589         for (j = 0; j < 8; ++j) {
590           temp_in[i * 8 + j] = out[j * 8 + i];
591         }
592       }
593       idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride);
594       break;
595     case ADST_ADST:   // ADST in both directions
596       for (i = 0; i < 8; ++i) {
597         iadst8_dspr2(input, outptr);
598         input += 8;
599         outptr += 8;
600       }
601 
602       for (i = 0; i < 8; ++i) {
603         for (j = 0; j < 8; ++j)
604           temp_in[j] = out[j * 8 + i];
605 
606         iadst8_dspr2(temp_in, temp_out);
607 
608         for (j = 0; j < 8; ++j)
609           dest[j * dest_stride + i] =
610                 clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
611                                       + dest[j * dest_stride + i]);
612       }
613       break;
614     default:
615       printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n");
616       break;
617   }
618 }
619 
vp9_idct8x8_10_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)620 void vp9_idct8x8_10_add_dspr2(const int16_t *input, uint8_t *dest,
621                               int dest_stride) {
622   DECLARE_ALIGNED(32, int16_t, out[8 * 8]);
623   int16_t *outptr = out;
624   uint32_t pos = 45;
625 
626   /* bit positon for extract from acc */
627   __asm__ __volatile__ (
628     "wrdsp    %[pos],    1    \n\t"
629     :
630     : [pos] "r" (pos)
631   );
632 
633   // First transform rows
634   idct8_rows_dspr2(input, outptr, 4);
635 
636   outptr += 4;
637 
638   __asm__ __volatile__ (
639       "sw  $zero,   0(%[outptr])  \n\t"
640       "sw  $zero,   4(%[outptr])  \n\t"
641       "sw  $zero,  16(%[outptr])  \n\t"
642       "sw  $zero,  20(%[outptr])  \n\t"
643       "sw  $zero,  32(%[outptr])  \n\t"
644       "sw  $zero,  36(%[outptr])  \n\t"
645       "sw  $zero,  48(%[outptr])  \n\t"
646       "sw  $zero,  52(%[outptr])  \n\t"
647       "sw  $zero,  64(%[outptr])  \n\t"
648       "sw  $zero,  68(%[outptr])  \n\t"
649       "sw  $zero,  80(%[outptr])  \n\t"
650       "sw  $zero,  84(%[outptr])  \n\t"
651       "sw  $zero,  96(%[outptr])  \n\t"
652       "sw  $zero, 100(%[outptr])  \n\t"
653       "sw  $zero, 112(%[outptr])  \n\t"
654       "sw  $zero, 116(%[outptr])  \n\t"
655 
656       :
657       : [outptr] "r" (outptr)
658   );
659 
660 
661   // Then transform columns and add to dest
662   idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride);
663 }
664 
vp9_idct8x8_1_add_dspr2(const int16_t * input,uint8_t * dest,int dest_stride)665 void vp9_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest,
666                              int dest_stride) {
667   uint32_t pos = 45;
668   int32_t out;
669   int32_t r;
670   int32_t a1, absa1;
671   int32_t t1, t2, vector_a1, vector_1, vector_2;
672 
673   /* bit positon for extract from acc */
674   __asm__ __volatile__ (
675     "wrdsp      %[pos],     1           \n\t"
676 
677     :
678     : [pos] "r" (pos)
679   );
680 
681   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input[0]);
682   __asm__ __volatile__ (
683       "addi     %[out],     %[out],     16      \n\t"
684       "sra      %[a1],      %[out],     5       \n\t"
685 
686       : [out] "+r" (out), [a1] "=r" (a1)
687       :
688   );
689 
690   if (a1 < 0) {
691     /* use quad-byte
692      * input and output memory are four byte aligned */
693     __asm__ __volatile__ (
694         "abs        %[absa1],       %[a1]       \n\t"
695         "replv.qb   %[vector_a1],   %[absa1]    \n\t"
696 
697         : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
698         : [a1] "r" (a1)
699     );
700 
701     for (r = 8; r--;) {
702       __asm__ __volatile__ (
703           "lw           %[t1],          0(%[dest])                      \n\t"
704           "lw           %[t2],          4(%[dest])                      \n\t"
705           "subu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
706           "subu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
707           "sw           %[vector_1],    0(%[dest])                      \n\t"
708           "sw           %[vector_2],    4(%[dest])                      \n\t"
709           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
710 
711           : [t1] "=&r" (t1), [t2] "=&r" (t2),
712             [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
713             [dest] "+&r" (dest)
714           : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
715       );
716     }
717   } else {
718     /* use quad-byte
719      * input and output memory are four byte aligned */
720     __asm__ __volatile__ (
721         "replv.qb   %[vector_a1],   %[a1]   \n\t"
722 
723         : [vector_a1] "=r" (vector_a1)
724         : [a1] "r" (a1)
725     );
726 
727     for (r = 8; r--;) {
728       __asm__ __volatile__ (
729           "lw           %[t1],          0(%[dest])                      \n\t"
730           "lw           %[t2],          4(%[dest])                      \n\t"
731           "addu_s.qb    %[vector_1],    %[t1],          %[vector_a1]    \n\t"
732           "addu_s.qb    %[vector_2],    %[t2],          %[vector_a1]    \n\t"
733           "sw           %[vector_1],    0(%[dest])                      \n\t"
734           "sw           %[vector_2],    4(%[dest])                      \n\t"
735           "add          %[dest],        %[dest],        %[dest_stride]  \n\t"
736 
737           : [t1] "=&r" (t1), [t2] "=&r" (t2),
738             [vector_1] "=&r" (vector_1), [vector_2] "=&r" (vector_2),
739             [dest] "+r" (dest)
740           : [dest_stride] "r" (dest_stride), [vector_a1] "r" (vector_a1)
741       );
742     }
743   }
744 }
745 #endif  // #if HAVE_DSPR2
746