1 /*
2  *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "./vpx_config.h"
12 #include "./vpx_dsp_rtcd.h"
13 #include "vpx_dsp/mips/inv_txfm_dspr2.h"
14 #include "vpx_dsp/txfm_common.h"
15 
16 #if HAVE_DSPR2
vpx_idct4_rows_dspr2(const int16_t * input,int16_t * output)17 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) {
18   int step_0, step_1, step_2, step_3;
19   int Temp0, Temp1, Temp2, Temp3;
20   const int const_2_power_13 = 8192;
21   int i;
22 
23   for (i = 4; i--;) {
24     __asm__ __volatile__(
25         /*
26           temp_1 = (input[0] + input[2]) * cospi_16_64;
27           step_0 = dct_const_round_shift(temp_1);
28 
29           temp_2 = (input[0] - input[2]) * cospi_16_64;
30           step_1 = dct_const_round_shift(temp_2);
31         */
32         "lh       %[Temp0],             0(%[input])                     \n\t"
33         "lh       %[Temp1],             4(%[input])                     \n\t"
34         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
35         "mthi     $zero,                $ac0                            \n\t"
36         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
37         "mthi     $zero,                $ac1                            \n\t"
38         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
39         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
40         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
41         "lh       %[Temp0],             2(%[input])                     \n\t"
42         "lh       %[Temp1],             6(%[input])                     \n\t"
43         "extp     %[step_0],            $ac0,           31              \n\t"
44         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
45         "mthi     $zero,                $ac0                            \n\t"
46 
47         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
48         "extp     %[step_1],            $ac1,           31              \n\t"
49         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
50         "mthi     $zero,                $ac1                            \n\t"
51 
52         /*
53           temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
54           step_2 = dct_const_round_shift(temp1);
55         */
56         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
57         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
58         "extp     %[step_2],            $ac0,           31              \n\t"
59 
60         /*
61           temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
62           step_3 = dct_const_round_shift(temp2);
63         */
64         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
65         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
66         "extp     %[step_3],            $ac1,           31              \n\t"
67 
68         /*
69           output[0]  = step_0 + step_3;
70           output[4]  = step_1 + step_2;
71           output[8]  = step_1 - step_2;
72           output[12] = step_0 - step_3;
73         */
74         "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
75         "sh       %[Temp0],             0(%[output])                    \n\t"
76 
77         "add      %[Temp1],             %[step_1],      %[step_2]       \n\t"
78         "sh       %[Temp1],             8(%[output])                    \n\t"
79 
80         "sub      %[Temp2],             %[step_1],      %[step_2]       \n\t"
81         "sh       %[Temp2],             16(%[output])                   \n\t"
82 
83         "sub      %[Temp3],             %[step_0],      %[step_3]       \n\t"
84         "sh       %[Temp3],             24(%[output])                   \n\t"
85 
86         : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
87           [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
88           [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [output] "+r"(output)
89         : [const_2_power_13] "r"(const_2_power_13),
90           [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
91           [cospi_24_64] "r"(cospi_24_64), [input] "r"(input));
92 
93     input += 4;
94     output += 1;
95   }
96 }
97 
vpx_idct4_columns_add_blk_dspr2(int16_t * input,uint8_t * dest,int stride)98 void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest,
99                                      int stride) {
100   int step_0, step_1, step_2, step_3;
101   int Temp0, Temp1, Temp2, Temp3;
102   const int const_2_power_13 = 8192;
103   const int const_255 = 255;
104   int i;
105   uint8_t *dest_pix;
106 
107   for (i = 0; i < 4; ++i) {
108     dest_pix = (dest + i);
109 
110     __asm__ __volatile__(
111         /*
112           temp_1 = (input[0] + input[2]) * cospi_16_64;
113           step_0 = dct_const_round_shift(temp_1);
114 
115           temp_2 = (input[0] - input[2]) * cospi_16_64;
116           step_1 = dct_const_round_shift(temp_2);
117         */
118         "lh       %[Temp0],             0(%[input])                     \n\t"
119         "lh       %[Temp1],             4(%[input])                     \n\t"
120         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
121         "mthi     $zero,                $ac0                            \n\t"
122         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
123         "mthi     $zero,                $ac1                            \n\t"
124         "add      %[Temp2],             %[Temp0],       %[Temp1]        \n\t"
125         "sub      %[Temp3],             %[Temp0],       %[Temp1]        \n\t"
126         "madd     $ac0,                 %[Temp2],       %[cospi_16_64]  \n\t"
127         "lh       %[Temp0],             2(%[input])                     \n\t"
128         "lh       %[Temp1],             6(%[input])                     \n\t"
129         "extp     %[step_0],            $ac0,           31              \n\t"
130         "mtlo     %[const_2_power_13],  $ac0                            \n\t"
131         "mthi     $zero,                $ac0                            \n\t"
132 
133         "madd     $ac1,                 %[Temp3],       %[cospi_16_64]  \n\t"
134         "extp     %[step_1],            $ac1,           31              \n\t"
135         "mtlo     %[const_2_power_13],  $ac1                            \n\t"
136         "mthi     $zero,                $ac1                            \n\t"
137 
138         /*
139           temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
140           step_2 = dct_const_round_shift(temp1);
141         */
142         "madd     $ac0,                 %[Temp0],       %[cospi_24_64]  \n\t"
143         "msub     $ac0,                 %[Temp1],       %[cospi_8_64]   \n\t"
144         "extp     %[step_2],            $ac0,           31              \n\t"
145 
146         /*
147           temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
148           step_3 = dct_const_round_shift(temp2);
149         */
150         "madd     $ac1,                 %[Temp0],       %[cospi_8_64]   \n\t"
151         "madd     $ac1,                 %[Temp1],       %[cospi_24_64]  \n\t"
152         "extp     %[step_3],            $ac1,           31              \n\t"
153 
154         /*
155           output[0]  = step_0 + step_3;
156           output[4]  = step_1 + step_2;
157           output[8]  = step_1 - step_2;
158           output[12] = step_0 - step_3;
159         */
160         "add      %[Temp0],             %[step_0],      %[step_3]       \n\t"
161         "addi     %[Temp0],             %[Temp0],       8               \n\t"
162         "sra      %[Temp0],             %[Temp0],       4               \n\t"
163         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
164         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
165         "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
166         "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
167         "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
168         "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
169         "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
170         "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
171 
172         "add      %[Temp0],             %[step_1],      %[step_2]       \n\t"
173         "addi     %[Temp0],             %[Temp0],       8               \n\t"
174         "sra      %[Temp0],             %[Temp0],       4               \n\t"
175         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
176         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
177         "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
178         "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
179         "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
180         "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
181         "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
182         "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
183 
184         "sub      %[Temp0],             %[step_1],      %[step_2]       \n\t"
185         "addi     %[Temp0],             %[Temp0],       8               \n\t"
186         "sra      %[Temp0],             %[Temp0],       4               \n\t"
187         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
188         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
189         "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
190         "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
191         "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
192         "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
193         "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
194         "addu     %[dest_pix],          %[dest_pix],    %[stride]       \n\t"
195 
196         "sub      %[Temp0],             %[step_0],      %[step_3]       \n\t"
197         "addi     %[Temp0],             %[Temp0],       8               \n\t"
198         "sra      %[Temp0],             %[Temp0],       4               \n\t"
199         "lbu      %[Temp1],             0(%[dest_pix])                  \n\t"
200         "add      %[Temp1],             %[Temp1],       %[Temp0]        \n\t"
201         "slt      %[Temp2],             %[Temp1],       %[const_255]    \n\t"
202         "slt      %[Temp3],             $zero,          %[Temp1]        \n\t"
203         "movz     %[Temp1],             %[const_255],   %[Temp2]        \n\t"
204         "movz     %[Temp1],             $zero,          %[Temp3]        \n\t"
205         "sb       %[Temp1],             0(%[dest_pix])                  \n\t"
206 
207         : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
208           [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1),
209           [step_2] "=&r"(step_2), [step_3] "=&r"(step_3),
210           [dest_pix] "+r"(dest_pix)
211         : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255),
212           [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64),
213           [cospi_24_64] "r"(cospi_24_64), [input] "r"(input),
214           [stride] "r"(stride));
215 
216     input += 4;
217   }
218 }
219 
vpx_idct4x4_16_add_dspr2(const int16_t * input,uint8_t * dest,int stride)220 void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
221   DECLARE_ALIGNED(32, int16_t, out[4 * 4]);
222   int16_t *outptr = out;
223   uint32_t pos = 45;
224 
225   /* bit positon for extract from acc */
226   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
227                        :
228                        : [pos] "r"(pos));
229 
230   // Rows
231   vpx_idct4_rows_dspr2(input, outptr);
232 
233   // Columns
234   vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride);
235 }
236 
vpx_idct4x4_1_add_dspr2(const int16_t * input,uint8_t * dest,int stride)237 void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) {
238   int a1, absa1;
239   int r;
240   int32_t out;
241   int t2, vector_a1, vector_a;
242   uint32_t pos = 45;
243   int16_t input_dc = input[0];
244 
245   /* bit positon for extract from acc */
246   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
247 
248                        :
249                        : [pos] "r"(pos));
250 
251   out = DCT_CONST_ROUND_SHIFT_TWICE_COSPI_16_64(input_dc);
252   __asm__ __volatile__(
253       "addi     %[out],     %[out],    8       \n\t"
254       "sra      %[a1],      %[out],    4       \n\t"
255 
256       : [out] "+r"(out), [a1] "=r"(a1)
257       :);
258 
259   if (a1 < 0) {
260     /* use quad-byte
261      * input and output memory are four byte aligned */
262     __asm__ __volatile__(
263         "abs        %[absa1],     %[a1]         \n\t"
264         "replv.qb   %[vector_a1], %[absa1]      \n\t"
265 
266         : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1)
267         : [a1] "r"(a1));
268 
269     for (r = 4; r--;) {
270       __asm__ __volatile__(
271           "lw             %[t2],          0(%[dest])                      \n\t"
272           "subu_s.qb      %[vector_a],    %[t2],          %[vector_a1]    \n\t"
273           "sw             %[vector_a],    0(%[dest])                      \n\t"
274           "add            %[dest],        %[dest],        %[stride]       \n\t"
275 
276           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
277           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
278     }
279   } else if (a1 > 255) {
280     int32_t a11, a12, vector_a11, vector_a12;
281 
282     /* use quad-byte
283      * input and output memory are four byte aligned */
284     a11 = a1 >> 3;
285     a12 = a1 - (a11 * 7);
286 
287     __asm__ __volatile__(
288         "replv.qb       %[vector_a11],  %[a11]     \n\t"
289         "replv.qb       %[vector_a12],  %[a12]     \n\t"
290 
291         : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12)
292         : [a11] "r"(a11), [a12] "r"(a12));
293 
294     for (r = 4; r--;) {
295       __asm__ __volatile__(
296           "lw             %[t2],          4(%[dest])                      \n\t"
297           "addu_s.qb      %[vector_a],    %[t2],          %[vector_a11]   \n\t"
298           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
299           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
300           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
301           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
302           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
303           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a11]   \n\t"
304           "addu_s.qb      %[vector_a],    %[vector_a],    %[vector_a12]   \n\t"
305           "sw             %[vector_a],    0(%[dest])                      \n\t"
306           "add            %[dest],        %[dest],        %[stride]       \n\t"
307 
308           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
309           : [stride] "r"(stride), [vector_a11] "r"(vector_a11),
310             [vector_a12] "r"(vector_a12));
311     }
312   } else {
313     /* use quad-byte
314      * input and output memory are four byte aligned */
315     __asm__ __volatile__("replv.qb       %[vector_a1],   %[a1]     \n\t"
316                          : [vector_a1] "=r"(vector_a1)
317                          : [a1] "r"(a1));
318 
319     for (r = 4; r--;) {
320       __asm__ __volatile__(
321           "lw           %[t2],          0(%[dest])                        \n\t"
322           "addu_s.qb    %[vector_a],    %[t2],            %[vector_a1]    \n\t"
323           "sw           %[vector_a],    0(%[dest])                        \n\t"
324           "add          %[dest],        %[dest],          %[stride]       \n\t"
325 
326           : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest)
327           : [stride] "r"(stride), [vector_a1] "r"(vector_a1));
328     }
329   }
330 }
331 
iadst4_dspr2(const int16_t * input,int16_t * output)332 void iadst4_dspr2(const int16_t *input, int16_t *output) {
333   int s0, s1, s2, s3, s4, s5, s6, s7;
334   int x0, x1, x2, x3;
335 
336   x0 = input[0];
337   x1 = input[1];
338   x2 = input[2];
339   x3 = input[3];
340 
341   if (!(x0 | x1 | x2 | x3)) {
342     output[0] = output[1] = output[2] = output[3] = 0;
343     return;
344   }
345 
346   // 32-bit result is enough for the following multiplications.
347   s0 = sinpi_1_9 * x0;
348   s1 = sinpi_2_9 * x0;
349   s2 = sinpi_3_9 * x1;
350   s3 = sinpi_4_9 * x2;
351   s4 = sinpi_1_9 * x2;
352   s5 = sinpi_2_9 * x3;
353   s6 = sinpi_4_9 * x3;
354   s7 = x0 - x2 + x3;
355 
356   x0 = s0 + s3 + s5;
357   x1 = s1 - s4 - s6;
358   x2 = sinpi_3_9 * s7;
359   x3 = s2;
360 
361   s0 = x0 + x3;
362   s1 = x1 + x3;
363   s2 = x2;
364   s3 = x0 + x1 - x3;
365 
366   // 1-D transform scaling factor is sqrt(2).
367   // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
368   // + 1b (addition) = 29b.
369   // Hence the output bit depth is 15b.
370   output[0] = dct_const_round_shift(s0);
371   output[1] = dct_const_round_shift(s1);
372   output[2] = dct_const_round_shift(s2);
373   output[3] = dct_const_round_shift(s3);
374 }
375 #endif  // #if HAVE_DSPR2
376