1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "vp8_rtcd.h"
12
13 #if HAVE_DSPR2
14 #define CROP_WIDTH 256
15
16 /******************************************************************************
17 * Notes:
18 *
19 * This implementation makes use of 16 bit fixed point version of two multiply
20 * constants:
21 * 1. sqrt(2) * cos (pi/8)
22 * 2. sqrt(2) * sin (pi/8)
23 * Since the first constant is bigger than 1, to maintain the same 16 bit
24 * fixed point precision as the second one, we use a trick of
25 * x * a = x + x*(a-1)
26 * so
27 * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
28 ****************************************************************************/
29 extern unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
30 static const int cospi8sqrt2minus1 = 20091;
31 static const int sinpi8sqrt2 = 35468;
32
prefetch_load_short(short * src)33 inline void prefetch_load_short(short *src)
34 {
35 __asm__ __volatile__ (
36 "pref 0, 0(%[src]) \n\t"
37 :
38 : [src] "r" (src)
39 );
40 }
41
vp8_short_idct4x4llm_dspr2(short * input,unsigned char * pred_ptr,int pred_stride,unsigned char * dst_ptr,int dst_stride)42 void vp8_short_idct4x4llm_dspr2(short *input, unsigned char *pred_ptr,
43 int pred_stride, unsigned char *dst_ptr,
44 int dst_stride)
45 {
46 int r, c;
47 int a1, b1, c1, d1;
48 short output[16];
49 short *ip = input;
50 short *op = output;
51 int temp1, temp2;
52 int shortpitch = 4;
53
54 int c2, d2;
55 int temp3, temp4;
56 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
57
58 /* prepare data for load */
59 prefetch_load_short(ip + 8);
60
61 /* first loop is unrolled */
62 a1 = ip[0] + ip[8];
63 b1 = ip[0] - ip[8];
64
65 temp1 = (ip[4] * sinpi8sqrt2) >> 16;
66 temp2 = ip[12] + ((ip[12] * cospi8sqrt2minus1) >> 16);
67 c1 = temp1 - temp2;
68
69 temp1 = ip[4] + ((ip[4] * cospi8sqrt2minus1) >> 16);
70 temp2 = (ip[12] * sinpi8sqrt2) >> 16;
71 d1 = temp1 + temp2;
72
73 temp3 = (ip[5] * sinpi8sqrt2) >> 16;
74 temp4 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
75 c2 = temp3 - temp4;
76
77 temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
78 temp4 = (ip[13] * sinpi8sqrt2) >> 16;
79 d2 = temp3 + temp4;
80
81 op[0] = a1 + d1;
82 op[12] = a1 - d1;
83 op[4] = b1 + c1;
84 op[8] = b1 - c1;
85
86 a1 = ip[1] + ip[9];
87 b1 = ip[1] - ip[9];
88
89 op[1] = a1 + d2;
90 op[13] = a1 - d2;
91 op[5] = b1 + c2;
92 op[9] = b1 - c2;
93
94 a1 = ip[2] + ip[10];
95 b1 = ip[2] - ip[10];
96
97 temp1 = (ip[6] * sinpi8sqrt2) >> 16;
98 temp2 = ip[14] + ((ip[14] * cospi8sqrt2minus1) >> 16);
99 c1 = temp1 - temp2;
100
101 temp1 = ip[6] + ((ip[6] * cospi8sqrt2minus1) >> 16);
102 temp2 = (ip[14] * sinpi8sqrt2) >> 16;
103 d1 = temp1 + temp2;
104
105 temp3 = (ip[7] * sinpi8sqrt2) >> 16;
106 temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
107 c2 = temp3 - temp4;
108
109 temp3 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
110 temp4 = (ip[15] * sinpi8sqrt2) >> 16;
111 d2 = temp3 + temp4;
112
113 op[2] = a1 + d1;
114 op[14] = a1 - d1;
115 op[6] = b1 + c1;
116 op[10] = b1 - c1;
117
118 a1 = ip[3] + ip[11];
119 b1 = ip[3] - ip[11];
120
121 op[3] = a1 + d2;
122 op[15] = a1 - d2;
123 op[7] = b1 + c2;
124 op[11] = b1 - c2;
125
126 ip = output;
127
128 /* prepare data for load */
129 prefetch_load_short(ip + shortpitch);
130
131 /* second loop is unrolled */
132 a1 = ip[0] + ip[2];
133 b1 = ip[0] - ip[2];
134
135 temp1 = (ip[1] * sinpi8sqrt2) >> 16;
136 temp2 = ip[3] + ((ip[3] * cospi8sqrt2minus1) >> 16);
137 c1 = temp1 - temp2;
138
139 temp1 = ip[1] + ((ip[1] * cospi8sqrt2minus1) >> 16);
140 temp2 = (ip[3] * sinpi8sqrt2) >> 16;
141 d1 = temp1 + temp2;
142
143 temp3 = (ip[5] * sinpi8sqrt2) >> 16;
144 temp4 = ip[7] + ((ip[7] * cospi8sqrt2minus1) >> 16);
145 c2 = temp3 - temp4;
146
147 temp3 = ip[5] + ((ip[5] * cospi8sqrt2minus1) >> 16);
148 temp4 = (ip[7] * sinpi8sqrt2) >> 16;
149 d2 = temp3 + temp4;
150
151 op[0] = (a1 + d1 + 4) >> 3;
152 op[3] = (a1 - d1 + 4) >> 3;
153 op[1] = (b1 + c1 + 4) >> 3;
154 op[2] = (b1 - c1 + 4) >> 3;
155
156 a1 = ip[4] + ip[6];
157 b1 = ip[4] - ip[6];
158
159 op[4] = (a1 + d2 + 4) >> 3;
160 op[7] = (a1 - d2 + 4) >> 3;
161 op[5] = (b1 + c2 + 4) >> 3;
162 op[6] = (b1 - c2 + 4) >> 3;
163
164 a1 = ip[8] + ip[10];
165 b1 = ip[8] - ip[10];
166
167 temp1 = (ip[9] * sinpi8sqrt2) >> 16;
168 temp2 = ip[11] + ((ip[11] * cospi8sqrt2minus1) >> 16);
169 c1 = temp1 - temp2;
170
171 temp1 = ip[9] + ((ip[9] * cospi8sqrt2minus1) >> 16);
172 temp2 = (ip[11] * sinpi8sqrt2) >> 16;
173 d1 = temp1 + temp2;
174
175 temp3 = (ip[13] * sinpi8sqrt2) >> 16;
176 temp4 = ip[15] + ((ip[15] * cospi8sqrt2minus1) >> 16);
177 c2 = temp3 - temp4;
178
179 temp3 = ip[13] + ((ip[13] * cospi8sqrt2minus1) >> 16);
180 temp4 = (ip[15] * sinpi8sqrt2) >> 16;
181 d2 = temp3 + temp4;
182
183 op[8] = (a1 + d1 + 4) >> 3;
184 op[11] = (a1 - d1 + 4) >> 3;
185 op[9] = (b1 + c1 + 4) >> 3;
186 op[10] = (b1 - c1 + 4) >> 3;
187
188 a1 = ip[12] + ip[14];
189 b1 = ip[12] - ip[14];
190
191 op[12] = (a1 + d2 + 4) >> 3;
192 op[15] = (a1 - d2 + 4) >> 3;
193 op[13] = (b1 + c2 + 4) >> 3;
194 op[14] = (b1 - c2 + 4) >> 3;
195
196 ip = output;
197
198 for (r = 0; r < 4; r++)
199 {
200 for (c = 0; c < 4; c++)
201 {
202 short a = ip[c] + pred_ptr[c] ;
203 dst_ptr[c] = cm[a] ;
204 }
205
206 ip += 4;
207 dst_ptr += dst_stride;
208 pred_ptr += pred_stride;
209 }
210 }
211
vp8_dc_only_idct_add_dspr2(short input_dc,unsigned char * pred_ptr,int pred_stride,unsigned char * dst_ptr,int dst_stride)212 void vp8_dc_only_idct_add_dspr2(short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride)
213 {
214 int a1;
215 int i, absa1;
216 int t2, vector_a1, vector_a;
217
218 /* a1 = ((input_dc + 4) >> 3); */
219 __asm__ __volatile__ (
220 "addi %[a1], %[input_dc], 4 \n\t"
221 "sra %[a1], %[a1], 3 \n\t"
222 : [a1] "=r" (a1)
223 : [input_dc] "r" (input_dc)
224 );
225
226 if (a1 < 0)
227 {
228 /* use quad-byte
229 * input and output memory are four byte aligned
230 */
231 __asm__ __volatile__ (
232 "abs %[absa1], %[a1] \n\t"
233 "replv.qb %[vector_a1], %[absa1] \n\t"
234 : [absa1] "=r" (absa1), [vector_a1] "=r" (vector_a1)
235 : [a1] "r" (a1)
236 );
237
238 /* use (a1 - predptr[c]) instead a1 + predptr[c] */
239 for (i = 4; i--;)
240 {
241 __asm__ __volatile__ (
242 "lw %[t2], 0(%[pred_ptr]) \n\t"
243 "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t"
244 "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t"
245 "sw %[vector_a], 0(%[dst_ptr]) \n\t"
246 "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
247 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
248 [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
249 : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
250 );
251 }
252 }
253 else
254 {
255 /* use quad-byte
256 * input and output memory are four byte aligned
257 */
258 __asm__ __volatile__ (
259 "replv.qb %[vector_a1], %[a1] \n\t"
260 : [vector_a1] "=r" (vector_a1)
261 : [a1] "r" (a1)
262 );
263
264 for (i = 4; i--;)
265 {
266 __asm__ __volatile__ (
267 "lw %[t2], 0(%[pred_ptr]) \n\t"
268 "add %[pred_ptr], %[pred_ptr], %[pred_stride] \n\t"
269 "addu_s.qb %[vector_a], %[vector_a1], %[t2] \n\t"
270 "sw %[vector_a], 0(%[dst_ptr]) \n\t"
271 "add %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
272 : [t2] "=&r" (t2), [vector_a] "=&r" (vector_a),
273 [dst_ptr] "+&r" (dst_ptr), [pred_ptr] "+&r" (pred_ptr)
274 : [dst_stride] "r" (dst_stride), [pred_stride] "r" (pred_stride), [vector_a1] "r" (vector_a1)
275 );
276 }
277 }
278
279 }
280
vp8_short_inv_walsh4x4_dspr2(short * input,short * mb_dqcoeff)281 void vp8_short_inv_walsh4x4_dspr2(short *input, short *mb_dqcoeff)
282 {
283 short output[16];
284 int i;
285 int a1, b1, c1, d1;
286 int a2, b2, c2, d2;
287 short *ip = input;
288 short *op = output;
289
290 prefetch_load_short(ip);
291
292 for (i = 4; i--;)
293 {
294 a1 = ip[0] + ip[12];
295 b1 = ip[4] + ip[8];
296 c1 = ip[4] - ip[8];
297 d1 = ip[0] - ip[12];
298
299 op[0] = a1 + b1;
300 op[4] = c1 + d1;
301 op[8] = a1 - b1;
302 op[12] = d1 - c1;
303
304 ip++;
305 op++;
306 }
307
308 ip = output;
309 op = output;
310
311 prefetch_load_short(ip);
312
313 for (i = 4; i--;)
314 {
315 a1 = ip[0] + ip[3] + 3;
316 b1 = ip[1] + ip[2];
317 c1 = ip[1] - ip[2];
318 d1 = ip[0] - ip[3] + 3;
319
320 a2 = a1 + b1;
321 b2 = d1 + c1;
322 c2 = a1 - b1;
323 d2 = d1 - c1;
324
325 op[0] = a2 >> 3;
326 op[1] = b2 >> 3;
327 op[2] = c2 >> 3;
328 op[3] = d2 >> 3;
329
330 ip += 4;
331 op += 4;
332 }
333
334 for (i = 0; i < 16; i++)
335 {
336 mb_dqcoeff[i * 16] = output[i];
337 }
338 }
339
vp8_short_inv_walsh4x4_1_dspr2(short * input,short * mb_dqcoeff)340 void vp8_short_inv_walsh4x4_1_dspr2(short *input, short *mb_dqcoeff)
341 {
342 int a1;
343
344 a1 = ((input[0] + 3) >> 3);
345
346 __asm__ __volatile__ (
347 "sh %[a1], 0(%[mb_dqcoeff]) \n\t"
348 "sh %[a1], 32(%[mb_dqcoeff]) \n\t"
349 "sh %[a1], 64(%[mb_dqcoeff]) \n\t"
350 "sh %[a1], 96(%[mb_dqcoeff]) \n\t"
351 "sh %[a1], 128(%[mb_dqcoeff]) \n\t"
352 "sh %[a1], 160(%[mb_dqcoeff]) \n\t"
353 "sh %[a1], 192(%[mb_dqcoeff]) \n\t"
354 "sh %[a1], 224(%[mb_dqcoeff]) \n\t"
355 "sh %[a1], 256(%[mb_dqcoeff]) \n\t"
356 "sh %[a1], 288(%[mb_dqcoeff]) \n\t"
357 "sh %[a1], 320(%[mb_dqcoeff]) \n\t"
358 "sh %[a1], 352(%[mb_dqcoeff]) \n\t"
359 "sh %[a1], 384(%[mb_dqcoeff]) \n\t"
360 "sh %[a1], 416(%[mb_dqcoeff]) \n\t"
361 "sh %[a1], 448(%[mb_dqcoeff]) \n\t"
362 "sh %[a1], 480(%[mb_dqcoeff]) \n\t"
363
364 :
365 : [a1] "r" (a1), [mb_dqcoeff] "r" (mb_dqcoeff)
366 );
367 }
368
369 #endif
370