1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11
12 #include <stdlib.h>
13 #include "vp8_rtcd.h"
14 #include "vpx_ports/mem.h"
15
16 #if HAVE_DSPR2
17 #define CROP_WIDTH 256
18 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
19
20 static const unsigned short sub_pel_filterss[8][3] =
21 {
22 { 0, 0, 0},
23 { 0, 0x0601, 0x7b0c},
24 { 0x0201, 0x0b08, 0x6c24},
25 { 0, 0x0906, 0x5d32},
26 { 0x0303, 0x1010, 0x4d4d},
27 { 0, 0x0609, 0x325d},
28 { 0x0102, 0x080b, 0x246c},
29 { 0, 0x0106, 0x0c7b},
30 };
31
32
33 static const int sub_pel_filters_int[8][3] =
34 {
35 { 0, 0, 0},
36 { 0x0000fffa, 0x007b000c, 0xffff0000},
37 { 0x0002fff5, 0x006c0024, 0xfff80001},
38 { 0x0000fff7, 0x005d0032, 0xfffa0000},
39 { 0x0003fff0, 0x004d004d, 0xfff00003},
40 { 0x0000fffa, 0x0032005d, 0xfff70000},
41 { 0x0001fff8, 0x0024006c, 0xfff50002},
42 { 0x0000ffff, 0x000c007b, 0xfffa0000},
43 };
44
45
46 static const int sub_pel_filters_inv[8][3] =
47 {
48 { 0, 0, 0},
49 { 0xfffa0000, 0x000c007b, 0x0000ffff},
50 { 0xfff50002, 0x0024006c, 0x0001fff8},
51 { 0xfff70000, 0x0032005d, 0x0000fffa},
52 { 0xfff00003, 0x004d004d, 0x0003fff0},
53 { 0xfffa0000, 0x005d0032, 0x0000fff7},
54 { 0xfff80001, 0x006c0024, 0x0002fff5},
55 { 0xffff0000, 0x007b000c, 0x0000fffa},
56 };
57
58
59 static const int sub_pel_filters_int_tap_4[8][2] =
60 {
61 { 0, 0},
62 { 0xfffa007b, 0x000cffff},
63 { 0, 0},
64 { 0xfff7005d, 0x0032fffa},
65 { 0, 0},
66 { 0xfffa0032, 0x005dfff7},
67 { 0, 0},
68 { 0xffff000c, 0x007bfffa},
69 };
70
71
72 static const int sub_pel_filters_inv_tap_4[8][2] =
73 {
74 { 0, 0},
75 { 0x007bfffa, 0xffff000c},
76 { 0, 0},
77 { 0x005dfff7, 0xfffa0032},
78 { 0, 0},
79 { 0x0032fffa, 0xfff7005d},
80 { 0, 0},
81 { 0x000cffff, 0xfffa007b},
82 };
83
prefetch_load(unsigned char * src)84 inline void prefetch_load(unsigned char *src)
85 {
86 __asm__ __volatile__ (
87 "pref 0, 0(%[src]) \n\t"
88 :
89 : [src] "r" (src)
90 );
91 }
92
93
prefetch_store(unsigned char * dst)94 inline void prefetch_store(unsigned char *dst)
95 {
96 __asm__ __volatile__ (
97 "pref 1, 0(%[dst]) \n\t"
98 :
99 : [dst] "r" (dst)
100 );
101 }
102
dsputil_static_init(void)103 void dsputil_static_init(void)
104 {
105 int i;
106
107 for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
108
109 for (i = 0; i < CROP_WIDTH; i++)
110 {
111 ff_cropTbl[i] = 0;
112 ff_cropTbl[i + CROP_WIDTH + 256] = 255;
113 }
114 }
115
vp8_filter_block2d_first_pass_4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)116 void vp8_filter_block2d_first_pass_4
117 (
118 unsigned char *RESTRICT src_ptr,
119 unsigned char *RESTRICT dst_ptr,
120 unsigned int src_pixels_per_line,
121 unsigned int output_height,
122 int xoffset,
123 int pitch
124 )
125 {
126 unsigned int i;
127 int Temp1, Temp2, Temp3, Temp4;
128
129 unsigned int vector4a = 64;
130 int vector1b, vector2b, vector3b;
131 unsigned int tp1, tp2, tn1, tn2;
132 unsigned int p1, p2, p3;
133 unsigned int n1, n2, n3;
134 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
135
136 vector3b = sub_pel_filters_inv[xoffset][2];
137
138 /* if (xoffset == 0) we don't need any filtering */
139 if (vector3b == 0)
140 {
141 for (i = 0; i < output_height; i++)
142 {
143 /* prefetch src_ptr data to cache memory */
144 prefetch_load(src_ptr + src_pixels_per_line);
145 dst_ptr[0] = src_ptr[0];
146 dst_ptr[1] = src_ptr[1];
147 dst_ptr[2] = src_ptr[2];
148 dst_ptr[3] = src_ptr[3];
149
150 /* next row... */
151 src_ptr += src_pixels_per_line;
152 dst_ptr += 4;
153 }
154 }
155 else
156 {
157 if (vector3b > 65536)
158 {
159 /* 6 tap filter */
160
161 vector1b = sub_pel_filters_inv[xoffset][0];
162 vector2b = sub_pel_filters_inv[xoffset][1];
163
164 /* prefetch src_ptr data to cache memory */
165 prefetch_load(src_ptr + src_pixels_per_line);
166
167 for (i = output_height; i--;)
168 {
169 /* apply filter with vectors pairs */
170 __asm__ __volatile__ (
171 "ulw %[tp1], -2(%[src_ptr]) \n\t"
172 "ulw %[tp2], 2(%[src_ptr]) \n\t"
173
174 /* even 1. pixel */
175 "mtlo %[vector4a], $ac3 \n\t"
176 "preceu.ph.qbr %[p1], %[tp1] \n\t"
177 "preceu.ph.qbl %[p2], %[tp1] \n\t"
178 "preceu.ph.qbr %[p3], %[tp2] \n\t"
179 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
180 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
181 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
182
183 /* even 2. pixel */
184 "mtlo %[vector4a], $ac2 \n\t"
185 "preceu.ph.qbl %[p1], %[tp2] \n\t"
186 "balign %[tp2], %[tp1], 3 \n\t"
187 "extp %[Temp1], $ac3, 9 \n\t"
188 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
189 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
190 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
191
192 /* odd 1. pixel */
193 "ulw %[tn2], 3(%[src_ptr]) \n\t"
194 "mtlo %[vector4a], $ac3 \n\t"
195 "preceu.ph.qbr %[n1], %[tp2] \n\t"
196 "preceu.ph.qbl %[n2], %[tp2] \n\t"
197 "preceu.ph.qbr %[n3], %[tn2] \n\t"
198 "extp %[Temp3], $ac2, 9 \n\t"
199 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
200 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
201 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
202
203 /* even 2. pixel */
204 "mtlo %[vector4a], $ac2 \n\t"
205 "preceu.ph.qbl %[n1], %[tn2] \n\t"
206 "extp %[Temp2], $ac3, 9 \n\t"
207 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
208 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
209 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
210 "extp %[Temp4], $ac2, 9 \n\t"
211
212 /* clamp */
213 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
214 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
215 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
216 "lbux %[n2], %[Temp4](%[cm]) \n\t"
217
218 /* store bytes */
219 "sb %[tp1], 0(%[dst_ptr]) \n\t"
220 "sb %[tn1], 1(%[dst_ptr]) \n\t"
221 "sb %[tp2], 2(%[dst_ptr]) \n\t"
222 "sb %[n2], 3(%[dst_ptr]) \n\t"
223
224 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
225 [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
226 [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
227 [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
228 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
229 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
230 [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
231 [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
232 );
233
234 /* Next row... */
235 src_ptr += src_pixels_per_line;
236 dst_ptr += pitch;
237 }
238 }
239 else
240 {
241 /* 4 tap filter */
242
243 vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
244 vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
245
246 for (i = output_height; i--;)
247 {
248 /* apply filter with vectors pairs */
249 __asm__ __volatile__ (
250 "ulw %[tp1], -1(%[src_ptr]) \n\t"
251 "ulw %[tp2], 3(%[src_ptr]) \n\t"
252
253 /* even 1. pixel */
254 "mtlo %[vector4a], $ac3 \n\t"
255 "preceu.ph.qbr %[p1], %[tp1] \n\t"
256 "preceu.ph.qbl %[p2], %[tp1] \n\t"
257 "preceu.ph.qbr %[p3], %[tp2] \n\t"
258 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
259 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
260
261 /* even 2. pixel */
262 "mtlo %[vector4a], $ac2 \n\t"
263 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
264 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
265 "extp %[Temp1], $ac3, 9 \n\t"
266
267 /* odd 1. pixel */
268 "srl %[tn1], %[tp2], 8 \n\t"
269 "balign %[tp2], %[tp1], 3 \n\t"
270 "mtlo %[vector4a], $ac3 \n\t"
271 "preceu.ph.qbr %[n1], %[tp2] \n\t"
272 "preceu.ph.qbl %[n2], %[tp2] \n\t"
273 "preceu.ph.qbr %[n3], %[tn1] \n\t"
274 "extp %[Temp3], $ac2, 9 \n\t"
275 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
276 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
277
278 /* odd 2. pixel */
279 "mtlo %[vector4a], $ac2 \n\t"
280 "extp %[Temp2], $ac3, 9 \n\t"
281 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
282 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
283 "extp %[Temp4], $ac2, 9 \n\t"
284
285 /* clamp and store results */
286 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
287 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
288 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
289 "sb %[tp1], 0(%[dst_ptr]) \n\t"
290 "sb %[tn1], 1(%[dst_ptr]) \n\t"
291 "lbux %[n2], %[Temp4](%[cm]) \n\t"
292 "sb %[tp2], 2(%[dst_ptr]) \n\t"
293 "sb %[n2], 3(%[dst_ptr]) \n\t"
294
295 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
296 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
297 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
298 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
299 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
300 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
301 [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
302 [src_ptr] "r" (src_ptr)
303 );
304 /* Next row... */
305 src_ptr += src_pixels_per_line;
306 dst_ptr += pitch;
307 }
308 }
309 }
310 }
311
vp8_filter_block2d_first_pass_8_all(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)312 void vp8_filter_block2d_first_pass_8_all
313 (
314 unsigned char *RESTRICT src_ptr,
315 unsigned char *RESTRICT dst_ptr,
316 unsigned int src_pixels_per_line,
317 unsigned int output_height,
318 int xoffset,
319 int pitch
320 )
321 {
322 unsigned int i;
323 int Temp1, Temp2, Temp3, Temp4;
324
325 unsigned int vector4a = 64;
326 unsigned int vector1b, vector2b, vector3b;
327 unsigned int tp1, tp2, tn1, tn2;
328 unsigned int p1, p2, p3, p4;
329 unsigned int n1, n2, n3, n4;
330
331 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
332
333 /* if (xoffset == 0) we don't need any filtering */
334 if (xoffset == 0)
335 {
336 for (i = 0; i < output_height; i++)
337 {
338 /* prefetch src_ptr data to cache memory */
339 prefetch_load(src_ptr + src_pixels_per_line);
340
341 dst_ptr[0] = src_ptr[0];
342 dst_ptr[1] = src_ptr[1];
343 dst_ptr[2] = src_ptr[2];
344 dst_ptr[3] = src_ptr[3];
345 dst_ptr[4] = src_ptr[4];
346 dst_ptr[5] = src_ptr[5];
347 dst_ptr[6] = src_ptr[6];
348 dst_ptr[7] = src_ptr[7];
349
350 /* next row... */
351 src_ptr += src_pixels_per_line;
352 dst_ptr += 8;
353 }
354 }
355 else
356 {
357 vector3b = sub_pel_filters_inv[xoffset][2];
358
359 if (vector3b > 65536)
360 {
361 /* 6 tap filter */
362
363 vector1b = sub_pel_filters_inv[xoffset][0];
364 vector2b = sub_pel_filters_inv[xoffset][1];
365
366 for (i = output_height; i--;)
367 {
368 /* prefetch src_ptr data to cache memory */
369 prefetch_load(src_ptr + src_pixels_per_line);
370
371 /* apply filter with vectors pairs */
372 __asm__ __volatile__ (
373 "ulw %[tp1], -2(%[src_ptr]) \n\t"
374 "ulw %[tp2], 2(%[src_ptr]) \n\t"
375
376 /* even 1. pixel */
377 "mtlo %[vector4a], $ac3 \n\t"
378 "preceu.ph.qbr %[p1], %[tp1] \n\t"
379 "preceu.ph.qbl %[p2], %[tp1] \n\t"
380 "preceu.ph.qbr %[p3], %[tp2] \n\t"
381 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
382 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
383 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
384
385 /* even 2. pixel */
386 "mtlo %[vector4a], $ac2 \n\t"
387 "preceu.ph.qbl %[p1], %[tp2] \n\t"
388 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
389 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
390 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
391
392 "balign %[tp2], %[tp1], 3 \n\t"
393 "extp %[Temp1], $ac3, 9 \n\t"
394 "ulw %[tn2], 3(%[src_ptr]) \n\t"
395
396 /* odd 1. pixel */
397 "mtlo %[vector4a], $ac3 \n\t"
398 "preceu.ph.qbr %[n1], %[tp2] \n\t"
399 "preceu.ph.qbl %[n2], %[tp2] \n\t"
400 "preceu.ph.qbr %[n3], %[tn2] \n\t"
401 "extp %[Temp3], $ac2, 9 \n\t"
402 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
403 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
404 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
405
406 /* odd 2. pixel */
407 "mtlo %[vector4a], $ac2 \n\t"
408 "preceu.ph.qbl %[n1], %[tn2] \n\t"
409 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
410 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
411 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
412 "ulw %[tp1], 6(%[src_ptr]) \n\t"
413 "extp %[Temp2], $ac3, 9 \n\t"
414 "mtlo %[vector4a], $ac3 \n\t"
415 "preceu.ph.qbr %[p2], %[tp1] \n\t"
416 "extp %[Temp4], $ac2, 9 \n\t"
417
418 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
419 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
420 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
421 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
422 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
423 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
424 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
425 [src_ptr] "r" (src_ptr)
426 );
427
428 /* clamp and store results */
429 dst_ptr[0] = cm[Temp1];
430 dst_ptr[1] = cm[Temp2];
431 dst_ptr[2] = cm[Temp3];
432 dst_ptr[3] = cm[Temp4];
433
434 /* next 4 pixels */
435 __asm__ __volatile__ (
436 /* even 3. pixel */
437 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
438 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
439 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
440
441 /* even 4. pixel */
442 "mtlo %[vector4a], $ac2 \n\t"
443 "preceu.ph.qbl %[p4], %[tp1] \n\t"
444 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
445 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
446 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
447
448 "ulw %[tn1], 7(%[src_ptr]) \n\t"
449 "extp %[Temp1], $ac3, 9 \n\t"
450
451 /* odd 3. pixel */
452 "mtlo %[vector4a], $ac3 \n\t"
453 "preceu.ph.qbr %[n2], %[tn1] \n\t"
454 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
455 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
456 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
457 "extp %[Temp3], $ac2, 9 \n\t"
458
459 /* odd 4. pixel */
460 "mtlo %[vector4a], $ac2 \n\t"
461 "preceu.ph.qbl %[n4], %[tn1] \n\t"
462 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
463 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
464 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
465 "extp %[Temp2], $ac3, 9 \n\t"
466 "extp %[Temp4], $ac2, 9 \n\t"
467
468 : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
469 [p4] "=&r" (p4), [n4] "=&r" (n4),
470 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
471 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
472 : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
473 [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
474 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
475 [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
476 );
477
478 /* clamp and store results */
479 dst_ptr[4] = cm[Temp1];
480 dst_ptr[5] = cm[Temp2];
481 dst_ptr[6] = cm[Temp3];
482 dst_ptr[7] = cm[Temp4];
483
484 src_ptr += src_pixels_per_line;
485 dst_ptr += pitch;
486 }
487 }
488 else
489 {
490 /* 4 tap filter */
491
492 vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
493 vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
494
495 for (i = output_height; i--;)
496 {
497 /* prefetch src_ptr data to cache memory */
498 prefetch_load(src_ptr + src_pixels_per_line);
499
500 /* apply filter with vectors pairs */
501 __asm__ __volatile__ (
502 "ulw %[tp1], -1(%[src_ptr]) \n\t"
503
504 /* even 1. pixel */
505 "mtlo %[vector4a], $ac3 \n\t"
506 "preceu.ph.qbr %[p1], %[tp1] \n\t"
507 "preceu.ph.qbl %[p2], %[tp1] \n\t"
508 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
509 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
510
511 "ulw %[tp2], 3(%[src_ptr]) \n\t"
512
513 /* even 2. pixel */
514 "mtlo %[vector4a], $ac2 \n\t"
515 "preceu.ph.qbr %[p3], %[tp2] \n\t"
516 "preceu.ph.qbl %[p4], %[tp2] \n\t"
517 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
518 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
519 "extp %[Temp1], $ac3, 9 \n\t"
520
521 "balign %[tp2], %[tp1], 3 \n\t"
522
523 /* odd 1. pixel */
524 "mtlo %[vector4a], $ac3 \n\t"
525 "preceu.ph.qbr %[n1], %[tp2] \n\t"
526 "preceu.ph.qbl %[n2], %[tp2] \n\t"
527 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
528 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
529 "extp %[Temp3], $ac2, 9 \n\t"
530
531 "ulw %[tn2], 4(%[src_ptr]) \n\t"
532
533 /* odd 2. pixel */
534 "mtlo %[vector4a], $ac2 \n\t"
535 "preceu.ph.qbr %[n3], %[tn2] \n\t"
536 "preceu.ph.qbl %[n4], %[tn2] \n\t"
537 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
538 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
539 "ulw %[tp1], 7(%[src_ptr]) \n\t"
540 "extp %[Temp2], $ac3, 9 \n\t"
541 "mtlo %[vector4a], $ac3 \n\t"
542 "extp %[Temp4], $ac2, 9 \n\t"
543
544 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
545 [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
546 [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
547 [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
548 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
549 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
550 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
551 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
552 );
553
554 /* clamp and store results */
555 dst_ptr[0] = cm[Temp1];
556 dst_ptr[1] = cm[Temp2];
557 dst_ptr[2] = cm[Temp3];
558 dst_ptr[3] = cm[Temp4];
559
560 /* next 4 pixels */
561 __asm__ __volatile__ (
562 /* even 3. pixel */
563 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
564 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
565
566 /* even 4. pixel */
567 "mtlo %[vector4a], $ac2 \n\t"
568 "preceu.ph.qbr %[p2], %[tp1] \n\t"
569 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
570 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
571 "extp %[Temp1], $ac3, 9 \n\t"
572
573 /* odd 3. pixel */
574 "mtlo %[vector4a], $ac3 \n\t"
575 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
576 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
577 "ulw %[tn1], 8(%[src_ptr]) \n\t"
578 "extp %[Temp3], $ac2, 9 \n\t"
579
580 /* odd 4. pixel */
581 "mtlo %[vector4a], $ac2 \n\t"
582 "preceu.ph.qbr %[n2], %[tn1] \n\t"
583 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
584 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
585 "extp %[Temp2], $ac3, 9 \n\t"
586 "extp %[Temp4], $ac2, 9 \n\t"
587
588 : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
589 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
590 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
591 : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
592 [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
593 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
594 [n3] "r" (n3), [n4] "r" (n4)
595 );
596
597 /* clamp and store results */
598 dst_ptr[4] = cm[Temp1];
599 dst_ptr[5] = cm[Temp2];
600 dst_ptr[6] = cm[Temp3];
601 dst_ptr[7] = cm[Temp4];
602
603 /* next row... */
604 src_ptr += src_pixels_per_line;
605 dst_ptr += pitch;
606 }
607 }
608 }
609 }
610
611
vp8_filter_block2d_first_pass16_6tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)612 void vp8_filter_block2d_first_pass16_6tap
613 (
614 unsigned char *RESTRICT src_ptr,
615 unsigned char *RESTRICT dst_ptr,
616 unsigned int src_pixels_per_line,
617 unsigned int output_height,
618 int xoffset,
619 int pitch
620 )
621 {
622 unsigned int i;
623 int Temp1, Temp2, Temp3, Temp4;
624
625 unsigned int vector4a;
626 unsigned int vector1b, vector2b, vector3b;
627 unsigned int tp1, tp2, tn1, tn2;
628 unsigned int p1, p2, p3, p4;
629 unsigned int n1, n2, n3, n4;
630 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
631
632 vector1b = sub_pel_filters_inv[xoffset][0];
633 vector2b = sub_pel_filters_inv[xoffset][1];
634 vector3b = sub_pel_filters_inv[xoffset][2];
635 vector4a = 64;
636
637 for (i = output_height; i--;)
638 {
639 /* prefetch src_ptr data to cache memory */
640 prefetch_load(src_ptr + src_pixels_per_line);
641
642 /* apply filter with vectors pairs */
643 __asm__ __volatile__ (
644 "ulw %[tp1], -2(%[src_ptr]) \n\t"
645 "ulw %[tp2], 2(%[src_ptr]) \n\t"
646
647 /* even 1. pixel */
648 "mtlo %[vector4a], $ac3 \n\t"
649 "preceu.ph.qbr %[p1], %[tp1] \n\t"
650 "preceu.ph.qbl %[p2], %[tp1] \n\t"
651 "preceu.ph.qbr %[p3], %[tp2] \n\t"
652 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
653 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
654 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
655
656 /* even 2. pixel */
657 "mtlo %[vector4a], $ac2 \n\t"
658 "preceu.ph.qbl %[p1], %[tp2] \n\t"
659 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
660 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
661 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
662
663 "balign %[tp2], %[tp1], 3 \n\t"
664 "ulw %[tn2], 3(%[src_ptr]) \n\t"
665 "extp %[Temp1], $ac3, 9 \n\t"
666
667 /* odd 1. pixel */
668 "mtlo %[vector4a], $ac3 \n\t"
669 "preceu.ph.qbr %[n1], %[tp2] \n\t"
670 "preceu.ph.qbl %[n2], %[tp2] \n\t"
671 "preceu.ph.qbr %[n3], %[tn2] \n\t"
672 "extp %[Temp3], $ac2, 9 \n\t"
673 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
674 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
675 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
676
677 /* odd 2. pixel */
678 "mtlo %[vector4a], $ac2 \n\t"
679 "preceu.ph.qbl %[n1], %[tn2] \n\t"
680 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
681 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
682 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
683 "ulw %[tp1], 6(%[src_ptr]) \n\t"
684 "extp %[Temp2], $ac3, 9 \n\t"
685 "mtlo %[vector4a], $ac3 \n\t"
686 "preceu.ph.qbr %[p2], %[tp1] \n\t"
687 "extp %[Temp4], $ac2, 9 \n\t"
688
689 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
690 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
691 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
692 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
693 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
694 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
695 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
696 [src_ptr] "r" (src_ptr)
697 );
698
699 /* clamp and store results */
700 dst_ptr[0] = cm[Temp1];
701 dst_ptr[1] = cm[Temp2];
702 dst_ptr[2] = cm[Temp3];
703 dst_ptr[3] = cm[Temp4];
704
705 /* next 4 pixels */
706 __asm__ __volatile__ (
707 /* even 3. pixel */
708 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
709 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
710 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
711
712 /* even 4. pixel */
713 "mtlo %[vector4a], $ac2 \n\t"
714 "preceu.ph.qbl %[p4], %[tp1] \n\t"
715 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
716 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
717 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
718 "ulw %[tn1], 7(%[src_ptr]) \n\t"
719 "extp %[Temp1], $ac3, 9 \n\t"
720
721 /* odd 3. pixel */
722 "mtlo %[vector4a], $ac3 \n\t"
723 "preceu.ph.qbr %[n2], %[tn1] \n\t"
724 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
725 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
726 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
727 "extp %[Temp3], $ac2, 9 \n\t"
728
729 /* odd 4. pixel */
730 "mtlo %[vector4a], $ac2 \n\t"
731 "preceu.ph.qbl %[n4], %[tn1] \n\t"
732 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
733 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
734 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
735 "ulw %[tp2], 10(%[src_ptr]) \n\t"
736 "extp %[Temp2], $ac3, 9 \n\t"
737 "mtlo %[vector4a], $ac3 \n\t"
738 "preceu.ph.qbr %[p1], %[tp2] \n\t"
739 "extp %[Temp4], $ac2, 9 \n\t"
740
741 : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
742 [p4] "=&r" (p4), [n4] "=&r" (n4),
743 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
744 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
745 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
746 [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
747 [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
748 [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
749 );
750
751 /* clamp and store results */
752 dst_ptr[4] = cm[Temp1];
753 dst_ptr[5] = cm[Temp2];
754 dst_ptr[6] = cm[Temp3];
755 dst_ptr[7] = cm[Temp4];
756
757 /* next 4 pixels */
758 __asm__ __volatile__ (
759 /* even 5. pixel */
760 "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t"
761 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
762 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
763
764 /* even 6. pixel */
765 "mtlo %[vector4a], $ac2 \n\t"
766 "preceu.ph.qbl %[p3], %[tp2] \n\t"
767 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
768 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
769 "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t"
770
771 "ulw %[tn1], 11(%[src_ptr]) \n\t"
772 "extp %[Temp1], $ac3, 9 \n\t"
773
774 /* odd 5. pixel */
775 "mtlo %[vector4a], $ac3 \n\t"
776 "preceu.ph.qbr %[n1], %[tn1] \n\t"
777 "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t"
778 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
779 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
780 "extp %[Temp3], $ac2, 9 \n\t"
781
782 /* odd 6. pixel */
783 "mtlo %[vector4a], $ac2 \n\t"
784 "preceu.ph.qbl %[n3], %[tn1] \n\t"
785 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
786 "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t"
787 "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t"
788 "ulw %[tp1], 14(%[src_ptr]) \n\t"
789 "extp %[Temp2], $ac3, 9 \n\t"
790 "mtlo %[vector4a], $ac3 \n\t"
791 "preceu.ph.qbr %[p4], %[tp1] \n\t"
792 "extp %[Temp4], $ac2, 9 \n\t"
793
794 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
795 [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
796 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
797 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
798 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
799 [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
800 [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
801 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
802 );
803
804 /* clamp and store results */
805 dst_ptr[8] = cm[Temp1];
806 dst_ptr[9] = cm[Temp2];
807 dst_ptr[10] = cm[Temp3];
808 dst_ptr[11] = cm[Temp4];
809
810 /* next 4 pixels */
811 __asm__ __volatile__ (
812 /* even 7. pixel */
813 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
814 "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t"
815 "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t"
816
817 /* even 8. pixel */
818 "mtlo %[vector4a], $ac2 \n\t"
819 "preceu.ph.qbl %[p2], %[tp1] \n\t"
820 "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t"
821 "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t"
822 "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t"
823 "ulw %[tn1], 15(%[src_ptr]) \n\t"
824 "extp %[Temp1], $ac3, 9 \n\t"
825
826 /* odd 7. pixel */
827 "mtlo %[vector4a], $ac3 \n\t"
828 "preceu.ph.qbr %[n4], %[tn1] \n\t"
829 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
830 "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t"
831 "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t"
832 "extp %[Temp3], $ac2, 9 \n\t"
833
834 /* odd 8. pixel */
835 "mtlo %[vector4a], $ac2 \n\t"
836 "preceu.ph.qbl %[n2], %[tn1] \n\t"
837 "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t"
838 "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t"
839 "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t"
840 "extp %[Temp2], $ac3, 9 \n\t"
841 "extp %[Temp4], $ac2, 9 \n\t"
842
843 /* clamp and store results */
844 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
845 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
846 "lbux %[p2], %[Temp3](%[cm]) \n\t"
847 "sb %[tp1], 12(%[dst_ptr]) \n\t"
848 "sb %[tn1], 13(%[dst_ptr]) \n\t"
849 "lbux %[n2], %[Temp4](%[cm]) \n\t"
850 "sb %[p2], 14(%[dst_ptr]) \n\t"
851 "sb %[n2], 15(%[dst_ptr]) \n\t"
852
853 : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
854 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
855 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
856 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
857 [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
858 [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
859 [n3] "r" (n3), [src_ptr] "r" (src_ptr),
860 [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
861 );
862
863 src_ptr += src_pixels_per_line;
864 dst_ptr += pitch;
865 }
866 }
867
868
vp8_filter_block2d_first_pass16_0(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line)869 void vp8_filter_block2d_first_pass16_0
870 (
871 unsigned char *RESTRICT src_ptr,
872 unsigned char *RESTRICT output_ptr,
873 unsigned int src_pixels_per_line
874 )
875 {
876 int Temp1, Temp2, Temp3, Temp4;
877 int i;
878
879 /* prefetch src_ptr data to cache memory */
880 prefetch_store(output_ptr + 32);
881
882 /* copy memory from src buffer to dst buffer */
883 for (i = 0; i < 7; i++)
884 {
885 __asm__ __volatile__ (
886 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
887 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
888 "ulw %[Temp3], 8(%[src_ptr]) \n\t"
889 "ulw %[Temp4], 12(%[src_ptr]) \n\t"
890 "sw %[Temp1], 0(%[output_ptr]) \n\t"
891 "sw %[Temp2], 4(%[output_ptr]) \n\t"
892 "sw %[Temp3], 8(%[output_ptr]) \n\t"
893 "sw %[Temp4], 12(%[output_ptr]) \n\t"
894 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
895
896 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
897 [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
898 : [src_pixels_per_line] "r" (src_pixels_per_line),
899 [output_ptr] "r" (output_ptr)
900 );
901
902 __asm__ __volatile__ (
903 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
904 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
905 "ulw %[Temp3], 8(%[src_ptr]) \n\t"
906 "ulw %[Temp4], 12(%[src_ptr]) \n\t"
907 "sw %[Temp1], 16(%[output_ptr]) \n\t"
908 "sw %[Temp2], 20(%[output_ptr]) \n\t"
909 "sw %[Temp3], 24(%[output_ptr]) \n\t"
910 "sw %[Temp4], 28(%[output_ptr]) \n\t"
911 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
912
913 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
914 [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
915 : [src_pixels_per_line] "r" (src_pixels_per_line),
916 [output_ptr] "r" (output_ptr)
917 );
918
919 __asm__ __volatile__ (
920 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
921 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
922 "ulw %[Temp3], 8(%[src_ptr]) \n\t"
923 "ulw %[Temp4], 12(%[src_ptr]) \n\t"
924 "sw %[Temp1], 32(%[output_ptr]) \n\t"
925 "sw %[Temp2], 36(%[output_ptr]) \n\t"
926 "sw %[Temp3], 40(%[output_ptr]) \n\t"
927 "sw %[Temp4], 44(%[output_ptr]) \n\t"
928 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
929
930 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
931 [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
932 : [src_pixels_per_line] "r" (src_pixels_per_line),
933 [output_ptr] "r" (output_ptr)
934 );
935
936 output_ptr += 48;
937 }
938 }
939
940
vp8_filter_block2d_first_pass16_4tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line,unsigned int output_width,unsigned int output_height,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int pitch)941 void vp8_filter_block2d_first_pass16_4tap
942 (
943 unsigned char *RESTRICT src_ptr,
944 unsigned char *RESTRICT output_ptr,
945 unsigned int src_pixels_per_line,
946 unsigned int output_width,
947 unsigned int output_height,
948 int xoffset,
949 int yoffset,
950 unsigned char *RESTRICT dst_ptr,
951 int pitch
952 )
953 {
954 unsigned int i, j;
955 int Temp1, Temp2, Temp3, Temp4;
956
957 unsigned int vector4a;
958 int vector1b, vector2b;
959 unsigned int tp1, tp2, tp3, tn1;
960 unsigned int p1, p2, p3;
961 unsigned int n1, n2, n3;
962 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
963
964 vector4a = 64;
965
966 vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
967 vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
968
969 /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
970 if (yoffset == 0)
971 {
972 output_height -= 5;
973 src_ptr += (src_pixels_per_line + src_pixels_per_line);
974
975 for (i = output_height; i--;)
976 {
977 __asm__ __volatile__ (
978 "ulw %[tp3], -1(%[src_ptr]) \n\t"
979 : [tp3] "=&r" (tp3)
980 : [src_ptr] "r" (src_ptr)
981 );
982
983 /* processing 4 adjacent pixels */
984 for (j = 0; j < 16; j += 4)
985 {
986 /* apply filter with vectors pairs */
987 __asm__ __volatile__ (
988 "ulw %[tp2], 3(%[src_ptr]) \n\t"
989 "move %[tp1], %[tp3] \n\t"
990
991 /* even 1. pixel */
992 "mtlo %[vector4a], $ac3 \n\t"
993 "mthi $0, $ac3 \n\t"
994 "move %[tp3], %[tp2] \n\t"
995 "preceu.ph.qbr %[p1], %[tp1] \n\t"
996 "preceu.ph.qbl %[p2], %[tp1] \n\t"
997 "preceu.ph.qbr %[p3], %[tp2] \n\t"
998 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
999 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
1000
1001 /* even 2. pixel */
1002 "mtlo %[vector4a], $ac2 \n\t"
1003 "mthi $0, $ac2 \n\t"
1004 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
1005 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
1006 "extr.w %[Temp1], $ac3, 7 \n\t"
1007
1008 /* odd 1. pixel */
1009 "ulw %[tn1], 4(%[src_ptr]) \n\t"
1010 "balign %[tp2], %[tp1], 3 \n\t"
1011 "mtlo %[vector4a], $ac3 \n\t"
1012 "mthi $0, $ac3 \n\t"
1013 "preceu.ph.qbr %[n1], %[tp2] \n\t"
1014 "preceu.ph.qbl %[n2], %[tp2] \n\t"
1015 "preceu.ph.qbr %[n3], %[tn1] \n\t"
1016 "extr.w %[Temp3], $ac2, 7 \n\t"
1017 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
1018 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
1019
1020 /* odd 2. pixel */
1021 "mtlo %[vector4a], $ac2 \n\t"
1022 "mthi $0, $ac2 \n\t"
1023 "extr.w %[Temp2], $ac3, 7 \n\t"
1024 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
1025 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
1026 "extr.w %[Temp4], $ac2, 7 \n\t"
1027
1028 /* clamp and store results */
1029 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
1030 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
1031 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
1032 "sb %[tp1], 0(%[dst_ptr]) \n\t"
1033 "sb %[tn1], 1(%[dst_ptr]) \n\t"
1034 "lbux %[n2], %[Temp4](%[cm]) \n\t"
1035 "sb %[tp2], 2(%[dst_ptr]) \n\t"
1036 "sb %[n2], 3(%[dst_ptr]) \n\t"
1037
1038 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
1039 [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
1040 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
1041 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
1042 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
1043 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1044 [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
1045 [src_ptr] "r" (src_ptr)
1046 );
1047
1048 src_ptr += 4;
1049 }
1050
1051 /* Next row... */
1052 src_ptr += src_pixels_per_line - 16;
1053 dst_ptr += pitch;
1054 }
1055 }
1056 else
1057 {
1058 for (i = output_height; i--;)
1059 {
1060 /* processing 4 adjacent pixels */
1061 for (j = 0; j < 16; j += 4)
1062 {
1063 /* apply filter with vectors pairs */
1064 __asm__ __volatile__ (
1065 "ulw %[tp1], -1(%[src_ptr]) \n\t"
1066 "ulw %[tp2], 3(%[src_ptr]) \n\t"
1067
1068 /* even 1. pixel */
1069 "mtlo %[vector4a], $ac3 \n\t"
1070 "mthi $0, $ac3 \n\t"
1071 "preceu.ph.qbr %[p1], %[tp1] \n\t"
1072 "preceu.ph.qbl %[p2], %[tp1] \n\t"
1073 "preceu.ph.qbr %[p3], %[tp2] \n\t"
1074 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
1075 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
1076
1077 /* even 2. pixel */
1078 "mtlo %[vector4a], $ac2 \n\t"
1079 "mthi $0, $ac2 \n\t"
1080 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
1081 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
1082 "extr.w %[Temp1], $ac3, 7 \n\t"
1083
1084 /* odd 1. pixel */
1085 "ulw %[tn1], 4(%[src_ptr]) \n\t"
1086 "balign %[tp2], %[tp1], 3 \n\t"
1087 "mtlo %[vector4a], $ac3 \n\t"
1088 "mthi $0, $ac3 \n\t"
1089 "preceu.ph.qbr %[n1], %[tp2] \n\t"
1090 "preceu.ph.qbl %[n2], %[tp2] \n\t"
1091 "preceu.ph.qbr %[n3], %[tn1] \n\t"
1092 "extr.w %[Temp3], $ac2, 7 \n\t"
1093 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
1094 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
1095
1096 /* odd 2. pixel */
1097 "mtlo %[vector4a], $ac2 \n\t"
1098 "mthi $0, $ac2 \n\t"
1099 "extr.w %[Temp2], $ac3, 7 \n\t"
1100 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
1101 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
1102 "extr.w %[Temp4], $ac2, 7 \n\t"
1103
1104 /* clamp and store results */
1105 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
1106 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
1107 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
1108 "sb %[tp1], 0(%[output_ptr]) \n\t"
1109 "sb %[tn1], 1(%[output_ptr]) \n\t"
1110 "lbux %[n2], %[Temp4](%[cm]) \n\t"
1111 "sb %[tp2], 2(%[output_ptr]) \n\t"
1112 "sb %[n2], 3(%[output_ptr]) \n\t"
1113
1114 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
1115 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
1116 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
1117 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1118 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
1119 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1120 [vector4a] "r" (vector4a), [cm] "r" (cm),
1121 [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
1122 );
1123
1124 src_ptr += 4;
1125 }
1126
1127 /* next row... */
1128 src_ptr += src_pixels_per_line;
1129 output_ptr += output_width;
1130 }
1131 }
1132 }
1133
1134
vp8_filter_block2d_second_pass4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,int yoffset)1135 void vp8_filter_block2d_second_pass4
1136 (
1137 unsigned char *RESTRICT src_ptr,
1138 unsigned char *RESTRICT output_ptr,
1139 int output_pitch,
1140 int yoffset
1141 )
1142 {
1143 unsigned int i;
1144
1145 int Temp1, Temp2, Temp3, Temp4;
1146 unsigned int vector1b, vector2b, vector3b, vector4a;
1147
1148 unsigned char src_ptr_l2;
1149 unsigned char src_ptr_l1;
1150 unsigned char src_ptr_0;
1151 unsigned char src_ptr_r1;
1152 unsigned char src_ptr_r2;
1153 unsigned char src_ptr_r3;
1154
1155 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1156
1157 vector4a = 64;
1158
1159 /* load filter coefficients */
1160 vector1b = sub_pel_filterss[yoffset][0];
1161 vector2b = sub_pel_filterss[yoffset][2];
1162 vector3b = sub_pel_filterss[yoffset][1];
1163
1164 if (vector1b)
1165 {
1166 /* 6 tap filter */
1167
1168 for (i = 2; i--;)
1169 {
1170 /* prefetch src_ptr data to cache memory */
1171 prefetch_load(src_ptr);
1172
1173 /* do not allow compiler to reorder instructions */
1174 __asm__ __volatile__ (
1175 ".set noreorder \n\t"
1176 :
1177 :
1178 );
1179
1180 /* apply filter with vectors pairs */
1181 __asm__ __volatile__ (
1182 "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t"
1183 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
1184 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1185 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
1186 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
1187 "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t"
1188 "mtlo %[vector4a], $ac2 \n\t"
1189
1190 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1191 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1192 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1193 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1194 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1195 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1196
1197 "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t"
1198 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
1199 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1200 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
1201 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
1202 "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t"
1203 "mtlo %[vector4a], $ac3 \n\t"
1204 "extp %[Temp1], $ac2, 9 \n\t"
1205
1206 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1207 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1208 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1209 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1210 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1211 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1212
1213 "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t"
1214 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
1215 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1216 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
1217 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
1218 "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t"
1219 "mtlo %[vector4a], $ac0 \n\t"
1220 "extp %[Temp2], $ac3, 9 \n\t"
1221
1222 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1223 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1224 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1225 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1226 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1227 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1228
1229 "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t"
1230 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
1231 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1232 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
1233 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
1234 "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t"
1235 "mtlo %[vector4a], $ac1 \n\t"
1236 "extp %[Temp3], $ac0, 9 \n\t"
1237
1238 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1239 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1240 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1241 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1242 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1243 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1244 "extp %[Temp4], $ac1, 9 \n\t"
1245
1246 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1247 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1248 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1249 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1250 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1251 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1252 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1253 [src_ptr] "r" (src_ptr)
1254 );
1255
1256 /* clamp and store results */
1257 output_ptr[0] = cm[Temp1];
1258 output_ptr[1] = cm[Temp2];
1259 output_ptr[2] = cm[Temp3];
1260 output_ptr[3] = cm[Temp4];
1261
1262 output_ptr += output_pitch;
1263
1264 /* apply filter with vectors pairs */
1265 __asm__ __volatile__ (
1266 "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t"
1267 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
1268 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1269 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1270 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
1271 "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t"
1272 "mtlo %[vector4a], $ac2 \n\t"
1273 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1274 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1275 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1276 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1277 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1278 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1279
1280 "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t"
1281 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
1282 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1283 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1284 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
1285 "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t"
1286 "mtlo %[vector4a], $ac3 \n\t"
1287 "extp %[Temp1], $ac2, 9 \n\t"
1288
1289 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1290 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1291 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1292 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1293 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1294 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1295
1296 "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t"
1297 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
1298 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1299 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
1300 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
1301 "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t"
1302 "mtlo %[vector4a], $ac0 \n\t"
1303 "extp %[Temp2], $ac3, 9 \n\t"
1304
1305 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1306 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1307 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1308 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1309 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1310 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1311
1312 "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t"
1313 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
1314 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
1315 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
1316 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
1317 "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t"
1318 "mtlo %[vector4a], $ac1 \n\t"
1319 "extp %[Temp3], $ac0, 9 \n\t"
1320
1321 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1322 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1323 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1324 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1325 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1326 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1327 "extp %[Temp4], $ac1, 9 \n\t"
1328
1329 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1330 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1331 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1332 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1333 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1334 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1335 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1336 [src_ptr] "r" (src_ptr)
1337 );
1338
1339 /* clamp and store results */
1340 output_ptr[0] = cm[Temp1];
1341 output_ptr[1] = cm[Temp2];
1342 output_ptr[2] = cm[Temp3];
1343 output_ptr[3] = cm[Temp4];
1344
1345 src_ptr += 8;
1346 output_ptr += output_pitch;
1347 }
1348 }
1349 else
1350 {
1351 /* 4 tap filter */
1352
1353 /* prefetch src_ptr data to cache memory */
1354 prefetch_load(src_ptr);
1355
1356 for (i = 2; i--;)
1357 {
1358 /* do not allow compiler to reorder instructions */
1359 __asm__ __volatile__ (
1360 ".set noreorder \n\t"
1361 :
1362 :
1363 );
1364
1365 /* apply filter with vectors pairs */
1366 __asm__ __volatile__ (
1367 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
1368 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1369 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
1370 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
1371 "mtlo %[vector4a], $ac2 \n\t"
1372 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1373 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1374 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1375 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1376
1377 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
1378 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1379 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
1380 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
1381 "mtlo %[vector4a], $ac3 \n\t"
1382 "extp %[Temp1], $ac2, 9 \n\t"
1383
1384 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1385 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1386 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1387 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1388
1389 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
1390 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1391 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
1392 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
1393 "mtlo %[vector4a], $ac0 \n\t"
1394 "extp %[Temp2], $ac3, 9 \n\t"
1395
1396 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1397 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1398 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1399 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1400
1401 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
1402 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1403 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
1404 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
1405 "mtlo %[vector4a], $ac1 \n\t"
1406 "extp %[Temp3], $ac0, 9 \n\t"
1407 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1408 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1409 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1410 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1411 "extp %[Temp4], $ac1, 9 \n\t"
1412
1413 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1414 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1415 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1416 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1417 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1418 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1419 );
1420
1421 /* clamp and store results */
1422 output_ptr[0] = cm[Temp1];
1423 output_ptr[1] = cm[Temp2];
1424 output_ptr[2] = cm[Temp3];
1425 output_ptr[3] = cm[Temp4];
1426
1427 output_ptr += output_pitch;
1428
1429 /* apply filter with vectors pairs */
1430 __asm__ __volatile__ (
1431 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
1432 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1433 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1434 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
1435 "mtlo %[vector4a], $ac2 \n\t"
1436 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1437 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1438 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1439 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1440
1441 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
1442 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1443 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1444 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
1445 "mtlo %[vector4a], $ac3 \n\t"
1446 "extp %[Temp1], $ac2, 9 \n\t"
1447
1448 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1449 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1450 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1451 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1452
1453 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
1454 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1455 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
1456 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
1457 "mtlo %[vector4a], $ac0 \n\t"
1458 "extp %[Temp2], $ac3, 9 \n\t"
1459
1460 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1461 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1462 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1463 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1464
1465 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
1466 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
1467 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
1468 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
1469 "mtlo %[vector4a], $ac1 \n\t"
1470 "extp %[Temp3], $ac0, 9 \n\t"
1471 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1472 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1473 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1474 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1475 "extp %[Temp4], $ac1, 9 \n\t"
1476
1477 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1478 [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1479 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1480 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1481 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1482 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1483 );
1484
1485 /* clamp and store results */
1486 output_ptr[0] = cm[Temp1];
1487 output_ptr[1] = cm[Temp2];
1488 output_ptr[2] = cm[Temp3];
1489 output_ptr[3] = cm[Temp4];
1490
1491 src_ptr += 8;
1492 output_ptr += output_pitch;
1493 }
1494 }
1495 }
1496
1497
vp8_filter_block2d_second_pass_8(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,unsigned int output_height,unsigned int output_width,unsigned int yoffset)1498 void vp8_filter_block2d_second_pass_8
1499 (
1500 unsigned char *RESTRICT src_ptr,
1501 unsigned char *RESTRICT output_ptr,
1502 int output_pitch,
1503 unsigned int output_height,
1504 unsigned int output_width,
1505 unsigned int yoffset
1506 )
1507 {
1508 unsigned int i;
1509
1510 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1511 unsigned int vector1b, vector2b, vector3b, vector4a;
1512
1513 unsigned char src_ptr_l2;
1514 unsigned char src_ptr_l1;
1515 unsigned char src_ptr_0;
1516 unsigned char src_ptr_r1;
1517 unsigned char src_ptr_r2;
1518 unsigned char src_ptr_r3;
1519 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1520
1521 vector4a = 64;
1522
1523 vector1b = sub_pel_filterss[yoffset][0];
1524 vector2b = sub_pel_filterss[yoffset][2];
1525 vector3b = sub_pel_filterss[yoffset][1];
1526
1527 if (vector1b)
1528 {
1529 /* 6 tap filter */
1530
1531 /* prefetch src_ptr data to cache memory */
1532 prefetch_load(src_ptr);
1533
1534 for (i = output_height; i--;)
1535 {
1536 /* apply filter with vectors pairs */
1537 __asm__ __volatile__ (
1538 "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t"
1539 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
1540 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1541 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1542 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
1543 "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t"
1544 "mtlo %[vector4a], $ac2 \n\t"
1545
1546 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1547 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1548 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1549 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1550 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1551 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1552
1553 "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t"
1554 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
1555 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1556 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1557 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
1558 "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t"
1559 "mtlo %[vector4a], $ac3 \n\t"
1560 "extp %[Temp1], $ac2, 9 \n\t"
1561
1562 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1563 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1564 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1565 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1566 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1567 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1568
1569 "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t"
1570 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
1571 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1572 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
1573 "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t"
1574 "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t"
1575 "mtlo %[vector4a], $ac0 \n\t"
1576 "extp %[Temp2], $ac3, 9 \n\t"
1577
1578 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1579 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1580 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1581 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1582 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1583 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1584
1585 "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t"
1586 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
1587 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1588 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
1589 "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t"
1590 "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t"
1591 "mtlo %[vector4a], $ac1 \n\t"
1592 "extp %[Temp3], $ac0, 9 \n\t"
1593
1594 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1595 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1596 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1597 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1598 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1599 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1600
1601 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
1602 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1603 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1604 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1605 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1606 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1607 [src_ptr] "r" (src_ptr)
1608 );
1609
1610 /* apply filter with vectors pairs */
1611 __asm__ __volatile__ (
1612 "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t"
1613 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
1614 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1615 "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t"
1616 "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t"
1617 "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t"
1618 "mtlo %[vector4a], $ac2 \n\t"
1619
1620 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1621 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1622 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1623 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1624 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1625 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1626 "extp %[Temp4], $ac1, 9 \n\t"
1627
1628 "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t"
1629 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
1630 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1631 "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t"
1632 "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t"
1633 "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t"
1634 "mtlo %[vector4a], $ac3 \n\t"
1635 "extp %[Temp5], $ac2, 9 \n\t"
1636
1637 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1638 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1639 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1640 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1641 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1642 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1643
1644 "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t"
1645 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
1646 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1647 "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t"
1648 "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t"
1649 "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t"
1650 "mtlo %[vector4a], $ac0 \n\t"
1651 "extp %[Temp6], $ac3, 9 \n\t"
1652
1653 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1654 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1655 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1656 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1657 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1658 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1659
1660 "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t"
1661 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
1662 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
1663 "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t"
1664 "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t"
1665 "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t"
1666 "mtlo %[vector4a], $ac1 \n\t"
1667 "extp %[Temp7], $ac0, 9 \n\t"
1668
1669 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1670 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1671 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1672 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1673 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1674 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1675 "extp %[Temp8], $ac1, 9 \n\t"
1676
1677 : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
1678 [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
1679 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1680 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1681 [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
1682 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1683 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1684 [src_ptr] "r" (src_ptr)
1685 );
1686
1687 /* clamp and store results */
1688 output_ptr[0] = cm[Temp1];
1689 output_ptr[1] = cm[Temp2];
1690 output_ptr[2] = cm[Temp3];
1691 output_ptr[3] = cm[Temp4];
1692 output_ptr[4] = cm[Temp5];
1693 output_ptr[5] = cm[Temp6];
1694 output_ptr[6] = cm[Temp7];
1695 output_ptr[7] = cm[Temp8];
1696
1697 src_ptr += 8;
1698 output_ptr += output_pitch;
1699 }
1700 }
1701 else
1702 {
1703 /* 4 tap filter */
1704
1705 /* prefetch src_ptr data to cache memory */
1706 prefetch_load(src_ptr);
1707
1708 for (i = output_height; i--;)
1709 {
1710 __asm__ __volatile__ (
1711 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
1712 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1713 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1714 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
1715 "mtlo %[vector4a], $ac2 \n\t"
1716 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1717 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1718 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1719 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1720
1721 : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1722 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1723 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1724 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1725 );
1726
1727 __asm__ __volatile__ (
1728 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
1729 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1730 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1731 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
1732 "mtlo %[vector4a], $ac3 \n\t"
1733 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1734 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1735 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1736 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1737 "extp %[Temp1], $ac2, 9 \n\t"
1738
1739 : [Temp1] "=r" (Temp1),
1740 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1741 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1742 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1743 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1744 );
1745
1746 src_ptr_l1 = src_ptr[-6];
1747 src_ptr_0 = src_ptr[2];
1748 src_ptr_r1 = src_ptr[10];
1749 src_ptr_r2 = src_ptr[18];
1750
1751 __asm__ __volatile__ (
1752 "mtlo %[vector4a], $ac0 \n\t"
1753 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1754 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1755 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1756 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1757 "extp %[Temp2], $ac3, 9 \n\t"
1758
1759 : [Temp2] "=r" (Temp2)
1760 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1761 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1762 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1763 [vector4a] "r" (vector4a)
1764 );
1765
1766 src_ptr_l1 = src_ptr[-5];
1767 src_ptr_0 = src_ptr[3];
1768 src_ptr_r1 = src_ptr[11];
1769 src_ptr_r2 = src_ptr[19];
1770
1771 __asm__ __volatile__ (
1772 "mtlo %[vector4a], $ac1 \n\t"
1773 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1774 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1775 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1776 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1777 "extp %[Temp3], $ac0, 9 \n\t"
1778
1779 : [Temp3] "=r" (Temp3)
1780 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1781 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1782 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1783 [vector4a] "r" (vector4a)
1784 );
1785
1786 src_ptr_l1 = src_ptr[-4];
1787 src_ptr_0 = src_ptr[4];
1788 src_ptr_r1 = src_ptr[12];
1789 src_ptr_r2 = src_ptr[20];
1790
1791 __asm__ __volatile__ (
1792 "mtlo %[vector4a], $ac2 \n\t"
1793 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1794 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1795 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1796 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1797 "extp %[Temp4], $ac1, 9 \n\t"
1798
1799 : [Temp4] "=r" (Temp4)
1800 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1801 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1802 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1803 [vector4a] "r" (vector4a)
1804 );
1805
1806 src_ptr_l1 = src_ptr[-3];
1807 src_ptr_0 = src_ptr[5];
1808 src_ptr_r1 = src_ptr[13];
1809 src_ptr_r2 = src_ptr[21];
1810
1811 __asm__ __volatile__ (
1812 "mtlo %[vector4a], $ac3 \n\t"
1813 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1814 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1815 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1816 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1817 "extp %[Temp5], $ac2, 9 \n\t"
1818
1819 : [Temp5] "=&r" (Temp5)
1820 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1821 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1822 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1823 [vector4a] "r" (vector4a)
1824 );
1825
1826 src_ptr_l1 = src_ptr[-2];
1827 src_ptr_0 = src_ptr[6];
1828 src_ptr_r1 = src_ptr[14];
1829 src_ptr_r2 = src_ptr[22];
1830
1831 __asm__ __volatile__ (
1832 "mtlo %[vector4a], $ac0 \n\t"
1833 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1834 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1835 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1836 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1837 "extp %[Temp6], $ac3, 9 \n\t"
1838
1839 : [Temp6] "=r" (Temp6)
1840 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1841 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1842 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1843 [vector4a] "r" (vector4a)
1844 );
1845
1846 src_ptr_l1 = src_ptr[-1];
1847 src_ptr_0 = src_ptr[7];
1848 src_ptr_r1 = src_ptr[15];
1849 src_ptr_r2 = src_ptr[23];
1850
1851 __asm__ __volatile__ (
1852 "mtlo %[vector4a], $ac1 \n\t"
1853 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1854 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1855 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1856 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1857 "extp %[Temp7], $ac0, 9 \n\t"
1858 "extp %[Temp8], $ac1, 9 \n\t"
1859
1860 : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
1861 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1862 [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1863 [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1864 [vector4a] "r" (vector4a)
1865 );
1866
1867 /* clamp and store results */
1868 output_ptr[0] = cm[Temp1];
1869 output_ptr[1] = cm[Temp2];
1870 output_ptr[2] = cm[Temp3];
1871 output_ptr[3] = cm[Temp4];
1872 output_ptr[4] = cm[Temp5];
1873 output_ptr[5] = cm[Temp6];
1874 output_ptr[6] = cm[Temp7];
1875 output_ptr[7] = cm[Temp8];
1876
1877 src_ptr += 8;
1878 output_ptr += output_pitch;
1879 }
1880 }
1881 }
1882
1883
vp8_filter_block2d_second_pass161(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,const unsigned short * vp8_filter)1884 void vp8_filter_block2d_second_pass161
1885 (
1886 unsigned char *RESTRICT src_ptr,
1887 unsigned char *RESTRICT output_ptr,
1888 int output_pitch,
1889 const unsigned short *vp8_filter
1890 )
1891 {
1892 unsigned int i, j;
1893
1894 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1895 unsigned int vector4a;
1896 unsigned int vector1b, vector2b, vector3b;
1897
1898 unsigned char src_ptr_l2;
1899 unsigned char src_ptr_l1;
1900 unsigned char src_ptr_0;
1901 unsigned char src_ptr_r1;
1902 unsigned char src_ptr_r2;
1903 unsigned char src_ptr_r3;
1904 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1905
1906 vector4a = 64;
1907
1908 vector1b = vp8_filter[0];
1909 vector2b = vp8_filter[2];
1910 vector3b = vp8_filter[1];
1911
1912 if (vector1b == 0)
1913 {
1914 /* 4 tap filter */
1915
1916 /* prefetch src_ptr data to cache memory */
1917 prefetch_load(src_ptr + 16);
1918
1919 for (i = 16; i--;)
1920 {
1921 /* unrolling for loop */
1922 for (j = 0; j < 16; j += 8)
1923 {
1924 /* apply filter with vectors pairs */
1925 __asm__ __volatile__ (
1926 "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t"
1927 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1928 "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t"
1929 "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t"
1930 "mtlo %[vector4a], $ac2 \n\t"
1931 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1932 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1933 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1934 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1935
1936 "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t"
1937 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1938 "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t"
1939 "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t"
1940 "mtlo %[vector4a], $ac3 \n\t"
1941 "extp %[Temp1], $ac2, 9 \n\t"
1942
1943 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1944 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1945 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1946 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1947
1948 "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t"
1949 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1950 "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t"
1951 "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t"
1952 "mtlo %[vector4a], $ac1 \n\t"
1953 "extp %[Temp2], $ac3, 9 \n\t"
1954
1955 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1956 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1957 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1958 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1959
1960 "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t"
1961 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1962 "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t"
1963 "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t"
1964 "mtlo %[vector4a], $ac3 \n\t"
1965 "extp %[Temp3], $ac1, 9 \n\t"
1966
1967 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1968 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1969 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1970 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1971
1972 "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t"
1973 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1974 "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t"
1975 "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t"
1976 "mtlo %[vector4a], $ac2 \n\t"
1977 "extp %[Temp4], $ac3, 9 \n\t"
1978
1979 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1980 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1981 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1982 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1983
1984 "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t"
1985 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1986 "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t"
1987 "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t"
1988 "mtlo %[vector4a], $ac3 \n\t"
1989 "extp %[Temp5], $ac2, 9 \n\t"
1990
1991 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1992 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1993 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1994 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1995
1996 "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t"
1997 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1998 "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t"
1999 "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t"
2000 "mtlo %[vector4a], $ac1 \n\t"
2001 "extp %[Temp6], $ac3, 9 \n\t"
2002
2003 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2004 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2005 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2006 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2007
2008 "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t"
2009 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
2010 "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t"
2011 "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t"
2012 "mtlo %[vector4a], $ac3 \n\t"
2013 "extp %[Temp7], $ac1, 9 \n\t"
2014
2015 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2016 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2017 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2018 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2019 "extp %[Temp8], $ac3, 9 \n\t"
2020
2021 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2022 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2023 [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2024 [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2025 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2026 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
2027 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
2028 [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
2029 );
2030
2031 /* clamp and store results */
2032 output_ptr[j] = cm[Temp1];
2033 output_ptr[j + 1] = cm[Temp2];
2034 output_ptr[j + 2] = cm[Temp3];
2035 output_ptr[j + 3] = cm[Temp4];
2036 output_ptr[j + 4] = cm[Temp5];
2037 output_ptr[j + 5] = cm[Temp6];
2038 output_ptr[j + 6] = cm[Temp7];
2039 output_ptr[j + 7] = cm[Temp8];
2040
2041 src_ptr += 8;
2042 }
2043
2044 output_ptr += output_pitch;
2045 }
2046 }
2047 else
2048 {
2049 /* 4 tap filter */
2050
2051 /* prefetch src_ptr data to cache memory */
2052 prefetch_load(src_ptr + 16);
2053
2054 /* unroll for loop */
2055 for (i = 16; i--;)
2056 {
2057 /* apply filter with vectors pairs */
2058 __asm__ __volatile__ (
2059 "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t"
2060 "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t"
2061 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
2062 "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t"
2063 "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t"
2064 "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t"
2065 "mtlo %[vector4a], $ac2 \n\t"
2066
2067 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2068 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2069 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2070 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2071 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2072 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2073
2074 "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t"
2075 "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t"
2076 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
2077 "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t"
2078 "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t"
2079 "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t"
2080 "mtlo %[vector4a], $ac0 \n\t"
2081 "extp %[Temp1], $ac2, 9 \n\t"
2082
2083 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2084 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2085 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2086 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2087 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2088 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2089
2090 "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t"
2091 "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t"
2092 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
2093 "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t"
2094 "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t"
2095 "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t"
2096 "mtlo %[vector4a], $ac1 \n\t"
2097 "extp %[Temp2], $ac0, 9 \n\t"
2098
2099 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2100 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2101 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2102 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2103 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2104 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2105
2106 "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t"
2107 "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t"
2108 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
2109 "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t"
2110 "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t"
2111 "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t"
2112 "mtlo %[vector4a], $ac3 \n\t"
2113 "extp %[Temp3], $ac1, 9 \n\t"
2114
2115 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2116 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2117 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2118 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2119 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2120 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2121
2122 "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t"
2123 "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t"
2124 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
2125 "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t"
2126 "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t"
2127 "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t"
2128 "mtlo %[vector4a], $ac2 \n\t"
2129 "extp %[Temp4], $ac3, 9 \n\t"
2130
2131 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2132 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2133 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2134 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2135 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2136 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2137
2138 "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t"
2139 "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t"
2140 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
2141 "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t"
2142 "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t"
2143 "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t"
2144 "mtlo %[vector4a], $ac0 \n\t"
2145 "extp %[Temp5], $ac2, 9 \n\t"
2146
2147 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2148 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2149 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2150 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2151 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2152 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2153
2154 "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t"
2155 "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t"
2156 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
2157 "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t"
2158 "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t"
2159 "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t"
2160 "mtlo %[vector4a], $ac1 \n\t"
2161 "extp %[Temp6], $ac0, 9 \n\t"
2162
2163 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2164 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2165 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2166 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2167 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2168 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2169
2170 "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t"
2171 "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t"
2172 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
2173 "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t"
2174 "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t"
2175 "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t"
2176 "mtlo %[vector4a], $ac3 \n\t"
2177 "extp %[Temp7], $ac1, 9 \n\t"
2178
2179 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2180 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2181 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2182 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2183 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2184 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2185 "extp %[Temp8], $ac3, 9 \n\t"
2186
2187 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2188 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2189 [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2190 [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2191 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2192 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
2193 [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
2194 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
2195 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
2196 [src_ptr] "r" (src_ptr)
2197 );
2198
2199 /* clamp and store results */
2200 output_ptr[0] = cm[Temp1];
2201 output_ptr[1] = cm[Temp2];
2202 output_ptr[2] = cm[Temp3];
2203 output_ptr[3] = cm[Temp4];
2204 output_ptr[4] = cm[Temp5];
2205 output_ptr[5] = cm[Temp6];
2206 output_ptr[6] = cm[Temp7];
2207 output_ptr[7] = cm[Temp8];
2208
2209 /* apply filter with vectors pairs */
2210 __asm__ __volatile__ (
2211 "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t"
2212 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
2213 "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t"
2214 "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t"
2215 "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t"
2216 "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t"
2217 "mtlo %[vector4a], $ac2 \n\t"
2218
2219 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2220 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2221 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2222 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2223 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2224 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2225
2226 "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t"
2227 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
2228 "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t"
2229 "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t"
2230 "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t"
2231 "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t"
2232 "mtlo %[vector4a], $ac0 \n\t"
2233 "extp %[Temp1], $ac2, 9 \n\t"
2234
2235 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2236 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2237 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2238 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2239 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2240 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2241
2242 "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t"
2243 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
2244 "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t"
2245 "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t"
2246 "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t"
2247 "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t"
2248 "mtlo %[vector4a], $ac1 \n\t"
2249 "extp %[Temp2], $ac0, 9 \n\t"
2250
2251 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2252 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2253 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2254 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2255 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2256 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2257
2258 "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t"
2259 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
2260 "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t"
2261 "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t"
2262 "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t"
2263 "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t"
2264 "mtlo %[vector4a], $ac3 \n\t"
2265 "extp %[Temp3], $ac1, 9 \n\t"
2266
2267 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2268 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2269 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2270 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2271 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2272 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2273
2274 "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t"
2275 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
2276 "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t"
2277 "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t"
2278 "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t"
2279 "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t"
2280 "mtlo %[vector4a], $ac2 \n\t"
2281 "extp %[Temp4], $ac3, 9 \n\t"
2282
2283 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2284 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2285 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2286 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2287 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2288 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2289
2290 "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t"
2291 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
2292 "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t"
2293 "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t"
2294 "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t"
2295 "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t"
2296 "mtlo %[vector4a], $ac0 \n\t"
2297 "extp %[Temp5], $ac2, 9 \n\t"
2298
2299 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2300 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2301 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2302 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2303 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2304 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2305
2306 "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t"
2307 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
2308 "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t"
2309 "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t"
2310 "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t"
2311 "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t"
2312 "mtlo %[vector4a], $ac1 \n\t"
2313 "extp %[Temp6], $ac0, 9 \n\t"
2314
2315 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2316 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2317 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2318 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2319 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2320 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2321
2322 "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t"
2323 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
2324 "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t"
2325 "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t"
2326 "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t"
2327 "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t"
2328 "mtlo %[vector4a], $ac3 \n\t"
2329 "extp %[Temp7], $ac1, 9 \n\t"
2330
2331 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2332 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2333 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2334 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2335 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2336 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2337 "extp %[Temp8], $ac3, 9 \n\t"
2338
2339 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2340 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2341 [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2342 [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2343 [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2344 [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
2345 [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
2346 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
2347 [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
2348 [src_ptr] "r" (src_ptr)
2349 );
2350
2351 src_ptr += 16;
2352 output_ptr[8] = cm[Temp1];
2353 output_ptr[9] = cm[Temp2];
2354 output_ptr[10] = cm[Temp3];
2355 output_ptr[11] = cm[Temp4];
2356 output_ptr[12] = cm[Temp5];
2357 output_ptr[13] = cm[Temp6];
2358 output_ptr[14] = cm[Temp7];
2359 output_ptr[15] = cm[Temp8];
2360
2361 output_ptr += output_pitch;
2362 }
2363 }
2364 }
2365
2366
vp8_sixtap_predict4x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2367 void vp8_sixtap_predict4x4_dspr2
2368 (
2369 unsigned char *RESTRICT src_ptr,
2370 int src_pixels_per_line,
2371 int xoffset,
2372 int yoffset,
2373 unsigned char *RESTRICT dst_ptr,
2374 int dst_pitch
2375 )
2376 {
2377 unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
2378 unsigned int pos = 16;
2379
2380 /* bit positon for extract from acc */
2381 __asm__ __volatile__ (
2382 "wrdsp %[pos], 1 \n\t"
2383 :
2384 : [pos] "r" (pos)
2385 );
2386
2387 if (yoffset)
2388 {
2389 /* First filter 1-D horizontally... */
2390 vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
2391 src_pixels_per_line, 9, xoffset, 4);
2392 /* then filter verticaly... */
2393 vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
2394 }
2395 else
2396 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2397 vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
2398 4, xoffset, dst_pitch);
2399 }
2400
2401
vp8_sixtap_predict8x8_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2402 void vp8_sixtap_predict8x8_dspr2
2403 (
2404 unsigned char *RESTRICT src_ptr,
2405 int src_pixels_per_line,
2406 int xoffset,
2407 int yoffset,
2408 unsigned char *RESTRICT dst_ptr,
2409 int dst_pitch
2410 )
2411 {
2412
2413 unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
2414 unsigned int pos, Temp1, Temp2;
2415
2416 pos = 16;
2417
2418 /* bit positon for extract from acc */
2419 __asm__ __volatile__ (
2420 "wrdsp %[pos], 1 \n\t"
2421 :
2422 : [pos] "r" (pos)
2423 );
2424
2425 if (yoffset)
2426 {
2427
2428 src_ptr = src_ptr - (2 * src_pixels_per_line);
2429
2430 if (xoffset)
2431 /* filter 1-D horizontally... */
2432 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2433 13, xoffset, 8);
2434
2435 else
2436 {
2437 /* prefetch src_ptr data to cache memory */
2438 prefetch_load(src_ptr + 2 * src_pixels_per_line);
2439
2440 __asm__ __volatile__ (
2441 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2442 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2443 "sw %[Temp1], 0(%[FData]) \n\t"
2444 "sw %[Temp2], 4(%[FData]) \n\t"
2445 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2446
2447 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2448 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2449 "sw %[Temp1], 8(%[FData]) \n\t"
2450 "sw %[Temp2], 12(%[FData]) \n\t"
2451 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2452
2453 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2454 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2455 "sw %[Temp1], 16(%[FData]) \n\t"
2456 "sw %[Temp2], 20(%[FData]) \n\t"
2457 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2458
2459 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2460 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2461 "sw %[Temp1], 24(%[FData]) \n\t"
2462 "sw %[Temp2], 28(%[FData]) \n\t"
2463 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2464
2465 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2466 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2467 "sw %[Temp1], 32(%[FData]) \n\t"
2468 "sw %[Temp2], 36(%[FData]) \n\t"
2469 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2470
2471 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2472 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2473 "sw %[Temp1], 40(%[FData]) \n\t"
2474 "sw %[Temp2], 44(%[FData]) \n\t"
2475 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2476
2477 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2478 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2479 "sw %[Temp1], 48(%[FData]) \n\t"
2480 "sw %[Temp2], 52(%[FData]) \n\t"
2481 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2482
2483 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2484 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2485 "sw %[Temp1], 56(%[FData]) \n\t"
2486 "sw %[Temp2], 60(%[FData]) \n\t"
2487 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2488
2489 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2490 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2491 "sw %[Temp1], 64(%[FData]) \n\t"
2492 "sw %[Temp2], 68(%[FData]) \n\t"
2493 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2494
2495 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2496 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2497 "sw %[Temp1], 72(%[FData]) \n\t"
2498 "sw %[Temp2], 76(%[FData]) \n\t"
2499 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2500
2501 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2502 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2503 "sw %[Temp1], 80(%[FData]) \n\t"
2504 "sw %[Temp2], 84(%[FData]) \n\t"
2505 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2506
2507 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2508 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2509 "sw %[Temp1], 88(%[FData]) \n\t"
2510 "sw %[Temp2], 92(%[FData]) \n\t"
2511 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2512
2513 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2514 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2515 "sw %[Temp1], 96(%[FData]) \n\t"
2516 "sw %[Temp2], 100(%[FData]) \n\t"
2517
2518 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2519 : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
2520 [src_pixels_per_line] "r" (src_pixels_per_line)
2521 );
2522 }
2523
2524 /* filter verticaly... */
2525 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
2526 }
2527
2528 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2529 else
2530 {
2531 if (xoffset)
2532 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2533 8, xoffset, dst_pitch);
2534
2535 else
2536 {
2537 /* copy from src buffer to dst buffer */
2538 __asm__ __volatile__ (
2539 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2540 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2541 "sw %[Temp1], 0(%[dst_ptr]) \n\t"
2542 "sw %[Temp2], 4(%[dst_ptr]) \n\t"
2543 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2544
2545 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2546 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2547 "sw %[Temp1], 8(%[dst_ptr]) \n\t"
2548 "sw %[Temp2], 12(%[dst_ptr]) \n\t"
2549 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2550
2551 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2552 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2553 "sw %[Temp1], 16(%[dst_ptr]) \n\t"
2554 "sw %[Temp2], 20(%[dst_ptr]) \n\t"
2555 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2556
2557 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2558 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2559 "sw %[Temp1], 24(%[dst_ptr]) \n\t"
2560 "sw %[Temp2], 28(%[dst_ptr]) \n\t"
2561 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2562
2563 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2564 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2565 "sw %[Temp1], 32(%[dst_ptr]) \n\t"
2566 "sw %[Temp2], 36(%[dst_ptr]) \n\t"
2567 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2568
2569 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2570 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2571 "sw %[Temp1], 40(%[dst_ptr]) \n\t"
2572 "sw %[Temp2], 44(%[dst_ptr]) \n\t"
2573 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2574
2575 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2576 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2577 "sw %[Temp1], 48(%[dst_ptr]) \n\t"
2578 "sw %[Temp2], 52(%[dst_ptr]) \n\t"
2579 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2580
2581 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2582 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2583 "sw %[Temp1], 56(%[dst_ptr]) \n\t"
2584 "sw %[Temp2], 60(%[dst_ptr]) \n\t"
2585
2586 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2587 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
2588 [src_pixels_per_line] "r" (src_pixels_per_line)
2589 );
2590 }
2591 }
2592 }
2593
2594
vp8_sixtap_predict8x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2595 void vp8_sixtap_predict8x4_dspr2
2596 (
2597 unsigned char *RESTRICT src_ptr,
2598 int src_pixels_per_line,
2599 int xoffset,
2600 int yoffset,
2601 unsigned char *RESTRICT dst_ptr,
2602 int dst_pitch
2603 )
2604 {
2605 unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
2606 unsigned int pos, Temp1, Temp2;
2607
2608 pos = 16;
2609
2610 /* bit positon for extract from acc */
2611 __asm__ __volatile__ (
2612 "wrdsp %[pos], 1 \n\t"
2613 :
2614 : [pos] "r" (pos)
2615 );
2616
2617 if (yoffset)
2618 {
2619
2620 src_ptr = src_ptr - (2 * src_pixels_per_line);
2621
2622 if (xoffset)
2623 /* filter 1-D horizontally... */
2624 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2625 9, xoffset, 8);
2626
2627 else
2628 {
2629 /* prefetch src_ptr data to cache memory */
2630 prefetch_load(src_ptr + 2 * src_pixels_per_line);
2631
2632 __asm__ __volatile__ (
2633 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2634 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2635 "sw %[Temp1], 0(%[FData]) \n\t"
2636 "sw %[Temp2], 4(%[FData]) \n\t"
2637 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2638
2639 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2640 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2641 "sw %[Temp1], 8(%[FData]) \n\t"
2642 "sw %[Temp2], 12(%[FData]) \n\t"
2643 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2644
2645 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2646 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2647 "sw %[Temp1], 16(%[FData]) \n\t"
2648 "sw %[Temp2], 20(%[FData]) \n\t"
2649 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2650
2651 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2652 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2653 "sw %[Temp1], 24(%[FData]) \n\t"
2654 "sw %[Temp2], 28(%[FData]) \n\t"
2655 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2656
2657 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2658 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2659 "sw %[Temp1], 32(%[FData]) \n\t"
2660 "sw %[Temp2], 36(%[FData]) \n\t"
2661 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2662
2663 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2664 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2665 "sw %[Temp1], 40(%[FData]) \n\t"
2666 "sw %[Temp2], 44(%[FData]) \n\t"
2667 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2668
2669 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2670 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2671 "sw %[Temp1], 48(%[FData]) \n\t"
2672 "sw %[Temp2], 52(%[FData]) \n\t"
2673 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2674
2675 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2676 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2677 "sw %[Temp1], 56(%[FData]) \n\t"
2678 "sw %[Temp2], 60(%[FData]) \n\t"
2679 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2680
2681 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2682 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2683 "sw %[Temp1], 64(%[FData]) \n\t"
2684 "sw %[Temp2], 68(%[FData]) \n\t"
2685
2686 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2687 : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
2688 [src_pixels_per_line] "r" (src_pixels_per_line)
2689 );
2690 }
2691
2692 /* filter verticaly... */
2693 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
2694 }
2695
2696 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2697 else
2698 {
2699 if (xoffset)
2700 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2701 4, xoffset, dst_pitch);
2702
2703 else
2704 {
2705 /* copy from src buffer to dst buffer */
2706 __asm__ __volatile__ (
2707 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2708 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2709 "sw %[Temp1], 0(%[dst_ptr]) \n\t"
2710 "sw %[Temp2], 4(%[dst_ptr]) \n\t"
2711 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2712
2713 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2714 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2715 "sw %[Temp1], 8(%[dst_ptr]) \n\t"
2716 "sw %[Temp2], 12(%[dst_ptr]) \n\t"
2717 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2718
2719 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2720 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2721 "sw %[Temp1], 16(%[dst_ptr]) \n\t"
2722 "sw %[Temp2], 20(%[dst_ptr]) \n\t"
2723 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2724
2725 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2726 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2727 "sw %[Temp1], 24(%[dst_ptr]) \n\t"
2728 "sw %[Temp2], 28(%[dst_ptr]) \n\t"
2729
2730 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2731 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
2732 [src_pixels_per_line] "r" (src_pixels_per_line)
2733 );
2734 }
2735 }
2736 }
2737
2738
vp8_sixtap_predict16x16_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2739 void vp8_sixtap_predict16x16_dspr2
2740 (
2741 unsigned char *RESTRICT src_ptr,
2742 int src_pixels_per_line,
2743 int xoffset,
2744 int yoffset,
2745 unsigned char *RESTRICT dst_ptr,
2746 int dst_pitch
2747 )
2748 {
2749 const unsigned short *VFilter;
2750 unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
2751 unsigned int pos;
2752
2753 VFilter = sub_pel_filterss[yoffset];
2754
2755 pos = 16;
2756
2757 /* bit positon for extract from acc */
2758 __asm__ __volatile__ (
2759 "wrdsp %[pos], 1 \n\t"
2760 :
2761 : [pos] "r" (pos)
2762 );
2763
2764 if (yoffset)
2765 {
2766
2767 src_ptr = src_ptr - (2 * src_pixels_per_line);
2768
2769 switch (xoffset)
2770 {
2771 /* filter 1-D horizontally... */
2772 case 2:
2773 case 4:
2774 case 6:
2775 /* 6 tap filter */
2776 vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
2777 21, xoffset, 16);
2778 break;
2779
2780 case 0:
2781 /* only copy buffer */
2782 vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
2783 break;
2784
2785 case 1:
2786 case 3:
2787 case 5:
2788 case 7:
2789 /* 4 tap filter */
2790 vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
2791 21, xoffset, yoffset, dst_ptr, dst_pitch);
2792 break;
2793 }
2794
2795 /* filter verticaly... */
2796 vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
2797 }
2798 else
2799 {
2800 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2801 switch (xoffset)
2802 {
2803 case 2:
2804 case 4:
2805 case 6:
2806 /* 6 tap filter */
2807 vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
2808 16, xoffset, dst_pitch);
2809 break;
2810
2811 case 1:
2812 case 3:
2813 case 5:
2814 case 7:
2815 /* 4 tap filter */
2816 vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
2817 21, xoffset, yoffset, dst_ptr, dst_pitch);
2818 break;
2819 }
2820 }
2821 }
2822
2823 #endif
2824