1 /*
2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <stdlib.h>
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/mem.h"
14
15 #if HAVE_DSPR2
16 #define CROP_WIDTH 256
17 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
18
19 static const unsigned short sub_pel_filterss[8][3] = {
20 { 0, 0, 0 },
21 { 0, 0x0601, 0x7b0c },
22 { 0x0201, 0x0b08, 0x6c24 },
23 { 0, 0x0906, 0x5d32 },
24 { 0x0303, 0x1010, 0x4d4d },
25 { 0, 0x0609, 0x325d },
26 { 0x0102, 0x080b, 0x246c },
27 { 0, 0x0106, 0x0c7b },
28 };
29
30 static const int sub_pel_filters_int[8][3] = {
31 { 0, 0, 0 },
32 { 0x0000fffa, 0x007b000c, 0xffff0000 },
33 { 0x0002fff5, 0x006c0024, 0xfff80001 },
34 { 0x0000fff7, 0x005d0032, 0xfffa0000 },
35 { 0x0003fff0, 0x004d004d, 0xfff00003 },
36 { 0x0000fffa, 0x0032005d, 0xfff70000 },
37 { 0x0001fff8, 0x0024006c, 0xfff50002 },
38 { 0x0000ffff, 0x000c007b, 0xfffa0000 },
39 };
40
41 static const int sub_pel_filters_inv[8][3] = {
42 { 0, 0, 0 },
43 { 0xfffa0000, 0x000c007b, 0x0000ffff },
44 { 0xfff50002, 0x0024006c, 0x0001fff8 },
45 { 0xfff70000, 0x0032005d, 0x0000fffa },
46 { 0xfff00003, 0x004d004d, 0x0003fff0 },
47 { 0xfffa0000, 0x005d0032, 0x0000fff7 },
48 { 0xfff80001, 0x006c0024, 0x0002fff5 },
49 { 0xffff0000, 0x007b000c, 0x0000fffa },
50 };
51
52 /* clang-format off */
53 static const int sub_pel_filters_int_tap_4[8][2] = {
54 { 0, 0},
55 { 0xfffa007b, 0x000cffff},
56 { 0, 0},
57 { 0xfff7005d, 0x0032fffa},
58 { 0, 0},
59 { 0xfffa0032, 0x005dfff7},
60 { 0, 0},
61 { 0xffff000c, 0x007bfffa},
62 };
63
64
65 static const int sub_pel_filters_inv_tap_4[8][2] = {
66 { 0, 0},
67 { 0x007bfffa, 0xffff000c},
68 { 0, 0},
69 { 0x005dfff7, 0xfffa0032},
70 { 0, 0},
71 { 0x0032fffa, 0xfff7005d},
72 { 0, 0},
73 { 0x000cffff, 0xfffa007b},
74 };
75 /* clang-format on */
76
prefetch_load(unsigned char * src)77 inline void prefetch_load(unsigned char *src) {
78 __asm__ __volatile__("pref 0, 0(%[src]) \n\t" : : [src] "r"(src));
79 }
80
prefetch_store(unsigned char * dst)81 inline void prefetch_store(unsigned char *dst) {
82 __asm__ __volatile__("pref 1, 0(%[dst]) \n\t" : : [dst] "r"(dst));
83 }
84
dsputil_static_init(void)85 void dsputil_static_init(void) {
86 int i;
87
88 for (i = 0; i < 256; ++i) ff_cropTbl[i + CROP_WIDTH] = i;
89
90 for (i = 0; i < CROP_WIDTH; ++i) {
91 ff_cropTbl[i] = 0;
92 ff_cropTbl[i + CROP_WIDTH + 256] = 255;
93 }
94 }
95
vp8_filter_block2d_first_pass_4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)96 void vp8_filter_block2d_first_pass_4(unsigned char *RESTRICT src_ptr,
97 unsigned char *RESTRICT dst_ptr,
98 unsigned int src_pixels_per_line,
99 unsigned int output_height, int xoffset,
100 int pitch) {
101 unsigned int i;
102 int Temp1, Temp2, Temp3, Temp4;
103
104 unsigned int vector4a = 64;
105 int vector1b, vector2b, vector3b;
106 unsigned int tp1, tp2, tn1, tn2;
107 unsigned int p1, p2, p3;
108 unsigned int n1, n2, n3;
109 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
110
111 vector3b = sub_pel_filters_inv[xoffset][2];
112
113 /* if (xoffset == 0) we don't need any filtering */
114 if (vector3b == 0) {
115 for (i = 0; i < output_height; ++i) {
116 /* prefetch src_ptr data to cache memory */
117 prefetch_load(src_ptr + src_pixels_per_line);
118 dst_ptr[0] = src_ptr[0];
119 dst_ptr[1] = src_ptr[1];
120 dst_ptr[2] = src_ptr[2];
121 dst_ptr[3] = src_ptr[3];
122
123 /* next row... */
124 src_ptr += src_pixels_per_line;
125 dst_ptr += 4;
126 }
127 } else {
128 if (vector3b > 65536) {
129 /* 6 tap filter */
130
131 vector1b = sub_pel_filters_inv[xoffset][0];
132 vector2b = sub_pel_filters_inv[xoffset][1];
133
134 /* prefetch src_ptr data to cache memory */
135 prefetch_load(src_ptr + src_pixels_per_line);
136
137 for (i = output_height; i--;) {
138 /* apply filter with vectors pairs */
139 __asm__ __volatile__(
140 "ulw %[tp1], -2(%[src_ptr]) \n\t"
141 "ulw %[tp2], 2(%[src_ptr]) \n\t"
142
143 /* even 1. pixel */
144 "mtlo %[vector4a], $ac3 \n\t"
145 "preceu.ph.qbr %[p1], %[tp1] \n\t"
146 "preceu.ph.qbl %[p2], %[tp1] \n\t"
147 "preceu.ph.qbr %[p3], %[tp2] \n\t"
148 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
149 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
150 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
151
152 /* even 2. pixel */
153 "mtlo %[vector4a], $ac2 \n\t"
154 "preceu.ph.qbl %[p1], %[tp2] \n\t"
155 "balign %[tp2], %[tp1], 3 \n\t"
156 "extp %[Temp1], $ac3, 9 \n\t"
157 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
158 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
159 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
160
161 /* odd 1. pixel */
162 "ulw %[tn2], 3(%[src_ptr]) \n\t"
163 "mtlo %[vector4a], $ac3 \n\t"
164 "preceu.ph.qbr %[n1], %[tp2] \n\t"
165 "preceu.ph.qbl %[n2], %[tp2] \n\t"
166 "preceu.ph.qbr %[n3], %[tn2] \n\t"
167 "extp %[Temp3], $ac2, 9 \n\t"
168 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
169 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
170 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
171
172 /* even 2. pixel */
173 "mtlo %[vector4a], $ac2 \n\t"
174 "preceu.ph.qbl %[n1], %[tn2] \n\t"
175 "extp %[Temp2], $ac3, 9 \n\t"
176 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
177 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
178 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
179 "extp %[Temp4], $ac2, 9 \n\t"
180
181 /* clamp */
182 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
183 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
184 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
185 "lbux %[n2], %[Temp4](%[cm]) \n\t"
186
187 /* store bytes */
188 "sb %[tp1], 0(%[dst_ptr]) \n\t"
189 "sb %[tn1], 1(%[dst_ptr]) \n\t"
190 "sb %[tp2], 2(%[dst_ptr]) \n\t"
191 "sb %[n2], 3(%[dst_ptr]) \n\t"
192
193 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
194 [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
195 [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
196 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
197 [Temp4] "=&r"(Temp4)
198 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
199 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
200 [vector3b] "r"(vector3b), [src_ptr] "r"(src_ptr));
201
202 /* Next row... */
203 src_ptr += src_pixels_per_line;
204 dst_ptr += pitch;
205 }
206 } else {
207 /* 4 tap filter */
208
209 vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
210 vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
211
212 for (i = output_height; i--;) {
213 /* apply filter with vectors pairs */
214 __asm__ __volatile__(
215 "ulw %[tp1], -1(%[src_ptr]) \n\t"
216 "ulw %[tp2], 3(%[src_ptr]) \n\t"
217
218 /* even 1. pixel */
219 "mtlo %[vector4a], $ac3 \n\t"
220 "preceu.ph.qbr %[p1], %[tp1] \n\t"
221 "preceu.ph.qbl %[p2], %[tp1] \n\t"
222 "preceu.ph.qbr %[p3], %[tp2] \n\t"
223 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
224 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
225
226 /* even 2. pixel */
227 "mtlo %[vector4a], $ac2 \n\t"
228 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
229 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
230 "extp %[Temp1], $ac3, 9 \n\t"
231
232 /* odd 1. pixel */
233 "srl %[tn1], %[tp2], 8 \n\t"
234 "balign %[tp2], %[tp1], 3 \n\t"
235 "mtlo %[vector4a], $ac3 \n\t"
236 "preceu.ph.qbr %[n1], %[tp2] \n\t"
237 "preceu.ph.qbl %[n2], %[tp2] \n\t"
238 "preceu.ph.qbr %[n3], %[tn1] \n\t"
239 "extp %[Temp3], $ac2, 9 \n\t"
240 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
241 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
242
243 /* odd 2. pixel */
244 "mtlo %[vector4a], $ac2 \n\t"
245 "extp %[Temp2], $ac3, 9 \n\t"
246 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
247 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
248 "extp %[Temp4], $ac2, 9 \n\t"
249
250 /* clamp and store results */
251 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
252 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
253 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
254 "sb %[tp1], 0(%[dst_ptr]) \n\t"
255 "sb %[tn1], 1(%[dst_ptr]) \n\t"
256 "lbux %[n2], %[Temp4](%[cm]) \n\t"
257 "sb %[tp2], 2(%[dst_ptr]) \n\t"
258 "sb %[n2], 3(%[dst_ptr]) \n\t"
259
260 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
261 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
262 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
263 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
264 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
265 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
266 [src_ptr] "r"(src_ptr));
267 /* Next row... */
268 src_ptr += src_pixels_per_line;
269 dst_ptr += pitch;
270 }
271 }
272 }
273 }
274
vp8_filter_block2d_first_pass_8_all(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)275 void vp8_filter_block2d_first_pass_8_all(unsigned char *RESTRICT src_ptr,
276 unsigned char *RESTRICT dst_ptr,
277 unsigned int src_pixels_per_line,
278 unsigned int output_height,
279 int xoffset, int pitch) {
280 unsigned int i;
281 int Temp1, Temp2, Temp3, Temp4;
282
283 unsigned int vector4a = 64;
284 unsigned int vector1b, vector2b, vector3b;
285 unsigned int tp1, tp2, tn1, tn2;
286 unsigned int p1, p2, p3, p4;
287 unsigned int n1, n2, n3, n4;
288
289 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
290
291 /* if (xoffset == 0) we don't need any filtering */
292 if (xoffset == 0) {
293 for (i = 0; i < output_height; ++i) {
294 /* prefetch src_ptr data to cache memory */
295 prefetch_load(src_ptr + src_pixels_per_line);
296
297 dst_ptr[0] = src_ptr[0];
298 dst_ptr[1] = src_ptr[1];
299 dst_ptr[2] = src_ptr[2];
300 dst_ptr[3] = src_ptr[3];
301 dst_ptr[4] = src_ptr[4];
302 dst_ptr[5] = src_ptr[5];
303 dst_ptr[6] = src_ptr[6];
304 dst_ptr[7] = src_ptr[7];
305
306 /* next row... */
307 src_ptr += src_pixels_per_line;
308 dst_ptr += 8;
309 }
310 } else {
311 vector3b = sub_pel_filters_inv[xoffset][2];
312
313 if (vector3b > 65536) {
314 /* 6 tap filter */
315
316 vector1b = sub_pel_filters_inv[xoffset][0];
317 vector2b = sub_pel_filters_inv[xoffset][1];
318
319 for (i = output_height; i--;) {
320 /* prefetch src_ptr data to cache memory */
321 prefetch_load(src_ptr + src_pixels_per_line);
322
323 /* apply filter with vectors pairs */
324 __asm__ __volatile__(
325 "ulw %[tp1], -2(%[src_ptr]) \n\t"
326 "ulw %[tp2], 2(%[src_ptr]) \n\t"
327
328 /* even 1. pixel */
329 "mtlo %[vector4a], $ac3 \n\t"
330 "preceu.ph.qbr %[p1], %[tp1] \n\t"
331 "preceu.ph.qbl %[p2], %[tp1] \n\t"
332 "preceu.ph.qbr %[p3], %[tp2] \n\t"
333 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
334 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
335 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
336
337 /* even 2. pixel */
338 "mtlo %[vector4a], $ac2 \n\t"
339 "preceu.ph.qbl %[p1], %[tp2] \n\t"
340 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
341 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
342 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
343
344 "balign %[tp2], %[tp1], 3 \n\t"
345 "extp %[Temp1], $ac3, 9 \n\t"
346 "ulw %[tn2], 3(%[src_ptr]) \n\t"
347
348 /* odd 1. pixel */
349 "mtlo %[vector4a], $ac3 \n\t"
350 "preceu.ph.qbr %[n1], %[tp2] \n\t"
351 "preceu.ph.qbl %[n2], %[tp2] \n\t"
352 "preceu.ph.qbr %[n3], %[tn2] \n\t"
353 "extp %[Temp3], $ac2, 9 \n\t"
354 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
355 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
356 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
357
358 /* odd 2. pixel */
359 "mtlo %[vector4a], $ac2 \n\t"
360 "preceu.ph.qbl %[n1], %[tn2] \n\t"
361 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
362 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
363 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
364 "ulw %[tp1], 6(%[src_ptr]) \n\t"
365 "extp %[Temp2], $ac3, 9 \n\t"
366 "mtlo %[vector4a], $ac3 \n\t"
367 "preceu.ph.qbr %[p2], %[tp1] \n\t"
368 "extp %[Temp4], $ac2, 9 \n\t"
369
370 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
371 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
372 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
373 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
374 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
375 [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
376 [src_ptr] "r"(src_ptr));
377
378 /* clamp and store results */
379 dst_ptr[0] = cm[Temp1];
380 dst_ptr[1] = cm[Temp2];
381 dst_ptr[2] = cm[Temp3];
382 dst_ptr[3] = cm[Temp4];
383
384 /* next 4 pixels */
385 __asm__ __volatile__(
386 /* even 3. pixel */
387 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
388 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
389 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
390
391 /* even 4. pixel */
392 "mtlo %[vector4a], $ac2 \n\t"
393 "preceu.ph.qbl %[p4], %[tp1] \n\t"
394 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
395 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
396 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
397
398 "ulw %[tn1], 7(%[src_ptr]) \n\t"
399 "extp %[Temp1], $ac3, 9 \n\t"
400
401 /* odd 3. pixel */
402 "mtlo %[vector4a], $ac3 \n\t"
403 "preceu.ph.qbr %[n2], %[tn1] \n\t"
404 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
405 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
406 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
407 "extp %[Temp3], $ac2, 9 \n\t"
408
409 /* odd 4. pixel */
410 "mtlo %[vector4a], $ac2 \n\t"
411 "preceu.ph.qbl %[n4], %[tn1] \n\t"
412 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
413 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
414 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
415 "extp %[Temp2], $ac3, 9 \n\t"
416 "extp %[Temp4], $ac2, 9 \n\t"
417
418 : [tn1] "=&r"(tn1), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4),
419 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
420 [Temp4] "=r"(Temp4)
421 : [tp1] "r"(tp1), [vector1b] "r"(vector1b), [p2] "r"(p2),
422 [vector2b] "r"(vector2b), [n1] "r"(n1), [p1] "r"(p1),
423 [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3),
424 [n3] "r"(n3), [src_ptr] "r"(src_ptr));
425
426 /* clamp and store results */
427 dst_ptr[4] = cm[Temp1];
428 dst_ptr[5] = cm[Temp2];
429 dst_ptr[6] = cm[Temp3];
430 dst_ptr[7] = cm[Temp4];
431
432 src_ptr += src_pixels_per_line;
433 dst_ptr += pitch;
434 }
435 } else {
436 /* 4 tap filter */
437
438 vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
439 vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
440
441 for (i = output_height; i--;) {
442 /* prefetch src_ptr data to cache memory */
443 prefetch_load(src_ptr + src_pixels_per_line);
444
445 /* apply filter with vectors pairs */
446 __asm__ __volatile__(
447 "ulw %[tp1], -1(%[src_ptr]) \n\t"
448
449 /* even 1. pixel */
450 "mtlo %[vector4a], $ac3 \n\t"
451 "preceu.ph.qbr %[p1], %[tp1] \n\t"
452 "preceu.ph.qbl %[p2], %[tp1] \n\t"
453 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
454 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
455
456 "ulw %[tp2], 3(%[src_ptr]) \n\t"
457
458 /* even 2. pixel */
459 "mtlo %[vector4a], $ac2 \n\t"
460 "preceu.ph.qbr %[p3], %[tp2] \n\t"
461 "preceu.ph.qbl %[p4], %[tp2] \n\t"
462 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
463 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
464 "extp %[Temp1], $ac3, 9 \n\t"
465
466 "balign %[tp2], %[tp1], 3 \n\t"
467
468 /* odd 1. pixel */
469 "mtlo %[vector4a], $ac3 \n\t"
470 "preceu.ph.qbr %[n1], %[tp2] \n\t"
471 "preceu.ph.qbl %[n2], %[tp2] \n\t"
472 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
473 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
474 "extp %[Temp3], $ac2, 9 \n\t"
475
476 "ulw %[tn2], 4(%[src_ptr]) \n\t"
477
478 /* odd 2. pixel */
479 "mtlo %[vector4a], $ac2 \n\t"
480 "preceu.ph.qbr %[n3], %[tn2] \n\t"
481 "preceu.ph.qbl %[n4], %[tn2] \n\t"
482 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
483 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
484 "ulw %[tp1], 7(%[src_ptr]) \n\t"
485 "extp %[Temp2], $ac3, 9 \n\t"
486 "mtlo %[vector4a], $ac3 \n\t"
487 "extp %[Temp4], $ac2, 9 \n\t"
488
489 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
490 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
491 [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), [n4] "=&r"(n4),
492 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
493 [Temp4] "=r"(Temp4)
494 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
495 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
496
497 /* clamp and store results */
498 dst_ptr[0] = cm[Temp1];
499 dst_ptr[1] = cm[Temp2];
500 dst_ptr[2] = cm[Temp3];
501 dst_ptr[3] = cm[Temp4];
502
503 /* next 4 pixels */
504 __asm__ __volatile__(
505 /* even 3. pixel */
506 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
507 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
508
509 /* even 4. pixel */
510 "mtlo %[vector4a], $ac2 \n\t"
511 "preceu.ph.qbr %[p2], %[tp1] \n\t"
512 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
513 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
514 "extp %[Temp1], $ac3, 9 \n\t"
515
516 /* odd 3. pixel */
517 "mtlo %[vector4a], $ac3 \n\t"
518 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
519 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
520 "ulw %[tn1], 8(%[src_ptr]) \n\t"
521 "extp %[Temp3], $ac2, 9 \n\t"
522
523 /* odd 4. pixel */
524 "mtlo %[vector4a], $ac2 \n\t"
525 "preceu.ph.qbr %[n2], %[tn1] \n\t"
526 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
527 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
528 "extp %[Temp2], $ac3, 9 \n\t"
529 "extp %[Temp4], $ac2, 9 \n\t"
530
531 : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2),
532 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
533 [Temp4] "=r"(Temp4)
534 : [tp1] "r"(tp1), [p3] "r"(p3), [p4] "r"(p4),
535 [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
536 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr), [n3] "r"(n3),
537 [n4] "r"(n4));
538
539 /* clamp and store results */
540 dst_ptr[4] = cm[Temp1];
541 dst_ptr[5] = cm[Temp2];
542 dst_ptr[6] = cm[Temp3];
543 dst_ptr[7] = cm[Temp4];
544
545 /* next row... */
546 src_ptr += src_pixels_per_line;
547 dst_ptr += pitch;
548 }
549 }
550 }
551 }
552
vp8_filter_block2d_first_pass16_6tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)553 void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
554 unsigned char *RESTRICT dst_ptr,
555 unsigned int src_pixels_per_line,
556 unsigned int output_height,
557 int xoffset, int pitch) {
558 unsigned int i;
559 int Temp1, Temp2, Temp3, Temp4;
560
561 unsigned int vector4a;
562 unsigned int vector1b, vector2b, vector3b;
563 unsigned int tp1, tp2, tn1, tn2;
564 unsigned int p1, p2, p3, p4;
565 unsigned int n1, n2, n3, n4;
566 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
567
568 vector1b = sub_pel_filters_inv[xoffset][0];
569 vector2b = sub_pel_filters_inv[xoffset][1];
570 vector3b = sub_pel_filters_inv[xoffset][2];
571 vector4a = 64;
572
573 for (i = output_height; i--;) {
574 /* prefetch src_ptr data to cache memory */
575 prefetch_load(src_ptr + src_pixels_per_line);
576
577 /* apply filter with vectors pairs */
578 __asm__ __volatile__(
579 "ulw %[tp1], -2(%[src_ptr]) \n\t"
580 "ulw %[tp2], 2(%[src_ptr]) \n\t"
581
582 /* even 1. pixel */
583 "mtlo %[vector4a], $ac3 \n\t"
584 "preceu.ph.qbr %[p1], %[tp1] \n\t"
585 "preceu.ph.qbl %[p2], %[tp1] \n\t"
586 "preceu.ph.qbr %[p3], %[tp2] \n\t"
587 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
588 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
589 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
590
591 /* even 2. pixel */
592 "mtlo %[vector4a], $ac2 \n\t"
593 "preceu.ph.qbl %[p1], %[tp2] \n\t"
594 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
595 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
596 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
597
598 "balign %[tp2], %[tp1], 3 \n\t"
599 "ulw %[tn2], 3(%[src_ptr]) \n\t"
600 "extp %[Temp1], $ac3, 9 \n\t"
601
602 /* odd 1. pixel */
603 "mtlo %[vector4a], $ac3 \n\t"
604 "preceu.ph.qbr %[n1], %[tp2] \n\t"
605 "preceu.ph.qbl %[n2], %[tp2] \n\t"
606 "preceu.ph.qbr %[n3], %[tn2] \n\t"
607 "extp %[Temp3], $ac2, 9 \n\t"
608 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
609 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
610 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
611
612 /* odd 2. pixel */
613 "mtlo %[vector4a], $ac2 \n\t"
614 "preceu.ph.qbl %[n1], %[tn2] \n\t"
615 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
616 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
617 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
618 "ulw %[tp1], 6(%[src_ptr]) \n\t"
619 "extp %[Temp2], $ac3, 9 \n\t"
620 "mtlo %[vector4a], $ac3 \n\t"
621 "preceu.ph.qbr %[p2], %[tp1] \n\t"
622 "extp %[Temp4], $ac2, 9 \n\t"
623
624 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), [p1] "=&r"(p1),
625 [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), [n2] "=&r"(n2),
626 [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
627 [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
628 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
629 [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
630 [src_ptr] "r"(src_ptr));
631
632 /* clamp and store results */
633 dst_ptr[0] = cm[Temp1];
634 dst_ptr[1] = cm[Temp2];
635 dst_ptr[2] = cm[Temp3];
636 dst_ptr[3] = cm[Temp4];
637
638 /* next 4 pixels */
639 __asm__ __volatile__(
640 /* even 3. pixel */
641 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
642 "dpa.w.ph $ac3, %[p1], %[vector2b] \n\t"
643 "dpa.w.ph $ac3, %[p2], %[vector3b] \n\t"
644
645 /* even 4. pixel */
646 "mtlo %[vector4a], $ac2 \n\t"
647 "preceu.ph.qbl %[p4], %[tp1] \n\t"
648 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
649 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
650 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
651 "ulw %[tn1], 7(%[src_ptr]) \n\t"
652 "extp %[Temp1], $ac3, 9 \n\t"
653
654 /* odd 3. pixel */
655 "mtlo %[vector4a], $ac3 \n\t"
656 "preceu.ph.qbr %[n2], %[tn1] \n\t"
657 "dpa.w.ph $ac3, %[n3], %[vector1b] \n\t"
658 "dpa.w.ph $ac3, %[n1], %[vector2b] \n\t"
659 "dpa.w.ph $ac3, %[n2], %[vector3b] \n\t"
660 "extp %[Temp3], $ac2, 9 \n\t"
661
662 /* odd 4. pixel */
663 "mtlo %[vector4a], $ac2 \n\t"
664 "preceu.ph.qbl %[n4], %[tn1] \n\t"
665 "dpa.w.ph $ac2, %[n1], %[vector1b] \n\t"
666 "dpa.w.ph $ac2, %[n2], %[vector2b] \n\t"
667 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
668 "ulw %[tp2], 10(%[src_ptr]) \n\t"
669 "extp %[Temp2], $ac3, 9 \n\t"
670 "mtlo %[vector4a], $ac3 \n\t"
671 "preceu.ph.qbr %[p1], %[tp2] \n\t"
672 "extp %[Temp4], $ac2, 9 \n\t"
673
674 : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
675 [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
676 [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
677 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
678 [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
679 [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
680 [src_ptr] "r"(src_ptr));
681
682 /* clamp and store results */
683 dst_ptr[4] = cm[Temp1];
684 dst_ptr[5] = cm[Temp2];
685 dst_ptr[6] = cm[Temp3];
686 dst_ptr[7] = cm[Temp4];
687
688 /* next 4 pixels */
689 __asm__ __volatile__(
690 /* even 5. pixel */
691 "dpa.w.ph $ac3, %[p2], %[vector1b] \n\t"
692 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
693 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
694
695 /* even 6. pixel */
696 "mtlo %[vector4a], $ac2 \n\t"
697 "preceu.ph.qbl %[p3], %[tp2] \n\t"
698 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
699 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
700 "dpa.w.ph $ac2, %[p3], %[vector3b] \n\t"
701
702 "ulw %[tn1], 11(%[src_ptr]) \n\t"
703 "extp %[Temp1], $ac3, 9 \n\t"
704
705 /* odd 5. pixel */
706 "mtlo %[vector4a], $ac3 \n\t"
707 "preceu.ph.qbr %[n1], %[tn1] \n\t"
708 "dpa.w.ph $ac3, %[n2], %[vector1b] \n\t"
709 "dpa.w.ph $ac3, %[n4], %[vector2b] \n\t"
710 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
711 "extp %[Temp3], $ac2, 9 \n\t"
712
713 /* odd 6. pixel */
714 "mtlo %[vector4a], $ac2 \n\t"
715 "preceu.ph.qbl %[n3], %[tn1] \n\t"
716 "dpa.w.ph $ac2, %[n4], %[vector1b] \n\t"
717 "dpa.w.ph $ac2, %[n1], %[vector2b] \n\t"
718 "dpa.w.ph $ac2, %[n3], %[vector3b] \n\t"
719 "ulw %[tp1], 14(%[src_ptr]) \n\t"
720 "extp %[Temp2], $ac3, 9 \n\t"
721 "mtlo %[vector4a], $ac3 \n\t"
722 "preceu.ph.qbr %[p4], %[tp1] \n\t"
723 "extp %[Temp4], $ac2, 9 \n\t"
724
725 : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
726 [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
727 [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
728 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
729 [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
730 [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
731 [vector3b] "r"(vector3b));
732
733 /* clamp and store results */
734 dst_ptr[8] = cm[Temp1];
735 dst_ptr[9] = cm[Temp2];
736 dst_ptr[10] = cm[Temp3];
737 dst_ptr[11] = cm[Temp4];
738
739 /* next 4 pixels */
740 __asm__ __volatile__(
741 /* even 7. pixel */
742 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
743 "dpa.w.ph $ac3, %[p3], %[vector2b] \n\t"
744 "dpa.w.ph $ac3, %[p4], %[vector3b] \n\t"
745
746 /* even 8. pixel */
747 "mtlo %[vector4a], $ac2 \n\t"
748 "preceu.ph.qbl %[p2], %[tp1] \n\t"
749 "dpa.w.ph $ac2, %[p3], %[vector1b] \n\t"
750 "dpa.w.ph $ac2, %[p4], %[vector2b] \n\t"
751 "dpa.w.ph $ac2, %[p2], %[vector3b] \n\t"
752 "ulw %[tn1], 15(%[src_ptr]) \n\t"
753 "extp %[Temp1], $ac3, 9 \n\t"
754
755 /* odd 7. pixel */
756 "mtlo %[vector4a], $ac3 \n\t"
757 "preceu.ph.qbr %[n4], %[tn1] \n\t"
758 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
759 "dpa.w.ph $ac3, %[n3], %[vector2b] \n\t"
760 "dpa.w.ph $ac3, %[n4], %[vector3b] \n\t"
761 "extp %[Temp3], $ac2, 9 \n\t"
762
763 /* odd 8. pixel */
764 "mtlo %[vector4a], $ac2 \n\t"
765 "preceu.ph.qbl %[n2], %[tn1] \n\t"
766 "dpa.w.ph $ac2, %[n3], %[vector1b] \n\t"
767 "dpa.w.ph $ac2, %[n4], %[vector2b] \n\t"
768 "dpa.w.ph $ac2, %[n2], %[vector3b] \n\t"
769 "extp %[Temp2], $ac3, 9 \n\t"
770 "extp %[Temp4], $ac2, 9 \n\t"
771
772 /* clamp and store results */
773 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
774 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
775 "lbux %[p2], %[Temp3](%[cm]) \n\t"
776 "sb %[tp1], 12(%[dst_ptr]) \n\t"
777 "sb %[tn1], 13(%[dst_ptr]) \n\t"
778 "lbux %[n2], %[Temp4](%[cm]) \n\t"
779 "sb %[p2], 14(%[dst_ptr]) \n\t"
780 "sb %[n2], 15(%[dst_ptr]) \n\t"
781
782 : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
783 [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
784 [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
785 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
786 [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
787 [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
788 [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
789
790 src_ptr += src_pixels_per_line;
791 dst_ptr += pitch;
792 }
793 }
794
vp8_filter_block2d_first_pass16_0(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line)795 void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
796 unsigned char *RESTRICT output_ptr,
797 unsigned int src_pixels_per_line) {
798 int Temp1, Temp2, Temp3, Temp4;
799 int i;
800
801 /* prefetch src_ptr data to cache memory */
802 prefetch_store(output_ptr + 32);
803
804 /* copy memory from src buffer to dst buffer */
805 for (i = 0; i < 7; ++i) {
806 __asm__ __volatile__(
807 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
808 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
809 "ulw %[Temp3], 8(%[src_ptr]) \n\t"
810 "ulw %[Temp4], 12(%[src_ptr]) \n\t"
811 "sw %[Temp1], 0(%[output_ptr]) \n\t"
812 "sw %[Temp2], 4(%[output_ptr]) \n\t"
813 "sw %[Temp3], 8(%[output_ptr]) \n\t"
814 "sw %[Temp4], 12(%[output_ptr]) \n\t"
815 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
816
817 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
818 [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
819 : [src_pixels_per_line] "r"(src_pixels_per_line),
820 [output_ptr] "r"(output_ptr));
821
822 __asm__ __volatile__(
823 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
824 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
825 "ulw %[Temp3], 8(%[src_ptr]) \n\t"
826 "ulw %[Temp4], 12(%[src_ptr]) \n\t"
827 "sw %[Temp1], 16(%[output_ptr]) \n\t"
828 "sw %[Temp2], 20(%[output_ptr]) \n\t"
829 "sw %[Temp3], 24(%[output_ptr]) \n\t"
830 "sw %[Temp4], 28(%[output_ptr]) \n\t"
831 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
832
833 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
834 [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
835 : [src_pixels_per_line] "r"(src_pixels_per_line),
836 [output_ptr] "r"(output_ptr));
837
838 __asm__ __volatile__(
839 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
840 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
841 "ulw %[Temp3], 8(%[src_ptr]) \n\t"
842 "ulw %[Temp4], 12(%[src_ptr]) \n\t"
843 "sw %[Temp1], 32(%[output_ptr]) \n\t"
844 "sw %[Temp2], 36(%[output_ptr]) \n\t"
845 "sw %[Temp3], 40(%[output_ptr]) \n\t"
846 "sw %[Temp4], 44(%[output_ptr]) \n\t"
847 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
848
849 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
850 [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
851 : [src_pixels_per_line] "r"(src_pixels_per_line),
852 [output_ptr] "r"(output_ptr));
853
854 output_ptr += 48;
855 }
856 }
857
vp8_filter_block2d_first_pass16_4tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line,unsigned int output_width,unsigned int output_height,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int pitch)858 void vp8_filter_block2d_first_pass16_4tap(
859 unsigned char *RESTRICT src_ptr, unsigned char *RESTRICT output_ptr,
860 unsigned int src_pixels_per_line, unsigned int output_width,
861 unsigned int output_height, int xoffset, int yoffset,
862 unsigned char *RESTRICT dst_ptr, int pitch) {
863 unsigned int i, j;
864 int Temp1, Temp2, Temp3, Temp4;
865
866 unsigned int vector4a;
867 int vector1b, vector2b;
868 unsigned int tp1, tp2, tp3, tn1;
869 unsigned int p1, p2, p3;
870 unsigned int n1, n2, n3;
871 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
872
873 vector4a = 64;
874
875 vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
876 vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
877
878 /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
879 if (yoffset == 0) {
880 output_height -= 5;
881 src_ptr += (src_pixels_per_line + src_pixels_per_line);
882
883 for (i = output_height; i--;) {
884 __asm__ __volatile__("ulw %[tp3], -1(%[src_ptr]) \n\t"
885 : [tp3] "=&r"(tp3)
886 : [src_ptr] "r"(src_ptr));
887
888 /* processing 4 adjacent pixels */
889 for (j = 0; j < 16; j += 4) {
890 /* apply filter with vectors pairs */
891 __asm__ __volatile__(
892 "ulw %[tp2], 3(%[src_ptr]) "
893 "\n\t"
894 "move %[tp1], %[tp3] "
895 "\n\t"
896
897 /* even 1. pixel */
898 "mtlo %[vector4a], $ac3 "
899 "\n\t"
900 "mthi $0, $ac3 "
901 "\n\t"
902 "move %[tp3], %[tp2] "
903 "\n\t"
904 "preceu.ph.qbr %[p1], %[tp1] "
905 "\n\t"
906 "preceu.ph.qbl %[p2], %[tp1] "
907 "\n\t"
908 "preceu.ph.qbr %[p3], %[tp2] "
909 "\n\t"
910 "dpa.w.ph $ac3, %[p1], %[vector1b] "
911 "\n\t"
912 "dpa.w.ph $ac3, %[p2], %[vector2b] "
913 "\n\t"
914
915 /* even 2. pixel */
916 "mtlo %[vector4a], $ac2 "
917 "\n\t"
918 "mthi $0, $ac2 "
919 "\n\t"
920 "dpa.w.ph $ac2, %[p2], %[vector1b] "
921 "\n\t"
922 "dpa.w.ph $ac2, %[p3], %[vector2b] "
923 "\n\t"
924 "extr.w %[Temp1], $ac3, 7 "
925 "\n\t"
926
927 /* odd 1. pixel */
928 "ulw %[tn1], 4(%[src_ptr]) "
929 "\n\t"
930 "balign %[tp2], %[tp1], 3 "
931 "\n\t"
932 "mtlo %[vector4a], $ac3 "
933 "\n\t"
934 "mthi $0, $ac3 "
935 "\n\t"
936 "preceu.ph.qbr %[n1], %[tp2] "
937 "\n\t"
938 "preceu.ph.qbl %[n2], %[tp2] "
939 "\n\t"
940 "preceu.ph.qbr %[n3], %[tn1] "
941 "\n\t"
942 "extr.w %[Temp3], $ac2, 7 "
943 "\n\t"
944 "dpa.w.ph $ac3, %[n1], %[vector1b] "
945 "\n\t"
946 "dpa.w.ph $ac3, %[n2], %[vector2b] "
947 "\n\t"
948
949 /* odd 2. pixel */
950 "mtlo %[vector4a], $ac2 "
951 "\n\t"
952 "mthi $0, $ac2 "
953 "\n\t"
954 "extr.w %[Temp2], $ac3, 7 "
955 "\n\t"
956 "dpa.w.ph $ac2, %[n2], %[vector1b] "
957 "\n\t"
958 "dpa.w.ph $ac2, %[n3], %[vector2b] "
959 "\n\t"
960 "extr.w %[Temp4], $ac2, 7 "
961 "\n\t"
962
963 /* clamp and store results */
964 "lbux %[tp1], %[Temp1](%[cm]) "
965 "\n\t"
966 "lbux %[tn1], %[Temp2](%[cm]) "
967 "\n\t"
968 "lbux %[tp2], %[Temp3](%[cm]) "
969 "\n\t"
970 "sb %[tp1], 0(%[dst_ptr]) "
971 "\n\t"
972 "sb %[tn1], 1(%[dst_ptr]) "
973 "\n\t"
974 "lbux %[n2], %[Temp4](%[cm]) "
975 "\n\t"
976 "sb %[tp2], 2(%[dst_ptr]) "
977 "\n\t"
978 "sb %[n2], 3(%[dst_ptr]) "
979 "\n\t"
980
981 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
982 [tn1] "=&r"(tn1), [p1] "=&r"(p1), [p2] "=&r"(p2), [n1] "=&r"(n1),
983 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
984 [Temp2] "=&r"(Temp2), [p3] "=&r"(p3), [Temp3] "=&r"(Temp3),
985 [Temp4] "=&r"(Temp4)
986 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
987 [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
988 [src_ptr] "r"(src_ptr));
989
990 src_ptr += 4;
991 }
992
993 /* Next row... */
994 src_ptr += src_pixels_per_line - 16;
995 dst_ptr += pitch;
996 }
997 } else {
998 for (i = output_height; i--;) {
999 /* processing 4 adjacent pixels */
1000 for (j = 0; j < 16; j += 4) {
1001 /* apply filter with vectors pairs */
1002 __asm__ __volatile__(
1003 "ulw %[tp1], -1(%[src_ptr]) "
1004 "\n\t"
1005 "ulw %[tp2], 3(%[src_ptr]) "
1006 "\n\t"
1007
1008 /* even 1. pixel */
1009 "mtlo %[vector4a], $ac3 "
1010 "\n\t"
1011 "mthi $0, $ac3 "
1012 "\n\t"
1013 "preceu.ph.qbr %[p1], %[tp1] "
1014 "\n\t"
1015 "preceu.ph.qbl %[p2], %[tp1] "
1016 "\n\t"
1017 "preceu.ph.qbr %[p3], %[tp2] "
1018 "\n\t"
1019 "dpa.w.ph $ac3, %[p1], %[vector1b] "
1020 "\n\t"
1021 "dpa.w.ph $ac3, %[p2], %[vector2b] "
1022 "\n\t"
1023
1024 /* even 2. pixel */
1025 "mtlo %[vector4a], $ac2 "
1026 "\n\t"
1027 "mthi $0, $ac2 "
1028 "\n\t"
1029 "dpa.w.ph $ac2, %[p2], %[vector1b] "
1030 "\n\t"
1031 "dpa.w.ph $ac2, %[p3], %[vector2b] "
1032 "\n\t"
1033 "extr.w %[Temp1], $ac3, 7 "
1034 "\n\t"
1035
1036 /* odd 1. pixel */
1037 "ulw %[tn1], 4(%[src_ptr]) "
1038 "\n\t"
1039 "balign %[tp2], %[tp1], 3 "
1040 "\n\t"
1041 "mtlo %[vector4a], $ac3 "
1042 "\n\t"
1043 "mthi $0, $ac3 "
1044 "\n\t"
1045 "preceu.ph.qbr %[n1], %[tp2] "
1046 "\n\t"
1047 "preceu.ph.qbl %[n2], %[tp2] "
1048 "\n\t"
1049 "preceu.ph.qbr %[n3], %[tn1] "
1050 "\n\t"
1051 "extr.w %[Temp3], $ac2, 7 "
1052 "\n\t"
1053 "dpa.w.ph $ac3, %[n1], %[vector1b] "
1054 "\n\t"
1055 "dpa.w.ph $ac3, %[n2], %[vector2b] "
1056 "\n\t"
1057
1058 /* odd 2. pixel */
1059 "mtlo %[vector4a], $ac2 "
1060 "\n\t"
1061 "mthi $0, $ac2 "
1062 "\n\t"
1063 "extr.w %[Temp2], $ac3, 7 "
1064 "\n\t"
1065 "dpa.w.ph $ac2, %[n2], %[vector1b] "
1066 "\n\t"
1067 "dpa.w.ph $ac2, %[n3], %[vector2b] "
1068 "\n\t"
1069 "extr.w %[Temp4], $ac2, 7 "
1070 "\n\t"
1071
1072 /* clamp and store results */
1073 "lbux %[tp1], %[Temp1](%[cm]) "
1074 "\n\t"
1075 "lbux %[tn1], %[Temp2](%[cm]) "
1076 "\n\t"
1077 "lbux %[tp2], %[Temp3](%[cm]) "
1078 "\n\t"
1079 "sb %[tp1], 0(%[output_ptr]) "
1080 "\n\t"
1081 "sb %[tn1], 1(%[output_ptr]) "
1082 "\n\t"
1083 "lbux %[n2], %[Temp4](%[cm]) "
1084 "\n\t"
1085 "sb %[tp2], 2(%[output_ptr]) "
1086 "\n\t"
1087 "sb %[n2], 3(%[output_ptr]) "
1088 "\n\t"
1089
1090 : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
1091 [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
1092 [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
1093 [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
1094 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1095 [vector4a] "r"(vector4a), [cm] "r"(cm),
1096 [output_ptr] "r"(output_ptr), [src_ptr] "r"(src_ptr));
1097
1098 src_ptr += 4;
1099 }
1100
1101 /* next row... */
1102 src_ptr += src_pixels_per_line;
1103 output_ptr += output_width;
1104 }
1105 }
1106 }
1107
vp8_filter_block2d_second_pass4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,int yoffset)1108 void vp8_filter_block2d_second_pass4(unsigned char *RESTRICT src_ptr,
1109 unsigned char *RESTRICT output_ptr,
1110 int output_pitch, int yoffset) {
1111 unsigned int i;
1112
1113 int Temp1, Temp2, Temp3, Temp4;
1114 unsigned int vector1b, vector2b, vector3b, vector4a;
1115
1116 unsigned char src_ptr_l2;
1117 unsigned char src_ptr_l1;
1118 unsigned char src_ptr_0;
1119 unsigned char src_ptr_r1;
1120 unsigned char src_ptr_r2;
1121 unsigned char src_ptr_r3;
1122
1123 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1124
1125 vector4a = 64;
1126
1127 /* load filter coefficients */
1128 vector1b = sub_pel_filterss[yoffset][0];
1129 vector2b = sub_pel_filterss[yoffset][2];
1130 vector3b = sub_pel_filterss[yoffset][1];
1131
1132 if (vector1b) {
1133 /* 6 tap filter */
1134
1135 for (i = 2; i--;) {
1136 /* prefetch src_ptr data to cache memory */
1137 prefetch_load(src_ptr);
1138
1139 /* do not allow compiler to reorder instructions */
1140 __asm__ __volatile__(
1141 ".set noreorder \n\t"
1142 :
1143 :);
1144
1145 /* apply filter with vectors pairs */
1146 __asm__ __volatile__(
1147 "lbu %[src_ptr_l2], -8(%[src_ptr]) \n\t"
1148 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
1149 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1150 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
1151 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
1152 "lbu %[src_ptr_r3], 12(%[src_ptr]) \n\t"
1153 "mtlo %[vector4a], $ac2 \n\t"
1154
1155 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1156 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1157 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1158 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1159 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1160 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1161
1162 "lbu %[src_ptr_l2], -7(%[src_ptr]) \n\t"
1163 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
1164 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1165 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
1166 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
1167 "lbu %[src_ptr_r3], 13(%[src_ptr]) \n\t"
1168 "mtlo %[vector4a], $ac3 \n\t"
1169 "extp %[Temp1], $ac2, 9 \n\t"
1170
1171 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1172 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1173 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1174 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1175 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1176 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1177
1178 "lbu %[src_ptr_l2], -6(%[src_ptr]) \n\t"
1179 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
1180 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1181 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
1182 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
1183 "lbu %[src_ptr_r3], 14(%[src_ptr]) \n\t"
1184 "mtlo %[vector4a], $ac0 \n\t"
1185 "extp %[Temp2], $ac3, 9 \n\t"
1186
1187 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1188 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1189 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1190 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1191 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1192 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1193
1194 "lbu %[src_ptr_l2], -5(%[src_ptr]) \n\t"
1195 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
1196 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1197 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
1198 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
1199 "lbu %[src_ptr_r3], 15(%[src_ptr]) \n\t"
1200 "mtlo %[vector4a], $ac1 \n\t"
1201 "extp %[Temp3], $ac0, 9 \n\t"
1202
1203 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1204 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1205 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1206 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1207 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1208 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1209 "extp %[Temp4], $ac1, 9 \n\t"
1210
1211 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1212 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1213 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1214 [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
1215 [src_ptr_r3] "=&r"(src_ptr_r3)
1216 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1217 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1218 [src_ptr] "r"(src_ptr));
1219
1220 /* clamp and store results */
1221 output_ptr[0] = cm[Temp1];
1222 output_ptr[1] = cm[Temp2];
1223 output_ptr[2] = cm[Temp3];
1224 output_ptr[3] = cm[Temp4];
1225
1226 output_ptr += output_pitch;
1227
1228 /* apply filter with vectors pairs */
1229 __asm__ __volatile__(
1230 "lbu %[src_ptr_l2], -4(%[src_ptr]) \n\t"
1231 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
1232 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1233 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1234 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
1235 "lbu %[src_ptr_r3], 16(%[src_ptr]) \n\t"
1236 "mtlo %[vector4a], $ac2 \n\t"
1237 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1238 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1239 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1240 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1241 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1242 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1243
1244 "lbu %[src_ptr_l2], -3(%[src_ptr]) \n\t"
1245 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
1246 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1247 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1248 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
1249 "lbu %[src_ptr_r3], 17(%[src_ptr]) \n\t"
1250 "mtlo %[vector4a], $ac3 \n\t"
1251 "extp %[Temp1], $ac2, 9 \n\t"
1252
1253 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1254 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1255 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1256 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1257 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1258 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1259
1260 "lbu %[src_ptr_l2], -2(%[src_ptr]) \n\t"
1261 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
1262 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1263 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
1264 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
1265 "lbu %[src_ptr_r3], 18(%[src_ptr]) \n\t"
1266 "mtlo %[vector4a], $ac0 \n\t"
1267 "extp %[Temp2], $ac3, 9 \n\t"
1268
1269 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1270 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1271 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1272 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1273 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1274 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1275
1276 "lbu %[src_ptr_l2], -1(%[src_ptr]) \n\t"
1277 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
1278 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
1279 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
1280 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
1281 "lbu %[src_ptr_r3], 19(%[src_ptr]) \n\t"
1282 "mtlo %[vector4a], $ac1 \n\t"
1283 "extp %[Temp3], $ac0, 9 \n\t"
1284
1285 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1286 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1287 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1288 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1289 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1290 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1291 "extp %[Temp4], $ac1, 9 \n\t"
1292
1293 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1294 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1295 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1296 [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
1297 [src_ptr_r3] "=&r"(src_ptr_r3)
1298 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1299 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1300 [src_ptr] "r"(src_ptr));
1301
1302 /* clamp and store results */
1303 output_ptr[0] = cm[Temp1];
1304 output_ptr[1] = cm[Temp2];
1305 output_ptr[2] = cm[Temp3];
1306 output_ptr[3] = cm[Temp4];
1307
1308 src_ptr += 8;
1309 output_ptr += output_pitch;
1310 }
1311 } else {
1312 /* 4 tap filter */
1313
1314 /* prefetch src_ptr data to cache memory */
1315 prefetch_load(src_ptr);
1316
1317 for (i = 2; i--;) {
1318 /* do not allow compiler to reorder instructions */
1319 __asm__ __volatile__(
1320 ".set noreorder \n\t"
1321 :
1322 :);
1323
1324 /* apply filter with vectors pairs */
1325 __asm__ __volatile__(
1326 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
1327 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1328 "lbu %[src_ptr_r1], 4(%[src_ptr]) \n\t"
1329 "lbu %[src_ptr_r2], 8(%[src_ptr]) \n\t"
1330 "mtlo %[vector4a], $ac2 \n\t"
1331 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1332 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1333 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1334 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1335
1336 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
1337 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1338 "lbu %[src_ptr_r1], 5(%[src_ptr]) \n\t"
1339 "lbu %[src_ptr_r2], 9(%[src_ptr]) \n\t"
1340 "mtlo %[vector4a], $ac3 \n\t"
1341 "extp %[Temp1], $ac2, 9 \n\t"
1342
1343 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1344 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1345 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1346 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1347
1348 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
1349 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1350 "lbu %[src_ptr_r1], 6(%[src_ptr]) \n\t"
1351 "lbu %[src_ptr_r2], 10(%[src_ptr]) \n\t"
1352 "mtlo %[vector4a], $ac0 \n\t"
1353 "extp %[Temp2], $ac3, 9 \n\t"
1354
1355 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1356 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1357 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1358 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1359
1360 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
1361 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1362 "lbu %[src_ptr_r1], 7(%[src_ptr]) \n\t"
1363 "lbu %[src_ptr_r2], 11(%[src_ptr]) \n\t"
1364 "mtlo %[vector4a], $ac1 \n\t"
1365 "extp %[Temp3], $ac0, 9 \n\t"
1366 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1367 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1368 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1369 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1370 "extp %[Temp4], $ac1, 9 \n\t"
1371
1372 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1373 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1374 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1375 [src_ptr_r2] "=&r"(src_ptr_r2)
1376 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1377 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1378
1379 /* clamp and store results */
1380 output_ptr[0] = cm[Temp1];
1381 output_ptr[1] = cm[Temp2];
1382 output_ptr[2] = cm[Temp3];
1383 output_ptr[3] = cm[Temp4];
1384
1385 output_ptr += output_pitch;
1386
1387 /* apply filter with vectors pairs */
1388 __asm__ __volatile__(
1389 "lbu %[src_ptr_l1], 0(%[src_ptr]) \n\t"
1390 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1391 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1392 "lbu %[src_ptr_r2], 12(%[src_ptr]) \n\t"
1393 "mtlo %[vector4a], $ac2 \n\t"
1394 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1395 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1396 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1397 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1398
1399 "lbu %[src_ptr_l1], 1(%[src_ptr]) \n\t"
1400 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1401 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1402 "lbu %[src_ptr_r2], 13(%[src_ptr]) \n\t"
1403 "mtlo %[vector4a], $ac3 \n\t"
1404 "extp %[Temp1], $ac2, 9 \n\t"
1405
1406 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1407 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1408 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1409 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1410
1411 "lbu %[src_ptr_l1], 2(%[src_ptr]) \n\t"
1412 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1413 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
1414 "lbu %[src_ptr_r2], 14(%[src_ptr]) \n\t"
1415 "mtlo %[vector4a], $ac0 \n\t"
1416 "extp %[Temp2], $ac3, 9 \n\t"
1417
1418 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1419 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1420 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1421 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1422
1423 "lbu %[src_ptr_l1], 3(%[src_ptr]) \n\t"
1424 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
1425 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
1426 "lbu %[src_ptr_r2], 15(%[src_ptr]) \n\t"
1427 "mtlo %[vector4a], $ac1 \n\t"
1428 "extp %[Temp3], $ac0, 9 \n\t"
1429 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1430 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1431 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1432 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1433 "extp %[Temp4], $ac1, 9 \n\t"
1434
1435 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1436 [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1437 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1438 [src_ptr_r2] "=&r"(src_ptr_r2)
1439 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1440 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1441
1442 /* clamp and store results */
1443 output_ptr[0] = cm[Temp1];
1444 output_ptr[1] = cm[Temp2];
1445 output_ptr[2] = cm[Temp3];
1446 output_ptr[3] = cm[Temp4];
1447
1448 src_ptr += 8;
1449 output_ptr += output_pitch;
1450 }
1451 }
1452 }
1453
vp8_filter_block2d_second_pass_8(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,unsigned int output_height,unsigned int output_width,unsigned int yoffset)1454 void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
1455 unsigned char *RESTRICT output_ptr,
1456 int output_pitch,
1457 unsigned int output_height,
1458 unsigned int output_width,
1459 unsigned int yoffset) {
1460 unsigned int i;
1461
1462 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1463 unsigned int vector1b, vector2b, vector3b, vector4a;
1464
1465 unsigned char src_ptr_l2;
1466 unsigned char src_ptr_l1;
1467 unsigned char src_ptr_0;
1468 unsigned char src_ptr_r1;
1469 unsigned char src_ptr_r2;
1470 unsigned char src_ptr_r3;
1471 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1472 (void)output_width;
1473
1474 vector4a = 64;
1475
1476 vector1b = sub_pel_filterss[yoffset][0];
1477 vector2b = sub_pel_filterss[yoffset][2];
1478 vector3b = sub_pel_filterss[yoffset][1];
1479
1480 if (vector1b) {
1481 /* 6 tap filter */
1482
1483 /* prefetch src_ptr data to cache memory */
1484 prefetch_load(src_ptr);
1485
1486 for (i = output_height; i--;) {
1487 /* apply filter with vectors pairs */
1488 __asm__ __volatile__(
1489 "lbu %[src_ptr_l2], -16(%[src_ptr]) \n\t"
1490 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
1491 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1492 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1493 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
1494 "lbu %[src_ptr_r3], 24(%[src_ptr]) \n\t"
1495 "mtlo %[vector4a], $ac2 \n\t"
1496
1497 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1498 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1499 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1500 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1501 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1502 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1503
1504 "lbu %[src_ptr_l2], -15(%[src_ptr]) \n\t"
1505 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
1506 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1507 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1508 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
1509 "lbu %[src_ptr_r3], 25(%[src_ptr]) \n\t"
1510 "mtlo %[vector4a], $ac3 \n\t"
1511 "extp %[Temp1], $ac2, 9 \n\t"
1512
1513 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1514 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1515 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1516 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1517 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1518 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1519
1520 "lbu %[src_ptr_l2], -14(%[src_ptr]) \n\t"
1521 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
1522 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
1523 "lbu %[src_ptr_r1], 10(%[src_ptr]) \n\t"
1524 "lbu %[src_ptr_r2], 18(%[src_ptr]) \n\t"
1525 "lbu %[src_ptr_r3], 26(%[src_ptr]) \n\t"
1526 "mtlo %[vector4a], $ac0 \n\t"
1527 "extp %[Temp2], $ac3, 9 \n\t"
1528
1529 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1530 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1531 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1532 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1533 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1534 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1535
1536 "lbu %[src_ptr_l2], -13(%[src_ptr]) \n\t"
1537 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
1538 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
1539 "lbu %[src_ptr_r1], 11(%[src_ptr]) \n\t"
1540 "lbu %[src_ptr_r2], 19(%[src_ptr]) \n\t"
1541 "lbu %[src_ptr_r3], 27(%[src_ptr]) \n\t"
1542 "mtlo %[vector4a], $ac1 \n\t"
1543 "extp %[Temp3], $ac0, 9 \n\t"
1544
1545 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1546 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1547 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1548 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1549 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1550 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1551
1552 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1553 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
1554 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
1555 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
1556 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1557 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1558 [src_ptr] "r"(src_ptr));
1559
1560 /* apply filter with vectors pairs */
1561 __asm__ __volatile__(
1562 "lbu %[src_ptr_l2], -12(%[src_ptr]) \n\t"
1563 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
1564 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
1565 "lbu %[src_ptr_r1], 12(%[src_ptr]) \n\t"
1566 "lbu %[src_ptr_r2], 20(%[src_ptr]) \n\t"
1567 "lbu %[src_ptr_r3], 28(%[src_ptr]) \n\t"
1568 "mtlo %[vector4a], $ac2 \n\t"
1569
1570 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1571 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1572 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1573 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
1574 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1575 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1576 "extp %[Temp4], $ac1, 9 \n\t"
1577
1578 "lbu %[src_ptr_l2], -11(%[src_ptr]) \n\t"
1579 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
1580 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
1581 "lbu %[src_ptr_r1], 13(%[src_ptr]) \n\t"
1582 "lbu %[src_ptr_r2], 21(%[src_ptr]) \n\t"
1583 "lbu %[src_ptr_r3], 29(%[src_ptr]) \n\t"
1584 "mtlo %[vector4a], $ac3 \n\t"
1585 "extp %[Temp5], $ac2, 9 \n\t"
1586
1587 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1588 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1589 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1590 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
1591 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1592 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1593
1594 "lbu %[src_ptr_l2], -10(%[src_ptr]) \n\t"
1595 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
1596 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
1597 "lbu %[src_ptr_r1], 14(%[src_ptr]) \n\t"
1598 "lbu %[src_ptr_r2], 22(%[src_ptr]) \n\t"
1599 "lbu %[src_ptr_r3], 30(%[src_ptr]) \n\t"
1600 "mtlo %[vector4a], $ac0 \n\t"
1601 "extp %[Temp6], $ac3, 9 \n\t"
1602
1603 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1604 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1605 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1606 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
1607 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1608 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1609
1610 "lbu %[src_ptr_l2], -9(%[src_ptr]) \n\t"
1611 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
1612 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
1613 "lbu %[src_ptr_r1], 15(%[src_ptr]) \n\t"
1614 "lbu %[src_ptr_r2], 23(%[src_ptr]) \n\t"
1615 "lbu %[src_ptr_r3], 31(%[src_ptr]) \n\t"
1616 "mtlo %[vector4a], $ac1 \n\t"
1617 "extp %[Temp7], $ac0, 9 \n\t"
1618
1619 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
1620 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1621 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1622 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
1623 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1624 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1625 "extp %[Temp8], $ac1, 9 \n\t"
1626
1627 : [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
1628 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
1629 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
1630 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
1631 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
1632 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1633 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1634 [src_ptr] "r"(src_ptr));
1635
1636 /* clamp and store results */
1637 output_ptr[0] = cm[Temp1];
1638 output_ptr[1] = cm[Temp2];
1639 output_ptr[2] = cm[Temp3];
1640 output_ptr[3] = cm[Temp4];
1641 output_ptr[4] = cm[Temp5];
1642 output_ptr[5] = cm[Temp6];
1643 output_ptr[6] = cm[Temp7];
1644 output_ptr[7] = cm[Temp8];
1645
1646 src_ptr += 8;
1647 output_ptr += output_pitch;
1648 }
1649 } else {
1650 /* 4 tap filter */
1651
1652 /* prefetch src_ptr data to cache memory */
1653 prefetch_load(src_ptr);
1654
1655 for (i = output_height; i--;) {
1656 __asm__ __volatile__(
1657 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
1658 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
1659 "lbu %[src_ptr_r1], 8(%[src_ptr]) \n\t"
1660 "lbu %[src_ptr_r2], 16(%[src_ptr]) \n\t"
1661 "mtlo %[vector4a], $ac2 \n\t"
1662 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1663 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1664 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1665 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1666
1667 : [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
1668 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
1669 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1670 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1671
1672 __asm__ __volatile__(
1673 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
1674 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
1675 "lbu %[src_ptr_r1], 9(%[src_ptr]) \n\t"
1676 "lbu %[src_ptr_r2], 17(%[src_ptr]) \n\t"
1677 "mtlo %[vector4a], $ac3 \n\t"
1678 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1679 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1680 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1681 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1682 "extp %[Temp1], $ac2, 9 \n\t"
1683
1684 : [Temp1] "=r"(Temp1), [src_ptr_l1] "=&r"(src_ptr_l1),
1685 [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1686 [src_ptr_r2] "=&r"(src_ptr_r2)
1687 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1688 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1689
1690 src_ptr_l1 = src_ptr[-6];
1691 src_ptr_0 = src_ptr[2];
1692 src_ptr_r1 = src_ptr[10];
1693 src_ptr_r2 = src_ptr[18];
1694
1695 __asm__ __volatile__(
1696 "mtlo %[vector4a], $ac0 \n\t"
1697 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1698 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1699 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1700 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1701 "extp %[Temp2], $ac3, 9 \n\t"
1702
1703 : [Temp2] "=r"(Temp2)
1704 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1705 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1706 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1707 [vector4a] "r"(vector4a));
1708
1709 src_ptr_l1 = src_ptr[-5];
1710 src_ptr_0 = src_ptr[3];
1711 src_ptr_r1 = src_ptr[11];
1712 src_ptr_r2 = src_ptr[19];
1713
1714 __asm__ __volatile__(
1715 "mtlo %[vector4a], $ac1 \n\t"
1716 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1717 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1718 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1719 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1720 "extp %[Temp3], $ac0, 9 \n\t"
1721
1722 : [Temp3] "=r"(Temp3)
1723 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1724 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1725 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1726 [vector4a] "r"(vector4a));
1727
1728 src_ptr_l1 = src_ptr[-4];
1729 src_ptr_0 = src_ptr[4];
1730 src_ptr_r1 = src_ptr[12];
1731 src_ptr_r2 = src_ptr[20];
1732
1733 __asm__ __volatile__(
1734 "mtlo %[vector4a], $ac2 \n\t"
1735 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1736 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1737 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
1738 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
1739 "extp %[Temp4], $ac1, 9 \n\t"
1740
1741 : [Temp4] "=r"(Temp4)
1742 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1743 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1744 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1745 [vector4a] "r"(vector4a));
1746
1747 src_ptr_l1 = src_ptr[-3];
1748 src_ptr_0 = src_ptr[5];
1749 src_ptr_r1 = src_ptr[13];
1750 src_ptr_r2 = src_ptr[21];
1751
1752 __asm__ __volatile__(
1753 "mtlo %[vector4a], $ac3 \n\t"
1754 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1755 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1756 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
1757 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
1758 "extp %[Temp5], $ac2, 9 \n\t"
1759
1760 : [Temp5] "=&r"(Temp5)
1761 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1762 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1763 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1764 [vector4a] "r"(vector4a));
1765
1766 src_ptr_l1 = src_ptr[-2];
1767 src_ptr_0 = src_ptr[6];
1768 src_ptr_r1 = src_ptr[14];
1769 src_ptr_r2 = src_ptr[22];
1770
1771 __asm__ __volatile__(
1772 "mtlo %[vector4a], $ac0 \n\t"
1773 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1774 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1775 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
1776 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
1777 "extp %[Temp6], $ac3, 9 \n\t"
1778
1779 : [Temp6] "=r"(Temp6)
1780 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1781 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1782 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1783 [vector4a] "r"(vector4a));
1784
1785 src_ptr_l1 = src_ptr[-1];
1786 src_ptr_0 = src_ptr[7];
1787 src_ptr_r1 = src_ptr[15];
1788 src_ptr_r2 = src_ptr[23];
1789
1790 __asm__ __volatile__(
1791 "mtlo %[vector4a], $ac1 \n\t"
1792 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
1793 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
1794 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
1795 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
1796 "extp %[Temp7], $ac0, 9 \n\t"
1797 "extp %[Temp8], $ac1, 9 \n\t"
1798
1799 : [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8)
1800 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1801 [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1802 [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1803 [vector4a] "r"(vector4a));
1804
1805 /* clamp and store results */
1806 output_ptr[0] = cm[Temp1];
1807 output_ptr[1] = cm[Temp2];
1808 output_ptr[2] = cm[Temp3];
1809 output_ptr[3] = cm[Temp4];
1810 output_ptr[4] = cm[Temp5];
1811 output_ptr[5] = cm[Temp6];
1812 output_ptr[6] = cm[Temp7];
1813 output_ptr[7] = cm[Temp8];
1814
1815 src_ptr += 8;
1816 output_ptr += output_pitch;
1817 }
1818 }
1819 }
1820
vp8_filter_block2d_second_pass161(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,const unsigned short * vp8_filter)1821 void vp8_filter_block2d_second_pass161(unsigned char *RESTRICT src_ptr,
1822 unsigned char *RESTRICT output_ptr,
1823 int output_pitch,
1824 const unsigned short *vp8_filter) {
1825 unsigned int i, j;
1826
1827 int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1828 unsigned int vector4a;
1829 unsigned int vector1b, vector2b, vector3b;
1830
1831 unsigned char src_ptr_l2;
1832 unsigned char src_ptr_l1;
1833 unsigned char src_ptr_0;
1834 unsigned char src_ptr_r1;
1835 unsigned char src_ptr_r2;
1836 unsigned char src_ptr_r3;
1837 unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1838
1839 vector4a = 64;
1840
1841 vector1b = vp8_filter[0];
1842 vector2b = vp8_filter[2];
1843 vector3b = vp8_filter[1];
1844
1845 if (vector1b == 0) {
1846 /* 4 tap filter */
1847
1848 /* prefetch src_ptr data to cache memory */
1849 prefetch_load(src_ptr + 16);
1850
1851 for (i = 16; i--;) {
1852 /* unrolling for loop */
1853 for (j = 0; j < 16; j += 8) {
1854 /* apply filter with vectors pairs */
1855 __asm__ __volatile__(
1856 "lbu %[src_ptr_l1], -16(%[src_ptr]) "
1857 "\n\t"
1858 "lbu %[src_ptr_0], 0(%[src_ptr]) "
1859 "\n\t"
1860 "lbu %[src_ptr_r1], 16(%[src_ptr]) "
1861 "\n\t"
1862 "lbu %[src_ptr_r2], 32(%[src_ptr]) "
1863 "\n\t"
1864 "mtlo %[vector4a], $ac2 "
1865 "\n\t"
1866 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1867 "\n\t"
1868 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
1869 "\n\t"
1870 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] "
1871 "\n\t"
1872 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] "
1873 "\n\t"
1874
1875 "lbu %[src_ptr_l1], -15(%[src_ptr]) "
1876 "\n\t"
1877 "lbu %[src_ptr_0], 1(%[src_ptr]) "
1878 "\n\t"
1879 "lbu %[src_ptr_r1], 17(%[src_ptr]) "
1880 "\n\t"
1881 "lbu %[src_ptr_r2], 33(%[src_ptr]) "
1882 "\n\t"
1883 "mtlo %[vector4a], $ac3 "
1884 "\n\t"
1885 "extp %[Temp1], $ac2, 9 "
1886 "\n\t"
1887
1888 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1889 "\n\t"
1890 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
1891 "\n\t"
1892 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
1893 "\n\t"
1894 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
1895 "\n\t"
1896
1897 "lbu %[src_ptr_l1], -14(%[src_ptr]) "
1898 "\n\t"
1899 "lbu %[src_ptr_0], 2(%[src_ptr]) "
1900 "\n\t"
1901 "lbu %[src_ptr_r1], 18(%[src_ptr]) "
1902 "\n\t"
1903 "lbu %[src_ptr_r2], 34(%[src_ptr]) "
1904 "\n\t"
1905 "mtlo %[vector4a], $ac1 "
1906 "\n\t"
1907 "extp %[Temp2], $ac3, 9 "
1908 "\n\t"
1909
1910 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1911 "\n\t"
1912 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
1913 "\n\t"
1914 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] "
1915 "\n\t"
1916 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] "
1917 "\n\t"
1918
1919 "lbu %[src_ptr_l1], -13(%[src_ptr]) "
1920 "\n\t"
1921 "lbu %[src_ptr_0], 3(%[src_ptr]) "
1922 "\n\t"
1923 "lbu %[src_ptr_r1], 19(%[src_ptr]) "
1924 "\n\t"
1925 "lbu %[src_ptr_r2], 35(%[src_ptr]) "
1926 "\n\t"
1927 "mtlo %[vector4a], $ac3 "
1928 "\n\t"
1929 "extp %[Temp3], $ac1, 9 "
1930 "\n\t"
1931
1932 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1933 "\n\t"
1934 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
1935 "\n\t"
1936 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
1937 "\n\t"
1938 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
1939 "\n\t"
1940
1941 "lbu %[src_ptr_l1], -12(%[src_ptr]) "
1942 "\n\t"
1943 "lbu %[src_ptr_0], 4(%[src_ptr]) "
1944 "\n\t"
1945 "lbu %[src_ptr_r1], 20(%[src_ptr]) "
1946 "\n\t"
1947 "lbu %[src_ptr_r2], 36(%[src_ptr]) "
1948 "\n\t"
1949 "mtlo %[vector4a], $ac2 "
1950 "\n\t"
1951 "extp %[Temp4], $ac3, 9 "
1952 "\n\t"
1953
1954 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1955 "\n\t"
1956 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
1957 "\n\t"
1958 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] "
1959 "\n\t"
1960 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] "
1961 "\n\t"
1962
1963 "lbu %[src_ptr_l1], -11(%[src_ptr]) "
1964 "\n\t"
1965 "lbu %[src_ptr_0], 5(%[src_ptr]) "
1966 "\n\t"
1967 "lbu %[src_ptr_r1], 21(%[src_ptr]) "
1968 "\n\t"
1969 "lbu %[src_ptr_r2], 37(%[src_ptr]) "
1970 "\n\t"
1971 "mtlo %[vector4a], $ac3 "
1972 "\n\t"
1973 "extp %[Temp5], $ac2, 9 "
1974 "\n\t"
1975
1976 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1977 "\n\t"
1978 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
1979 "\n\t"
1980 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
1981 "\n\t"
1982 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
1983 "\n\t"
1984
1985 "lbu %[src_ptr_l1], -10(%[src_ptr]) "
1986 "\n\t"
1987 "lbu %[src_ptr_0], 6(%[src_ptr]) "
1988 "\n\t"
1989 "lbu %[src_ptr_r1], 22(%[src_ptr]) "
1990 "\n\t"
1991 "lbu %[src_ptr_r2], 38(%[src_ptr]) "
1992 "\n\t"
1993 "mtlo %[vector4a], $ac1 "
1994 "\n\t"
1995 "extp %[Temp6], $ac3, 9 "
1996 "\n\t"
1997
1998 "append %[src_ptr_0], %[src_ptr_r1], 8 "
1999 "\n\t"
2000 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
2001 "\n\t"
2002 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] "
2003 "\n\t"
2004 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] "
2005 "\n\t"
2006
2007 "lbu %[src_ptr_l1], -9(%[src_ptr]) "
2008 "\n\t"
2009 "lbu %[src_ptr_0], 7(%[src_ptr]) "
2010 "\n\t"
2011 "lbu %[src_ptr_r1], 23(%[src_ptr]) "
2012 "\n\t"
2013 "lbu %[src_ptr_r2], 39(%[src_ptr]) "
2014 "\n\t"
2015 "mtlo %[vector4a], $ac3 "
2016 "\n\t"
2017 "extp %[Temp7], $ac1, 9 "
2018 "\n\t"
2019
2020 "append %[src_ptr_0], %[src_ptr_r1], 8 "
2021 "\n\t"
2022 "append %[src_ptr_l1], %[src_ptr_r2], 8 "
2023 "\n\t"
2024 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] "
2025 "\n\t"
2026 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] "
2027 "\n\t"
2028 "extp %[Temp8], $ac3, 9 "
2029 "\n\t"
2030
2031 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
2032 [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
2033 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
2034 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
2035 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
2036 : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
2037 [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
2038
2039 /* clamp and store results */
2040 output_ptr[j] = cm[Temp1];
2041 output_ptr[j + 1] = cm[Temp2];
2042 output_ptr[j + 2] = cm[Temp3];
2043 output_ptr[j + 3] = cm[Temp4];
2044 output_ptr[j + 4] = cm[Temp5];
2045 output_ptr[j + 5] = cm[Temp6];
2046 output_ptr[j + 6] = cm[Temp7];
2047 output_ptr[j + 7] = cm[Temp8];
2048
2049 src_ptr += 8;
2050 }
2051
2052 output_ptr += output_pitch;
2053 }
2054 } else {
2055 /* 4 tap filter */
2056
2057 /* prefetch src_ptr data to cache memory */
2058 prefetch_load(src_ptr + 16);
2059
2060 /* unroll for loop */
2061 for (i = 16; i--;) {
2062 /* apply filter with vectors pairs */
2063 __asm__ __volatile__(
2064 "lbu %[src_ptr_l2], -32(%[src_ptr]) \n\t"
2065 "lbu %[src_ptr_l1], -16(%[src_ptr]) \n\t"
2066 "lbu %[src_ptr_0], 0(%[src_ptr]) \n\t"
2067 "lbu %[src_ptr_r1], 16(%[src_ptr]) \n\t"
2068 "lbu %[src_ptr_r2], 32(%[src_ptr]) \n\t"
2069 "lbu %[src_ptr_r3], 48(%[src_ptr]) \n\t"
2070 "mtlo %[vector4a], $ac2 \n\t"
2071
2072 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2073 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2074 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2075 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2076 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2077 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2078
2079 "lbu %[src_ptr_l2], -31(%[src_ptr]) \n\t"
2080 "lbu %[src_ptr_l1], -15(%[src_ptr]) \n\t"
2081 "lbu %[src_ptr_0], 1(%[src_ptr]) \n\t"
2082 "lbu %[src_ptr_r1], 17(%[src_ptr]) \n\t"
2083 "lbu %[src_ptr_r2], 33(%[src_ptr]) \n\t"
2084 "lbu %[src_ptr_r3], 49(%[src_ptr]) \n\t"
2085 "mtlo %[vector4a], $ac0 \n\t"
2086 "extp %[Temp1], $ac2, 9 \n\t"
2087
2088 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2089 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2090 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2091 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2092 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2093 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2094
2095 "lbu %[src_ptr_l2], -30(%[src_ptr]) \n\t"
2096 "lbu %[src_ptr_l1], -14(%[src_ptr]) \n\t"
2097 "lbu %[src_ptr_0], 2(%[src_ptr]) \n\t"
2098 "lbu %[src_ptr_r1], 18(%[src_ptr]) \n\t"
2099 "lbu %[src_ptr_r2], 34(%[src_ptr]) \n\t"
2100 "lbu %[src_ptr_r3], 50(%[src_ptr]) \n\t"
2101 "mtlo %[vector4a], $ac1 \n\t"
2102 "extp %[Temp2], $ac0, 9 \n\t"
2103
2104 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2105 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2106 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2107 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2108 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2109 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2110
2111 "lbu %[src_ptr_l2], -29(%[src_ptr]) \n\t"
2112 "lbu %[src_ptr_l1], -13(%[src_ptr]) \n\t"
2113 "lbu %[src_ptr_0], 3(%[src_ptr]) \n\t"
2114 "lbu %[src_ptr_r1], 19(%[src_ptr]) \n\t"
2115 "lbu %[src_ptr_r2], 35(%[src_ptr]) \n\t"
2116 "lbu %[src_ptr_r3], 51(%[src_ptr]) \n\t"
2117 "mtlo %[vector4a], $ac3 \n\t"
2118 "extp %[Temp3], $ac1, 9 \n\t"
2119
2120 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2121 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2122 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2123 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2124 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2125 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2126
2127 "lbu %[src_ptr_l2], -28(%[src_ptr]) \n\t"
2128 "lbu %[src_ptr_l1], -12(%[src_ptr]) \n\t"
2129 "lbu %[src_ptr_0], 4(%[src_ptr]) \n\t"
2130 "lbu %[src_ptr_r1], 20(%[src_ptr]) \n\t"
2131 "lbu %[src_ptr_r2], 36(%[src_ptr]) \n\t"
2132 "lbu %[src_ptr_r3], 52(%[src_ptr]) \n\t"
2133 "mtlo %[vector4a], $ac2 \n\t"
2134 "extp %[Temp4], $ac3, 9 \n\t"
2135
2136 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2137 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2138 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2139 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2140 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2141 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2142
2143 "lbu %[src_ptr_l2], -27(%[src_ptr]) \n\t"
2144 "lbu %[src_ptr_l1], -11(%[src_ptr]) \n\t"
2145 "lbu %[src_ptr_0], 5(%[src_ptr]) \n\t"
2146 "lbu %[src_ptr_r1], 21(%[src_ptr]) \n\t"
2147 "lbu %[src_ptr_r2], 37(%[src_ptr]) \n\t"
2148 "lbu %[src_ptr_r3], 53(%[src_ptr]) \n\t"
2149 "mtlo %[vector4a], $ac0 \n\t"
2150 "extp %[Temp5], $ac2, 9 \n\t"
2151
2152 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2153 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2154 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2155 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2156 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2157 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2158
2159 "lbu %[src_ptr_l2], -26(%[src_ptr]) \n\t"
2160 "lbu %[src_ptr_l1], -10(%[src_ptr]) \n\t"
2161 "lbu %[src_ptr_0], 6(%[src_ptr]) \n\t"
2162 "lbu %[src_ptr_r1], 22(%[src_ptr]) \n\t"
2163 "lbu %[src_ptr_r2], 38(%[src_ptr]) \n\t"
2164 "lbu %[src_ptr_r3], 54(%[src_ptr]) \n\t"
2165 "mtlo %[vector4a], $ac1 \n\t"
2166 "extp %[Temp6], $ac0, 9 \n\t"
2167
2168 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2169 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2170 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2171 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2172 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2173 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2174
2175 "lbu %[src_ptr_l2], -25(%[src_ptr]) \n\t"
2176 "lbu %[src_ptr_l1], -9(%[src_ptr]) \n\t"
2177 "lbu %[src_ptr_0], 7(%[src_ptr]) \n\t"
2178 "lbu %[src_ptr_r1], 23(%[src_ptr]) \n\t"
2179 "lbu %[src_ptr_r2], 39(%[src_ptr]) \n\t"
2180 "lbu %[src_ptr_r3], 55(%[src_ptr]) \n\t"
2181 "mtlo %[vector4a], $ac3 \n\t"
2182 "extp %[Temp7], $ac1, 9 \n\t"
2183
2184 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2185 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2186 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2187 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2188 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2189 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2190 "extp %[Temp8], $ac3, 9 \n\t"
2191
2192 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
2193 [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
2194 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
2195 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
2196 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
2197 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
2198 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
2199 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
2200 [src_ptr] "r"(src_ptr));
2201
2202 /* clamp and store results */
2203 output_ptr[0] = cm[Temp1];
2204 output_ptr[1] = cm[Temp2];
2205 output_ptr[2] = cm[Temp3];
2206 output_ptr[3] = cm[Temp4];
2207 output_ptr[4] = cm[Temp5];
2208 output_ptr[5] = cm[Temp6];
2209 output_ptr[6] = cm[Temp7];
2210 output_ptr[7] = cm[Temp8];
2211
2212 /* apply filter with vectors pairs */
2213 __asm__ __volatile__(
2214 "lbu %[src_ptr_l2], -24(%[src_ptr]) \n\t"
2215 "lbu %[src_ptr_l1], -8(%[src_ptr]) \n\t"
2216 "lbu %[src_ptr_0], 8(%[src_ptr]) \n\t"
2217 "lbu %[src_ptr_r1], 24(%[src_ptr]) \n\t"
2218 "lbu %[src_ptr_r2], 40(%[src_ptr]) \n\t"
2219 "lbu %[src_ptr_r3], 56(%[src_ptr]) \n\t"
2220 "mtlo %[vector4a], $ac2 \n\t"
2221
2222 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2223 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2224 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2225 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2226 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2227 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2228
2229 "lbu %[src_ptr_l2], -23(%[src_ptr]) \n\t"
2230 "lbu %[src_ptr_l1], -7(%[src_ptr]) \n\t"
2231 "lbu %[src_ptr_0], 9(%[src_ptr]) \n\t"
2232 "lbu %[src_ptr_r1], 25(%[src_ptr]) \n\t"
2233 "lbu %[src_ptr_r2], 41(%[src_ptr]) \n\t"
2234 "lbu %[src_ptr_r3], 57(%[src_ptr]) \n\t"
2235 "mtlo %[vector4a], $ac0 \n\t"
2236 "extp %[Temp1], $ac2, 9 \n\t"
2237
2238 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2239 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2240 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2241 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2242 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2243 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2244
2245 "lbu %[src_ptr_l2], -22(%[src_ptr]) \n\t"
2246 "lbu %[src_ptr_l1], -6(%[src_ptr]) \n\t"
2247 "lbu %[src_ptr_0], 10(%[src_ptr]) \n\t"
2248 "lbu %[src_ptr_r1], 26(%[src_ptr]) \n\t"
2249 "lbu %[src_ptr_r2], 42(%[src_ptr]) \n\t"
2250 "lbu %[src_ptr_r3], 58(%[src_ptr]) \n\t"
2251 "mtlo %[vector4a], $ac1 \n\t"
2252 "extp %[Temp2], $ac0, 9 \n\t"
2253
2254 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2255 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2256 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2257 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2258 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2259 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2260
2261 "lbu %[src_ptr_l2], -21(%[src_ptr]) \n\t"
2262 "lbu %[src_ptr_l1], -5(%[src_ptr]) \n\t"
2263 "lbu %[src_ptr_0], 11(%[src_ptr]) \n\t"
2264 "lbu %[src_ptr_r1], 27(%[src_ptr]) \n\t"
2265 "lbu %[src_ptr_r2], 43(%[src_ptr]) \n\t"
2266 "lbu %[src_ptr_r3], 59(%[src_ptr]) \n\t"
2267 "mtlo %[vector4a], $ac3 \n\t"
2268 "extp %[Temp3], $ac1, 9 \n\t"
2269
2270 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2271 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2272 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2273 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2274 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2275 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2276
2277 "lbu %[src_ptr_l2], -20(%[src_ptr]) \n\t"
2278 "lbu %[src_ptr_l1], -4(%[src_ptr]) \n\t"
2279 "lbu %[src_ptr_0], 12(%[src_ptr]) \n\t"
2280 "lbu %[src_ptr_r1], 28(%[src_ptr]) \n\t"
2281 "lbu %[src_ptr_r2], 44(%[src_ptr]) \n\t"
2282 "lbu %[src_ptr_r3], 60(%[src_ptr]) \n\t"
2283 "mtlo %[vector4a], $ac2 \n\t"
2284 "extp %[Temp4], $ac3, 9 \n\t"
2285
2286 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2287 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2288 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2289 "dpau.h.qbr $ac2, %[src_ptr_l2], %[vector1b] \n\t"
2290 "dpau.h.qbr $ac2, %[src_ptr_0], %[vector2b] \n\t"
2291 "dpsu.h.qbr $ac2, %[src_ptr_l1], %[vector3b] \n\t"
2292
2293 "lbu %[src_ptr_l2], -19(%[src_ptr]) \n\t"
2294 "lbu %[src_ptr_l1], -3(%[src_ptr]) \n\t"
2295 "lbu %[src_ptr_0], 13(%[src_ptr]) \n\t"
2296 "lbu %[src_ptr_r1], 29(%[src_ptr]) \n\t"
2297 "lbu %[src_ptr_r2], 45(%[src_ptr]) \n\t"
2298 "lbu %[src_ptr_r3], 61(%[src_ptr]) \n\t"
2299 "mtlo %[vector4a], $ac0 \n\t"
2300 "extp %[Temp5], $ac2, 9 \n\t"
2301
2302 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2303 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2304 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2305 "dpau.h.qbr $ac0, %[src_ptr_l2], %[vector1b] \n\t"
2306 "dpau.h.qbr $ac0, %[src_ptr_0], %[vector2b] \n\t"
2307 "dpsu.h.qbr $ac0, %[src_ptr_l1], %[vector3b] \n\t"
2308
2309 "lbu %[src_ptr_l2], -18(%[src_ptr]) \n\t"
2310 "lbu %[src_ptr_l1], -2(%[src_ptr]) \n\t"
2311 "lbu %[src_ptr_0], 14(%[src_ptr]) \n\t"
2312 "lbu %[src_ptr_r1], 30(%[src_ptr]) \n\t"
2313 "lbu %[src_ptr_r2], 46(%[src_ptr]) \n\t"
2314 "lbu %[src_ptr_r3], 62(%[src_ptr]) \n\t"
2315 "mtlo %[vector4a], $ac1 \n\t"
2316 "extp %[Temp6], $ac0, 9 \n\t"
2317
2318 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2319 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2320 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2321 "dpau.h.qbr $ac1, %[src_ptr_l2], %[vector1b] \n\t"
2322 "dpau.h.qbr $ac1, %[src_ptr_0], %[vector2b] \n\t"
2323 "dpsu.h.qbr $ac1, %[src_ptr_l1], %[vector3b] \n\t"
2324
2325 "lbu %[src_ptr_l2], -17(%[src_ptr]) \n\t"
2326 "lbu %[src_ptr_l1], -1(%[src_ptr]) \n\t"
2327 "lbu %[src_ptr_0], 15(%[src_ptr]) \n\t"
2328 "lbu %[src_ptr_r1], 31(%[src_ptr]) \n\t"
2329 "lbu %[src_ptr_r2], 47(%[src_ptr]) \n\t"
2330 "lbu %[src_ptr_r3], 63(%[src_ptr]) \n\t"
2331 "mtlo %[vector4a], $ac3 \n\t"
2332 "extp %[Temp7], $ac1, 9 \n\t"
2333
2334 "append %[src_ptr_l2], %[src_ptr_r3], 8 \n\t"
2335 "append %[src_ptr_0], %[src_ptr_r1], 8 \n\t"
2336 "append %[src_ptr_l1], %[src_ptr_r2], 8 \n\t"
2337 "dpau.h.qbr $ac3, %[src_ptr_l2], %[vector1b] \n\t"
2338 "dpau.h.qbr $ac3, %[src_ptr_0], %[vector2b] \n\t"
2339 "dpsu.h.qbr $ac3, %[src_ptr_l1], %[vector3b] \n\t"
2340 "extp %[Temp8], $ac3, 9 \n\t"
2341
2342 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
2343 [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
2344 [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
2345 [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
2346 [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
2347 [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
2348 : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
2349 [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
2350 [src_ptr] "r"(src_ptr));
2351
2352 src_ptr += 16;
2353 output_ptr[8] = cm[Temp1];
2354 output_ptr[9] = cm[Temp2];
2355 output_ptr[10] = cm[Temp3];
2356 output_ptr[11] = cm[Temp4];
2357 output_ptr[12] = cm[Temp5];
2358 output_ptr[13] = cm[Temp6];
2359 output_ptr[14] = cm[Temp7];
2360 output_ptr[15] = cm[Temp8];
2361
2362 output_ptr += output_pitch;
2363 }
2364 }
2365 }
2366
vp8_sixtap_predict4x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2367 void vp8_sixtap_predict4x4_dspr2(unsigned char *RESTRICT src_ptr,
2368 int src_pixels_per_line, int xoffset,
2369 int yoffset, unsigned char *RESTRICT dst_ptr,
2370 int dst_pitch) {
2371 unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
2372 unsigned int pos = 16;
2373
2374 /* bit positon for extract from acc */
2375 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
2376 :
2377 : [pos] "r"(pos));
2378
2379 if (yoffset) {
2380 /* First filter 1-D horizontally... */
2381 vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
2382 src_pixels_per_line, 9, xoffset, 4);
2383 /* then filter verticaly... */
2384 vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
2385 } else
2386 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2387 vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 4,
2388 xoffset, dst_pitch);
2389 }
2390
vp8_sixtap_predict8x8_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2391 void vp8_sixtap_predict8x8_dspr2(unsigned char *RESTRICT src_ptr,
2392 int src_pixels_per_line, int xoffset,
2393 int yoffset, unsigned char *RESTRICT dst_ptr,
2394 int dst_pitch) {
2395 unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
2396 unsigned int pos, Temp1, Temp2;
2397
2398 pos = 16;
2399
2400 /* bit positon for extract from acc */
2401 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
2402 :
2403 : [pos] "r"(pos));
2404
2405 if (yoffset) {
2406 src_ptr = src_ptr - (2 * src_pixels_per_line);
2407
2408 if (xoffset) /* filter 1-D horizontally... */
2409 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2410 13, xoffset, 8);
2411
2412 else {
2413 /* prefetch src_ptr data to cache memory */
2414 prefetch_load(src_ptr + 2 * src_pixels_per_line);
2415
2416 __asm__ __volatile__(
2417 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2418 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2419 "sw %[Temp1], 0(%[FData]) \n\t"
2420 "sw %[Temp2], 4(%[FData]) \n\t"
2421 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2422
2423 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2424 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2425 "sw %[Temp1], 8(%[FData]) \n\t"
2426 "sw %[Temp2], 12(%[FData]) \n\t"
2427 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2428
2429 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2430 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2431 "sw %[Temp1], 16(%[FData]) \n\t"
2432 "sw %[Temp2], 20(%[FData]) \n\t"
2433 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2434
2435 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2436 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2437 "sw %[Temp1], 24(%[FData]) \n\t"
2438 "sw %[Temp2], 28(%[FData]) \n\t"
2439 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2440
2441 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2442 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2443 "sw %[Temp1], 32(%[FData]) \n\t"
2444 "sw %[Temp2], 36(%[FData]) \n\t"
2445 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2446
2447 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2448 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2449 "sw %[Temp1], 40(%[FData]) \n\t"
2450 "sw %[Temp2], 44(%[FData]) \n\t"
2451 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2452
2453 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2454 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2455 "sw %[Temp1], 48(%[FData]) \n\t"
2456 "sw %[Temp2], 52(%[FData]) \n\t"
2457 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2458
2459 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2460 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2461 "sw %[Temp1], 56(%[FData]) \n\t"
2462 "sw %[Temp2], 60(%[FData]) \n\t"
2463 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2464
2465 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2466 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2467 "sw %[Temp1], 64(%[FData]) \n\t"
2468 "sw %[Temp2], 68(%[FData]) \n\t"
2469 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2470
2471 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2472 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2473 "sw %[Temp1], 72(%[FData]) \n\t"
2474 "sw %[Temp2], 76(%[FData]) \n\t"
2475 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2476
2477 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2478 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2479 "sw %[Temp1], 80(%[FData]) \n\t"
2480 "sw %[Temp2], 84(%[FData]) \n\t"
2481 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2482
2483 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2484 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2485 "sw %[Temp1], 88(%[FData]) \n\t"
2486 "sw %[Temp2], 92(%[FData]) \n\t"
2487 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2488
2489 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2490 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2491 "sw %[Temp1], 96(%[FData]) \n\t"
2492 "sw %[Temp2], 100(%[FData]) \n\t"
2493
2494 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2495 : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
2496 [src_pixels_per_line] "r"(src_pixels_per_line));
2497 }
2498
2499 /* filter verticaly... */
2500 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8,
2501 yoffset);
2502 }
2503
2504 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2505 else {
2506 if (xoffset)
2507 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2508 8, xoffset, dst_pitch);
2509
2510 else {
2511 /* copy from src buffer to dst buffer */
2512 __asm__ __volatile__(
2513 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2514 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2515 "sw %[Temp1], 0(%[dst_ptr]) \n\t"
2516 "sw %[Temp2], 4(%[dst_ptr]) \n\t"
2517 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2518
2519 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2520 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2521 "sw %[Temp1], 8(%[dst_ptr]) \n\t"
2522 "sw %[Temp2], 12(%[dst_ptr]) \n\t"
2523 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2524
2525 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2526 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2527 "sw %[Temp1], 16(%[dst_ptr]) \n\t"
2528 "sw %[Temp2], 20(%[dst_ptr]) \n\t"
2529 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2530
2531 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2532 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2533 "sw %[Temp1], 24(%[dst_ptr]) \n\t"
2534 "sw %[Temp2], 28(%[dst_ptr]) \n\t"
2535 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2536
2537 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2538 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2539 "sw %[Temp1], 32(%[dst_ptr]) \n\t"
2540 "sw %[Temp2], 36(%[dst_ptr]) \n\t"
2541 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2542
2543 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2544 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2545 "sw %[Temp1], 40(%[dst_ptr]) \n\t"
2546 "sw %[Temp2], 44(%[dst_ptr]) \n\t"
2547 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2548
2549 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2550 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2551 "sw %[Temp1], 48(%[dst_ptr]) \n\t"
2552 "sw %[Temp2], 52(%[dst_ptr]) \n\t"
2553 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2554
2555 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2556 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2557 "sw %[Temp1], 56(%[dst_ptr]) \n\t"
2558 "sw %[Temp2], 60(%[dst_ptr]) \n\t"
2559
2560 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2561 : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
2562 [src_pixels_per_line] "r"(src_pixels_per_line));
2563 }
2564 }
2565 }
2566
vp8_sixtap_predict8x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2567 void vp8_sixtap_predict8x4_dspr2(unsigned char *RESTRICT src_ptr,
2568 int src_pixels_per_line, int xoffset,
2569 int yoffset, unsigned char *RESTRICT dst_ptr,
2570 int dst_pitch) {
2571 unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
2572 unsigned int pos, Temp1, Temp2;
2573
2574 pos = 16;
2575
2576 /* bit positon for extract from acc */
2577 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
2578 :
2579 : [pos] "r"(pos));
2580
2581 if (yoffset) {
2582 src_ptr = src_ptr - (2 * src_pixels_per_line);
2583
2584 if (xoffset) /* filter 1-D horizontally... */
2585 vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2586 9, xoffset, 8);
2587
2588 else {
2589 /* prefetch src_ptr data to cache memory */
2590 prefetch_load(src_ptr + 2 * src_pixels_per_line);
2591
2592 __asm__ __volatile__(
2593 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2594 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2595 "sw %[Temp1], 0(%[FData]) \n\t"
2596 "sw %[Temp2], 4(%[FData]) \n\t"
2597 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2598
2599 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2600 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2601 "sw %[Temp1], 8(%[FData]) \n\t"
2602 "sw %[Temp2], 12(%[FData]) \n\t"
2603 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2604
2605 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2606 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2607 "sw %[Temp1], 16(%[FData]) \n\t"
2608 "sw %[Temp2], 20(%[FData]) \n\t"
2609 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2610
2611 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2612 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2613 "sw %[Temp1], 24(%[FData]) \n\t"
2614 "sw %[Temp2], 28(%[FData]) \n\t"
2615 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2616
2617 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2618 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2619 "sw %[Temp1], 32(%[FData]) \n\t"
2620 "sw %[Temp2], 36(%[FData]) \n\t"
2621 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2622
2623 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2624 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2625 "sw %[Temp1], 40(%[FData]) \n\t"
2626 "sw %[Temp2], 44(%[FData]) \n\t"
2627 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2628
2629 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2630 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2631 "sw %[Temp1], 48(%[FData]) \n\t"
2632 "sw %[Temp2], 52(%[FData]) \n\t"
2633 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2634
2635 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2636 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2637 "sw %[Temp1], 56(%[FData]) \n\t"
2638 "sw %[Temp2], 60(%[FData]) \n\t"
2639 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2640
2641 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2642 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2643 "sw %[Temp1], 64(%[FData]) \n\t"
2644 "sw %[Temp2], 68(%[FData]) \n\t"
2645
2646 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2647 : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
2648 [src_pixels_per_line] "r"(src_pixels_per_line));
2649 }
2650
2651 /* filter verticaly... */
2652 vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8,
2653 yoffset);
2654 }
2655
2656 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2657 else {
2658 if (xoffset)
2659 vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2660 4, xoffset, dst_pitch);
2661
2662 else {
2663 /* copy from src buffer to dst buffer */
2664 __asm__ __volatile__(
2665 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2666 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2667 "sw %[Temp1], 0(%[dst_ptr]) \n\t"
2668 "sw %[Temp2], 4(%[dst_ptr]) \n\t"
2669 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2670
2671 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2672 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2673 "sw %[Temp1], 8(%[dst_ptr]) \n\t"
2674 "sw %[Temp2], 12(%[dst_ptr]) \n\t"
2675 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2676
2677 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2678 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2679 "sw %[Temp1], 16(%[dst_ptr]) \n\t"
2680 "sw %[Temp2], 20(%[dst_ptr]) \n\t"
2681 "addu %[src_ptr], %[src_ptr], %[src_pixels_per_line] \n\t"
2682
2683 "ulw %[Temp1], 0(%[src_ptr]) \n\t"
2684 "ulw %[Temp2], 4(%[src_ptr]) \n\t"
2685 "sw %[Temp1], 24(%[dst_ptr]) \n\t"
2686 "sw %[Temp2], 28(%[dst_ptr]) \n\t"
2687
2688 : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2689 : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
2690 [src_pixels_per_line] "r"(src_pixels_per_line));
2691 }
2692 }
2693 }
2694
vp8_sixtap_predict16x16_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2695 void vp8_sixtap_predict16x16_dspr2(unsigned char *RESTRICT src_ptr,
2696 int src_pixels_per_line, int xoffset,
2697 int yoffset, unsigned char *RESTRICT dst_ptr,
2698 int dst_pitch) {
2699 const unsigned short *VFilter;
2700 unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
2701 unsigned int pos;
2702
2703 VFilter = sub_pel_filterss[yoffset];
2704
2705 pos = 16;
2706
2707 /* bit positon for extract from acc */
2708 __asm__ __volatile__("wrdsp %[pos], 1 \n\t"
2709 :
2710 : [pos] "r"(pos));
2711
2712 if (yoffset) {
2713 src_ptr = src_ptr - (2 * src_pixels_per_line);
2714
2715 switch (xoffset) {
2716 /* filter 1-D horizontally... */
2717 case 2:
2718 case 4:
2719 case 6:
2720 /* 6 tap filter */
2721 vp8_filter_block2d_first_pass16_6tap(
2722 src_ptr, FData, src_pixels_per_line, 21, xoffset, 16);
2723 break;
2724
2725 case 0:
2726 /* only copy buffer */
2727 vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
2728 break;
2729
2730 case 1:
2731 case 3:
2732 case 5:
2733 case 7:
2734 /* 4 tap filter */
2735 vp8_filter_block2d_first_pass16_4tap(
2736 src_ptr, FData, src_pixels_per_line, 16, 21, xoffset, yoffset,
2737 dst_ptr, dst_pitch);
2738 break;
2739 }
2740
2741 /* filter verticaly... */
2742 vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
2743 } else {
2744 /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2745 switch (xoffset) {
2746 case 2:
2747 case 4:
2748 case 6:
2749 /* 6 tap filter */
2750 vp8_filter_block2d_first_pass16_6tap(
2751 src_ptr, dst_ptr, src_pixels_per_line, 16, xoffset, dst_pitch);
2752 break;
2753
2754 case 1:
2755 case 3:
2756 case 5:
2757 case 7:
2758 /* 4 tap filter */
2759 vp8_filter_block2d_first_pass16_4tap(
2760 src_ptr, dst_ptr, src_pixels_per_line, 16, 21, xoffset, yoffset,
2761 dst_ptr, dst_pitch);
2762 break;
2763 }
2764 }
2765 }
2766
2767 #endif
2768