1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 
12 #include <stdlib.h>
13 #include "vp8_rtcd.h"
14 #include "vpx_ports/mem.h"
15 
16 #if HAVE_DSPR2
17 #define CROP_WIDTH 256
18 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
19 
20 static const unsigned short sub_pel_filterss[8][3] =
21 {
22     {      0,      0,      0},
23     {      0, 0x0601, 0x7b0c},
24     { 0x0201, 0x0b08, 0x6c24},
25     {      0, 0x0906, 0x5d32},
26     { 0x0303, 0x1010, 0x4d4d},
27     {      0, 0x0609, 0x325d},
28     { 0x0102, 0x080b, 0x246c},
29     {      0, 0x0106, 0x0c7b},
30 };
31 
32 
33 static const int sub_pel_filters_int[8][3] =
34 {
35     {          0,          0,          0},
36     { 0x0000fffa, 0x007b000c, 0xffff0000},
37     { 0x0002fff5, 0x006c0024, 0xfff80001},
38     { 0x0000fff7, 0x005d0032, 0xfffa0000},
39     { 0x0003fff0, 0x004d004d, 0xfff00003},
40     { 0x0000fffa, 0x0032005d, 0xfff70000},
41     { 0x0001fff8, 0x0024006c, 0xfff50002},
42     { 0x0000ffff, 0x000c007b, 0xfffa0000},
43 };
44 
45 
46 static const int sub_pel_filters_inv[8][3] =
47 {
48     {          0,          0,          0},
49     { 0xfffa0000, 0x000c007b, 0x0000ffff},
50     { 0xfff50002, 0x0024006c, 0x0001fff8},
51     { 0xfff70000, 0x0032005d, 0x0000fffa},
52     { 0xfff00003, 0x004d004d, 0x0003fff0},
53     { 0xfffa0000, 0x005d0032, 0x0000fff7},
54     { 0xfff80001, 0x006c0024, 0x0002fff5},
55     { 0xffff0000, 0x007b000c, 0x0000fffa},
56 };
57 
58 
59 static const int sub_pel_filters_int_tap_4[8][2] =
60 {
61     {          0,          0},
62     { 0xfffa007b, 0x000cffff},
63     {          0,          0},
64     { 0xfff7005d, 0x0032fffa},
65     {          0,          0},
66     { 0xfffa0032, 0x005dfff7},
67     {          0,          0},
68     { 0xffff000c, 0x007bfffa},
69 };
70 
71 
72 static const int sub_pel_filters_inv_tap_4[8][2] =
73 {
74     {          0,          0},
75     { 0x007bfffa, 0xffff000c},
76     {          0,          0},
77     { 0x005dfff7, 0xfffa0032},
78     {          0,          0},
79     { 0x0032fffa, 0xfff7005d},
80     {          0,          0},
81     { 0x000cffff, 0xfffa007b},
82 };
83 
prefetch_load(unsigned char * src)84 inline void prefetch_load(unsigned char *src)
85 {
86     __asm__ __volatile__ (
87         "pref   0,  0(%[src])   \n\t"
88         :
89         : [src] "r" (src)
90     );
91 }
92 
93 
prefetch_store(unsigned char * dst)94 inline void prefetch_store(unsigned char *dst)
95 {
96     __asm__ __volatile__ (
97         "pref   1,  0(%[dst])   \n\t"
98         :
99         : [dst] "r" (dst)
100     );
101 }
102 
dsputil_static_init(void)103 void dsputil_static_init(void)
104 {
105     int i;
106 
107     for (i = 0; i < 256; i++) ff_cropTbl[i + CROP_WIDTH] = i;
108 
109     for (i = 0; i < CROP_WIDTH; i++)
110     {
111         ff_cropTbl[i] = 0;
112         ff_cropTbl[i + CROP_WIDTH + 256] = 255;
113     }
114 }
115 
vp8_filter_block2d_first_pass_4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)116 void vp8_filter_block2d_first_pass_4
117 (
118     unsigned char *RESTRICT src_ptr,
119     unsigned char *RESTRICT dst_ptr,
120     unsigned int src_pixels_per_line,
121     unsigned int output_height,
122     int xoffset,
123     int pitch
124 )
125 {
126     unsigned int i;
127     int Temp1, Temp2, Temp3, Temp4;
128 
129     unsigned int vector4a = 64;
130     int vector1b, vector2b, vector3b;
131     unsigned int tp1, tp2, tn1, tn2;
132     unsigned int p1, p2, p3;
133     unsigned int n1, n2, n3;
134     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
135 
136     vector3b = sub_pel_filters_inv[xoffset][2];
137 
138     /* if (xoffset == 0) we don't need any filtering */
139     if (vector3b == 0)
140     {
141         for (i = 0; i < output_height; i++)
142         {
143             /* prefetch src_ptr data to cache memory */
144             prefetch_load(src_ptr + src_pixels_per_line);
145             dst_ptr[0] = src_ptr[0];
146             dst_ptr[1] = src_ptr[1];
147             dst_ptr[2] = src_ptr[2];
148             dst_ptr[3] = src_ptr[3];
149 
150             /* next row... */
151             src_ptr += src_pixels_per_line;
152             dst_ptr += 4;
153         }
154     }
155     else
156     {
157         if (vector3b > 65536)
158         {
159             /* 6 tap filter */
160 
161             vector1b = sub_pel_filters_inv[xoffset][0];
162             vector2b = sub_pel_filters_inv[xoffset][1];
163 
164             /* prefetch src_ptr data to cache memory */
165             prefetch_load(src_ptr + src_pixels_per_line);
166 
167             for (i = output_height; i--;)
168             {
169                 /* apply filter with vectors pairs */
170                 __asm__ __volatile__ (
171                     "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
172                     "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
173 
174                     /* even 1. pixel */
175                     "mtlo             %[vector4a], $ac3                           \n\t"
176                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
177                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
178                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
179                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
180                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
181                     "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
182 
183                     /* even 2. pixel */
184                     "mtlo             %[vector4a], $ac2                           \n\t"
185                     "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
186                     "balign           %[tp2],      %[tp1],         3              \n\t"
187                     "extp             %[Temp1],    $ac3,           9              \n\t"
188                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
189                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
190                     "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
191 
192                     /* odd 1. pixel */
193                     "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
194                     "mtlo             %[vector4a], $ac3                           \n\t"
195                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
196                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
197                     "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
198                     "extp             %[Temp3],    $ac2,           9              \n\t"
199                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
200                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
201                     "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
202 
203                     /* even 2. pixel */
204                     "mtlo             %[vector4a], $ac2                           \n\t"
205                     "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
206                     "extp             %[Temp2],    $ac3,           9              \n\t"
207                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
208                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
209                     "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
210                     "extp             %[Temp4],    $ac2,           9              \n\t"
211 
212                     /* clamp */
213                     "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
214                     "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
215                     "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
216                     "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
217 
218                     /* store bytes */
219                     "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
220                     "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
221                     "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
222                     "sb               %[n2],       3(%[dst_ptr])                  \n\t"
223 
224                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
225                       [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
226                       [p3] "=&r" (p3), [n1] "=&r" (n1), [n2] "=&r" (n2),
227                       [n3] "=&r" (n3), [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
228                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
229                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
230                       [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
231                       [vector3b] "r" (vector3b), [src_ptr] "r" (src_ptr)
232                 );
233 
234                 /* Next row... */
235                 src_ptr += src_pixels_per_line;
236                 dst_ptr += pitch;
237             }
238         }
239         else
240         {
241             /* 4 tap filter */
242 
243             vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
244             vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
245 
246             for (i = output_height; i--;)
247             {
248                 /* apply filter with vectors pairs */
249                 __asm__ __volatile__ (
250                     "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
251                     "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
252 
253                     /* even 1. pixel */
254                     "mtlo             %[vector4a], $ac3                           \n\t"
255                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
256                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
257                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
258                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
259                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
260 
261                     /* even 2. pixel */
262                     "mtlo             %[vector4a], $ac2                           \n\t"
263                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
264                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
265                     "extp             %[Temp1],    $ac3,           9              \n\t"
266 
267                     /* odd 1. pixel */
268                     "srl              %[tn1],      %[tp2],         8              \n\t"
269                     "balign           %[tp2],      %[tp1],         3              \n\t"
270                     "mtlo             %[vector4a], $ac3                           \n\t"
271                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
272                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
273                     "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
274                     "extp             %[Temp3],    $ac2,           9              \n\t"
275                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
276                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
277 
278                     /* odd 2. pixel */
279                     "mtlo             %[vector4a], $ac2                           \n\t"
280                     "extp             %[Temp2],    $ac3,           9              \n\t"
281                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
282                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
283                     "extp             %[Temp4],    $ac2,           9              \n\t"
284 
285                     /* clamp and store results */
286                     "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
287                     "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
288                     "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
289                     "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
290                     "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
291                     "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
292                     "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
293                     "sb               %[n2],       3(%[dst_ptr])                  \n\t"
294 
295                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
296                       [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
297                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
298                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
299                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
300                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
301                       [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
302                       [src_ptr] "r" (src_ptr)
303                 );
304                 /*  Next row... */
305                 src_ptr += src_pixels_per_line;
306                 dst_ptr += pitch;
307             }
308         }
309     }
310 }
311 
vp8_filter_block2d_first_pass_8_all(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)312 void vp8_filter_block2d_first_pass_8_all
313 (
314     unsigned char *RESTRICT src_ptr,
315     unsigned char *RESTRICT dst_ptr,
316     unsigned int src_pixels_per_line,
317     unsigned int output_height,
318     int xoffset,
319     int pitch
320 )
321 {
322     unsigned int i;
323     int Temp1, Temp2, Temp3, Temp4;
324 
325     unsigned int vector4a = 64;
326     unsigned int vector1b, vector2b, vector3b;
327     unsigned int tp1, tp2, tn1, tn2;
328     unsigned int p1, p2, p3, p4;
329     unsigned int n1, n2, n3, n4;
330 
331     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
332 
333     /* if (xoffset == 0) we don't need any filtering */
334     if (xoffset == 0)
335     {
336         for (i = 0; i < output_height; i++)
337         {
338             /* prefetch src_ptr data to cache memory */
339             prefetch_load(src_ptr + src_pixels_per_line);
340 
341             dst_ptr[0] = src_ptr[0];
342             dst_ptr[1] = src_ptr[1];
343             dst_ptr[2] = src_ptr[2];
344             dst_ptr[3] = src_ptr[3];
345             dst_ptr[4] = src_ptr[4];
346             dst_ptr[5] = src_ptr[5];
347             dst_ptr[6] = src_ptr[6];
348             dst_ptr[7] = src_ptr[7];
349 
350             /* next row... */
351             src_ptr += src_pixels_per_line;
352             dst_ptr += 8;
353         }
354     }
355     else
356     {
357         vector3b = sub_pel_filters_inv[xoffset][2];
358 
359         if (vector3b > 65536)
360         {
361             /* 6 tap filter */
362 
363             vector1b = sub_pel_filters_inv[xoffset][0];
364             vector2b = sub_pel_filters_inv[xoffset][1];
365 
366             for (i = output_height; i--;)
367             {
368                 /* prefetch src_ptr data to cache memory */
369                 prefetch_load(src_ptr + src_pixels_per_line);
370 
371                 /* apply filter with vectors pairs */
372                 __asm__ __volatile__ (
373                     "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
374                     "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
375 
376                     /* even 1. pixel */
377                     "mtlo             %[vector4a], $ac3                           \n\t"
378                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
379                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
380                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
381                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
382                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
383                     "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
384 
385                     /* even 2. pixel */
386                     "mtlo             %[vector4a], $ac2                           \n\t"
387                     "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
388                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
389                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
390                     "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
391 
392                     "balign           %[tp2],      %[tp1],         3              \n\t"
393                     "extp             %[Temp1],    $ac3,           9              \n\t"
394                     "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
395 
396                     /* odd 1. pixel */
397                     "mtlo             %[vector4a], $ac3                           \n\t"
398                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
399                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
400                     "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
401                     "extp             %[Temp3],    $ac2,           9              \n\t"
402                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
403                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
404                     "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
405 
406                     /* odd 2. pixel */
407                     "mtlo             %[vector4a], $ac2                           \n\t"
408                     "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
409                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
410                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
411                     "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
412                     "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
413                     "extp             %[Temp2],    $ac3,           9              \n\t"
414                     "mtlo             %[vector4a], $ac3                           \n\t"
415                     "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
416                     "extp             %[Temp4],    $ac2,           9              \n\t"
417 
418                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
419                       [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
420                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
421                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
422                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
423                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
424                       [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
425                       [src_ptr] "r" (src_ptr)
426                 );
427 
428                 /* clamp and store results */
429                 dst_ptr[0] = cm[Temp1];
430                 dst_ptr[1] = cm[Temp2];
431                 dst_ptr[2] = cm[Temp3];
432                 dst_ptr[3] = cm[Temp4];
433 
434                 /* next 4 pixels */
435                 __asm__ __volatile__ (
436                     /* even 3. pixel */
437                     "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
438                     "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
439                     "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
440 
441                     /* even 4. pixel */
442                     "mtlo             %[vector4a], $ac2                           \n\t"
443                     "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
444                     "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
445                     "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
446                     "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
447 
448                     "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
449                     "extp             %[Temp1],    $ac3,           9              \n\t"
450 
451                     /* odd 3. pixel */
452                     "mtlo             %[vector4a], $ac3                           \n\t"
453                     "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
454                     "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
455                     "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
456                     "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
457                     "extp             %[Temp3],    $ac2,           9              \n\t"
458 
459                     /* odd 4. pixel */
460                     "mtlo             %[vector4a], $ac2                           \n\t"
461                     "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
462                     "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
463                     "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
464                     "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
465                     "extp             %[Temp2],    $ac3,           9              \n\t"
466                     "extp             %[Temp4],    $ac2,           9              \n\t"
467 
468                     : [tn1] "=&r" (tn1), [n2] "=&r" (n2),
469                       [p4] "=&r" (p4), [n4] "=&r" (n4),
470                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
471                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
472                     : [tp1] "r" (tp1), [vector1b] "r" (vector1b), [p2] "r" (p2),
473                       [vector2b] "r" (vector2b), [n1] "r" (n1), [p1] "r" (p1),
474                       [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
475                       [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
476                 );
477 
478                 /* clamp and store results */
479                 dst_ptr[4] = cm[Temp1];
480                 dst_ptr[5] = cm[Temp2];
481                 dst_ptr[6] = cm[Temp3];
482                 dst_ptr[7] = cm[Temp4];
483 
484                 src_ptr += src_pixels_per_line;
485                 dst_ptr += pitch;
486             }
487         }
488         else
489         {
490             /* 4 tap filter */
491 
492             vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
493             vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
494 
495             for (i = output_height; i--;)
496             {
497                 /* prefetch src_ptr data to cache memory */
498                 prefetch_load(src_ptr + src_pixels_per_line);
499 
500                 /* apply filter with vectors pairs */
501                 __asm__ __volatile__ (
502                     "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
503 
504                     /* even 1. pixel */
505                     "mtlo             %[vector4a], $ac3                           \n\t"
506                     "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
507                     "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
508                     "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
509                     "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
510 
511                     "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
512 
513                     /* even 2. pixel  */
514                     "mtlo             %[vector4a], $ac2                           \n\t"
515                     "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
516                     "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
517                     "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
518                     "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
519                     "extp             %[Temp1],    $ac3,           9              \n\t"
520 
521                     "balign           %[tp2],      %[tp1],         3              \n\t"
522 
523                     /* odd 1. pixel */
524                     "mtlo             %[vector4a], $ac3                           \n\t"
525                     "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
526                     "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
527                     "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
528                     "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
529                     "extp             %[Temp3],    $ac2,           9              \n\t"
530 
531                     "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
532 
533                     /* odd 2. pixel */
534                     "mtlo             %[vector4a], $ac2                           \n\t"
535                     "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
536                     "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
537                     "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
538                     "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
539                     "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
540                     "extp             %[Temp2],    $ac3,           9              \n\t"
541                     "mtlo             %[vector4a], $ac3                           \n\t"
542                     "extp             %[Temp4],    $ac2,           9              \n\t"
543 
544                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
545                       [tn2] "=&r" (tn2), [p1] "=&r" (p1), [p2] "=&r" (p2),
546                       [p3] "=&r" (p3), [p4] "=&r" (p4), [n1] "=&r" (n1),
547                       [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
548                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
549                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
550                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
551                       [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
552                 );
553 
554                 /* clamp and store results */
555                 dst_ptr[0] = cm[Temp1];
556                 dst_ptr[1] = cm[Temp2];
557                 dst_ptr[2] = cm[Temp3];
558                 dst_ptr[3] = cm[Temp4];
559 
560                 /* next 4 pixels */
561                 __asm__ __volatile__ (
562                     /* even 3. pixel */
563                     "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
564                     "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
565 
566                     /* even 4. pixel */
567                     "mtlo             %[vector4a], $ac2                           \n\t"
568                     "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
569                     "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
570                     "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
571                     "extp             %[Temp1],    $ac3,           9              \n\t"
572 
573                     /* odd 3. pixel */
574                     "mtlo             %[vector4a], $ac3                           \n\t"
575                     "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
576                     "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
577                     "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
578                     "extp             %[Temp3],    $ac2,           9              \n\t"
579 
580                     /* odd 4. pixel */
581                     "mtlo             %[vector4a], $ac2                           \n\t"
582                     "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
583                     "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
584                     "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
585                     "extp             %[Temp2],    $ac3,           9              \n\t"
586                     "extp             %[Temp4],    $ac2,           9              \n\t"
587 
588                     : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2),
589                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
590                       [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
591                     : [tp1] "r" (tp1), [p3] "r" (p3), [p4] "r" (p4),
592                       [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
593                       [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr),
594                       [n3] "r" (n3), [n4] "r" (n4)
595                 );
596 
597                 /* clamp and store results */
598                 dst_ptr[4] = cm[Temp1];
599                 dst_ptr[5] = cm[Temp2];
600                 dst_ptr[6] = cm[Temp3];
601                 dst_ptr[7] = cm[Temp4];
602 
603                 /* next row... */
604                 src_ptr += src_pixels_per_line;
605                 dst_ptr += pitch;
606             }
607         }
608     }
609 }
610 
611 
vp8_filter_block2d_first_pass16_6tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)612 void vp8_filter_block2d_first_pass16_6tap
613 (
614     unsigned char *RESTRICT src_ptr,
615     unsigned char *RESTRICT dst_ptr,
616     unsigned int src_pixels_per_line,
617     unsigned int output_height,
618     int xoffset,
619     int pitch
620 )
621 {
622     unsigned int i;
623     int Temp1, Temp2, Temp3, Temp4;
624 
625     unsigned int vector4a;
626     unsigned int vector1b, vector2b, vector3b;
627     unsigned int tp1, tp2, tn1, tn2;
628     unsigned int p1, p2, p3, p4;
629     unsigned int n1, n2, n3, n4;
630     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
631 
632     vector1b = sub_pel_filters_inv[xoffset][0];
633     vector2b = sub_pel_filters_inv[xoffset][1];
634     vector3b = sub_pel_filters_inv[xoffset][2];
635     vector4a = 64;
636 
637     for (i = output_height; i--;)
638     {
639         /* prefetch src_ptr data to cache memory */
640         prefetch_load(src_ptr + src_pixels_per_line);
641 
642         /* apply filter with vectors pairs */
643         __asm__ __volatile__ (
644             "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
645             "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
646 
647             /* even 1. pixel */
648             "mtlo               %[vector4a], $ac3                           \n\t"
649             "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
650             "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
651             "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
652             "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
653             "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
654             "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
655 
656             /* even 2. pixel */
657             "mtlo               %[vector4a], $ac2                           \n\t"
658             "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
659             "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
660             "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
661             "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
662 
663             "balign             %[tp2],      %[tp1],          3             \n\t"
664             "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
665             "extp               %[Temp1],    $ac3,            9             \n\t"
666 
667             /* odd 1. pixel */
668             "mtlo               %[vector4a], $ac3                           \n\t"
669             "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
670             "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
671             "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
672             "extp               %[Temp3],    $ac2,            9             \n\t"
673             "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
674             "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
675             "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
676 
677             /* odd 2. pixel */
678             "mtlo               %[vector4a], $ac2                           \n\t"
679             "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
680             "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
681             "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
682             "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
683             "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
684             "extp               %[Temp2],    $ac3,            9             \n\t"
685             "mtlo               %[vector4a], $ac3                           \n\t"
686             "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
687             "extp               %[Temp4],    $ac2,            9             \n\t"
688 
689             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn2] "=&r" (tn2),
690               [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
691               [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
692               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
693               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
694             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
695               [vector4a] "r" (vector4a), [vector3b] "r" (vector3b),
696               [src_ptr] "r" (src_ptr)
697         );
698 
699         /* clamp and store results */
700         dst_ptr[0] = cm[Temp1];
701         dst_ptr[1] = cm[Temp2];
702         dst_ptr[2] = cm[Temp3];
703         dst_ptr[3] = cm[Temp4];
704 
705         /* next 4 pixels */
706         __asm__ __volatile__ (
707             /* even 3. pixel */
708             "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
709             "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
710             "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
711 
712             /* even 4. pixel */
713             "mtlo               %[vector4a], $ac2                           \n\t"
714             "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
715             "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
716             "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
717             "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
718             "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
719             "extp               %[Temp1],    $ac3,            9             \n\t"
720 
721             /* odd 3. pixel */
722             "mtlo               %[vector4a], $ac3                           \n\t"
723             "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
724             "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
725             "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
726             "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
727             "extp               %[Temp3],    $ac2,            9             \n\t"
728 
729             /* odd 4. pixel */
730             "mtlo               %[vector4a], $ac2                           \n\t"
731             "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
732             "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
733             "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
734             "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
735             "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
736             "extp               %[Temp2],    $ac3,            9             \n\t"
737             "mtlo               %[vector4a], $ac3                           \n\t"
738             "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
739             "extp               %[Temp4],    $ac2,            9             \n\t"
740 
741             : [tn1] "=&r" (tn1), [tp2] "=&r" (tp2), [n2] "=&r" (n2),
742               [p4] "=&r" (p4), [n4] "=&r" (n4),
743               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
744               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
745             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
746               [tp1] "r" (tp1), [n1] "r" (n1), [p1] "r" (p1),
747               [vector4a] "r" (vector4a), [p2] "r" (p2), [vector3b] "r" (vector3b),
748               [p3] "r" (p3), [n3] "r" (n3), [src_ptr] "r" (src_ptr)
749         );
750 
751         /* clamp and store results */
752         dst_ptr[4] = cm[Temp1];
753         dst_ptr[5] = cm[Temp2];
754         dst_ptr[6] = cm[Temp3];
755         dst_ptr[7] = cm[Temp4];
756 
757         /* next 4 pixels */
758         __asm__ __volatile__ (
759             /* even 5. pixel */
760             "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
761             "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
762             "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
763 
764             /* even 6. pixel */
765             "mtlo               %[vector4a], $ac2                           \n\t"
766             "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
767             "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
768             "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
769             "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
770 
771             "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
772             "extp               %[Temp1],    $ac3,            9             \n\t"
773 
774             /* odd 5. pixel */
775             "mtlo               %[vector4a], $ac3                           \n\t"
776             "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
777             "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
778             "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
779             "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
780             "extp               %[Temp3],    $ac2,            9             \n\t"
781 
782             /* odd 6. pixel */
783             "mtlo               %[vector4a], $ac2                           \n\t"
784             "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
785             "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
786             "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
787             "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
788             "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
789             "extp               %[Temp2],    $ac3,            9             \n\t"
790             "mtlo               %[vector4a], $ac3                           \n\t"
791             "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
792             "extp               %[Temp4],    $ac2,            9             \n\t"
793 
794             : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
795               [n1] "=&r" (n1), [p3] "=&r" (p3), [n3] "=&r" (n3),
796               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
797               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
798             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
799               [tp2] "r" (tp2), [p2] "r" (p2), [n2] "r" (n2),
800               [p4] "r" (p4), [n4] "r" (n4), [p1] "r" (p1), [src_ptr] "r" (src_ptr),
801               [vector4a] "r" (vector4a), [vector3b] "r" (vector3b)
802         );
803 
804         /* clamp and store results */
805         dst_ptr[8] = cm[Temp1];
806         dst_ptr[9] = cm[Temp2];
807         dst_ptr[10] = cm[Temp3];
808         dst_ptr[11] = cm[Temp4];
809 
810         /* next 4 pixels */
811         __asm__ __volatile__ (
812             /* even 7. pixel */
813             "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
814             "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
815             "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
816 
817             /* even 8. pixel */
818             "mtlo               %[vector4a], $ac2                           \n\t"
819             "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
820             "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
821             "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
822             "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
823             "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
824             "extp               %[Temp1],    $ac3,            9             \n\t"
825 
826             /* odd 7. pixel */
827             "mtlo               %[vector4a], $ac3                           \n\t"
828             "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
829             "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
830             "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
831             "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
832             "extp               %[Temp3],    $ac2,            9             \n\t"
833 
834             /* odd 8. pixel */
835             "mtlo               %[vector4a], $ac2                           \n\t"
836             "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
837             "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
838             "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
839             "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
840             "extp               %[Temp2],    $ac3,            9             \n\t"
841             "extp               %[Temp4],    $ac2,            9             \n\t"
842 
843             /* clamp and store results */
844             "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
845             "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
846             "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
847             "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
848             "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
849             "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
850             "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
851             "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
852 
853             : [tn1] "=&r" (tn1), [p2] "=&r" (p2), [n2] "=&r" (n2), [n4] "=&r" (n4),
854               [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
855               [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4)
856             : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
857               [tp1] "r" (tp1), [p4] "r" (p4), [n1] "r" (n1), [p1] "r" (p1),
858               [vector4a] "r" (vector4a), [vector3b] "r" (vector3b), [p3] "r" (p3),
859               [n3] "r" (n3), [src_ptr] "r" (src_ptr),
860               [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
861         );
862 
863         src_ptr += src_pixels_per_line;
864         dst_ptr += pitch;
865     }
866 }
867 
868 
vp8_filter_block2d_first_pass16_0(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line)869 void vp8_filter_block2d_first_pass16_0
870 (
871     unsigned char *RESTRICT src_ptr,
872     unsigned char *RESTRICT output_ptr,
873     unsigned int src_pixels_per_line
874 )
875 {
876     int Temp1, Temp2, Temp3, Temp4;
877     int i;
878 
879     /* prefetch src_ptr data to cache memory */
880     prefetch_store(output_ptr + 32);
881 
882     /* copy memory from src buffer to dst buffer */
883     for (i = 0; i < 7; i++)
884     {
885         __asm__ __volatile__ (
886             "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
887             "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
888             "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
889             "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
890             "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
891             "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
892             "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
893             "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
894             "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
895 
896             : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
897               [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
898             : [src_pixels_per_line] "r" (src_pixels_per_line),
899               [output_ptr] "r" (output_ptr)
900         );
901 
902         __asm__ __volatile__ (
903             "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
904             "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
905             "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
906             "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
907             "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
908             "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
909             "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
910             "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
911             "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
912 
913             : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
914               [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
915             : [src_pixels_per_line] "r" (src_pixels_per_line),
916               [output_ptr] "r" (output_ptr)
917         );
918 
919         __asm__ __volatile__ (
920             "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
921             "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
922             "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
923             "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
924             "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
925             "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
926             "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
927             "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
928             "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
929 
930             : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
931               [Temp4] "=&r" (Temp4), [src_ptr] "+r" (src_ptr)
932             : [src_pixels_per_line] "r" (src_pixels_per_line),
933               [output_ptr] "r" (output_ptr)
934         );
935 
936         output_ptr += 48;
937     }
938 }
939 
940 
vp8_filter_block2d_first_pass16_4tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line,unsigned int output_width,unsigned int output_height,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int pitch)941 void vp8_filter_block2d_first_pass16_4tap
942 (
943     unsigned char *RESTRICT src_ptr,
944     unsigned char *RESTRICT output_ptr,
945     unsigned int src_pixels_per_line,
946     unsigned int output_width,
947     unsigned int output_height,
948     int xoffset,
949     int yoffset,
950     unsigned char *RESTRICT dst_ptr,
951     int pitch
952 )
953 {
954     unsigned int i, j;
955     int Temp1, Temp2, Temp3, Temp4;
956 
957     unsigned int vector4a;
958     int vector1b, vector2b;
959     unsigned int tp1, tp2, tp3, tn1;
960     unsigned int p1, p2, p3;
961     unsigned int n1, n2, n3;
962     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
963 
964     vector4a = 64;
965 
966     vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
967     vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
968 
969     /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
970     if (yoffset == 0)
971     {
972         output_height -= 5;
973         src_ptr += (src_pixels_per_line + src_pixels_per_line);
974 
975         for (i = output_height; i--;)
976         {
977             __asm__ __volatile__ (
978                 "ulw     %[tp3],   -1(%[src_ptr])               \n\t"
979                 : [tp3] "=&r" (tp3)
980                 : [src_ptr] "r" (src_ptr)
981             );
982 
983             /* processing 4 adjacent pixels */
984             for (j = 0; j < 16; j += 4)
985             {
986                 /* apply filter with vectors pairs */
987                 __asm__ __volatile__ (
988                     "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
989                     "move             %[tp1],      %[tp3]                           \n\t"
990 
991                     /* even 1. pixel */
992                     "mtlo             %[vector4a], $ac3                             \n\t"
993                     "mthi             $0,          $ac3                             \n\t"
994                     "move             %[tp3],      %[tp2]                           \n\t"
995                     "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
996                     "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
997                     "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
998                     "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
999                     "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
1000 
1001                     /* even 2. pixel */
1002                     "mtlo             %[vector4a], $ac2                             \n\t"
1003                     "mthi             $0,          $ac2                             \n\t"
1004                     "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
1005                     "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
1006                     "extr.w           %[Temp1],    $ac3,            7               \n\t"
1007 
1008                     /* odd 1. pixel */
1009                     "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
1010                     "balign           %[tp2],      %[tp1],          3               \n\t"
1011                     "mtlo             %[vector4a], $ac3                             \n\t"
1012                     "mthi             $0,          $ac3                             \n\t"
1013                     "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
1014                     "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
1015                     "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
1016                     "extr.w           %[Temp3],    $ac2,            7               \n\t"
1017                     "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
1018                     "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
1019 
1020                     /* odd 2. pixel */
1021                     "mtlo             %[vector4a], $ac2                             \n\t"
1022                     "mthi             $0,          $ac2                             \n\t"
1023                     "extr.w           %[Temp2],    $ac3,            7               \n\t"
1024                     "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
1025                     "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
1026                     "extr.w           %[Temp4],    $ac2,            7               \n\t"
1027 
1028                     /* clamp and store results */
1029                     "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
1030                     "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
1031                     "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
1032                     "sb               %[tp1],      0(%[dst_ptr])                    \n\t"
1033                     "sb               %[tn1],      1(%[dst_ptr])                    \n\t"
1034                     "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
1035                     "sb               %[tp2],      2(%[dst_ptr])                    \n\t"
1036                     "sb               %[n2],       3(%[dst_ptr])                    \n\t"
1037 
1038                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
1039                       [tn1] "=&r" (tn1), [p1] "=&r" (p1), [p2] "=&r" (p2),
1040                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
1041                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [p3] "=&r" (p3),
1042                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
1043                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1044                       [vector4a] "r" (vector4a), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr),
1045                       [src_ptr] "r" (src_ptr)
1046                 );
1047 
1048                 src_ptr += 4;
1049             }
1050 
1051             /* Next row... */
1052             src_ptr += src_pixels_per_line - 16;
1053             dst_ptr += pitch;
1054         }
1055     }
1056     else
1057     {
1058         for (i = output_height; i--;)
1059         {
1060             /* processing 4 adjacent pixels */
1061             for (j = 0; j < 16; j += 4)
1062             {
1063                 /* apply filter with vectors pairs */
1064                 __asm__ __volatile__ (
1065                     "ulw              %[tp1],      -1(%[src_ptr])                   \n\t"
1066                     "ulw              %[tp2],      3(%[src_ptr])                    \n\t"
1067 
1068                     /* even 1. pixel */
1069                     "mtlo             %[vector4a], $ac3                             \n\t"
1070                     "mthi             $0,          $ac3                             \n\t"
1071                     "preceu.ph.qbr    %[p1],       %[tp1]                           \n\t"
1072                     "preceu.ph.qbl    %[p2],       %[tp1]                           \n\t"
1073                     "preceu.ph.qbr    %[p3],       %[tp2]                           \n\t"
1074                     "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     \n\t"
1075                     "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     \n\t"
1076 
1077                     /* even 2. pixel */
1078                     "mtlo             %[vector4a], $ac2                             \n\t"
1079                     "mthi             $0,          $ac2                             \n\t"
1080                     "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     \n\t"
1081                     "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     \n\t"
1082                     "extr.w           %[Temp1],    $ac3,            7               \n\t"
1083 
1084                     /* odd 1. pixel */
1085                     "ulw              %[tn1],      4(%[src_ptr])                    \n\t"
1086                     "balign           %[tp2],      %[tp1],          3               \n\t"
1087                     "mtlo             %[vector4a], $ac3                             \n\t"
1088                     "mthi             $0,          $ac3                             \n\t"
1089                     "preceu.ph.qbr    %[n1],       %[tp2]                           \n\t"
1090                     "preceu.ph.qbl    %[n2],       %[tp2]                           \n\t"
1091                     "preceu.ph.qbr    %[n3],       %[tn1]                           \n\t"
1092                     "extr.w           %[Temp3],    $ac2,            7               \n\t"
1093                     "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     \n\t"
1094                     "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     \n\t"
1095 
1096                     /* odd 2. pixel */
1097                     "mtlo             %[vector4a], $ac2                             \n\t"
1098                     "mthi             $0,          $ac2                             \n\t"
1099                     "extr.w           %[Temp2],    $ac3,            7               \n\t"
1100                     "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     \n\t"
1101                     "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     \n\t"
1102                     "extr.w           %[Temp4],    $ac2,            7               \n\t"
1103 
1104                     /* clamp and store results */
1105                     "lbux             %[tp1],      %[Temp1](%[cm])                  \n\t"
1106                     "lbux             %[tn1],      %[Temp2](%[cm])                  \n\t"
1107                     "lbux             %[tp2],      %[Temp3](%[cm])                  \n\t"
1108                     "sb               %[tp1],      0(%[output_ptr])                 \n\t"
1109                     "sb               %[tn1],      1(%[output_ptr])                 \n\t"
1110                     "lbux             %[n2],       %[Temp4](%[cm])                  \n\t"
1111                     "sb               %[tp2],      2(%[output_ptr])                 \n\t"
1112                     "sb               %[n2],       3(%[output_ptr])                 \n\t"
1113 
1114                     : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1),
1115                       [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
1116                       [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3),
1117                       [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1118                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
1119                     : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1120                       [vector4a] "r" (vector4a), [cm] "r" (cm),
1121                       [output_ptr] "r" (output_ptr), [src_ptr] "r" (src_ptr)
1122                 );
1123 
1124                 src_ptr += 4;
1125             }
1126 
1127             /* next row... */
1128             src_ptr += src_pixels_per_line;
1129             output_ptr += output_width;
1130         }
1131     }
1132 }
1133 
1134 
vp8_filter_block2d_second_pass4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,int yoffset)1135 void vp8_filter_block2d_second_pass4
1136 (
1137     unsigned char *RESTRICT src_ptr,
1138     unsigned char *RESTRICT output_ptr,
1139     int output_pitch,
1140     int yoffset
1141 )
1142 {
1143     unsigned int i;
1144 
1145     int Temp1, Temp2, Temp3, Temp4;
1146     unsigned int vector1b, vector2b, vector3b, vector4a;
1147 
1148     unsigned char src_ptr_l2;
1149     unsigned char src_ptr_l1;
1150     unsigned char src_ptr_0;
1151     unsigned char src_ptr_r1;
1152     unsigned char src_ptr_r2;
1153     unsigned char src_ptr_r3;
1154 
1155     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1156 
1157     vector4a = 64;
1158 
1159     /* load filter coefficients */
1160     vector1b = sub_pel_filterss[yoffset][0];
1161     vector2b = sub_pel_filterss[yoffset][2];
1162     vector3b = sub_pel_filterss[yoffset][1];
1163 
1164     if (vector1b)
1165     {
1166         /* 6 tap filter */
1167 
1168         for (i = 2; i--;)
1169         {
1170             /* prefetch src_ptr data to cache memory */
1171             prefetch_load(src_ptr);
1172 
1173             /* do not allow compiler to reorder instructions */
1174             __asm__ __volatile__ (
1175                 ".set noreorder                                                 \n\t"
1176                 :
1177                 :
1178             );
1179 
1180             /* apply filter with vectors pairs */
1181             __asm__ __volatile__ (
1182                 "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
1183                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1184                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1185                 "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
1186                 "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
1187                 "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
1188                 "mtlo           %[vector4a],    $ac2                            \n\t"
1189 
1190                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1191                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1192                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1193                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1194                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1195                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1196 
1197                 "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
1198                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1199                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1200                 "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
1201                 "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
1202                 "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
1203                 "mtlo           %[vector4a],    $ac3                            \n\t"
1204                 "extp           %[Temp1],       $ac2,           9               \n\t"
1205 
1206                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1207                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1208                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1209                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1210                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1211                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1212 
1213                 "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
1214                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1215                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1216                 "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
1217                 "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
1218                 "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
1219                 "mtlo           %[vector4a],    $ac0                            \n\t"
1220                 "extp           %[Temp2],       $ac3,           9               \n\t"
1221 
1222                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1223                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1224                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1225                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1226                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1227                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1228 
1229                 "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
1230                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1231                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1232                 "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
1233                 "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
1234                 "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
1235                 "mtlo           %[vector4a],    $ac1                            \n\t"
1236                 "extp           %[Temp3],       $ac0,           9               \n\t"
1237 
1238                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1239                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1240                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1241                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1242                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1243                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1244                 "extp           %[Temp4],       $ac1,           9               \n\t"
1245 
1246                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1247                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1248                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1249                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1250                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1251                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1252                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1253                   [src_ptr] "r" (src_ptr)
1254             );
1255 
1256             /* clamp and store results */
1257             output_ptr[0] = cm[Temp1];
1258             output_ptr[1] = cm[Temp2];
1259             output_ptr[2] = cm[Temp3];
1260             output_ptr[3] = cm[Temp4];
1261 
1262             output_ptr += output_pitch;
1263 
1264             /* apply filter with vectors pairs */
1265             __asm__ __volatile__ (
1266                 "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
1267                 "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
1268                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1269                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1270                 "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
1271                 "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
1272                 "mtlo           %[vector4a],    $ac2                            \n\t"
1273                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1274                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1275                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1276                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1277                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1278                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1279 
1280                 "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
1281                 "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
1282                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1283                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1284                 "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
1285                 "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
1286                 "mtlo           %[vector4a],    $ac3                            \n\t"
1287                 "extp           %[Temp1],       $ac2,           9               \n\t"
1288 
1289                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1290                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1291                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1292                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1293                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1294                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1295 
1296                 "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
1297                 "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
1298                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1299                 "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1300                 "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
1301                 "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
1302                 "mtlo           %[vector4a],    $ac0                            \n\t"
1303                 "extp           %[Temp2],       $ac3,           9               \n\t"
1304 
1305                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1306                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1307                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1308                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1309                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1310                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1311 
1312                 "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
1313                 "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
1314                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1315                 "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1316                 "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
1317                 "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
1318                 "mtlo           %[vector4a],    $ac1                            \n\t"
1319                 "extp           %[Temp3],       $ac0,           9               \n\t"
1320 
1321                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1322                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1323                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1324                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1325                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1326                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1327                 "extp           %[Temp4],       $ac1,           9               \n\t"
1328 
1329                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1330                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1331                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1332                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1333                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1334                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1335                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1336                   [src_ptr] "r" (src_ptr)
1337             );
1338 
1339             /* clamp and store results */
1340             output_ptr[0] = cm[Temp1];
1341             output_ptr[1] = cm[Temp2];
1342             output_ptr[2] = cm[Temp3];
1343             output_ptr[3] = cm[Temp4];
1344 
1345             src_ptr += 8;
1346             output_ptr += output_pitch;
1347         }
1348     }
1349     else
1350     {
1351         /* 4 tap filter */
1352 
1353         /* prefetch src_ptr data to cache memory */
1354         prefetch_load(src_ptr);
1355 
1356         for (i = 2; i--;)
1357         {
1358             /* do not allow compiler to reorder instructions */
1359             __asm__ __volatile__ (
1360                 ".set noreorder                                                 \n\t"
1361                 :
1362                 :
1363             );
1364 
1365             /* apply filter with vectors pairs */
1366             __asm__ __volatile__ (
1367                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1368                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1369                 "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
1370                 "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
1371                 "mtlo           %[vector4a],    $ac2                            \n\t"
1372                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1373                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1374                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1375                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1376 
1377                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1378                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1379                 "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
1380                 "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
1381                 "mtlo           %[vector4a],    $ac3                            \n\t"
1382                 "extp           %[Temp1],       $ac2,           9               \n\t"
1383 
1384                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1385                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1386                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1387                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1388 
1389                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1390                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1391                 "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
1392                 "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
1393                 "mtlo           %[vector4a],    $ac0                            \n\t"
1394                 "extp           %[Temp2],       $ac3,           9               \n\t"
1395 
1396                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1397                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1398                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1399                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1400 
1401                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1402                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1403                 "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
1404                 "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
1405                 "mtlo           %[vector4a],    $ac1                            \n\t"
1406                 "extp           %[Temp3],       $ac0,           9               \n\t"
1407                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1408                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1409                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1410                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1411                 "extp           %[Temp4],       $ac1,           9               \n\t"
1412 
1413                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1414                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1415                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1416                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1417                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1418                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1419             );
1420 
1421             /* clamp and store results */
1422             output_ptr[0] = cm[Temp1];
1423             output_ptr[1] = cm[Temp2];
1424             output_ptr[2] = cm[Temp3];
1425             output_ptr[3] = cm[Temp4];
1426 
1427             output_ptr += output_pitch;
1428 
1429             /* apply filter with vectors pairs */
1430             __asm__ __volatile__ (
1431                 "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
1432                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1433                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1434                 "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
1435                 "mtlo           %[vector4a],    $ac2                            \n\t"
1436                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1437                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1438                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1439                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1440 
1441                 "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
1442                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1443                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1444                 "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
1445                 "mtlo           %[vector4a],    $ac3                            \n\t"
1446                 "extp           %[Temp1],       $ac2,           9               \n\t"
1447 
1448                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1449                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1450                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1451                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1452 
1453                 "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
1454                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1455                 "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1456                 "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
1457                 "mtlo           %[vector4a],    $ac0                            \n\t"
1458                 "extp           %[Temp2],       $ac3,           9               \n\t"
1459 
1460                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1461                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1462                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1463                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1464 
1465                 "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
1466                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1467                 "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1468                 "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
1469                 "mtlo           %[vector4a],    $ac1                            \n\t"
1470                 "extp           %[Temp3],       $ac0,           9               \n\t"
1471                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1472                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1473                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1474                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1475                 "extp           %[Temp4],       $ac1,           9               \n\t"
1476 
1477                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
1478                   [Temp3] "=&r" (Temp3), [Temp4] "=r" (Temp4),
1479                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1480                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1481                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1482                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1483             );
1484 
1485             /* clamp and store results */
1486             output_ptr[0] = cm[Temp1];
1487             output_ptr[1] = cm[Temp2];
1488             output_ptr[2] = cm[Temp3];
1489             output_ptr[3] = cm[Temp4];
1490 
1491             src_ptr += 8;
1492             output_ptr += output_pitch;
1493         }
1494     }
1495 }
1496 
1497 
vp8_filter_block2d_second_pass_8(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,unsigned int output_height,unsigned int output_width,unsigned int yoffset)1498 void vp8_filter_block2d_second_pass_8
1499 (
1500     unsigned char *RESTRICT src_ptr,
1501     unsigned char *RESTRICT output_ptr,
1502     int output_pitch,
1503     unsigned int output_height,
1504     unsigned int output_width,
1505     unsigned int yoffset
1506 )
1507 {
1508     unsigned int i;
1509 
1510     int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1511     unsigned int vector1b, vector2b, vector3b, vector4a;
1512 
1513     unsigned char src_ptr_l2;
1514     unsigned char src_ptr_l1;
1515     unsigned char src_ptr_0;
1516     unsigned char src_ptr_r1;
1517     unsigned char src_ptr_r2;
1518     unsigned char src_ptr_r3;
1519     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1520 
1521     vector4a = 64;
1522 
1523     vector1b = sub_pel_filterss[yoffset][0];
1524     vector2b = sub_pel_filterss[yoffset][2];
1525     vector3b = sub_pel_filterss[yoffset][1];
1526 
1527     if (vector1b)
1528     {
1529         /* 6 tap filter */
1530 
1531         /* prefetch src_ptr data to cache memory */
1532         prefetch_load(src_ptr);
1533 
1534         for (i = output_height; i--;)
1535         {
1536             /* apply filter with vectors pairs */
1537             __asm__ __volatile__ (
1538                 "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
1539                 "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
1540                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1541                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1542                 "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
1543                 "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
1544                 "mtlo           %[vector4a],    $ac2                            \n\t"
1545 
1546                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1547                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1548                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1549                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1550                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1551                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1552 
1553                 "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
1554                 "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
1555                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1556                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1557                 "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
1558                 "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
1559                 "mtlo           %[vector4a],    $ac3                            \n\t"
1560                 "extp           %[Temp1],       $ac2,           9               \n\t"
1561 
1562                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1563                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1564                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1565                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1566                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1567                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1568 
1569                 "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
1570                 "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
1571                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1572                 "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1573                 "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
1574                 "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
1575                 "mtlo           %[vector4a],    $ac0                            \n\t"
1576                 "extp           %[Temp2],       $ac3,           9               \n\t"
1577 
1578                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1579                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1580                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1581                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1582                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1583                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1584 
1585                 "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
1586                 "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
1587                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1588                 "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1589                 "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
1590                 "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
1591                 "mtlo           %[vector4a],    $ac1                            \n\t"
1592                 "extp           %[Temp3],       $ac0,           9               \n\t"
1593 
1594                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1595                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1596                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1597                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1598                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1599                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1600 
1601                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
1602                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1603                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1604                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
1605                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1606                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1607                   [src_ptr] "r" (src_ptr)
1608             );
1609 
1610             /* apply filter with vectors pairs */
1611             __asm__ __volatile__ (
1612                 "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
1613                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1614                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1615                 "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
1616                 "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
1617                 "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
1618                 "mtlo           %[vector4a],    $ac2                            \n\t"
1619 
1620                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1621                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1622                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1623                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1624                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1625                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1626                 "extp           %[Temp4],       $ac1,           9               \n\t"
1627 
1628                 "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
1629                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1630                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1631                 "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
1632                 "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
1633                 "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
1634                 "mtlo           %[vector4a],    $ac3                            \n\t"
1635                 "extp           %[Temp5],       $ac2,           9               \n\t"
1636 
1637                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1638                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1639                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1640                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1641                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1642                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1643 
1644                 "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
1645                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1646                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1647                 "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
1648                 "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
1649                 "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
1650                 "mtlo           %[vector4a],    $ac0                            \n\t"
1651                 "extp           %[Temp6],       $ac3,           9               \n\t"
1652 
1653                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1654                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1655                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1656                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1657                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1658                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1659 
1660                 "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
1661                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1662                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1663                 "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
1664                 "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
1665                 "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
1666                 "mtlo           %[vector4a],    $ac1                            \n\t"
1667                 "extp           %[Temp7],       $ac0,           9               \n\t"
1668 
1669                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1670                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1671                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1672                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1673                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1674                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1675                 "extp           %[Temp8],       $ac1,           9               \n\t"
1676 
1677                 : [Temp4] "=&r" (Temp4), [Temp5] "=&r" (Temp5),
1678                   [Temp6] "=&r" (Temp6), [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
1679                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1680                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
1681                   [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
1682                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
1683                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
1684                   [src_ptr] "r" (src_ptr)
1685             );
1686 
1687             /* clamp and store results */
1688             output_ptr[0] = cm[Temp1];
1689             output_ptr[1] = cm[Temp2];
1690             output_ptr[2] = cm[Temp3];
1691             output_ptr[3] = cm[Temp4];
1692             output_ptr[4] = cm[Temp5];
1693             output_ptr[5] = cm[Temp6];
1694             output_ptr[6] = cm[Temp7];
1695             output_ptr[7] = cm[Temp8];
1696 
1697             src_ptr += 8;
1698             output_ptr += output_pitch;
1699         }
1700     }
1701     else
1702     {
1703         /* 4 tap filter */
1704 
1705         /* prefetch src_ptr data to cache memory */
1706         prefetch_load(src_ptr);
1707 
1708         for (i = output_height; i--;)
1709         {
1710             __asm__ __volatile__ (
1711                 "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
1712                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1713                 "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1714                 "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
1715                 "mtlo           %[vector4a],    $ac2                            \n\t"
1716                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1717                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1718                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1719                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1720 
1721                 : [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1722                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1723                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1724                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1725             );
1726 
1727             __asm__ __volatile__ (
1728                 "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
1729                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1730                 "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1731                 "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
1732                 "mtlo           %[vector4a],    $ac3                            \n\t"
1733                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1734                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1735                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1736                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1737                 "extp           %[Temp1],       $ac2,           9               \n\t"
1738 
1739                 : [Temp1] "=r" (Temp1),
1740                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
1741                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
1742                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1743                   [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
1744             );
1745 
1746             src_ptr_l1 = src_ptr[-6];
1747             src_ptr_0  = src_ptr[2];
1748             src_ptr_r1 = src_ptr[10];
1749             src_ptr_r2 = src_ptr[18];
1750 
1751             __asm__ __volatile__ (
1752                 "mtlo           %[vector4a],    $ac0                            \n\t"
1753                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1754                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1755                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1756                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1757                 "extp           %[Temp2],       $ac3,           9               \n\t"
1758 
1759                 : [Temp2] "=r" (Temp2)
1760                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1761                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1762                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1763                   [vector4a] "r" (vector4a)
1764             );
1765 
1766             src_ptr_l1 = src_ptr[-5];
1767             src_ptr_0  = src_ptr[3];
1768             src_ptr_r1 = src_ptr[11];
1769             src_ptr_r2 = src_ptr[19];
1770 
1771             __asm__ __volatile__ (
1772                 "mtlo           %[vector4a],    $ac1                            \n\t"
1773                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1774                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1775                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1776                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1777                 "extp           %[Temp3],       $ac0,           9               \n\t"
1778 
1779                 : [Temp3] "=r" (Temp3)
1780                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1781                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1782                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1783                   [vector4a] "r" (vector4a)
1784             );
1785 
1786             src_ptr_l1 = src_ptr[-4];
1787             src_ptr_0  = src_ptr[4];
1788             src_ptr_r1 = src_ptr[12];
1789             src_ptr_r2 = src_ptr[20];
1790 
1791             __asm__ __volatile__ (
1792                 "mtlo           %[vector4a],    $ac2                            \n\t"
1793                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1794                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1795                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1796                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1797                 "extp           %[Temp4],       $ac1,           9               \n\t"
1798 
1799                 : [Temp4] "=r" (Temp4)
1800                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1801                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1802                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1803                   [vector4a] "r" (vector4a)
1804             );
1805 
1806             src_ptr_l1 = src_ptr[-3];
1807             src_ptr_0  = src_ptr[5];
1808             src_ptr_r1 = src_ptr[13];
1809             src_ptr_r2 = src_ptr[21];
1810 
1811             __asm__ __volatile__ (
1812                 "mtlo           %[vector4a],    $ac3                            \n\t"
1813                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1814                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1815                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1816                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1817                 "extp           %[Temp5],       $ac2,           9               \n\t"
1818 
1819                 : [Temp5] "=&r" (Temp5)
1820                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1821                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1822                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1823                   [vector4a] "r" (vector4a)
1824             );
1825 
1826             src_ptr_l1 = src_ptr[-2];
1827             src_ptr_0  = src_ptr[6];
1828             src_ptr_r1 = src_ptr[14];
1829             src_ptr_r2 = src_ptr[22];
1830 
1831             __asm__ __volatile__ (
1832                 "mtlo           %[vector4a],    $ac0                            \n\t"
1833                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1834                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1835                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1836                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1837                 "extp           %[Temp6],       $ac3,           9               \n\t"
1838 
1839                 : [Temp6] "=r" (Temp6)
1840                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1841                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1842                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1843                   [vector4a] "r" (vector4a)
1844             );
1845 
1846             src_ptr_l1 = src_ptr[-1];
1847             src_ptr_0  = src_ptr[7];
1848             src_ptr_r1 = src_ptr[15];
1849             src_ptr_r2 = src_ptr[23];
1850 
1851             __asm__ __volatile__ (
1852                 "mtlo           %[vector4a],    $ac1                            \n\t"
1853                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1854                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1855                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1856                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1857                 "extp           %[Temp7],       $ac0,           9               \n\t"
1858                 "extp           %[Temp8],       $ac1,           9               \n\t"
1859 
1860                 : [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8)
1861                 : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
1862                   [src_ptr_l1] "r" (src_ptr_l1), [src_ptr_0] "r" (src_ptr_0),
1863                   [src_ptr_r1] "r" (src_ptr_r1), [src_ptr_r2] "r" (src_ptr_r2),
1864                   [vector4a] "r" (vector4a)
1865             );
1866 
1867             /* clamp and store results */
1868             output_ptr[0] = cm[Temp1];
1869             output_ptr[1] = cm[Temp2];
1870             output_ptr[2] = cm[Temp3];
1871             output_ptr[3] = cm[Temp4];
1872             output_ptr[4] = cm[Temp5];
1873             output_ptr[5] = cm[Temp6];
1874             output_ptr[6] = cm[Temp7];
1875             output_ptr[7] = cm[Temp8];
1876 
1877             src_ptr += 8;
1878             output_ptr += output_pitch;
1879         }
1880     }
1881 }
1882 
1883 
vp8_filter_block2d_second_pass161(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,const unsigned short * vp8_filter)1884 void vp8_filter_block2d_second_pass161
1885 (
1886     unsigned char *RESTRICT src_ptr,
1887     unsigned char *RESTRICT output_ptr,
1888     int output_pitch,
1889     const unsigned short *vp8_filter
1890 )
1891 {
1892     unsigned int i, j;
1893 
1894     int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1895     unsigned int vector4a;
1896     unsigned int vector1b, vector2b, vector3b;
1897 
1898     unsigned char src_ptr_l2;
1899     unsigned char src_ptr_l1;
1900     unsigned char src_ptr_0;
1901     unsigned char src_ptr_r1;
1902     unsigned char src_ptr_r2;
1903     unsigned char src_ptr_r3;
1904     unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1905 
1906     vector4a = 64;
1907 
1908     vector1b = vp8_filter[0];
1909     vector2b = vp8_filter[2];
1910     vector3b = vp8_filter[1];
1911 
1912     if (vector1b == 0)
1913     {
1914         /* 4 tap filter */
1915 
1916         /* prefetch src_ptr data to cache memory */
1917         prefetch_load(src_ptr + 16);
1918 
1919         for (i = 16; i--;)
1920         {
1921             /* unrolling for loop */
1922             for (j = 0; j < 16; j += 8)
1923             {
1924                 /* apply filter with vectors pairs */
1925                 __asm__ __volatile__ (
1926                     "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
1927                     "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1928                     "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
1929                     "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
1930                     "mtlo           %[vector4a],    $ac2                            \n\t"
1931                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1932                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1933                     "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1934                     "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1935 
1936                     "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
1937                     "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1938                     "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
1939                     "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
1940                     "mtlo           %[vector4a],    $ac3                            \n\t"
1941                     "extp           %[Temp1],       $ac2,           9               \n\t"
1942 
1943                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1944                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1945                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1946                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1947 
1948                     "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
1949                     "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1950                     "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
1951                     "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
1952                     "mtlo           %[vector4a],    $ac1                            \n\t"
1953                     "extp           %[Temp2],       $ac3,           9               \n\t"
1954 
1955                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1956                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1957                     "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1958                     "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1959 
1960                     "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
1961                     "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1962                     "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
1963                     "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
1964                     "mtlo           %[vector4a],    $ac3                            \n\t"
1965                     "extp           %[Temp3],       $ac1,           9               \n\t"
1966 
1967                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1968                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1969                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1970                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1971 
1972                     "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
1973                     "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1974                     "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
1975                     "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
1976                     "mtlo           %[vector4a],    $ac2                            \n\t"
1977                     "extp           %[Temp4],       $ac3,           9               \n\t"
1978 
1979                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1980                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1981                     "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1982                     "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1983 
1984                     "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
1985                     "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1986                     "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
1987                     "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
1988                     "mtlo           %[vector4a],    $ac3                            \n\t"
1989                     "extp           %[Temp5],       $ac2,           9               \n\t"
1990 
1991                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1992                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1993                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1994                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1995 
1996                     "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
1997                     "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1998                     "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
1999                     "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
2000                     "mtlo           %[vector4a],    $ac1                            \n\t"
2001                     "extp           %[Temp6],       $ac3,           9               \n\t"
2002 
2003                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2004                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2005                     "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2006                     "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2007 
2008                     "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
2009                     "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
2010                     "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
2011                     "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
2012                     "mtlo           %[vector4a],    $ac3                            \n\t"
2013                     "extp           %[Temp7],       $ac1,           9               \n\t"
2014 
2015                     "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2016                     "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2017                     "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2018                     "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2019                     "extp           %[Temp8],       $ac3,           9               \n\t"
2020 
2021                     : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2022                       [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2023                       [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2024                       [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2025                       [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2026                       [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2)
2027                     : [vector2b] "r" (vector2b), [vector3b] "r" (vector3b),
2028                       [vector4a] "r" (vector4a), [src_ptr] "r" (src_ptr)
2029                 );
2030 
2031                 /* clamp and store results */
2032                 output_ptr[j] = cm[Temp1];
2033                 output_ptr[j + 1] = cm[Temp2];
2034                 output_ptr[j + 2] = cm[Temp3];
2035                 output_ptr[j + 3] = cm[Temp4];
2036                 output_ptr[j + 4] = cm[Temp5];
2037                 output_ptr[j + 5] = cm[Temp6];
2038                 output_ptr[j + 6] = cm[Temp7];
2039                 output_ptr[j + 7] = cm[Temp8];
2040 
2041                 src_ptr += 8;
2042             }
2043 
2044             output_ptr += output_pitch;
2045         }
2046     }
2047     else
2048     {
2049         /* 4 tap filter */
2050 
2051         /* prefetch src_ptr data to cache memory */
2052         prefetch_load(src_ptr + 16);
2053 
2054         /* unroll for loop */
2055         for (i = 16; i--;)
2056         {
2057             /* apply filter with vectors pairs */
2058             __asm__ __volatile__ (
2059                 "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
2060                 "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
2061                 "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
2062                 "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
2063                 "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
2064                 "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
2065                 "mtlo           %[vector4a],    $ac2                            \n\t"
2066 
2067                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2068                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2069                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2070                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2071                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2072                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2073 
2074                 "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
2075                 "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
2076                 "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
2077                 "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
2078                 "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
2079                 "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
2080                 "mtlo           %[vector4a],    $ac0                            \n\t"
2081                 "extp           %[Temp1],       $ac2,           9               \n\t"
2082 
2083                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2084                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2085                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2086                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2087                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2088                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2089 
2090                 "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
2091                 "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
2092                 "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
2093                 "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
2094                 "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
2095                 "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
2096                 "mtlo           %[vector4a],    $ac1                            \n\t"
2097                 "extp           %[Temp2],       $ac0,           9               \n\t"
2098 
2099                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2100                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2101                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2102                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2103                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2104                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2105 
2106                 "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
2107                 "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
2108                 "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
2109                 "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
2110                 "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
2111                 "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
2112                 "mtlo           %[vector4a],    $ac3                            \n\t"
2113                 "extp           %[Temp3],       $ac1,           9               \n\t"
2114 
2115                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2116                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2117                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2118                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2119                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2120                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2121 
2122                 "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
2123                 "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
2124                 "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
2125                 "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
2126                 "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
2127                 "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
2128                 "mtlo           %[vector4a],    $ac2                            \n\t"
2129                 "extp           %[Temp4],       $ac3,           9               \n\t"
2130 
2131                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2132                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2133                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2134                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2135                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2136                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2137 
2138                 "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
2139                 "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
2140                 "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
2141                 "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
2142                 "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
2143                 "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
2144                 "mtlo           %[vector4a],    $ac0                            \n\t"
2145                 "extp           %[Temp5],       $ac2,           9               \n\t"
2146 
2147                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2148                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2149                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2150                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2151                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2152                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2153 
2154                 "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
2155                 "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
2156                 "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
2157                 "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
2158                 "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
2159                 "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
2160                 "mtlo           %[vector4a],    $ac1                            \n\t"
2161                 "extp           %[Temp6],       $ac0,           9               \n\t"
2162 
2163                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2164                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2165                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2166                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2167                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2168                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2169 
2170                 "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
2171                 "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
2172                 "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
2173                 "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
2174                 "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
2175                 "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
2176                 "mtlo           %[vector4a],    $ac3                            \n\t"
2177                 "extp           %[Temp7],       $ac1,           9               \n\t"
2178 
2179                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2180                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2181                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2182                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2183                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2184                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2185                 "extp           %[Temp8],       $ac3,           9               \n\t"
2186 
2187                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2188                   [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2189                   [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2190                   [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2191                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2192                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
2193                   [src_ptr_l2] "=&r" (src_ptr_l2),[src_ptr_r3] "=&r" (src_ptr_r3)
2194                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
2195                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
2196                   [src_ptr] "r" (src_ptr)
2197             );
2198 
2199             /* clamp and store results */
2200             output_ptr[0] = cm[Temp1];
2201             output_ptr[1] = cm[Temp2];
2202             output_ptr[2] = cm[Temp3];
2203             output_ptr[3] = cm[Temp4];
2204             output_ptr[4] = cm[Temp5];
2205             output_ptr[5] = cm[Temp6];
2206             output_ptr[6] = cm[Temp7];
2207             output_ptr[7] = cm[Temp8];
2208 
2209             /* apply filter with vectors pairs */
2210             __asm__ __volatile__ (
2211                 "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
2212                 "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
2213                 "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
2214                 "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
2215                 "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
2216                 "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
2217                 "mtlo           %[vector4a],    $ac2                            \n\t"
2218 
2219                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2220                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2221                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2222                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2223                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2224                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2225 
2226                 "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
2227                 "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
2228                 "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
2229                 "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
2230                 "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
2231                 "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
2232                 "mtlo           %[vector4a],    $ac0                            \n\t"
2233                 "extp           %[Temp1],       $ac2,           9               \n\t"
2234 
2235                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2236                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2237                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2238                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2239                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2240                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2241 
2242                 "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
2243                 "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
2244                 "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
2245                 "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
2246                 "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
2247                 "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
2248                 "mtlo           %[vector4a],    $ac1                            \n\t"
2249                 "extp           %[Temp2],       $ac0,           9               \n\t"
2250 
2251                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2252                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2253                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2254                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2255                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2256                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2257 
2258                 "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
2259                 "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
2260                 "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
2261                 "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
2262                 "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
2263                 "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
2264                 "mtlo           %[vector4a],    $ac3                            \n\t"
2265                 "extp           %[Temp3],       $ac1,           9               \n\t"
2266 
2267                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2268                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2269                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2270                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2271                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2272                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2273 
2274                 "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
2275                 "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
2276                 "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
2277                 "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
2278                 "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
2279                 "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
2280                 "mtlo           %[vector4a],    $ac2                            \n\t"
2281                 "extp           %[Temp4],       $ac3,           9               \n\t"
2282 
2283                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2284                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2285                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2286                 "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2287                 "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2288                 "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2289 
2290                 "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
2291                 "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
2292                 "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
2293                 "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
2294                 "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
2295                 "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
2296                 "mtlo           %[vector4a],    $ac0                            \n\t"
2297                 "extp           %[Temp5],       $ac2,           9               \n\t"
2298 
2299                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2300                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2301                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2302                 "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2303                 "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2304                 "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2305 
2306                 "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
2307                 "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
2308                 "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
2309                 "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
2310                 "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
2311                 "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
2312                 "mtlo           %[vector4a],    $ac1                            \n\t"
2313                 "extp           %[Temp6],       $ac0,           9               \n\t"
2314 
2315                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2316                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2317                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2318                 "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2319                 "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2320                 "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2321 
2322                 "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
2323                 "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
2324                 "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
2325                 "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
2326                 "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
2327                 "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
2328                 "mtlo           %[vector4a],    $ac3                            \n\t"
2329                 "extp           %[Temp7],       $ac1,           9               \n\t"
2330 
2331                 "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2332                 "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2333                 "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2334                 "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2335                 "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2336                 "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2337                 "extp           %[Temp8],       $ac3,           9               \n\t"
2338 
2339                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
2340                   [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
2341                   [Temp5] "=&r" (Temp5), [Temp6] "=&r" (Temp6),
2342                   [Temp7] "=&r" (Temp7), [Temp8] "=r" (Temp8),
2343                   [src_ptr_l1] "=&r" (src_ptr_l1), [src_ptr_0] "=&r" (src_ptr_0),
2344                   [src_ptr_r1] "=&r" (src_ptr_r1), [src_ptr_r2] "=&r" (src_ptr_r2),
2345                   [src_ptr_l2] "=&r" (src_ptr_l2), [src_ptr_r3] "=&r" (src_ptr_r3)
2346                 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
2347                   [vector3b] "r" (vector3b), [vector4a] "r" (vector4a),
2348                   [src_ptr] "r" (src_ptr)
2349             );
2350 
2351             src_ptr += 16;
2352             output_ptr[8] = cm[Temp1];
2353             output_ptr[9] = cm[Temp2];
2354             output_ptr[10] = cm[Temp3];
2355             output_ptr[11] = cm[Temp4];
2356             output_ptr[12] = cm[Temp5];
2357             output_ptr[13] = cm[Temp6];
2358             output_ptr[14] = cm[Temp7];
2359             output_ptr[15] = cm[Temp8];
2360 
2361             output_ptr += output_pitch;
2362         }
2363     }
2364 }
2365 
2366 
vp8_sixtap_predict4x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2367 void vp8_sixtap_predict4x4_dspr2
2368 (
2369     unsigned char *RESTRICT src_ptr,
2370     int   src_pixels_per_line,
2371     int  xoffset,
2372     int  yoffset,
2373     unsigned char *RESTRICT dst_ptr,
2374     int dst_pitch
2375 )
2376 {
2377     unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
2378     unsigned int pos = 16;
2379 
2380     /* bit positon for extract from acc */
2381     __asm__ __volatile__ (
2382         "wrdsp      %[pos],     1           \n\t"
2383         :
2384         : [pos] "r" (pos)
2385     );
2386 
2387     if (yoffset)
2388     {
2389         /* First filter 1-D horizontally... */
2390         vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
2391                                         src_pixels_per_line, 9, xoffset, 4);
2392         /* then filter verticaly... */
2393         vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
2394     }
2395     else
2396         /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2397         vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line,
2398                                         4, xoffset, dst_pitch);
2399 }
2400 
2401 
vp8_sixtap_predict8x8_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2402 void vp8_sixtap_predict8x8_dspr2
2403 (
2404     unsigned char   *RESTRICT src_ptr,
2405     int  src_pixels_per_line,
2406     int  xoffset,
2407     int  yoffset,
2408     unsigned char *RESTRICT dst_ptr,
2409     int  dst_pitch
2410 )
2411 {
2412 
2413     unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
2414     unsigned int pos, Temp1, Temp2;
2415 
2416     pos = 16;
2417 
2418     /* bit positon for extract from acc */
2419     __asm__ __volatile__ (
2420         "wrdsp      %[pos],     1               \n\t"
2421         :
2422         : [pos] "r" (pos)
2423     );
2424 
2425     if (yoffset)
2426     {
2427 
2428         src_ptr = src_ptr - (2 * src_pixels_per_line);
2429 
2430         if (xoffset)
2431             /* filter 1-D horizontally... */
2432             vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2433                                                 13, xoffset, 8);
2434 
2435         else
2436         {
2437             /* prefetch src_ptr data to cache memory */
2438             prefetch_load(src_ptr + 2 * src_pixels_per_line);
2439 
2440             __asm__ __volatile__ (
2441                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2442                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2443                 "sw     %[Temp1],   0(%[FData])                             \n\t"
2444                 "sw     %[Temp2],   4(%[FData])                             \n\t"
2445                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2446 
2447                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2448                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2449                 "sw     %[Temp1],   8(%[FData])                             \n\t"
2450                 "sw     %[Temp2],   12(%[FData])                            \n\t"
2451                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2452 
2453                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2454                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2455                 "sw     %[Temp1],   16(%[FData])                            \n\t"
2456                 "sw     %[Temp2],   20(%[FData])                            \n\t"
2457                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2458 
2459                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2460                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2461                 "sw     %[Temp1],   24(%[FData])                            \n\t"
2462                 "sw     %[Temp2],   28(%[FData])                            \n\t"
2463                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2464 
2465                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2466                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2467                 "sw     %[Temp1],   32(%[FData])                            \n\t"
2468                 "sw     %[Temp2],   36(%[FData])                            \n\t"
2469                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2470 
2471                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2472                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2473                 "sw     %[Temp1],   40(%[FData])                            \n\t"
2474                 "sw     %[Temp2],   44(%[FData])                            \n\t"
2475                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2476 
2477                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2478                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2479                 "sw     %[Temp1],   48(%[FData])                            \n\t"
2480                 "sw     %[Temp2],   52(%[FData])                            \n\t"
2481                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2482 
2483                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2484                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2485                 "sw     %[Temp1],   56(%[FData])                            \n\t"
2486                 "sw     %[Temp2],   60(%[FData])                            \n\t"
2487                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2488 
2489                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2490                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2491                 "sw     %[Temp1],   64(%[FData])                            \n\t"
2492                 "sw     %[Temp2],   68(%[FData])                            \n\t"
2493                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2494 
2495                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2496                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2497                 "sw     %[Temp1],   72(%[FData])                            \n\t"
2498                 "sw     %[Temp2],   76(%[FData])                            \n\t"
2499                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2500 
2501                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2502                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2503                 "sw     %[Temp1],   80(%[FData])                            \n\t"
2504                 "sw     %[Temp2],   84(%[FData])                            \n\t"
2505                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2506 
2507                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2508                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2509                 "sw     %[Temp1],   88(%[FData])                            \n\t"
2510                 "sw     %[Temp2],   92(%[FData])                            \n\t"
2511                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2512 
2513                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2514                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2515                 "sw     %[Temp1],   96(%[FData])                            \n\t"
2516                 "sw     %[Temp2],   100(%[FData])                           \n\t"
2517 
2518                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2519                 : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
2520                   [src_pixels_per_line] "r" (src_pixels_per_line)
2521             );
2522         }
2523 
2524         /* filter verticaly... */
2525         vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8, yoffset);
2526     }
2527 
2528     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2529     else
2530     {
2531         if (xoffset)
2532             vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2533                                                 8, xoffset, dst_pitch);
2534 
2535         else
2536         {
2537             /* copy from src buffer to dst buffer */
2538             __asm__ __volatile__ (
2539                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2540                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2541                 "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
2542                 "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
2543                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2544 
2545                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2546                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2547                 "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
2548                 "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
2549                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2550 
2551                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2552                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2553                 "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
2554                 "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
2555                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2556 
2557                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2558                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2559                 "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
2560                 "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
2561                 "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
2562 
2563                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2564                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2565                 "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
2566                 "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
2567                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2568 
2569                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2570                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2571                 "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
2572                 "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
2573                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2574 
2575                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2576                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2577                 "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
2578                 "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
2579                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2580 
2581                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2582                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2583                 "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
2584                 "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
2585 
2586                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2587                 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
2588                   [src_pixels_per_line] "r" (src_pixels_per_line)
2589             );
2590         }
2591     }
2592 }
2593 
2594 
vp8_sixtap_predict8x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2595 void vp8_sixtap_predict8x4_dspr2
2596 (
2597     unsigned char   *RESTRICT src_ptr,
2598     int  src_pixels_per_line,
2599     int  xoffset,
2600     int  yoffset,
2601     unsigned char *RESTRICT dst_ptr,
2602     int  dst_pitch
2603 )
2604 {
2605     unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
2606     unsigned int pos, Temp1, Temp2;
2607 
2608     pos = 16;
2609 
2610     /* bit positon for extract from acc */
2611     __asm__ __volatile__ (
2612         "wrdsp      %[pos],     1           \n\t"
2613         :
2614         : [pos] "r" (pos)
2615     );
2616 
2617     if (yoffset)
2618     {
2619 
2620         src_ptr = src_ptr - (2 * src_pixels_per_line);
2621 
2622         if (xoffset)
2623             /* filter 1-D horizontally... */
2624             vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2625                                                 9, xoffset, 8);
2626 
2627         else
2628         {
2629             /* prefetch src_ptr data to cache memory */
2630             prefetch_load(src_ptr + 2 * src_pixels_per_line);
2631 
2632             __asm__ __volatile__ (
2633                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2634                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2635                 "sw     %[Temp1],   0(%[FData])                             \n\t"
2636                 "sw     %[Temp2],   4(%[FData])                             \n\t"
2637                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2638 
2639                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2640                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2641                 "sw     %[Temp1],   8(%[FData])                             \n\t"
2642                 "sw     %[Temp2],   12(%[FData])                            \n\t"
2643                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2644 
2645                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2646                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2647                 "sw     %[Temp1],   16(%[FData])                            \n\t"
2648                 "sw     %[Temp2],   20(%[FData])                            \n\t"
2649                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2650 
2651                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2652                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2653                 "sw     %[Temp1],   24(%[FData])                            \n\t"
2654                 "sw     %[Temp2],   28(%[FData])                            \n\t"
2655                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2656 
2657                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2658                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2659                 "sw     %[Temp1],   32(%[FData])                            \n\t"
2660                 "sw     %[Temp2],   36(%[FData])                            \n\t"
2661                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2662 
2663                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2664                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2665                 "sw     %[Temp1],   40(%[FData])                            \n\t"
2666                 "sw     %[Temp2],   44(%[FData])                            \n\t"
2667                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2668 
2669                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2670                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2671                 "sw     %[Temp1],   48(%[FData])                            \n\t"
2672                 "sw     %[Temp2],   52(%[FData])                            \n\t"
2673                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2674 
2675                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2676                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2677                 "sw     %[Temp1],   56(%[FData])                            \n\t"
2678                 "sw     %[Temp2],   60(%[FData])                            \n\t"
2679                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2680 
2681                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2682                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2683                 "sw     %[Temp1],   64(%[FData])                            \n\t"
2684                 "sw     %[Temp2],   68(%[FData])                            \n\t"
2685 
2686                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2687                 : [FData] "r" (FData), [src_ptr] "r" (src_ptr),
2688                   [src_pixels_per_line] "r" (src_pixels_per_line)
2689             );
2690         }
2691 
2692         /* filter verticaly... */
2693         vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8, yoffset);
2694     }
2695 
2696     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2697     else
2698     {
2699         if (xoffset)
2700             vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2701                                                 4, xoffset, dst_pitch);
2702 
2703         else
2704         {
2705             /* copy from src buffer to dst buffer */
2706             __asm__ __volatile__ (
2707                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2708                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2709                 "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
2710                 "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
2711                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2712 
2713                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2714                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2715                 "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
2716                 "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
2717                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2718 
2719                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2720                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2721                 "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
2722                 "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
2723                 "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2724 
2725                 "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2726                 "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2727                 "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
2728                 "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
2729 
2730                 : [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2)
2731                 : [dst_ptr] "r" (dst_ptr), [src_ptr] "r" (src_ptr),
2732                   [src_pixels_per_line] "r" (src_pixels_per_line)
2733             );
2734         }
2735     }
2736 }
2737 
2738 
vp8_sixtap_predict16x16_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2739 void vp8_sixtap_predict16x16_dspr2
2740 (
2741     unsigned char   *RESTRICT src_ptr,
2742     int  src_pixels_per_line,
2743     int  xoffset,
2744     int  yoffset,
2745     unsigned char *RESTRICT dst_ptr,
2746     int  dst_pitch
2747 )
2748 {
2749     const unsigned short *VFilter;
2750     unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
2751     unsigned int pos;
2752 
2753     VFilter = sub_pel_filterss[yoffset];
2754 
2755     pos = 16;
2756 
2757     /* bit positon for extract from acc */
2758     __asm__ __volatile__ (
2759         "wrdsp      %[pos],     1           \n\t"
2760         :
2761         : [pos] "r" (pos)
2762     );
2763 
2764     if (yoffset)
2765     {
2766 
2767         src_ptr = src_ptr - (2 * src_pixels_per_line);
2768 
2769         switch (xoffset)
2770         {
2771             /* filter 1-D horizontally... */
2772         case 2:
2773         case 4:
2774         case 6:
2775             /* 6 tap filter */
2776             vp8_filter_block2d_first_pass16_6tap(src_ptr, FData, src_pixels_per_line,
2777                                                  21, xoffset, 16);
2778             break;
2779 
2780         case 0:
2781             /* only copy buffer */
2782             vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
2783             break;
2784 
2785         case 1:
2786         case 3:
2787         case 5:
2788         case 7:
2789             /* 4 tap filter */
2790             vp8_filter_block2d_first_pass16_4tap(src_ptr, FData, src_pixels_per_line, 16,
2791                                                  21, xoffset, yoffset, dst_ptr, dst_pitch);
2792             break;
2793         }
2794 
2795         /* filter verticaly... */
2796         vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
2797     }
2798     else
2799     {
2800         /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2801         switch (xoffset)
2802         {
2803         case 2:
2804         case 4:
2805         case 6:
2806             /* 6 tap filter */
2807             vp8_filter_block2d_first_pass16_6tap(src_ptr, dst_ptr, src_pixels_per_line,
2808                                                  16, xoffset, dst_pitch);
2809             break;
2810 
2811         case 1:
2812         case 3:
2813         case 5:
2814         case 7:
2815             /* 4 tap filter */
2816             vp8_filter_block2d_first_pass16_4tap(src_ptr, dst_ptr, src_pixels_per_line, 16,
2817                                                  21, xoffset, yoffset, dst_ptr, dst_pitch);
2818             break;
2819         }
2820     }
2821 }
2822 
2823 #endif
2824