1 /*
2  *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <stdlib.h>
12 #include "vp8_rtcd.h"
13 #include "vpx_ports/mem.h"
14 
15 #if HAVE_DSPR2
16 #define CROP_WIDTH 256
17 unsigned char ff_cropTbl[256 + 2 * CROP_WIDTH];
18 
19 static const unsigned short sub_pel_filterss[8][3] = {
20   { 0, 0, 0 },
21   { 0, 0x0601, 0x7b0c },
22   { 0x0201, 0x0b08, 0x6c24 },
23   { 0, 0x0906, 0x5d32 },
24   { 0x0303, 0x1010, 0x4d4d },
25   { 0, 0x0609, 0x325d },
26   { 0x0102, 0x080b, 0x246c },
27   { 0, 0x0106, 0x0c7b },
28 };
29 
30 static const int sub_pel_filters_int[8][3] = {
31   { 0, 0, 0 },
32   { 0x0000fffa, 0x007b000c, 0xffff0000 },
33   { 0x0002fff5, 0x006c0024, 0xfff80001 },
34   { 0x0000fff7, 0x005d0032, 0xfffa0000 },
35   { 0x0003fff0, 0x004d004d, 0xfff00003 },
36   { 0x0000fffa, 0x0032005d, 0xfff70000 },
37   { 0x0001fff8, 0x0024006c, 0xfff50002 },
38   { 0x0000ffff, 0x000c007b, 0xfffa0000 },
39 };
40 
41 static const int sub_pel_filters_inv[8][3] = {
42   { 0, 0, 0 },
43   { 0xfffa0000, 0x000c007b, 0x0000ffff },
44   { 0xfff50002, 0x0024006c, 0x0001fff8 },
45   { 0xfff70000, 0x0032005d, 0x0000fffa },
46   { 0xfff00003, 0x004d004d, 0x0003fff0 },
47   { 0xfffa0000, 0x005d0032, 0x0000fff7 },
48   { 0xfff80001, 0x006c0024, 0x0002fff5 },
49   { 0xffff0000, 0x007b000c, 0x0000fffa },
50 };
51 
52 /* clang-format off */
53 static const int sub_pel_filters_int_tap_4[8][2] = {
54   {          0,          0},
55   { 0xfffa007b, 0x000cffff},
56   {          0,          0},
57   { 0xfff7005d, 0x0032fffa},
58   {          0,          0},
59   { 0xfffa0032, 0x005dfff7},
60   {          0,          0},
61   { 0xffff000c, 0x007bfffa},
62 };
63 
64 
65 static const int sub_pel_filters_inv_tap_4[8][2] = {
66   {          0,          0},
67   { 0x007bfffa, 0xffff000c},
68   {          0,          0},
69   { 0x005dfff7, 0xfffa0032},
70   {          0,          0},
71   { 0x0032fffa, 0xfff7005d},
72   {          0,          0},
73   { 0x000cffff, 0xfffa007b},
74 };
75 /* clang-format on */
76 
prefetch_load(unsigned char * src)77 inline void prefetch_load(unsigned char *src) {
78   __asm__ __volatile__("pref   0,  0(%[src])   \n\t" : : [src] "r"(src));
79 }
80 
prefetch_store(unsigned char * dst)81 inline void prefetch_store(unsigned char *dst) {
82   __asm__ __volatile__("pref   1,  0(%[dst])   \n\t" : : [dst] "r"(dst));
83 }
84 
dsputil_static_init(void)85 void dsputil_static_init(void) {
86   int i;
87 
88   for (i = 0; i < 256; ++i) ff_cropTbl[i + CROP_WIDTH] = i;
89 
90   for (i = 0; i < CROP_WIDTH; ++i) {
91     ff_cropTbl[i] = 0;
92     ff_cropTbl[i + CROP_WIDTH + 256] = 255;
93   }
94 }
95 
vp8_filter_block2d_first_pass_4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)96 void vp8_filter_block2d_first_pass_4(unsigned char *RESTRICT src_ptr,
97                                      unsigned char *RESTRICT dst_ptr,
98                                      unsigned int src_pixels_per_line,
99                                      unsigned int output_height, int xoffset,
100                                      int pitch) {
101   unsigned int i;
102   int Temp1, Temp2, Temp3, Temp4;
103 
104   unsigned int vector4a = 64;
105   int vector1b, vector2b, vector3b;
106   unsigned int tp1, tp2, tn1, tn2;
107   unsigned int p1, p2, p3;
108   unsigned int n1, n2, n3;
109   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
110 
111   vector3b = sub_pel_filters_inv[xoffset][2];
112 
113   /* if (xoffset == 0) we don't need any filtering */
114   if (vector3b == 0) {
115     for (i = 0; i < output_height; ++i) {
116       /* prefetch src_ptr data to cache memory */
117       prefetch_load(src_ptr + src_pixels_per_line);
118       dst_ptr[0] = src_ptr[0];
119       dst_ptr[1] = src_ptr[1];
120       dst_ptr[2] = src_ptr[2];
121       dst_ptr[3] = src_ptr[3];
122 
123       /* next row... */
124       src_ptr += src_pixels_per_line;
125       dst_ptr += 4;
126     }
127   } else {
128     if (vector3b > 65536) {
129       /* 6 tap filter */
130 
131       vector1b = sub_pel_filters_inv[xoffset][0];
132       vector2b = sub_pel_filters_inv[xoffset][1];
133 
134       /* prefetch src_ptr data to cache memory */
135       prefetch_load(src_ptr + src_pixels_per_line);
136 
137       for (i = output_height; i--;) {
138         /* apply filter with vectors pairs */
139         __asm__ __volatile__(
140             "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
141             "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
142 
143             /* even 1. pixel */
144             "mtlo             %[vector4a], $ac3                           \n\t"
145             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
146             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
147             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
148             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
149             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
150             "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
151 
152             /* even 2. pixel */
153             "mtlo             %[vector4a], $ac2                           \n\t"
154             "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
155             "balign           %[tp2],      %[tp1],         3              \n\t"
156             "extp             %[Temp1],    $ac3,           9              \n\t"
157             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
158             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
159             "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
160 
161             /* odd 1. pixel */
162             "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
163             "mtlo             %[vector4a], $ac3                           \n\t"
164             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
165             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
166             "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
167             "extp             %[Temp3],    $ac2,           9              \n\t"
168             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
169             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
170             "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
171 
172             /* even 2. pixel */
173             "mtlo             %[vector4a], $ac2                           \n\t"
174             "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
175             "extp             %[Temp2],    $ac3,           9              \n\t"
176             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
177             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
178             "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
179             "extp             %[Temp4],    $ac2,           9              \n\t"
180 
181             /* clamp */
182             "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
183             "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
184             "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
185             "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
186 
187             /* store bytes */
188             "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
189             "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
190             "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
191             "sb               %[n2],       3(%[dst_ptr])                  \n\t"
192 
193             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
194               [tn2] "=&r"(tn2), [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3),
195               [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3),
196               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
197               [Temp4] "=&r"(Temp4)
198             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
199               [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
200               [vector3b] "r"(vector3b), [src_ptr] "r"(src_ptr));
201 
202         /* Next row... */
203         src_ptr += src_pixels_per_line;
204         dst_ptr += pitch;
205       }
206     } else {
207       /* 4 tap filter */
208 
209       vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
210       vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
211 
212       for (i = output_height; i--;) {
213         /* apply filter with vectors pairs */
214         __asm__ __volatile__(
215             "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
216             "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
217 
218             /* even 1. pixel */
219             "mtlo             %[vector4a], $ac3                           \n\t"
220             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
221             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
222             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
223             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
224             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
225 
226             /* even 2. pixel */
227             "mtlo             %[vector4a], $ac2                           \n\t"
228             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
229             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
230             "extp             %[Temp1],    $ac3,           9              \n\t"
231 
232             /* odd 1. pixel */
233             "srl              %[tn1],      %[tp2],         8              \n\t"
234             "balign           %[tp2],      %[tp1],         3              \n\t"
235             "mtlo             %[vector4a], $ac3                           \n\t"
236             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
237             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
238             "preceu.ph.qbr    %[n3],       %[tn1]                         \n\t"
239             "extp             %[Temp3],    $ac2,           9              \n\t"
240             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
241             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
242 
243             /* odd 2. pixel */
244             "mtlo             %[vector4a], $ac2                           \n\t"
245             "extp             %[Temp2],    $ac3,           9              \n\t"
246             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
247             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
248             "extp             %[Temp4],    $ac2,           9              \n\t"
249 
250             /* clamp and store results */
251             "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
252             "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
253             "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
254             "sb               %[tp1],      0(%[dst_ptr])                  \n\t"
255             "sb               %[tn1],      1(%[dst_ptr])                  \n\t"
256             "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
257             "sb               %[tp2],      2(%[dst_ptr])                  \n\t"
258             "sb               %[n2],       3(%[dst_ptr])                  \n\t"
259 
260             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
261               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
262               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
263               [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
264             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
265               [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
266               [src_ptr] "r"(src_ptr));
267         /*  Next row... */
268         src_ptr += src_pixels_per_line;
269         dst_ptr += pitch;
270       }
271     }
272   }
273 }
274 
vp8_filter_block2d_first_pass_8_all(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)275 void vp8_filter_block2d_first_pass_8_all(unsigned char *RESTRICT src_ptr,
276                                          unsigned char *RESTRICT dst_ptr,
277                                          unsigned int src_pixels_per_line,
278                                          unsigned int output_height,
279                                          int xoffset, int pitch) {
280   unsigned int i;
281   int Temp1, Temp2, Temp3, Temp4;
282 
283   unsigned int vector4a = 64;
284   unsigned int vector1b, vector2b, vector3b;
285   unsigned int tp1, tp2, tn1, tn2;
286   unsigned int p1, p2, p3, p4;
287   unsigned int n1, n2, n3, n4;
288 
289   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
290 
291   /* if (xoffset == 0) we don't need any filtering */
292   if (xoffset == 0) {
293     for (i = 0; i < output_height; ++i) {
294       /* prefetch src_ptr data to cache memory */
295       prefetch_load(src_ptr + src_pixels_per_line);
296 
297       dst_ptr[0] = src_ptr[0];
298       dst_ptr[1] = src_ptr[1];
299       dst_ptr[2] = src_ptr[2];
300       dst_ptr[3] = src_ptr[3];
301       dst_ptr[4] = src_ptr[4];
302       dst_ptr[5] = src_ptr[5];
303       dst_ptr[6] = src_ptr[6];
304       dst_ptr[7] = src_ptr[7];
305 
306       /* next row... */
307       src_ptr += src_pixels_per_line;
308       dst_ptr += 8;
309     }
310   } else {
311     vector3b = sub_pel_filters_inv[xoffset][2];
312 
313     if (vector3b > 65536) {
314       /* 6 tap filter */
315 
316       vector1b = sub_pel_filters_inv[xoffset][0];
317       vector2b = sub_pel_filters_inv[xoffset][1];
318 
319       for (i = output_height; i--;) {
320         /* prefetch src_ptr data to cache memory */
321         prefetch_load(src_ptr + src_pixels_per_line);
322 
323         /* apply filter with vectors pairs */
324         __asm__ __volatile__(
325             "ulw              %[tp1],      -2(%[src_ptr])                 \n\t"
326             "ulw              %[tp2],      2(%[src_ptr])                  \n\t"
327 
328             /* even 1. pixel */
329             "mtlo             %[vector4a], $ac3                           \n\t"
330             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
331             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
332             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
333             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
334             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
335             "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
336 
337             /* even 2. pixel */
338             "mtlo             %[vector4a], $ac2                           \n\t"
339             "preceu.ph.qbl    %[p1],       %[tp2]                         \n\t"
340             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
341             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
342             "dpa.w.ph         $ac2,        %[p1],          %[vector3b]    \n\t"
343 
344             "balign           %[tp2],      %[tp1],         3              \n\t"
345             "extp             %[Temp1],    $ac3,           9              \n\t"
346             "ulw              %[tn2],      3(%[src_ptr])                  \n\t"
347 
348             /* odd 1. pixel */
349             "mtlo             %[vector4a], $ac3                           \n\t"
350             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
351             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
352             "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
353             "extp             %[Temp3],    $ac2,           9              \n\t"
354             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
355             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
356             "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
357 
358             /* odd 2. pixel */
359             "mtlo             %[vector4a], $ac2                           \n\t"
360             "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
361             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
362             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
363             "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
364             "ulw              %[tp1],      6(%[src_ptr])                  \n\t"
365             "extp             %[Temp2],    $ac3,           9              \n\t"
366             "mtlo             %[vector4a], $ac3                           \n\t"
367             "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
368             "extp             %[Temp4],    $ac2,           9              \n\t"
369 
370             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
371               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
372               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
373               [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
374             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
375               [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
376               [src_ptr] "r"(src_ptr));
377 
378         /* clamp and store results */
379         dst_ptr[0] = cm[Temp1];
380         dst_ptr[1] = cm[Temp2];
381         dst_ptr[2] = cm[Temp3];
382         dst_ptr[3] = cm[Temp4];
383 
384         /* next 4 pixels */
385         __asm__ __volatile__(
386             /* even 3. pixel */
387             "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
388             "dpa.w.ph         $ac3,        %[p1],          %[vector2b]    \n\t"
389             "dpa.w.ph         $ac3,        %[p2],          %[vector3b]    \n\t"
390 
391             /* even 4. pixel */
392             "mtlo             %[vector4a], $ac2                           \n\t"
393             "preceu.ph.qbl    %[p4],       %[tp1]                         \n\t"
394             "dpa.w.ph         $ac2,        %[p1],          %[vector1b]    \n\t"
395             "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
396             "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
397 
398             "ulw              %[tn1],      7(%[src_ptr])                  \n\t"
399             "extp             %[Temp1],    $ac3,           9              \n\t"
400 
401             /* odd 3. pixel */
402             "mtlo             %[vector4a], $ac3                           \n\t"
403             "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
404             "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
405             "dpa.w.ph         $ac3,        %[n1],          %[vector2b]    \n\t"
406             "dpa.w.ph         $ac3,        %[n2],          %[vector3b]    \n\t"
407             "extp             %[Temp3],    $ac2,           9              \n\t"
408 
409             /* odd 4. pixel */
410             "mtlo             %[vector4a], $ac2                           \n\t"
411             "preceu.ph.qbl    %[n4],       %[tn1]                         \n\t"
412             "dpa.w.ph         $ac2,        %[n1],          %[vector1b]    \n\t"
413             "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
414             "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
415             "extp             %[Temp2],    $ac3,           9              \n\t"
416             "extp             %[Temp4],    $ac2,           9              \n\t"
417 
418             : [tn1] "=&r"(tn1), [n2] "=&r"(n2), [p4] "=&r"(p4), [n4] "=&r"(n4),
419               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
420               [Temp4] "=r"(Temp4)
421             : [tp1] "r"(tp1), [vector1b] "r"(vector1b), [p2] "r"(p2),
422               [vector2b] "r"(vector2b), [n1] "r"(n1), [p1] "r"(p1),
423               [vector4a] "r"(vector4a), [vector3b] "r"(vector3b), [p3] "r"(p3),
424               [n3] "r"(n3), [src_ptr] "r"(src_ptr));
425 
426         /* clamp and store results */
427         dst_ptr[4] = cm[Temp1];
428         dst_ptr[5] = cm[Temp2];
429         dst_ptr[6] = cm[Temp3];
430         dst_ptr[7] = cm[Temp4];
431 
432         src_ptr += src_pixels_per_line;
433         dst_ptr += pitch;
434       }
435     } else {
436       /* 4 tap filter */
437 
438       vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
439       vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
440 
441       for (i = output_height; i--;) {
442         /* prefetch src_ptr data to cache memory */
443         prefetch_load(src_ptr + src_pixels_per_line);
444 
445         /* apply filter with vectors pairs */
446         __asm__ __volatile__(
447             "ulw              %[tp1],      -1(%[src_ptr])                 \n\t"
448 
449             /* even 1. pixel */
450             "mtlo             %[vector4a], $ac3                           \n\t"
451             "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
452             "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
453             "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
454             "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
455 
456             "ulw              %[tp2],      3(%[src_ptr])                  \n\t"
457 
458             /* even 2. pixel  */
459             "mtlo             %[vector4a], $ac2                           \n\t"
460             "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
461             "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
462             "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
463             "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
464             "extp             %[Temp1],    $ac3,           9              \n\t"
465 
466             "balign           %[tp2],      %[tp1],         3              \n\t"
467 
468             /* odd 1. pixel */
469             "mtlo             %[vector4a], $ac3                           \n\t"
470             "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
471             "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
472             "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
473             "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
474             "extp             %[Temp3],    $ac2,           9              \n\t"
475 
476             "ulw              %[tn2],      4(%[src_ptr])                  \n\t"
477 
478             /* odd 2. pixel */
479             "mtlo             %[vector4a], $ac2                           \n\t"
480             "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
481             "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
482             "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
483             "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
484             "ulw              %[tp1],      7(%[src_ptr])                  \n\t"
485             "extp             %[Temp2],    $ac3,           9              \n\t"
486             "mtlo             %[vector4a], $ac3                           \n\t"
487             "extp             %[Temp4],    $ac2,           9              \n\t"
488 
489             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2),
490               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [p4] "=&r"(p4),
491               [n1] "=&r"(n1), [n2] "=&r"(n2), [n3] "=&r"(n3), [n4] "=&r"(n4),
492               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
493               [Temp4] "=r"(Temp4)
494             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
495               [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
496 
497         /* clamp and store results */
498         dst_ptr[0] = cm[Temp1];
499         dst_ptr[1] = cm[Temp2];
500         dst_ptr[2] = cm[Temp3];
501         dst_ptr[3] = cm[Temp4];
502 
503         /* next 4 pixels */
504         __asm__ __volatile__(
505             /* even 3. pixel */
506             "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
507             "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
508 
509             /* even 4. pixel */
510             "mtlo             %[vector4a], $ac2                           \n\t"
511             "preceu.ph.qbr    %[p2],       %[tp1]                         \n\t"
512             "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
513             "dpa.w.ph         $ac2,        %[p2],          %[vector2b]    \n\t"
514             "extp             %[Temp1],    $ac3,           9              \n\t"
515 
516             /* odd 3. pixel */
517             "mtlo             %[vector4a], $ac3                           \n\t"
518             "dpa.w.ph         $ac3,        %[n3],          %[vector1b]    \n\t"
519             "dpa.w.ph         $ac3,        %[n4],          %[vector2b]    \n\t"
520             "ulw              %[tn1],      8(%[src_ptr])                  \n\t"
521             "extp             %[Temp3],    $ac2,           9              \n\t"
522 
523             /* odd 4. pixel */
524             "mtlo             %[vector4a], $ac2                           \n\t"
525             "preceu.ph.qbr    %[n2],       %[tn1]                         \n\t"
526             "dpa.w.ph         $ac2,        %[n4],          %[vector1b]    \n\t"
527             "dpa.w.ph         $ac2,        %[n2],          %[vector2b]    \n\t"
528             "extp             %[Temp2],    $ac3,           9              \n\t"
529             "extp             %[Temp4],    $ac2,           9              \n\t"
530 
531             : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2),
532               [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
533               [Temp4] "=r"(Temp4)
534             : [tp1] "r"(tp1), [p3] "r"(p3), [p4] "r"(p4),
535               [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
536               [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr), [n3] "r"(n3),
537               [n4] "r"(n4));
538 
539         /* clamp and store results */
540         dst_ptr[4] = cm[Temp1];
541         dst_ptr[5] = cm[Temp2];
542         dst_ptr[6] = cm[Temp3];
543         dst_ptr[7] = cm[Temp4];
544 
545         /* next row... */
546         src_ptr += src_pixels_per_line;
547         dst_ptr += pitch;
548       }
549     }
550   }
551 }
552 
vp8_filter_block2d_first_pass16_6tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT dst_ptr,unsigned int src_pixels_per_line,unsigned int output_height,int xoffset,int pitch)553 void vp8_filter_block2d_first_pass16_6tap(unsigned char *RESTRICT src_ptr,
554                                           unsigned char *RESTRICT dst_ptr,
555                                           unsigned int src_pixels_per_line,
556                                           unsigned int output_height,
557                                           int xoffset, int pitch) {
558   unsigned int i;
559   int Temp1, Temp2, Temp3, Temp4;
560 
561   unsigned int vector4a;
562   unsigned int vector1b, vector2b, vector3b;
563   unsigned int tp1, tp2, tn1, tn2;
564   unsigned int p1, p2, p3, p4;
565   unsigned int n1, n2, n3, n4;
566   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
567 
568   vector1b = sub_pel_filters_inv[xoffset][0];
569   vector2b = sub_pel_filters_inv[xoffset][1];
570   vector3b = sub_pel_filters_inv[xoffset][2];
571   vector4a = 64;
572 
573   for (i = output_height; i--;) {
574     /* prefetch src_ptr data to cache memory */
575     prefetch_load(src_ptr + src_pixels_per_line);
576 
577     /* apply filter with vectors pairs */
578     __asm__ __volatile__(
579         "ulw                %[tp1],      -2(%[src_ptr])                 \n\t"
580         "ulw                %[tp2],      2(%[src_ptr])                  \n\t"
581 
582         /* even 1. pixel */
583         "mtlo               %[vector4a], $ac3                           \n\t"
584         "preceu.ph.qbr      %[p1],       %[tp1]                         \n\t"
585         "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
586         "preceu.ph.qbr      %[p3],       %[tp2]                         \n\t"
587         "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
588         "dpa.w.ph           $ac3,        %[p2],           %[vector2b]   \n\t"
589         "dpa.w.ph           $ac3,        %[p3],           %[vector3b]   \n\t"
590 
591         /* even 2. pixel */
592         "mtlo               %[vector4a], $ac2                           \n\t"
593         "preceu.ph.qbl      %[p1],       %[tp2]                         \n\t"
594         "dpa.w.ph           $ac2,        %[p2],           %[vector1b]   \n\t"
595         "dpa.w.ph           $ac2,        %[p3],           %[vector2b]   \n\t"
596         "dpa.w.ph           $ac2,        %[p1],           %[vector3b]   \n\t"
597 
598         "balign             %[tp2],      %[tp1],          3             \n\t"
599         "ulw                %[tn2],      3(%[src_ptr])                  \n\t"
600         "extp               %[Temp1],    $ac3,            9             \n\t"
601 
602         /* odd 1. pixel */
603         "mtlo               %[vector4a], $ac3                           \n\t"
604         "preceu.ph.qbr      %[n1],       %[tp2]                         \n\t"
605         "preceu.ph.qbl      %[n2],       %[tp2]                         \n\t"
606         "preceu.ph.qbr      %[n3],       %[tn2]                         \n\t"
607         "extp               %[Temp3],    $ac2,            9             \n\t"
608         "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
609         "dpa.w.ph           $ac3,        %[n2],           %[vector2b]   \n\t"
610         "dpa.w.ph           $ac3,        %[n3],           %[vector3b]   \n\t"
611 
612         /* odd 2. pixel */
613         "mtlo               %[vector4a], $ac2                           \n\t"
614         "preceu.ph.qbl      %[n1],       %[tn2]                         \n\t"
615         "dpa.w.ph           $ac2,        %[n2],           %[vector1b]   \n\t"
616         "dpa.w.ph           $ac2,        %[n3],           %[vector2b]   \n\t"
617         "dpa.w.ph           $ac2,        %[n1],           %[vector3b]   \n\t"
618         "ulw                %[tp1],      6(%[src_ptr])                  \n\t"
619         "extp               %[Temp2],    $ac3,            9             \n\t"
620         "mtlo               %[vector4a], $ac3                           \n\t"
621         "preceu.ph.qbr      %[p2],       %[tp1]                         \n\t"
622         "extp               %[Temp4],    $ac2,            9             \n\t"
623 
624         : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn2] "=&r"(tn2), [p1] "=&r"(p1),
625           [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1), [n2] "=&r"(n2),
626           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
627           [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4)
628         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
629           [vector4a] "r"(vector4a), [vector3b] "r"(vector3b),
630           [src_ptr] "r"(src_ptr));
631 
632     /* clamp and store results */
633     dst_ptr[0] = cm[Temp1];
634     dst_ptr[1] = cm[Temp2];
635     dst_ptr[2] = cm[Temp3];
636     dst_ptr[3] = cm[Temp4];
637 
638     /* next 4 pixels */
639     __asm__ __volatile__(
640         /* even 3. pixel */
641         "dpa.w.ph           $ac3,        %[p3],           %[vector1b]   \n\t"
642         "dpa.w.ph           $ac3,        %[p1],           %[vector2b]   \n\t"
643         "dpa.w.ph           $ac3,        %[p2],           %[vector3b]   \n\t"
644 
645         /* even 4. pixel */
646         "mtlo               %[vector4a], $ac2                           \n\t"
647         "preceu.ph.qbl      %[p4],       %[tp1]                         \n\t"
648         "dpa.w.ph           $ac2,        %[p1],           %[vector1b]   \n\t"
649         "dpa.w.ph           $ac2,        %[p2],           %[vector2b]   \n\t"
650         "dpa.w.ph           $ac2,        %[p4],           %[vector3b]   \n\t"
651         "ulw                %[tn1],      7(%[src_ptr])                  \n\t"
652         "extp               %[Temp1],    $ac3,            9             \n\t"
653 
654         /* odd 3. pixel */
655         "mtlo               %[vector4a], $ac3                           \n\t"
656         "preceu.ph.qbr      %[n2],       %[tn1]                         \n\t"
657         "dpa.w.ph           $ac3,        %[n3],           %[vector1b]   \n\t"
658         "dpa.w.ph           $ac3,        %[n1],           %[vector2b]   \n\t"
659         "dpa.w.ph           $ac3,        %[n2],           %[vector3b]   \n\t"
660         "extp               %[Temp3],    $ac2,            9             \n\t"
661 
662         /* odd 4. pixel */
663         "mtlo               %[vector4a], $ac2                           \n\t"
664         "preceu.ph.qbl      %[n4],       %[tn1]                         \n\t"
665         "dpa.w.ph           $ac2,        %[n1],           %[vector1b]   \n\t"
666         "dpa.w.ph           $ac2,        %[n2],           %[vector2b]   \n\t"
667         "dpa.w.ph           $ac2,        %[n4],           %[vector3b]   \n\t"
668         "ulw                %[tp2],      10(%[src_ptr])                 \n\t"
669         "extp               %[Temp2],    $ac3,            9             \n\t"
670         "mtlo               %[vector4a], $ac3                           \n\t"
671         "preceu.ph.qbr      %[p1],       %[tp2]                         \n\t"
672         "extp               %[Temp4],    $ac2,            9             \n\t"
673 
674         : [tn1] "=&r"(tn1), [tp2] "=&r"(tp2), [n2] "=&r"(n2), [p4] "=&r"(p4),
675           [n4] "=&r"(n4), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
676           [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p1] "+r"(p1)
677         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp1] "r"(tp1),
678           [n1] "r"(n1), [vector4a] "r"(vector4a), [p2] "r"(p2),
679           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
680           [src_ptr] "r"(src_ptr));
681 
682     /* clamp and store results */
683     dst_ptr[4] = cm[Temp1];
684     dst_ptr[5] = cm[Temp2];
685     dst_ptr[6] = cm[Temp3];
686     dst_ptr[7] = cm[Temp4];
687 
688     /* next 4 pixels */
689     __asm__ __volatile__(
690         /* even 5. pixel */
691         "dpa.w.ph           $ac3,        %[p2],           %[vector1b]   \n\t"
692         "dpa.w.ph           $ac3,        %[p4],           %[vector2b]   \n\t"
693         "dpa.w.ph           $ac3,        %[p1],           %[vector3b]   \n\t"
694 
695         /* even 6. pixel */
696         "mtlo               %[vector4a], $ac2                           \n\t"
697         "preceu.ph.qbl      %[p3],       %[tp2]                         \n\t"
698         "dpa.w.ph           $ac2,        %[p4],           %[vector1b]   \n\t"
699         "dpa.w.ph           $ac2,        %[p1],           %[vector2b]   \n\t"
700         "dpa.w.ph           $ac2,        %[p3],           %[vector3b]   \n\t"
701 
702         "ulw                %[tn1],      11(%[src_ptr])                 \n\t"
703         "extp               %[Temp1],    $ac3,            9             \n\t"
704 
705         /* odd 5. pixel */
706         "mtlo               %[vector4a], $ac3                           \n\t"
707         "preceu.ph.qbr      %[n1],       %[tn1]                         \n\t"
708         "dpa.w.ph           $ac3,        %[n2],           %[vector1b]   \n\t"
709         "dpa.w.ph           $ac3,        %[n4],           %[vector2b]   \n\t"
710         "dpa.w.ph           $ac3,        %[n1],           %[vector3b]   \n\t"
711         "extp               %[Temp3],    $ac2,            9             \n\t"
712 
713         /* odd 6. pixel */
714         "mtlo               %[vector4a], $ac2                           \n\t"
715         "preceu.ph.qbl      %[n3],       %[tn1]                         \n\t"
716         "dpa.w.ph           $ac2,        %[n4],           %[vector1b]   \n\t"
717         "dpa.w.ph           $ac2,        %[n1],           %[vector2b]   \n\t"
718         "dpa.w.ph           $ac2,        %[n3],           %[vector3b]   \n\t"
719         "ulw                %[tp1],      14(%[src_ptr])                 \n\t"
720         "extp               %[Temp2],    $ac3,            9             \n\t"
721         "mtlo               %[vector4a], $ac3                           \n\t"
722         "preceu.ph.qbr      %[p4],       %[tp1]                         \n\t"
723         "extp               %[Temp4],    $ac2,            9             \n\t"
724 
725         : [tn1] "=&r"(tn1), [tp1] "=&r"(tp1), [n1] "=&r"(n1), [p3] "=&r"(p3),
726           [n3] "=&r"(n3), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2),
727           [Temp3] "=&r"(Temp3), [Temp4] "=r"(Temp4), [p4] "+r"(p4)
728         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [tp2] "r"(tp2),
729           [p2] "r"(p2), [n2] "r"(n2), [n4] "r"(n4), [p1] "r"(p1),
730           [src_ptr] "r"(src_ptr), [vector4a] "r"(vector4a),
731           [vector3b] "r"(vector3b));
732 
733     /* clamp and store results */
734     dst_ptr[8] = cm[Temp1];
735     dst_ptr[9] = cm[Temp2];
736     dst_ptr[10] = cm[Temp3];
737     dst_ptr[11] = cm[Temp4];
738 
739     /* next 4 pixels */
740     __asm__ __volatile__(
741         /* even 7. pixel */
742         "dpa.w.ph           $ac3,        %[p1],           %[vector1b]   \n\t"
743         "dpa.w.ph           $ac3,        %[p3],           %[vector2b]   \n\t"
744         "dpa.w.ph           $ac3,        %[p4],           %[vector3b]   \n\t"
745 
746         /* even 8. pixel */
747         "mtlo               %[vector4a], $ac2                           \n\t"
748         "preceu.ph.qbl      %[p2],       %[tp1]                         \n\t"
749         "dpa.w.ph           $ac2,        %[p3],           %[vector1b]   \n\t"
750         "dpa.w.ph           $ac2,        %[p4],           %[vector2b]   \n\t"
751         "dpa.w.ph           $ac2,        %[p2],           %[vector3b]   \n\t"
752         "ulw                %[tn1],      15(%[src_ptr])                 \n\t"
753         "extp               %[Temp1],    $ac3,            9             \n\t"
754 
755         /* odd 7. pixel */
756         "mtlo               %[vector4a], $ac3                           \n\t"
757         "preceu.ph.qbr      %[n4],       %[tn1]                         \n\t"
758         "dpa.w.ph           $ac3,        %[n1],           %[vector1b]   \n\t"
759         "dpa.w.ph           $ac3,        %[n3],           %[vector2b]   \n\t"
760         "dpa.w.ph           $ac3,        %[n4],           %[vector3b]   \n\t"
761         "extp               %[Temp3],    $ac2,            9             \n\t"
762 
763         /* odd 8. pixel */
764         "mtlo               %[vector4a], $ac2                           \n\t"
765         "preceu.ph.qbl      %[n2],       %[tn1]                         \n\t"
766         "dpa.w.ph           $ac2,        %[n3],           %[vector1b]   \n\t"
767         "dpa.w.ph           $ac2,        %[n4],           %[vector2b]   \n\t"
768         "dpa.w.ph           $ac2,        %[n2],           %[vector3b]   \n\t"
769         "extp               %[Temp2],    $ac3,            9             \n\t"
770         "extp               %[Temp4],    $ac2,            9             \n\t"
771 
772         /* clamp and store results */
773         "lbux               %[tp1],      %[Temp1](%[cm])                \n\t"
774         "lbux               %[tn1],      %[Temp2](%[cm])                \n\t"
775         "lbux               %[p2],       %[Temp3](%[cm])                \n\t"
776         "sb                 %[tp1],      12(%[dst_ptr])                 \n\t"
777         "sb                 %[tn1],      13(%[dst_ptr])                 \n\t"
778         "lbux               %[n2],       %[Temp4](%[cm])                \n\t"
779         "sb                 %[p2],       14(%[dst_ptr])                 \n\t"
780         "sb                 %[n2],       15(%[dst_ptr])                 \n\t"
781 
782         : [tn1] "=&r"(tn1), [p2] "=&r"(p2), [n2] "=&r"(n2), [n4] "=&r"(n4),
783           [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
784           [Temp4] "=r"(Temp4), [tp1] "+r"(tp1)
785         : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b), [p4] "r"(p4),
786           [n1] "r"(n1), [p1] "r"(p1), [vector4a] "r"(vector4a),
787           [vector3b] "r"(vector3b), [p3] "r"(p3), [n3] "r"(n3),
788           [src_ptr] "r"(src_ptr), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr));
789 
790     src_ptr += src_pixels_per_line;
791     dst_ptr += pitch;
792   }
793 }
794 
vp8_filter_block2d_first_pass16_0(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line)795 void vp8_filter_block2d_first_pass16_0(unsigned char *RESTRICT src_ptr,
796                                        unsigned char *RESTRICT output_ptr,
797                                        unsigned int src_pixels_per_line) {
798   int Temp1, Temp2, Temp3, Temp4;
799   int i;
800 
801   /* prefetch src_ptr data to cache memory */
802   prefetch_store(output_ptr + 32);
803 
804   /* copy memory from src buffer to dst buffer */
805   for (i = 0; i < 7; ++i) {
806     __asm__ __volatile__(
807         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
808         "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
809         "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
810         "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
811         "sw     %[Temp1],   0(%[output_ptr])                            \n\t"
812         "sw     %[Temp2],   4(%[output_ptr])                            \n\t"
813         "sw     %[Temp3],   8(%[output_ptr])                            \n\t"
814         "sw     %[Temp4],   12(%[output_ptr])                           \n\t"
815         "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
816 
817         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
818           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
819         : [src_pixels_per_line] "r"(src_pixels_per_line),
820           [output_ptr] "r"(output_ptr));
821 
822     __asm__ __volatile__(
823         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
824         "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
825         "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
826         "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
827         "sw     %[Temp1],   16(%[output_ptr])                           \n\t"
828         "sw     %[Temp2],   20(%[output_ptr])                           \n\t"
829         "sw     %[Temp3],   24(%[output_ptr])                           \n\t"
830         "sw     %[Temp4],   28(%[output_ptr])                           \n\t"
831         "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
832 
833         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
834           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
835         : [src_pixels_per_line] "r"(src_pixels_per_line),
836           [output_ptr] "r"(output_ptr));
837 
838     __asm__ __volatile__(
839         "ulw    %[Temp1],   0(%[src_ptr])                               \n\t"
840         "ulw    %[Temp2],   4(%[src_ptr])                               \n\t"
841         "ulw    %[Temp3],   8(%[src_ptr])                               \n\t"
842         "ulw    %[Temp4],   12(%[src_ptr])                              \n\t"
843         "sw     %[Temp1],   32(%[output_ptr])                           \n\t"
844         "sw     %[Temp2],   36(%[output_ptr])                           \n\t"
845         "sw     %[Temp3],   40(%[output_ptr])                           \n\t"
846         "sw     %[Temp4],   44(%[output_ptr])                           \n\t"
847         "addu   %[src_ptr], %[src_ptr],        %[src_pixels_per_line]   \n\t"
848 
849         : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
850           [Temp4] "=&r"(Temp4), [src_ptr] "+r"(src_ptr)
851         : [src_pixels_per_line] "r"(src_pixels_per_line),
852           [output_ptr] "r"(output_ptr));
853 
854     output_ptr += 48;
855   }
856 }
857 
vp8_filter_block2d_first_pass16_4tap(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,unsigned int src_pixels_per_line,unsigned int output_width,unsigned int output_height,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int pitch)858 void vp8_filter_block2d_first_pass16_4tap(
859     unsigned char *RESTRICT src_ptr, unsigned char *RESTRICT output_ptr,
860     unsigned int src_pixels_per_line, unsigned int output_width,
861     unsigned int output_height, int xoffset, int yoffset,
862     unsigned char *RESTRICT dst_ptr, int pitch) {
863   unsigned int i, j;
864   int Temp1, Temp2, Temp3, Temp4;
865 
866   unsigned int vector4a;
867   int vector1b, vector2b;
868   unsigned int tp1, tp2, tp3, tn1;
869   unsigned int p1, p2, p3;
870   unsigned int n1, n2, n3;
871   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
872 
873   vector4a = 64;
874 
875   vector1b = sub_pel_filters_inv_tap_4[xoffset][0];
876   vector2b = sub_pel_filters_inv_tap_4[xoffset][1];
877 
878   /* if (yoffset == 0) don't need temp buffer, data will be stored in dst_ptr */
879   if (yoffset == 0) {
880     output_height -= 5;
881     src_ptr += (src_pixels_per_line + src_pixels_per_line);
882 
883     for (i = output_height; i--;) {
884       __asm__ __volatile__("ulw     %[tp3],   -1(%[src_ptr])               \n\t"
885                            : [tp3] "=&r"(tp3)
886                            : [src_ptr] "r"(src_ptr));
887 
888       /* processing 4 adjacent pixels */
889       for (j = 0; j < 16; j += 4) {
890         /* apply filter with vectors pairs */
891         __asm__ __volatile__(
892             "ulw              %[tp2],      3(%[src_ptr])                    "
893             "\n\t"
894             "move             %[tp1],      %[tp3]                           "
895             "\n\t"
896 
897             /* even 1. pixel */
898             "mtlo             %[vector4a], $ac3                             "
899             "\n\t"
900             "mthi             $0,          $ac3                             "
901             "\n\t"
902             "move             %[tp3],      %[tp2]                           "
903             "\n\t"
904             "preceu.ph.qbr    %[p1],       %[tp1]                           "
905             "\n\t"
906             "preceu.ph.qbl    %[p2],       %[tp1]                           "
907             "\n\t"
908             "preceu.ph.qbr    %[p3],       %[tp2]                           "
909             "\n\t"
910             "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     "
911             "\n\t"
912             "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     "
913             "\n\t"
914 
915             /* even 2. pixel */
916             "mtlo             %[vector4a], $ac2                             "
917             "\n\t"
918             "mthi             $0,          $ac2                             "
919             "\n\t"
920             "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     "
921             "\n\t"
922             "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     "
923             "\n\t"
924             "extr.w           %[Temp1],    $ac3,            7               "
925             "\n\t"
926 
927             /* odd 1. pixel */
928             "ulw              %[tn1],      4(%[src_ptr])                    "
929             "\n\t"
930             "balign           %[tp2],      %[tp1],          3               "
931             "\n\t"
932             "mtlo             %[vector4a], $ac3                             "
933             "\n\t"
934             "mthi             $0,          $ac3                             "
935             "\n\t"
936             "preceu.ph.qbr    %[n1],       %[tp2]                           "
937             "\n\t"
938             "preceu.ph.qbl    %[n2],       %[tp2]                           "
939             "\n\t"
940             "preceu.ph.qbr    %[n3],       %[tn1]                           "
941             "\n\t"
942             "extr.w           %[Temp3],    $ac2,            7               "
943             "\n\t"
944             "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     "
945             "\n\t"
946             "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     "
947             "\n\t"
948 
949             /* odd 2. pixel */
950             "mtlo             %[vector4a], $ac2                             "
951             "\n\t"
952             "mthi             $0,          $ac2                             "
953             "\n\t"
954             "extr.w           %[Temp2],    $ac3,            7               "
955             "\n\t"
956             "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     "
957             "\n\t"
958             "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     "
959             "\n\t"
960             "extr.w           %[Temp4],    $ac2,            7               "
961             "\n\t"
962 
963             /* clamp and store results */
964             "lbux             %[tp1],      %[Temp1](%[cm])                  "
965             "\n\t"
966             "lbux             %[tn1],      %[Temp2](%[cm])                  "
967             "\n\t"
968             "lbux             %[tp2],      %[Temp3](%[cm])                  "
969             "\n\t"
970             "sb               %[tp1],      0(%[dst_ptr])                    "
971             "\n\t"
972             "sb               %[tn1],      1(%[dst_ptr])                    "
973             "\n\t"
974             "lbux             %[n2],       %[Temp4](%[cm])                  "
975             "\n\t"
976             "sb               %[tp2],      2(%[dst_ptr])                    "
977             "\n\t"
978             "sb               %[n2],       3(%[dst_ptr])                    "
979             "\n\t"
980 
981             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tp3] "=&r"(tp3),
982               [tn1] "=&r"(tn1), [p1] "=&r"(p1), [p2] "=&r"(p2), [n1] "=&r"(n1),
983               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
984               [Temp2] "=&r"(Temp2), [p3] "=&r"(p3), [Temp3] "=&r"(Temp3),
985               [Temp4] "=&r"(Temp4)
986             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
987               [vector4a] "r"(vector4a), [cm] "r"(cm), [dst_ptr] "r"(dst_ptr),
988               [src_ptr] "r"(src_ptr));
989 
990         src_ptr += 4;
991       }
992 
993       /* Next row... */
994       src_ptr += src_pixels_per_line - 16;
995       dst_ptr += pitch;
996     }
997   } else {
998     for (i = output_height; i--;) {
999       /* processing 4 adjacent pixels */
1000       for (j = 0; j < 16; j += 4) {
1001         /* apply filter with vectors pairs */
1002         __asm__ __volatile__(
1003             "ulw              %[tp1],      -1(%[src_ptr])                   "
1004             "\n\t"
1005             "ulw              %[tp2],      3(%[src_ptr])                    "
1006             "\n\t"
1007 
1008             /* even 1. pixel */
1009             "mtlo             %[vector4a], $ac3                             "
1010             "\n\t"
1011             "mthi             $0,          $ac3                             "
1012             "\n\t"
1013             "preceu.ph.qbr    %[p1],       %[tp1]                           "
1014             "\n\t"
1015             "preceu.ph.qbl    %[p2],       %[tp1]                           "
1016             "\n\t"
1017             "preceu.ph.qbr    %[p3],       %[tp2]                           "
1018             "\n\t"
1019             "dpa.w.ph         $ac3,        %[p1],           %[vector1b]     "
1020             "\n\t"
1021             "dpa.w.ph         $ac3,        %[p2],           %[vector2b]     "
1022             "\n\t"
1023 
1024             /* even 2. pixel */
1025             "mtlo             %[vector4a], $ac2                             "
1026             "\n\t"
1027             "mthi             $0,          $ac2                             "
1028             "\n\t"
1029             "dpa.w.ph         $ac2,        %[p2],           %[vector1b]     "
1030             "\n\t"
1031             "dpa.w.ph         $ac2,        %[p3],           %[vector2b]     "
1032             "\n\t"
1033             "extr.w           %[Temp1],    $ac3,            7               "
1034             "\n\t"
1035 
1036             /* odd 1. pixel */
1037             "ulw              %[tn1],      4(%[src_ptr])                    "
1038             "\n\t"
1039             "balign           %[tp2],      %[tp1],          3               "
1040             "\n\t"
1041             "mtlo             %[vector4a], $ac3                             "
1042             "\n\t"
1043             "mthi             $0,          $ac3                             "
1044             "\n\t"
1045             "preceu.ph.qbr    %[n1],       %[tp2]                           "
1046             "\n\t"
1047             "preceu.ph.qbl    %[n2],       %[tp2]                           "
1048             "\n\t"
1049             "preceu.ph.qbr    %[n3],       %[tn1]                           "
1050             "\n\t"
1051             "extr.w           %[Temp3],    $ac2,            7               "
1052             "\n\t"
1053             "dpa.w.ph         $ac3,        %[n1],           %[vector1b]     "
1054             "\n\t"
1055             "dpa.w.ph         $ac3,        %[n2],           %[vector2b]     "
1056             "\n\t"
1057 
1058             /* odd 2. pixel */
1059             "mtlo             %[vector4a], $ac2                             "
1060             "\n\t"
1061             "mthi             $0,          $ac2                             "
1062             "\n\t"
1063             "extr.w           %[Temp2],    $ac3,            7               "
1064             "\n\t"
1065             "dpa.w.ph         $ac2,        %[n2],           %[vector1b]     "
1066             "\n\t"
1067             "dpa.w.ph         $ac2,        %[n3],           %[vector2b]     "
1068             "\n\t"
1069             "extr.w           %[Temp4],    $ac2,            7               "
1070             "\n\t"
1071 
1072             /* clamp and store results */
1073             "lbux             %[tp1],      %[Temp1](%[cm])                  "
1074             "\n\t"
1075             "lbux             %[tn1],      %[Temp2](%[cm])                  "
1076             "\n\t"
1077             "lbux             %[tp2],      %[Temp3](%[cm])                  "
1078             "\n\t"
1079             "sb               %[tp1],      0(%[output_ptr])                 "
1080             "\n\t"
1081             "sb               %[tn1],      1(%[output_ptr])                 "
1082             "\n\t"
1083             "lbux             %[n2],       %[Temp4](%[cm])                  "
1084             "\n\t"
1085             "sb               %[tp2],      2(%[output_ptr])                 "
1086             "\n\t"
1087             "sb               %[n2],       3(%[output_ptr])                 "
1088             "\n\t"
1089 
1090             : [tp1] "=&r"(tp1), [tp2] "=&r"(tp2), [tn1] "=&r"(tn1),
1091               [p1] "=&r"(p1), [p2] "=&r"(p2), [p3] "=&r"(p3), [n1] "=&r"(n1),
1092               [n2] "=&r"(n2), [n3] "=&r"(n3), [Temp1] "=&r"(Temp1),
1093               [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [Temp4] "=&r"(Temp4)
1094             : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1095               [vector4a] "r"(vector4a), [cm] "r"(cm),
1096               [output_ptr] "r"(output_ptr), [src_ptr] "r"(src_ptr));
1097 
1098         src_ptr += 4;
1099       }
1100 
1101       /* next row... */
1102       src_ptr += src_pixels_per_line;
1103       output_ptr += output_width;
1104     }
1105   }
1106 }
1107 
vp8_filter_block2d_second_pass4(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,int yoffset)1108 void vp8_filter_block2d_second_pass4(unsigned char *RESTRICT src_ptr,
1109                                      unsigned char *RESTRICT output_ptr,
1110                                      int output_pitch, int yoffset) {
1111   unsigned int i;
1112 
1113   int Temp1, Temp2, Temp3, Temp4;
1114   unsigned int vector1b, vector2b, vector3b, vector4a;
1115 
1116   unsigned char src_ptr_l2;
1117   unsigned char src_ptr_l1;
1118   unsigned char src_ptr_0;
1119   unsigned char src_ptr_r1;
1120   unsigned char src_ptr_r2;
1121   unsigned char src_ptr_r3;
1122 
1123   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1124 
1125   vector4a = 64;
1126 
1127   /* load filter coefficients */
1128   vector1b = sub_pel_filterss[yoffset][0];
1129   vector2b = sub_pel_filterss[yoffset][2];
1130   vector3b = sub_pel_filterss[yoffset][1];
1131 
1132   if (vector1b) {
1133     /* 6 tap filter */
1134 
1135     for (i = 2; i--;) {
1136       /* prefetch src_ptr data to cache memory */
1137       prefetch_load(src_ptr);
1138 
1139       /* do not allow compiler to reorder instructions */
1140       __asm__ __volatile__(
1141           ".set noreorder                                                 \n\t"
1142           :
1143           :);
1144 
1145       /* apply filter with vectors pairs */
1146       __asm__ __volatile__(
1147           "lbu            %[src_ptr_l2],  -8(%[src_ptr])                  \n\t"
1148           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1149           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1150           "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
1151           "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
1152           "lbu            %[src_ptr_r3],  12(%[src_ptr])                  \n\t"
1153           "mtlo           %[vector4a],    $ac2                            \n\t"
1154 
1155           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1156           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1157           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1158           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1159           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1160           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1161 
1162           "lbu            %[src_ptr_l2],  -7(%[src_ptr])                  \n\t"
1163           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1164           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1165           "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
1166           "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
1167           "lbu            %[src_ptr_r3],  13(%[src_ptr])                  \n\t"
1168           "mtlo           %[vector4a],    $ac3                            \n\t"
1169           "extp           %[Temp1],       $ac2,           9               \n\t"
1170 
1171           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1172           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1173           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1174           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1175           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1176           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1177 
1178           "lbu            %[src_ptr_l2],  -6(%[src_ptr])                  \n\t"
1179           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1180           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1181           "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
1182           "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
1183           "lbu            %[src_ptr_r3],  14(%[src_ptr])                  \n\t"
1184           "mtlo           %[vector4a],    $ac0                            \n\t"
1185           "extp           %[Temp2],       $ac3,           9               \n\t"
1186 
1187           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1188           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1189           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1190           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1191           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1192           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1193 
1194           "lbu            %[src_ptr_l2],  -5(%[src_ptr])                  \n\t"
1195           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1196           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1197           "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
1198           "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
1199           "lbu            %[src_ptr_r3],  15(%[src_ptr])                  \n\t"
1200           "mtlo           %[vector4a],    $ac1                            \n\t"
1201           "extp           %[Temp3],       $ac0,           9               \n\t"
1202 
1203           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1204           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1205           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1206           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1207           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1208           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1209           "extp           %[Temp4],       $ac1,           9               \n\t"
1210 
1211           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1212             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1213             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1214             [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
1215             [src_ptr_r3] "=&r"(src_ptr_r3)
1216           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1217             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1218             [src_ptr] "r"(src_ptr));
1219 
1220       /* clamp and store results */
1221       output_ptr[0] = cm[Temp1];
1222       output_ptr[1] = cm[Temp2];
1223       output_ptr[2] = cm[Temp3];
1224       output_ptr[3] = cm[Temp4];
1225 
1226       output_ptr += output_pitch;
1227 
1228       /* apply filter with vectors pairs */
1229       __asm__ __volatile__(
1230           "lbu            %[src_ptr_l2],  -4(%[src_ptr])                  \n\t"
1231           "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
1232           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1233           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1234           "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
1235           "lbu            %[src_ptr_r3],  16(%[src_ptr])                  \n\t"
1236           "mtlo           %[vector4a],    $ac2                            \n\t"
1237           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1238           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1239           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1240           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1241           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1242           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1243 
1244           "lbu            %[src_ptr_l2],  -3(%[src_ptr])                  \n\t"
1245           "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
1246           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1247           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1248           "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
1249           "lbu            %[src_ptr_r3],  17(%[src_ptr])                  \n\t"
1250           "mtlo           %[vector4a],    $ac3                            \n\t"
1251           "extp           %[Temp1],       $ac2,           9               \n\t"
1252 
1253           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1254           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1255           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1256           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1257           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1258           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1259 
1260           "lbu            %[src_ptr_l2],  -2(%[src_ptr])                  \n\t"
1261           "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
1262           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1263           "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1264           "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
1265           "lbu            %[src_ptr_r3],  18(%[src_ptr])                  \n\t"
1266           "mtlo           %[vector4a],    $ac0                            \n\t"
1267           "extp           %[Temp2],       $ac3,           9               \n\t"
1268 
1269           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1270           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1271           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1272           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1273           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1274           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1275 
1276           "lbu            %[src_ptr_l2],  -1(%[src_ptr])                  \n\t"
1277           "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
1278           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1279           "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1280           "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
1281           "lbu            %[src_ptr_r3],  19(%[src_ptr])                  \n\t"
1282           "mtlo           %[vector4a],    $ac1                            \n\t"
1283           "extp           %[Temp3],       $ac0,           9               \n\t"
1284 
1285           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1286           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1287           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1288           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1289           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1290           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1291           "extp           %[Temp4],       $ac1,           9               \n\t"
1292 
1293           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1294             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1295             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1296             [src_ptr_r2] "=&r"(src_ptr_r2), [src_ptr_l2] "=&r"(src_ptr_l2),
1297             [src_ptr_r3] "=&r"(src_ptr_r3)
1298           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1299             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1300             [src_ptr] "r"(src_ptr));
1301 
1302       /* clamp and store results */
1303       output_ptr[0] = cm[Temp1];
1304       output_ptr[1] = cm[Temp2];
1305       output_ptr[2] = cm[Temp3];
1306       output_ptr[3] = cm[Temp4];
1307 
1308       src_ptr += 8;
1309       output_ptr += output_pitch;
1310     }
1311   } else {
1312     /* 4 tap filter */
1313 
1314     /* prefetch src_ptr data to cache memory */
1315     prefetch_load(src_ptr);
1316 
1317     for (i = 2; i--;) {
1318       /* do not allow compiler to reorder instructions */
1319       __asm__ __volatile__(
1320           ".set noreorder                                                 \n\t"
1321           :
1322           :);
1323 
1324       /* apply filter with vectors pairs */
1325       __asm__ __volatile__(
1326           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1327           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1328           "lbu            %[src_ptr_r1],  4(%[src_ptr])                   \n\t"
1329           "lbu            %[src_ptr_r2],  8(%[src_ptr])                   \n\t"
1330           "mtlo           %[vector4a],    $ac2                            \n\t"
1331           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1332           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1333           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1334           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1335 
1336           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1337           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1338           "lbu            %[src_ptr_r1],  5(%[src_ptr])                   \n\t"
1339           "lbu            %[src_ptr_r2],  9(%[src_ptr])                   \n\t"
1340           "mtlo           %[vector4a],    $ac3                            \n\t"
1341           "extp           %[Temp1],       $ac2,           9               \n\t"
1342 
1343           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1344           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1345           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1346           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1347 
1348           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1349           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1350           "lbu            %[src_ptr_r1],  6(%[src_ptr])                   \n\t"
1351           "lbu            %[src_ptr_r2],  10(%[src_ptr])                  \n\t"
1352           "mtlo           %[vector4a],    $ac0                            \n\t"
1353           "extp           %[Temp2],       $ac3,           9               \n\t"
1354 
1355           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1356           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1357           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1358           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1359 
1360           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1361           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1362           "lbu            %[src_ptr_r1],  7(%[src_ptr])                   \n\t"
1363           "lbu            %[src_ptr_r2],  11(%[src_ptr])                  \n\t"
1364           "mtlo           %[vector4a],    $ac1                            \n\t"
1365           "extp           %[Temp3],       $ac0,           9               \n\t"
1366           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1367           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1368           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1369           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1370           "extp           %[Temp4],       $ac1,           9               \n\t"
1371 
1372           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1373             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1374             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1375             [src_ptr_r2] "=&r"(src_ptr_r2)
1376           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1377             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1378 
1379       /* clamp and store results */
1380       output_ptr[0] = cm[Temp1];
1381       output_ptr[1] = cm[Temp2];
1382       output_ptr[2] = cm[Temp3];
1383       output_ptr[3] = cm[Temp4];
1384 
1385       output_ptr += output_pitch;
1386 
1387       /* apply filter with vectors pairs */
1388       __asm__ __volatile__(
1389           "lbu            %[src_ptr_l1],  0(%[src_ptr])                   \n\t"
1390           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1391           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1392           "lbu            %[src_ptr_r2],  12(%[src_ptr])                  \n\t"
1393           "mtlo           %[vector4a],    $ac2                            \n\t"
1394           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1395           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1396           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1397           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1398 
1399           "lbu            %[src_ptr_l1],  1(%[src_ptr])                   \n\t"
1400           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1401           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1402           "lbu            %[src_ptr_r2],  13(%[src_ptr])                  \n\t"
1403           "mtlo           %[vector4a],    $ac3                            \n\t"
1404           "extp           %[Temp1],       $ac2,           9               \n\t"
1405 
1406           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1407           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1408           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1409           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1410 
1411           "lbu            %[src_ptr_l1],  2(%[src_ptr])                   \n\t"
1412           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1413           "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1414           "lbu            %[src_ptr_r2],  14(%[src_ptr])                  \n\t"
1415           "mtlo           %[vector4a],    $ac0                            \n\t"
1416           "extp           %[Temp2],       $ac3,           9               \n\t"
1417 
1418           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1419           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1420           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1421           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1422 
1423           "lbu            %[src_ptr_l1],  3(%[src_ptr])                   \n\t"
1424           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1425           "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1426           "lbu            %[src_ptr_r2],  15(%[src_ptr])                  \n\t"
1427           "mtlo           %[vector4a],    $ac1                            \n\t"
1428           "extp           %[Temp3],       $ac0,           9               \n\t"
1429           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1430           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1431           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1432           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1433           "extp           %[Temp4],       $ac1,           9               \n\t"
1434 
1435           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1436             [Temp4] "=r"(Temp4), [src_ptr_l1] "=&r"(src_ptr_l1),
1437             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1438             [src_ptr_r2] "=&r"(src_ptr_r2)
1439           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1440             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1441 
1442       /* clamp and store results */
1443       output_ptr[0] = cm[Temp1];
1444       output_ptr[1] = cm[Temp2];
1445       output_ptr[2] = cm[Temp3];
1446       output_ptr[3] = cm[Temp4];
1447 
1448       src_ptr += 8;
1449       output_ptr += output_pitch;
1450     }
1451   }
1452 }
1453 
vp8_filter_block2d_second_pass_8(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,unsigned int output_height,unsigned int output_width,unsigned int yoffset)1454 void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr,
1455                                       unsigned char *RESTRICT output_ptr,
1456                                       int output_pitch,
1457                                       unsigned int output_height,
1458                                       unsigned int output_width,
1459                                       unsigned int yoffset) {
1460   unsigned int i;
1461 
1462   int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1463   unsigned int vector1b, vector2b, vector3b, vector4a;
1464 
1465   unsigned char src_ptr_l2;
1466   unsigned char src_ptr_l1;
1467   unsigned char src_ptr_0;
1468   unsigned char src_ptr_r1;
1469   unsigned char src_ptr_r2;
1470   unsigned char src_ptr_r3;
1471   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1472   (void)output_width;
1473 
1474   vector4a = 64;
1475 
1476   vector1b = sub_pel_filterss[yoffset][0];
1477   vector2b = sub_pel_filterss[yoffset][2];
1478   vector3b = sub_pel_filterss[yoffset][1];
1479 
1480   if (vector1b) {
1481     /* 6 tap filter */
1482 
1483     /* prefetch src_ptr data to cache memory */
1484     prefetch_load(src_ptr);
1485 
1486     for (i = output_height; i--;) {
1487       /* apply filter with vectors pairs */
1488       __asm__ __volatile__(
1489           "lbu            %[src_ptr_l2],  -16(%[src_ptr])                 \n\t"
1490           "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
1491           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1492           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1493           "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
1494           "lbu            %[src_ptr_r3],  24(%[src_ptr])                  \n\t"
1495           "mtlo           %[vector4a],    $ac2                            \n\t"
1496 
1497           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1498           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1499           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1500           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1501           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1502           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1503 
1504           "lbu            %[src_ptr_l2],  -15(%[src_ptr])                 \n\t"
1505           "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
1506           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1507           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1508           "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
1509           "lbu            %[src_ptr_r3],  25(%[src_ptr])                  \n\t"
1510           "mtlo           %[vector4a],    $ac3                            \n\t"
1511           "extp           %[Temp1],       $ac2,           9               \n\t"
1512 
1513           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1514           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1515           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1516           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1517           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1518           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1519 
1520           "lbu            %[src_ptr_l2],  -14(%[src_ptr])                 \n\t"
1521           "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
1522           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
1523           "lbu            %[src_ptr_r1],  10(%[src_ptr])                  \n\t"
1524           "lbu            %[src_ptr_r2],  18(%[src_ptr])                  \n\t"
1525           "lbu            %[src_ptr_r3],  26(%[src_ptr])                  \n\t"
1526           "mtlo           %[vector4a],    $ac0                            \n\t"
1527           "extp           %[Temp2],       $ac3,           9               \n\t"
1528 
1529           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1530           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1531           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1532           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1533           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1534           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1535 
1536           "lbu            %[src_ptr_l2],  -13(%[src_ptr])                 \n\t"
1537           "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
1538           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
1539           "lbu            %[src_ptr_r1],  11(%[src_ptr])                  \n\t"
1540           "lbu            %[src_ptr_r2],  19(%[src_ptr])                  \n\t"
1541           "lbu            %[src_ptr_r3],  27(%[src_ptr])                  \n\t"
1542           "mtlo           %[vector4a],    $ac1                            \n\t"
1543           "extp           %[Temp3],       $ac0,           9               \n\t"
1544 
1545           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1546           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1547           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1548           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1549           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1550           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1551 
1552           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
1553             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
1554             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
1555             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
1556           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1557             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1558             [src_ptr] "r"(src_ptr));
1559 
1560       /* apply filter with vectors pairs */
1561       __asm__ __volatile__(
1562           "lbu            %[src_ptr_l2],  -12(%[src_ptr])                 \n\t"
1563           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
1564           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
1565           "lbu            %[src_ptr_r1],  12(%[src_ptr])                  \n\t"
1566           "lbu            %[src_ptr_r2],  20(%[src_ptr])                  \n\t"
1567           "lbu            %[src_ptr_r3],  28(%[src_ptr])                  \n\t"
1568           "mtlo           %[vector4a],    $ac2                            \n\t"
1569 
1570           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1571           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1572           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1573           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
1574           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1575           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1576           "extp           %[Temp4],       $ac1,           9               \n\t"
1577 
1578           "lbu            %[src_ptr_l2],  -11(%[src_ptr])                 \n\t"
1579           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
1580           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
1581           "lbu            %[src_ptr_r1],  13(%[src_ptr])                  \n\t"
1582           "lbu            %[src_ptr_r2],  21(%[src_ptr])                  \n\t"
1583           "lbu            %[src_ptr_r3],  29(%[src_ptr])                  \n\t"
1584           "mtlo           %[vector4a],    $ac3                            \n\t"
1585           "extp           %[Temp5],       $ac2,           9               \n\t"
1586 
1587           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1588           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1589           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1590           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
1591           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1592           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1593 
1594           "lbu            %[src_ptr_l2],  -10(%[src_ptr])                 \n\t"
1595           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
1596           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
1597           "lbu            %[src_ptr_r1],  14(%[src_ptr])                  \n\t"
1598           "lbu            %[src_ptr_r2],  22(%[src_ptr])                  \n\t"
1599           "lbu            %[src_ptr_r3],  30(%[src_ptr])                  \n\t"
1600           "mtlo           %[vector4a],    $ac0                            \n\t"
1601           "extp           %[Temp6],       $ac3,           9               \n\t"
1602 
1603           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1604           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1605           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1606           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
1607           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1608           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1609 
1610           "lbu            %[src_ptr_l2],  -9(%[src_ptr])                  \n\t"
1611           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
1612           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
1613           "lbu            %[src_ptr_r1],  15(%[src_ptr])                  \n\t"
1614           "lbu            %[src_ptr_r2],  23(%[src_ptr])                  \n\t"
1615           "lbu            %[src_ptr_r3],  31(%[src_ptr])                  \n\t"
1616           "mtlo           %[vector4a],    $ac1                            \n\t"
1617           "extp           %[Temp7],       $ac0,           9               \n\t"
1618 
1619           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
1620           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1621           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1622           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
1623           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1624           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1625           "extp           %[Temp8],       $ac1,           9               \n\t"
1626 
1627           : [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
1628             [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
1629             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
1630             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
1631             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
1632           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
1633             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
1634             [src_ptr] "r"(src_ptr));
1635 
1636       /* clamp and store results */
1637       output_ptr[0] = cm[Temp1];
1638       output_ptr[1] = cm[Temp2];
1639       output_ptr[2] = cm[Temp3];
1640       output_ptr[3] = cm[Temp4];
1641       output_ptr[4] = cm[Temp5];
1642       output_ptr[5] = cm[Temp6];
1643       output_ptr[6] = cm[Temp7];
1644       output_ptr[7] = cm[Temp8];
1645 
1646       src_ptr += 8;
1647       output_ptr += output_pitch;
1648     }
1649   } else {
1650     /* 4 tap filter */
1651 
1652     /* prefetch src_ptr data to cache memory */
1653     prefetch_load(src_ptr);
1654 
1655     for (i = output_height; i--;) {
1656       __asm__ __volatile__(
1657           "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
1658           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
1659           "lbu            %[src_ptr_r1],  8(%[src_ptr])                   \n\t"
1660           "lbu            %[src_ptr_r2],  16(%[src_ptr])                  \n\t"
1661           "mtlo           %[vector4a],    $ac2                            \n\t"
1662           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1663           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1664           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1665           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1666 
1667           : [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
1668             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
1669           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1670             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1671 
1672       __asm__ __volatile__(
1673           "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
1674           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
1675           "lbu            %[src_ptr_r1],  9(%[src_ptr])                   \n\t"
1676           "lbu            %[src_ptr_r2],  17(%[src_ptr])                  \n\t"
1677           "mtlo           %[vector4a],    $ac3                            \n\t"
1678           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1679           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1680           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1681           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1682           "extp           %[Temp1],       $ac2,           9               \n\t"
1683 
1684           : [Temp1] "=r"(Temp1), [src_ptr_l1] "=&r"(src_ptr_l1),
1685             [src_ptr_0] "=&r"(src_ptr_0), [src_ptr_r1] "=&r"(src_ptr_r1),
1686             [src_ptr_r2] "=&r"(src_ptr_r2)
1687           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1688             [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
1689 
1690       src_ptr_l1 = src_ptr[-6];
1691       src_ptr_0 = src_ptr[2];
1692       src_ptr_r1 = src_ptr[10];
1693       src_ptr_r2 = src_ptr[18];
1694 
1695       __asm__ __volatile__(
1696           "mtlo           %[vector4a],    $ac0                            \n\t"
1697           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1698           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1699           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1700           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1701           "extp           %[Temp2],       $ac3,           9               \n\t"
1702 
1703           : [Temp2] "=r"(Temp2)
1704           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1705             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1706             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1707             [vector4a] "r"(vector4a));
1708 
1709       src_ptr_l1 = src_ptr[-5];
1710       src_ptr_0 = src_ptr[3];
1711       src_ptr_r1 = src_ptr[11];
1712       src_ptr_r2 = src_ptr[19];
1713 
1714       __asm__ __volatile__(
1715           "mtlo           %[vector4a],    $ac1                            \n\t"
1716           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1717           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1718           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1719           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1720           "extp           %[Temp3],       $ac0,           9               \n\t"
1721 
1722           : [Temp3] "=r"(Temp3)
1723           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1724             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1725             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1726             [vector4a] "r"(vector4a));
1727 
1728       src_ptr_l1 = src_ptr[-4];
1729       src_ptr_0 = src_ptr[4];
1730       src_ptr_r1 = src_ptr[12];
1731       src_ptr_r2 = src_ptr[20];
1732 
1733       __asm__ __volatile__(
1734           "mtlo           %[vector4a],    $ac2                            \n\t"
1735           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1736           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1737           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
1738           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
1739           "extp           %[Temp4],       $ac1,           9               \n\t"
1740 
1741           : [Temp4] "=r"(Temp4)
1742           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1743             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1744             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1745             [vector4a] "r"(vector4a));
1746 
1747       src_ptr_l1 = src_ptr[-3];
1748       src_ptr_0 = src_ptr[5];
1749       src_ptr_r1 = src_ptr[13];
1750       src_ptr_r2 = src_ptr[21];
1751 
1752       __asm__ __volatile__(
1753           "mtlo           %[vector4a],    $ac3                            \n\t"
1754           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1755           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1756           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
1757           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
1758           "extp           %[Temp5],       $ac2,           9               \n\t"
1759 
1760           : [Temp5] "=&r"(Temp5)
1761           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1762             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1763             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1764             [vector4a] "r"(vector4a));
1765 
1766       src_ptr_l1 = src_ptr[-2];
1767       src_ptr_0 = src_ptr[6];
1768       src_ptr_r1 = src_ptr[14];
1769       src_ptr_r2 = src_ptr[22];
1770 
1771       __asm__ __volatile__(
1772           "mtlo           %[vector4a],    $ac0                            \n\t"
1773           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1774           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1775           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
1776           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
1777           "extp           %[Temp6],       $ac3,           9               \n\t"
1778 
1779           : [Temp6] "=r"(Temp6)
1780           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1781             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1782             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1783             [vector4a] "r"(vector4a));
1784 
1785       src_ptr_l1 = src_ptr[-1];
1786       src_ptr_0 = src_ptr[7];
1787       src_ptr_r1 = src_ptr[15];
1788       src_ptr_r2 = src_ptr[23];
1789 
1790       __asm__ __volatile__(
1791           "mtlo           %[vector4a],    $ac1                            \n\t"
1792           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
1793           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
1794           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
1795           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
1796           "extp           %[Temp7],       $ac0,           9               \n\t"
1797           "extp           %[Temp8],       $ac1,           9               \n\t"
1798 
1799           : [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8)
1800           : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
1801             [src_ptr_l1] "r"(src_ptr_l1), [src_ptr_0] "r"(src_ptr_0),
1802             [src_ptr_r1] "r"(src_ptr_r1), [src_ptr_r2] "r"(src_ptr_r2),
1803             [vector4a] "r"(vector4a));
1804 
1805       /* clamp and store results */
1806       output_ptr[0] = cm[Temp1];
1807       output_ptr[1] = cm[Temp2];
1808       output_ptr[2] = cm[Temp3];
1809       output_ptr[3] = cm[Temp4];
1810       output_ptr[4] = cm[Temp5];
1811       output_ptr[5] = cm[Temp6];
1812       output_ptr[6] = cm[Temp7];
1813       output_ptr[7] = cm[Temp8];
1814 
1815       src_ptr += 8;
1816       output_ptr += output_pitch;
1817     }
1818   }
1819 }
1820 
vp8_filter_block2d_second_pass161(unsigned char * RESTRICT src_ptr,unsigned char * RESTRICT output_ptr,int output_pitch,const unsigned short * vp8_filter)1821 void vp8_filter_block2d_second_pass161(unsigned char *RESTRICT src_ptr,
1822                                        unsigned char *RESTRICT output_ptr,
1823                                        int output_pitch,
1824                                        const unsigned short *vp8_filter) {
1825   unsigned int i, j;
1826 
1827   int Temp1, Temp2, Temp3, Temp4, Temp5, Temp6, Temp7, Temp8;
1828   unsigned int vector4a;
1829   unsigned int vector1b, vector2b, vector3b;
1830 
1831   unsigned char src_ptr_l2;
1832   unsigned char src_ptr_l1;
1833   unsigned char src_ptr_0;
1834   unsigned char src_ptr_r1;
1835   unsigned char src_ptr_r2;
1836   unsigned char src_ptr_r3;
1837   unsigned char *cm = ff_cropTbl + CROP_WIDTH;
1838 
1839   vector4a = 64;
1840 
1841   vector1b = vp8_filter[0];
1842   vector2b = vp8_filter[2];
1843   vector3b = vp8_filter[1];
1844 
1845   if (vector1b == 0) {
1846     /* 4 tap filter */
1847 
1848     /* prefetch src_ptr data to cache memory */
1849     prefetch_load(src_ptr + 16);
1850 
1851     for (i = 16; i--;) {
1852       /* unrolling for loop */
1853       for (j = 0; j < 16; j += 8) {
1854         /* apply filter with vectors pairs */
1855         __asm__ __volatile__(
1856             "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 "
1857             "\n\t"
1858             "lbu            %[src_ptr_0],   0(%[src_ptr])                   "
1859             "\n\t"
1860             "lbu            %[src_ptr_r1],  16(%[src_ptr])                  "
1861             "\n\t"
1862             "lbu            %[src_ptr_r2],  32(%[src_ptr])                  "
1863             "\n\t"
1864             "mtlo           %[vector4a],    $ac2                            "
1865             "\n\t"
1866             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1867             "\n\t"
1868             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
1869             "\n\t"
1870             "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     "
1871             "\n\t"
1872             "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     "
1873             "\n\t"
1874 
1875             "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 "
1876             "\n\t"
1877             "lbu            %[src_ptr_0],   1(%[src_ptr])                   "
1878             "\n\t"
1879             "lbu            %[src_ptr_r1],  17(%[src_ptr])                  "
1880             "\n\t"
1881             "lbu            %[src_ptr_r2],  33(%[src_ptr])                  "
1882             "\n\t"
1883             "mtlo           %[vector4a],    $ac3                            "
1884             "\n\t"
1885             "extp           %[Temp1],       $ac2,           9               "
1886             "\n\t"
1887 
1888             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1889             "\n\t"
1890             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
1891             "\n\t"
1892             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
1893             "\n\t"
1894             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
1895             "\n\t"
1896 
1897             "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 "
1898             "\n\t"
1899             "lbu            %[src_ptr_0],   2(%[src_ptr])                   "
1900             "\n\t"
1901             "lbu            %[src_ptr_r1],  18(%[src_ptr])                  "
1902             "\n\t"
1903             "lbu            %[src_ptr_r2],  34(%[src_ptr])                  "
1904             "\n\t"
1905             "mtlo           %[vector4a],    $ac1                            "
1906             "\n\t"
1907             "extp           %[Temp2],       $ac3,           9               "
1908             "\n\t"
1909 
1910             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1911             "\n\t"
1912             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
1913             "\n\t"
1914             "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     "
1915             "\n\t"
1916             "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     "
1917             "\n\t"
1918 
1919             "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 "
1920             "\n\t"
1921             "lbu            %[src_ptr_0],   3(%[src_ptr])                   "
1922             "\n\t"
1923             "lbu            %[src_ptr_r1],  19(%[src_ptr])                  "
1924             "\n\t"
1925             "lbu            %[src_ptr_r2],  35(%[src_ptr])                  "
1926             "\n\t"
1927             "mtlo           %[vector4a],    $ac3                            "
1928             "\n\t"
1929             "extp           %[Temp3],       $ac1,           9               "
1930             "\n\t"
1931 
1932             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1933             "\n\t"
1934             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
1935             "\n\t"
1936             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
1937             "\n\t"
1938             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
1939             "\n\t"
1940 
1941             "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 "
1942             "\n\t"
1943             "lbu            %[src_ptr_0],   4(%[src_ptr])                   "
1944             "\n\t"
1945             "lbu            %[src_ptr_r1],  20(%[src_ptr])                  "
1946             "\n\t"
1947             "lbu            %[src_ptr_r2],  36(%[src_ptr])                  "
1948             "\n\t"
1949             "mtlo           %[vector4a],    $ac2                            "
1950             "\n\t"
1951             "extp           %[Temp4],       $ac3,           9               "
1952             "\n\t"
1953 
1954             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1955             "\n\t"
1956             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
1957             "\n\t"
1958             "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     "
1959             "\n\t"
1960             "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     "
1961             "\n\t"
1962 
1963             "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 "
1964             "\n\t"
1965             "lbu            %[src_ptr_0],   5(%[src_ptr])                   "
1966             "\n\t"
1967             "lbu            %[src_ptr_r1],  21(%[src_ptr])                  "
1968             "\n\t"
1969             "lbu            %[src_ptr_r2],  37(%[src_ptr])                  "
1970             "\n\t"
1971             "mtlo           %[vector4a],    $ac3                            "
1972             "\n\t"
1973             "extp           %[Temp5],       $ac2,           9               "
1974             "\n\t"
1975 
1976             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1977             "\n\t"
1978             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
1979             "\n\t"
1980             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
1981             "\n\t"
1982             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
1983             "\n\t"
1984 
1985             "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 "
1986             "\n\t"
1987             "lbu            %[src_ptr_0],   6(%[src_ptr])                   "
1988             "\n\t"
1989             "lbu            %[src_ptr_r1],  22(%[src_ptr])                  "
1990             "\n\t"
1991             "lbu            %[src_ptr_r2],  38(%[src_ptr])                  "
1992             "\n\t"
1993             "mtlo           %[vector4a],    $ac1                            "
1994             "\n\t"
1995             "extp           %[Temp6],       $ac3,           9               "
1996             "\n\t"
1997 
1998             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
1999             "\n\t"
2000             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
2001             "\n\t"
2002             "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     "
2003             "\n\t"
2004             "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     "
2005             "\n\t"
2006 
2007             "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  "
2008             "\n\t"
2009             "lbu            %[src_ptr_0],   7(%[src_ptr])                   "
2010             "\n\t"
2011             "lbu            %[src_ptr_r1],  23(%[src_ptr])                  "
2012             "\n\t"
2013             "lbu            %[src_ptr_r2],  39(%[src_ptr])                  "
2014             "\n\t"
2015             "mtlo           %[vector4a],    $ac3                            "
2016             "\n\t"
2017             "extp           %[Temp7],       $ac1,           9               "
2018             "\n\t"
2019 
2020             "append         %[src_ptr_0],   %[src_ptr_r1],  8               "
2021             "\n\t"
2022             "append         %[src_ptr_l1],  %[src_ptr_r2],  8               "
2023             "\n\t"
2024             "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     "
2025             "\n\t"
2026             "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     "
2027             "\n\t"
2028             "extp           %[Temp8],       $ac3,           9               "
2029             "\n\t"
2030 
2031             : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
2032               [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
2033               [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
2034               [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
2035               [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2)
2036             : [vector2b] "r"(vector2b), [vector3b] "r"(vector3b),
2037               [vector4a] "r"(vector4a), [src_ptr] "r"(src_ptr));
2038 
2039         /* clamp and store results */
2040         output_ptr[j] = cm[Temp1];
2041         output_ptr[j + 1] = cm[Temp2];
2042         output_ptr[j + 2] = cm[Temp3];
2043         output_ptr[j + 3] = cm[Temp4];
2044         output_ptr[j + 4] = cm[Temp5];
2045         output_ptr[j + 5] = cm[Temp6];
2046         output_ptr[j + 6] = cm[Temp7];
2047         output_ptr[j + 7] = cm[Temp8];
2048 
2049         src_ptr += 8;
2050       }
2051 
2052       output_ptr += output_pitch;
2053     }
2054   } else {
2055     /* 4 tap filter */
2056 
2057     /* prefetch src_ptr data to cache memory */
2058     prefetch_load(src_ptr + 16);
2059 
2060     /* unroll for loop */
2061     for (i = 16; i--;) {
2062       /* apply filter with vectors pairs */
2063       __asm__ __volatile__(
2064           "lbu            %[src_ptr_l2],  -32(%[src_ptr])                 \n\t"
2065           "lbu            %[src_ptr_l1],  -16(%[src_ptr])                 \n\t"
2066           "lbu            %[src_ptr_0],   0(%[src_ptr])                   \n\t"
2067           "lbu            %[src_ptr_r1],  16(%[src_ptr])                  \n\t"
2068           "lbu            %[src_ptr_r2],  32(%[src_ptr])                  \n\t"
2069           "lbu            %[src_ptr_r3],  48(%[src_ptr])                  \n\t"
2070           "mtlo           %[vector4a],    $ac2                            \n\t"
2071 
2072           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2073           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2074           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2075           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2076           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2077           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2078 
2079           "lbu            %[src_ptr_l2],  -31(%[src_ptr])                 \n\t"
2080           "lbu            %[src_ptr_l1],  -15(%[src_ptr])                 \n\t"
2081           "lbu            %[src_ptr_0],   1(%[src_ptr])                   \n\t"
2082           "lbu            %[src_ptr_r1],  17(%[src_ptr])                  \n\t"
2083           "lbu            %[src_ptr_r2],  33(%[src_ptr])                  \n\t"
2084           "lbu            %[src_ptr_r3],  49(%[src_ptr])                  \n\t"
2085           "mtlo           %[vector4a],    $ac0                            \n\t"
2086           "extp           %[Temp1],       $ac2,           9               \n\t"
2087 
2088           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2089           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2090           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2091           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2092           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2093           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2094 
2095           "lbu            %[src_ptr_l2],  -30(%[src_ptr])                 \n\t"
2096           "lbu            %[src_ptr_l1],  -14(%[src_ptr])                 \n\t"
2097           "lbu            %[src_ptr_0],   2(%[src_ptr])                   \n\t"
2098           "lbu            %[src_ptr_r1],  18(%[src_ptr])                  \n\t"
2099           "lbu            %[src_ptr_r2],  34(%[src_ptr])                  \n\t"
2100           "lbu            %[src_ptr_r3],  50(%[src_ptr])                  \n\t"
2101           "mtlo           %[vector4a],    $ac1                            \n\t"
2102           "extp           %[Temp2],       $ac0,           9               \n\t"
2103 
2104           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2105           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2106           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2107           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2108           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2109           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2110 
2111           "lbu            %[src_ptr_l2],  -29(%[src_ptr])                 \n\t"
2112           "lbu            %[src_ptr_l1],  -13(%[src_ptr])                 \n\t"
2113           "lbu            %[src_ptr_0],   3(%[src_ptr])                   \n\t"
2114           "lbu            %[src_ptr_r1],  19(%[src_ptr])                  \n\t"
2115           "lbu            %[src_ptr_r2],  35(%[src_ptr])                  \n\t"
2116           "lbu            %[src_ptr_r3],  51(%[src_ptr])                  \n\t"
2117           "mtlo           %[vector4a],    $ac3                            \n\t"
2118           "extp           %[Temp3],       $ac1,           9               \n\t"
2119 
2120           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2121           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2122           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2123           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2124           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2125           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2126 
2127           "lbu            %[src_ptr_l2],  -28(%[src_ptr])                 \n\t"
2128           "lbu            %[src_ptr_l1],  -12(%[src_ptr])                 \n\t"
2129           "lbu            %[src_ptr_0],   4(%[src_ptr])                   \n\t"
2130           "lbu            %[src_ptr_r1],  20(%[src_ptr])                  \n\t"
2131           "lbu            %[src_ptr_r2],  36(%[src_ptr])                  \n\t"
2132           "lbu            %[src_ptr_r3],  52(%[src_ptr])                  \n\t"
2133           "mtlo           %[vector4a],    $ac2                            \n\t"
2134           "extp           %[Temp4],       $ac3,           9               \n\t"
2135 
2136           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2137           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2138           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2139           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2140           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2141           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2142 
2143           "lbu            %[src_ptr_l2],  -27(%[src_ptr])                 \n\t"
2144           "lbu            %[src_ptr_l1],  -11(%[src_ptr])                 \n\t"
2145           "lbu            %[src_ptr_0],   5(%[src_ptr])                   \n\t"
2146           "lbu            %[src_ptr_r1],  21(%[src_ptr])                  \n\t"
2147           "lbu            %[src_ptr_r2],  37(%[src_ptr])                  \n\t"
2148           "lbu            %[src_ptr_r3],  53(%[src_ptr])                  \n\t"
2149           "mtlo           %[vector4a],    $ac0                            \n\t"
2150           "extp           %[Temp5],       $ac2,           9               \n\t"
2151 
2152           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2153           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2154           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2155           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2156           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2157           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2158 
2159           "lbu            %[src_ptr_l2],  -26(%[src_ptr])                 \n\t"
2160           "lbu            %[src_ptr_l1],  -10(%[src_ptr])                 \n\t"
2161           "lbu            %[src_ptr_0],   6(%[src_ptr])                   \n\t"
2162           "lbu            %[src_ptr_r1],  22(%[src_ptr])                  \n\t"
2163           "lbu            %[src_ptr_r2],  38(%[src_ptr])                  \n\t"
2164           "lbu            %[src_ptr_r3],  54(%[src_ptr])                  \n\t"
2165           "mtlo           %[vector4a],    $ac1                            \n\t"
2166           "extp           %[Temp6],       $ac0,           9               \n\t"
2167 
2168           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2169           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2170           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2171           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2172           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2173           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2174 
2175           "lbu            %[src_ptr_l2],  -25(%[src_ptr])                 \n\t"
2176           "lbu            %[src_ptr_l1],  -9(%[src_ptr])                  \n\t"
2177           "lbu            %[src_ptr_0],   7(%[src_ptr])                   \n\t"
2178           "lbu            %[src_ptr_r1],  23(%[src_ptr])                  \n\t"
2179           "lbu            %[src_ptr_r2],  39(%[src_ptr])                  \n\t"
2180           "lbu            %[src_ptr_r3],  55(%[src_ptr])                  \n\t"
2181           "mtlo           %[vector4a],    $ac3                            \n\t"
2182           "extp           %[Temp7],       $ac1,           9               \n\t"
2183 
2184           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2185           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2186           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2187           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2188           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2189           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2190           "extp           %[Temp8],       $ac3,           9               \n\t"
2191 
2192           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
2193             [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
2194             [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
2195             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
2196             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
2197             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
2198           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
2199             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
2200             [src_ptr] "r"(src_ptr));
2201 
2202       /* clamp and store results */
2203       output_ptr[0] = cm[Temp1];
2204       output_ptr[1] = cm[Temp2];
2205       output_ptr[2] = cm[Temp3];
2206       output_ptr[3] = cm[Temp4];
2207       output_ptr[4] = cm[Temp5];
2208       output_ptr[5] = cm[Temp6];
2209       output_ptr[6] = cm[Temp7];
2210       output_ptr[7] = cm[Temp8];
2211 
2212       /* apply filter with vectors pairs */
2213       __asm__ __volatile__(
2214           "lbu            %[src_ptr_l2],  -24(%[src_ptr])                 \n\t"
2215           "lbu            %[src_ptr_l1],  -8(%[src_ptr])                  \n\t"
2216           "lbu            %[src_ptr_0],   8(%[src_ptr])                   \n\t"
2217           "lbu            %[src_ptr_r1],  24(%[src_ptr])                  \n\t"
2218           "lbu            %[src_ptr_r2],  40(%[src_ptr])                  \n\t"
2219           "lbu            %[src_ptr_r3],  56(%[src_ptr])                  \n\t"
2220           "mtlo           %[vector4a],    $ac2                            \n\t"
2221 
2222           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2223           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2224           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2225           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2226           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2227           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2228 
2229           "lbu            %[src_ptr_l2],  -23(%[src_ptr])                 \n\t"
2230           "lbu            %[src_ptr_l1],  -7(%[src_ptr])                  \n\t"
2231           "lbu            %[src_ptr_0],   9(%[src_ptr])                   \n\t"
2232           "lbu            %[src_ptr_r1],  25(%[src_ptr])                  \n\t"
2233           "lbu            %[src_ptr_r2],  41(%[src_ptr])                  \n\t"
2234           "lbu            %[src_ptr_r3],  57(%[src_ptr])                  \n\t"
2235           "mtlo           %[vector4a],    $ac0                            \n\t"
2236           "extp           %[Temp1],       $ac2,           9               \n\t"
2237 
2238           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2239           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2240           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2241           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2242           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2243           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2244 
2245           "lbu            %[src_ptr_l2],  -22(%[src_ptr])                 \n\t"
2246           "lbu            %[src_ptr_l1],  -6(%[src_ptr])                  \n\t"
2247           "lbu            %[src_ptr_0],   10(%[src_ptr])                  \n\t"
2248           "lbu            %[src_ptr_r1],  26(%[src_ptr])                  \n\t"
2249           "lbu            %[src_ptr_r2],  42(%[src_ptr])                  \n\t"
2250           "lbu            %[src_ptr_r3],  58(%[src_ptr])                  \n\t"
2251           "mtlo           %[vector4a],    $ac1                            \n\t"
2252           "extp           %[Temp2],       $ac0,           9               \n\t"
2253 
2254           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2255           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2256           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2257           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2258           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2259           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2260 
2261           "lbu            %[src_ptr_l2],  -21(%[src_ptr])                 \n\t"
2262           "lbu            %[src_ptr_l1],  -5(%[src_ptr])                  \n\t"
2263           "lbu            %[src_ptr_0],   11(%[src_ptr])                  \n\t"
2264           "lbu            %[src_ptr_r1],  27(%[src_ptr])                  \n\t"
2265           "lbu            %[src_ptr_r2],  43(%[src_ptr])                  \n\t"
2266           "lbu            %[src_ptr_r3],  59(%[src_ptr])                  \n\t"
2267           "mtlo           %[vector4a],    $ac3                            \n\t"
2268           "extp           %[Temp3],       $ac1,           9               \n\t"
2269 
2270           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2271           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2272           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2273           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2274           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2275           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2276 
2277           "lbu            %[src_ptr_l2],  -20(%[src_ptr])                 \n\t"
2278           "lbu            %[src_ptr_l1],  -4(%[src_ptr])                  \n\t"
2279           "lbu            %[src_ptr_0],   12(%[src_ptr])                  \n\t"
2280           "lbu            %[src_ptr_r1],  28(%[src_ptr])                  \n\t"
2281           "lbu            %[src_ptr_r2],  44(%[src_ptr])                  \n\t"
2282           "lbu            %[src_ptr_r3],  60(%[src_ptr])                  \n\t"
2283           "mtlo           %[vector4a],    $ac2                            \n\t"
2284           "extp           %[Temp4],       $ac3,           9               \n\t"
2285 
2286           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2287           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2288           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2289           "dpau.h.qbr     $ac2,           %[src_ptr_l2],  %[vector1b]     \n\t"
2290           "dpau.h.qbr     $ac2,           %[src_ptr_0],   %[vector2b]     \n\t"
2291           "dpsu.h.qbr     $ac2,           %[src_ptr_l1],  %[vector3b]     \n\t"
2292 
2293           "lbu            %[src_ptr_l2],  -19(%[src_ptr])                 \n\t"
2294           "lbu            %[src_ptr_l1],  -3(%[src_ptr])                  \n\t"
2295           "lbu            %[src_ptr_0],   13(%[src_ptr])                  \n\t"
2296           "lbu            %[src_ptr_r1],  29(%[src_ptr])                  \n\t"
2297           "lbu            %[src_ptr_r2],  45(%[src_ptr])                  \n\t"
2298           "lbu            %[src_ptr_r3],  61(%[src_ptr])                  \n\t"
2299           "mtlo           %[vector4a],    $ac0                            \n\t"
2300           "extp           %[Temp5],       $ac2,           9               \n\t"
2301 
2302           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2303           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2304           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2305           "dpau.h.qbr     $ac0,           %[src_ptr_l2],  %[vector1b]     \n\t"
2306           "dpau.h.qbr     $ac0,           %[src_ptr_0],   %[vector2b]     \n\t"
2307           "dpsu.h.qbr     $ac0,           %[src_ptr_l1],  %[vector3b]     \n\t"
2308 
2309           "lbu            %[src_ptr_l2],  -18(%[src_ptr])                 \n\t"
2310           "lbu            %[src_ptr_l1],  -2(%[src_ptr])                  \n\t"
2311           "lbu            %[src_ptr_0],   14(%[src_ptr])                  \n\t"
2312           "lbu            %[src_ptr_r1],  30(%[src_ptr])                  \n\t"
2313           "lbu            %[src_ptr_r2],  46(%[src_ptr])                  \n\t"
2314           "lbu            %[src_ptr_r3],  62(%[src_ptr])                  \n\t"
2315           "mtlo           %[vector4a],    $ac1                            \n\t"
2316           "extp           %[Temp6],       $ac0,           9               \n\t"
2317 
2318           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2319           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2320           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2321           "dpau.h.qbr     $ac1,           %[src_ptr_l2],  %[vector1b]     \n\t"
2322           "dpau.h.qbr     $ac1,           %[src_ptr_0],   %[vector2b]     \n\t"
2323           "dpsu.h.qbr     $ac1,           %[src_ptr_l1],  %[vector3b]     \n\t"
2324 
2325           "lbu            %[src_ptr_l2],  -17(%[src_ptr])                 \n\t"
2326           "lbu            %[src_ptr_l1],  -1(%[src_ptr])                  \n\t"
2327           "lbu            %[src_ptr_0],   15(%[src_ptr])                  \n\t"
2328           "lbu            %[src_ptr_r1],  31(%[src_ptr])                  \n\t"
2329           "lbu            %[src_ptr_r2],  47(%[src_ptr])                  \n\t"
2330           "lbu            %[src_ptr_r3],  63(%[src_ptr])                  \n\t"
2331           "mtlo           %[vector4a],    $ac3                            \n\t"
2332           "extp           %[Temp7],       $ac1,           9               \n\t"
2333 
2334           "append         %[src_ptr_l2],  %[src_ptr_r3],  8               \n\t"
2335           "append         %[src_ptr_0],   %[src_ptr_r1],  8               \n\t"
2336           "append         %[src_ptr_l1],  %[src_ptr_r2],  8               \n\t"
2337           "dpau.h.qbr     $ac3,           %[src_ptr_l2],  %[vector1b]     \n\t"
2338           "dpau.h.qbr     $ac3,           %[src_ptr_0],   %[vector2b]     \n\t"
2339           "dpsu.h.qbr     $ac3,           %[src_ptr_l1],  %[vector3b]     \n\t"
2340           "extp           %[Temp8],       $ac3,           9               \n\t"
2341 
2342           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3),
2343             [Temp4] "=&r"(Temp4), [Temp5] "=&r"(Temp5), [Temp6] "=&r"(Temp6),
2344             [Temp7] "=&r"(Temp7), [Temp8] "=r"(Temp8),
2345             [src_ptr_l1] "=&r"(src_ptr_l1), [src_ptr_0] "=&r"(src_ptr_0),
2346             [src_ptr_r1] "=&r"(src_ptr_r1), [src_ptr_r2] "=&r"(src_ptr_r2),
2347             [src_ptr_l2] "=&r"(src_ptr_l2), [src_ptr_r3] "=&r"(src_ptr_r3)
2348           : [vector1b] "r"(vector1b), [vector2b] "r"(vector2b),
2349             [vector3b] "r"(vector3b), [vector4a] "r"(vector4a),
2350             [src_ptr] "r"(src_ptr));
2351 
2352       src_ptr += 16;
2353       output_ptr[8] = cm[Temp1];
2354       output_ptr[9] = cm[Temp2];
2355       output_ptr[10] = cm[Temp3];
2356       output_ptr[11] = cm[Temp4];
2357       output_ptr[12] = cm[Temp5];
2358       output_ptr[13] = cm[Temp6];
2359       output_ptr[14] = cm[Temp7];
2360       output_ptr[15] = cm[Temp8];
2361 
2362       output_ptr += output_pitch;
2363     }
2364   }
2365 }
2366 
vp8_sixtap_predict4x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2367 void vp8_sixtap_predict4x4_dspr2(unsigned char *RESTRICT src_ptr,
2368                                  int src_pixels_per_line, int xoffset,
2369                                  int yoffset, unsigned char *RESTRICT dst_ptr,
2370                                  int dst_pitch) {
2371   unsigned char FData[9 * 4]; /* Temp data bufffer used in filtering */
2372   unsigned int pos = 16;
2373 
2374   /* bit positon for extract from acc */
2375   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
2376                        :
2377                        : [pos] "r"(pos));
2378 
2379   if (yoffset) {
2380     /* First filter 1-D horizontally... */
2381     vp8_filter_block2d_first_pass_4(src_ptr - (2 * src_pixels_per_line), FData,
2382                                     src_pixels_per_line, 9, xoffset, 4);
2383     /* then filter verticaly... */
2384     vp8_filter_block2d_second_pass4(FData + 8, dst_ptr, dst_pitch, yoffset);
2385   } else
2386     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2387     vp8_filter_block2d_first_pass_4(src_ptr, dst_ptr, src_pixels_per_line, 4,
2388                                     xoffset, dst_pitch);
2389 }
2390 
vp8_sixtap_predict8x8_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2391 void vp8_sixtap_predict8x8_dspr2(unsigned char *RESTRICT src_ptr,
2392                                  int src_pixels_per_line, int xoffset,
2393                                  int yoffset, unsigned char *RESTRICT dst_ptr,
2394                                  int dst_pitch) {
2395   unsigned char FData[13 * 8]; /* Temp data bufffer used in filtering */
2396   unsigned int pos, Temp1, Temp2;
2397 
2398   pos = 16;
2399 
2400   /* bit positon for extract from acc */
2401   __asm__ __volatile__("wrdsp      %[pos],     1               \n\t"
2402                        :
2403                        : [pos] "r"(pos));
2404 
2405   if (yoffset) {
2406     src_ptr = src_ptr - (2 * src_pixels_per_line);
2407 
2408     if (xoffset) /* filter 1-D horizontally... */
2409       vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2410                                           13, xoffset, 8);
2411 
2412     else {
2413       /* prefetch src_ptr data to cache memory */
2414       prefetch_load(src_ptr + 2 * src_pixels_per_line);
2415 
2416       __asm__ __volatile__(
2417           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2418           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2419           "sw     %[Temp1],   0(%[FData])                             \n\t"
2420           "sw     %[Temp2],   4(%[FData])                             \n\t"
2421           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2422 
2423           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2424           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2425           "sw     %[Temp1],   8(%[FData])                             \n\t"
2426           "sw     %[Temp2],   12(%[FData])                            \n\t"
2427           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2428 
2429           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2430           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2431           "sw     %[Temp1],   16(%[FData])                            \n\t"
2432           "sw     %[Temp2],   20(%[FData])                            \n\t"
2433           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2434 
2435           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2436           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2437           "sw     %[Temp1],   24(%[FData])                            \n\t"
2438           "sw     %[Temp2],   28(%[FData])                            \n\t"
2439           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2440 
2441           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2442           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2443           "sw     %[Temp1],   32(%[FData])                            \n\t"
2444           "sw     %[Temp2],   36(%[FData])                            \n\t"
2445           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2446 
2447           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2448           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2449           "sw     %[Temp1],   40(%[FData])                            \n\t"
2450           "sw     %[Temp2],   44(%[FData])                            \n\t"
2451           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2452 
2453           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2454           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2455           "sw     %[Temp1],   48(%[FData])                            \n\t"
2456           "sw     %[Temp2],   52(%[FData])                            \n\t"
2457           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2458 
2459           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2460           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2461           "sw     %[Temp1],   56(%[FData])                            \n\t"
2462           "sw     %[Temp2],   60(%[FData])                            \n\t"
2463           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2464 
2465           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2466           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2467           "sw     %[Temp1],   64(%[FData])                            \n\t"
2468           "sw     %[Temp2],   68(%[FData])                            \n\t"
2469           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2470 
2471           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2472           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2473           "sw     %[Temp1],   72(%[FData])                            \n\t"
2474           "sw     %[Temp2],   76(%[FData])                            \n\t"
2475           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2476 
2477           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2478           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2479           "sw     %[Temp1],   80(%[FData])                            \n\t"
2480           "sw     %[Temp2],   84(%[FData])                            \n\t"
2481           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2482 
2483           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2484           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2485           "sw     %[Temp1],   88(%[FData])                            \n\t"
2486           "sw     %[Temp2],   92(%[FData])                            \n\t"
2487           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2488 
2489           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2490           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2491           "sw     %[Temp1],   96(%[FData])                            \n\t"
2492           "sw     %[Temp2],   100(%[FData])                           \n\t"
2493 
2494           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2495           : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
2496             [src_pixels_per_line] "r"(src_pixels_per_line));
2497     }
2498 
2499     /* filter verticaly... */
2500     vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 8, 8,
2501                                      yoffset);
2502   }
2503 
2504   /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2505   else {
2506     if (xoffset)
2507       vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2508                                           8, xoffset, dst_pitch);
2509 
2510     else {
2511       /* copy from src buffer to dst buffer */
2512       __asm__ __volatile__(
2513           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2514           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2515           "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
2516           "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
2517           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2518 
2519           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2520           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2521           "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
2522           "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
2523           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2524 
2525           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2526           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2527           "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
2528           "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
2529           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2530 
2531           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2532           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2533           "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
2534           "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
2535           "addu   %[src_ptr], %[src_ptr],   %[src_pixels_per_line]    \n\t"
2536 
2537           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2538           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2539           "sw     %[Temp1],   32(%[dst_ptr])                          \n\t"
2540           "sw     %[Temp2],   36(%[dst_ptr])                          \n\t"
2541           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2542 
2543           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2544           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2545           "sw     %[Temp1],   40(%[dst_ptr])                          \n\t"
2546           "sw     %[Temp2],   44(%[dst_ptr])                          \n\t"
2547           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2548 
2549           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2550           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2551           "sw     %[Temp1],   48(%[dst_ptr])                          \n\t"
2552           "sw     %[Temp2],   52(%[dst_ptr])                          \n\t"
2553           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2554 
2555           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2556           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2557           "sw     %[Temp1],   56(%[dst_ptr])                          \n\t"
2558           "sw     %[Temp2],   60(%[dst_ptr])                          \n\t"
2559 
2560           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2561           : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
2562             [src_pixels_per_line] "r"(src_pixels_per_line));
2563     }
2564   }
2565 }
2566 
vp8_sixtap_predict8x4_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2567 void vp8_sixtap_predict8x4_dspr2(unsigned char *RESTRICT src_ptr,
2568                                  int src_pixels_per_line, int xoffset,
2569                                  int yoffset, unsigned char *RESTRICT dst_ptr,
2570                                  int dst_pitch) {
2571   unsigned char FData[9 * 8]; /* Temp data bufffer used in filtering */
2572   unsigned int pos, Temp1, Temp2;
2573 
2574   pos = 16;
2575 
2576   /* bit positon for extract from acc */
2577   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
2578                        :
2579                        : [pos] "r"(pos));
2580 
2581   if (yoffset) {
2582     src_ptr = src_ptr - (2 * src_pixels_per_line);
2583 
2584     if (xoffset) /* filter 1-D horizontally... */
2585       vp8_filter_block2d_first_pass_8_all(src_ptr, FData, src_pixels_per_line,
2586                                           9, xoffset, 8);
2587 
2588     else {
2589       /* prefetch src_ptr data to cache memory */
2590       prefetch_load(src_ptr + 2 * src_pixels_per_line);
2591 
2592       __asm__ __volatile__(
2593           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2594           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2595           "sw     %[Temp1],   0(%[FData])                             \n\t"
2596           "sw     %[Temp2],   4(%[FData])                             \n\t"
2597           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2598 
2599           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2600           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2601           "sw     %[Temp1],   8(%[FData])                             \n\t"
2602           "sw     %[Temp2],   12(%[FData])                            \n\t"
2603           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2604 
2605           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2606           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2607           "sw     %[Temp1],   16(%[FData])                            \n\t"
2608           "sw     %[Temp2],   20(%[FData])                            \n\t"
2609           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2610 
2611           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2612           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2613           "sw     %[Temp1],   24(%[FData])                            \n\t"
2614           "sw     %[Temp2],   28(%[FData])                            \n\t"
2615           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2616 
2617           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2618           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2619           "sw     %[Temp1],   32(%[FData])                            \n\t"
2620           "sw     %[Temp2],   36(%[FData])                            \n\t"
2621           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2622 
2623           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2624           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2625           "sw     %[Temp1],   40(%[FData])                            \n\t"
2626           "sw     %[Temp2],   44(%[FData])                            \n\t"
2627           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2628 
2629           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2630           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2631           "sw     %[Temp1],   48(%[FData])                            \n\t"
2632           "sw     %[Temp2],   52(%[FData])                            \n\t"
2633           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2634 
2635           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2636           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2637           "sw     %[Temp1],   56(%[FData])                            \n\t"
2638           "sw     %[Temp2],   60(%[FData])                            \n\t"
2639           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2640 
2641           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2642           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2643           "sw     %[Temp1],   64(%[FData])                            \n\t"
2644           "sw     %[Temp2],   68(%[FData])                            \n\t"
2645 
2646           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2647           : [FData] "r"(FData), [src_ptr] "r"(src_ptr),
2648             [src_pixels_per_line] "r"(src_pixels_per_line));
2649     }
2650 
2651     /* filter verticaly... */
2652     vp8_filter_block2d_second_pass_8(FData + 16, dst_ptr, dst_pitch, 4, 8,
2653                                      yoffset);
2654   }
2655 
2656   /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2657   else {
2658     if (xoffset)
2659       vp8_filter_block2d_first_pass_8_all(src_ptr, dst_ptr, src_pixels_per_line,
2660                                           4, xoffset, dst_pitch);
2661 
2662     else {
2663       /* copy from src buffer to dst buffer */
2664       __asm__ __volatile__(
2665           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2666           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2667           "sw     %[Temp1],   0(%[dst_ptr])                           \n\t"
2668           "sw     %[Temp2],   4(%[dst_ptr])                           \n\t"
2669           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2670 
2671           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2672           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2673           "sw     %[Temp1],   8(%[dst_ptr])                           \n\t"
2674           "sw     %[Temp2],   12(%[dst_ptr])                          \n\t"
2675           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2676 
2677           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2678           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2679           "sw     %[Temp1],   16(%[dst_ptr])                          \n\t"
2680           "sw     %[Temp2],   20(%[dst_ptr])                          \n\t"
2681           "addu   %[src_ptr], %[src_ptr],    %[src_pixels_per_line]   \n\t"
2682 
2683           "ulw    %[Temp1],   0(%[src_ptr])                           \n\t"
2684           "ulw    %[Temp2],   4(%[src_ptr])                           \n\t"
2685           "sw     %[Temp1],   24(%[dst_ptr])                          \n\t"
2686           "sw     %[Temp2],   28(%[dst_ptr])                          \n\t"
2687 
2688           : [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2)
2689           : [dst_ptr] "r"(dst_ptr), [src_ptr] "r"(src_ptr),
2690             [src_pixels_per_line] "r"(src_pixels_per_line));
2691     }
2692   }
2693 }
2694 
vp8_sixtap_predict16x16_dspr2(unsigned char * RESTRICT src_ptr,int src_pixels_per_line,int xoffset,int yoffset,unsigned char * RESTRICT dst_ptr,int dst_pitch)2695 void vp8_sixtap_predict16x16_dspr2(unsigned char *RESTRICT src_ptr,
2696                                    int src_pixels_per_line, int xoffset,
2697                                    int yoffset, unsigned char *RESTRICT dst_ptr,
2698                                    int dst_pitch) {
2699   const unsigned short *VFilter;
2700   unsigned char FData[21 * 16]; /* Temp data bufffer used in filtering */
2701   unsigned int pos;
2702 
2703   VFilter = sub_pel_filterss[yoffset];
2704 
2705   pos = 16;
2706 
2707   /* bit positon for extract from acc */
2708   __asm__ __volatile__("wrdsp      %[pos],     1           \n\t"
2709                        :
2710                        : [pos] "r"(pos));
2711 
2712   if (yoffset) {
2713     src_ptr = src_ptr - (2 * src_pixels_per_line);
2714 
2715     switch (xoffset) {
2716       /* filter 1-D horizontally... */
2717       case 2:
2718       case 4:
2719       case 6:
2720         /* 6 tap filter */
2721         vp8_filter_block2d_first_pass16_6tap(
2722             src_ptr, FData, src_pixels_per_line, 21, xoffset, 16);
2723         break;
2724 
2725       case 0:
2726         /* only copy buffer */
2727         vp8_filter_block2d_first_pass16_0(src_ptr, FData, src_pixels_per_line);
2728         break;
2729 
2730       case 1:
2731       case 3:
2732       case 5:
2733       case 7:
2734         /* 4 tap filter */
2735         vp8_filter_block2d_first_pass16_4tap(
2736             src_ptr, FData, src_pixels_per_line, 16, 21, xoffset, yoffset,
2737             dst_ptr, dst_pitch);
2738         break;
2739     }
2740 
2741     /* filter verticaly... */
2742     vp8_filter_block2d_second_pass161(FData + 32, dst_ptr, dst_pitch, VFilter);
2743   } else {
2744     /* if (yoffsset == 0) vp8_filter_block2d_first_pass save data to dst_ptr */
2745     switch (xoffset) {
2746       case 2:
2747       case 4:
2748       case 6:
2749         /* 6 tap filter */
2750         vp8_filter_block2d_first_pass16_6tap(
2751             src_ptr, dst_ptr, src_pixels_per_line, 16, xoffset, dst_pitch);
2752         break;
2753 
2754       case 1:
2755       case 3:
2756       case 5:
2757       case 7:
2758         /* 4 tap filter */
2759         vp8_filter_block2d_first_pass16_4tap(
2760             src_ptr, dst_ptr, src_pixels_per_line, 16, 21, xoffset, yoffset,
2761             dst_ptr, dst_pitch);
2762         break;
2763     }
2764   }
2765 }
2766 
2767 #endif
2768