1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
24                                        int32_t src_stride,
25                                        uint8_t *dst,
26                                        int32_t dst_stride,
27                                        const int16_t *filter_x0,
28                                        int32_t h) {
29   int32_t y;
30   uint8_t *cm = vp9_ff_cropTbl;
31   int32_t  vector1b, vector2b, vector3b, vector4b;
32   int32_t  Temp1, Temp2, Temp3, Temp4;
33   uint32_t vector4a = 64;
34   uint32_t tp1, tp2;
35   uint32_t p1, p2, p3, p4;
36   uint32_t n1, n2, n3, n4;
37   uint32_t tn1, tn2;
38 
39   vector1b = ((const int32_t *)filter_x0)[0];
40   vector2b = ((const int32_t *)filter_x0)[1];
41   vector3b = ((const int32_t *)filter_x0)[2];
42   vector4b = ((const int32_t *)filter_x0)[3];
43 
44   for (y = h; y--;) {
45     /* prefetch data to cache memory */
46     vp9_prefetch_load(src + src_stride);
47     vp9_prefetch_load(src + src_stride + 32);
48     vp9_prefetch_store(dst + dst_stride);
49 
50     __asm__ __volatile__ (
51         "ulw              %[tp1],         0(%[src])                      \n\t"
52         "ulw              %[tp2],         4(%[src])                      \n\t"
53 
54         /* even 1. pixel */
55         "mtlo             %[vector4a],    $ac3                           \n\t"
56         "mthi             $zero,          $ac3                           \n\t"
57         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
58         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
59         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
60         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
61         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
62         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
63         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
64         "ulw              %[tn2],         8(%[src])                      \n\t"
65         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
66         "extp             %[Temp1],       $ac3,           31             \n\t"
67 
68         /* even 2. pixel */
69         "mtlo             %[vector4a],    $ac2                           \n\t"
70         "mthi             $zero,          $ac2                           \n\t"
71         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
72         "balign           %[tn1],         %[tn2],         3              \n\t"
73         "balign           %[tn2],         %[tp2],         3              \n\t"
74         "balign           %[tp2],         %[tp1],         3              \n\t"
75         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
76         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
77         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
78         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
79         "extp             %[Temp3],       $ac2,           31             \n\t"
80 
81         "lbu              %[p2],          3(%[dst])                      \n\t"  /* load odd 2 */
82 
83         /* odd 1. pixel */
84         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"  /* even 1 */
85         "mtlo             %[vector4a],    $ac3                           \n\t"
86         "mthi             $zero,          $ac3                           \n\t"
87         "lbu              %[Temp1],       1(%[dst])                      \n\t"  /* load odd 1 */
88         "preceu.ph.qbr    %[n1],          %[tp2]                         \n\t"
89         "preceu.ph.qbl    %[n2],          %[tp2]                         \n\t"
90         "preceu.ph.qbr    %[n3],          %[tn2]                         \n\t"
91         "preceu.ph.qbl    %[n4],          %[tn2]                         \n\t"
92         "dpa.w.ph         $ac3,           %[n1],          %[vector1b]    \n\t"
93         "dpa.w.ph         $ac3,           %[n2],          %[vector2b]    \n\t"
94         "dpa.w.ph         $ac3,           %[n3],          %[vector3b]    \n\t"
95         "dpa.w.ph         $ac3,           %[n4],          %[vector4b]    \n\t"
96         "extp             %[Temp2],       $ac3,           31             \n\t"
97 
98         "lbu              %[tn2],         0(%[dst])                      \n\t"  /* load even 1 */
99 
100         /* odd 2. pixel */
101         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"  /* even 2 */
102         "mtlo             %[vector4a],    $ac2                           \n\t"
103         "mthi             $zero,          $ac2                           \n\t"
104         "preceu.ph.qbr    %[n1],          %[tn1]                         \n\t"
105         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"  /* odd 1 */
106         "addqh_r.w        %[tn2],         %[tn2],         %[tp1]         \n\t"  /* average even 1 */
107         "dpa.w.ph         $ac2,           %[n2],          %[vector1b]    \n\t"
108         "dpa.w.ph         $ac2,           %[n3],          %[vector2b]    \n\t"
109         "dpa.w.ph         $ac2,           %[n4],          %[vector3b]    \n\t"
110         "dpa.w.ph         $ac2,           %[n1],          %[vector4b]    \n\t"
111         "extp             %[Temp4],       $ac2,           31             \n\t"
112 
113         "lbu              %[tp1],         2(%[dst])                      \n\t"  /* load even 2 */
114         "sb               %[tn2],         0(%[dst])                      \n\t"  /* store even 1 */
115 
116         /* clamp */
117         "addqh_r.w        %[Temp1],       %[Temp1],       %[tn1]         \n\t"  /* average odd 1 */
118         "lbux             %[n2],          %[Temp4](%[cm])                \n\t"  /* odd 2 */
119         "sb               %[Temp1],       1(%[dst])                      \n\t"  /* store odd 1 */
120 
121         "addqh_r.w        %[tp1],         %[tp1],         %[tp2]         \n\t"  /* average even 2 */
122         "sb               %[tp1],         2(%[dst])                      \n\t"  /* store even 2 */
123 
124         "addqh_r.w        %[p2],          %[p2],          %[n2]          \n\t"  /* average odd 2 */
125         "sb               %[p2],          3(%[dst])                      \n\t"  /* store odd 2 */
126 
127         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
128           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
129           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
130           [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
131           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
132           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
133         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
134           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
135           [vector4a] "r" (vector4a),
136           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
137     );
138 
139     /* Next row... */
140     src += src_stride;
141     dst += dst_stride;
142   }
143 }
144 
convolve_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)145 static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
146                                        int32_t src_stride,
147                                        uint8_t *dst,
148                                        int32_t dst_stride,
149                                        const int16_t *filter_x0,
150                                        int32_t h) {
151   int32_t y;
152   uint8_t *cm = vp9_ff_cropTbl;
153   uint32_t vector4a = 64;
154   int32_t vector1b, vector2b, vector3b, vector4b;
155   int32_t Temp1, Temp2, Temp3;
156   uint32_t tp1, tp2;
157   uint32_t p1, p2, p3, p4, n1;
158   uint32_t tn1, tn2, tn3;
159   uint32_t st0, st1;
160 
161   vector1b = ((const int32_t *)filter_x0)[0];
162   vector2b = ((const int32_t *)filter_x0)[1];
163   vector3b = ((const int32_t *)filter_x0)[2];
164   vector4b = ((const int32_t *)filter_x0)[3];
165 
166   for (y = h; y--;) {
167     /* prefetch data to cache memory */
168     vp9_prefetch_load(src + src_stride);
169     vp9_prefetch_load(src + src_stride + 32);
170     vp9_prefetch_store(dst + dst_stride);
171 
172     __asm__ __volatile__ (
173         "ulw              %[tp1],         0(%[src])                      \n\t"
174         "ulw              %[tp2],         4(%[src])                      \n\t"
175 
176         /* even 1. pixel */
177         "mtlo             %[vector4a],    $ac3                           \n\t"
178         "mthi             $zero,          $ac3                           \n\t"
179         "mtlo             %[vector4a],    $ac2                           \n\t"
180         "mthi             $zero,          $ac2                           \n\t"
181         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
182         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
183         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
184         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
185         "ulw              %[tn2],         8(%[src])                      \n\t"
186         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
187         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
188         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
189         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
190         "extp             %[Temp1],       $ac3,           31             \n\t"
191         "lbu              %[Temp2],       0(%[dst])                      \n\t"
192         "lbu              %[tn3],         2(%[dst])                      \n\t"
193 
194         /* even 2. pixel */
195         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
196         "preceu.ph.qbl    %[n1],          %[tn2]                         \n\t"
197         "ulw              %[tn1],         12(%[src])                     \n\t"
198         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
199         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
200         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
201         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
202         "extp             %[Temp3],       $ac2,           31             \n\t"
203 
204         /* even 3. pixel */
205         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
206         "mtlo             %[vector4a],    $ac1                           \n\t"
207         "mthi             $zero,          $ac1                           \n\t"
208         "preceu.ph.qbr    %[p2],          %[tn1]                         \n\t"
209         "lbux             %[st1],         %[Temp3](%[cm])                \n\t"
210         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]    \n\t"
211         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]    \n\t"
212         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]    \n\t"
213         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]    \n\t"
214         "extp             %[Temp1],       $ac1,           31             \n\t"
215 
216         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
217         "addqh_r.w        %[tn3],         %[tn3],         %[st1]         \n\t"
218         "sb               %[Temp2],       0(%[dst])                      \n\t"
219         "sb               %[tn3],         2(%[dst])                      \n\t"
220 
221         /* even 4. pixel */
222         "mtlo             %[vector4a],    $ac2                           \n\t"
223         "mthi             $zero,          $ac2                           \n\t"
224         "mtlo             %[vector4a],    $ac3                           \n\t"
225         "mthi             $zero,          $ac3                           \n\t"
226 
227         "balign           %[tn3],         %[tn1],         3              \n\t"
228         "balign           %[tn1],         %[tn2],         3              \n\t"
229         "balign           %[tn2],         %[tp2],         3              \n\t"
230         "balign           %[tp2],         %[tp1],         3              \n\t"
231 
232         "lbux             %[st0],         %[Temp1](%[cm])                \n\t"
233         "lbu              %[Temp2],       4(%[dst])                      \n\t"
234         "addqh_r.w        %[Temp2],       %[Temp2],       %[st0]         \n\t"
235 
236         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
237         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
238         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
239         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
240         "extp             %[Temp3],       $ac2,           31             \n\t"
241 
242         /* odd 1. pixel */
243         "mtlo             %[vector4a],    $ac1                           \n\t"
244         "mthi             $zero,          $ac1                           \n\t"
245         "sb               %[Temp2],       4(%[dst])                      \n\t"
246         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
247         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
248         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
249         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
250         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
251         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
252         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
253         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
254         "extp             %[Temp2],       $ac3,           31             \n\t"
255 
256         "lbu              %[tp1],         6(%[dst])                      \n\t"
257 
258         /* odd 2. pixel */
259         "mtlo             %[vector4a],    $ac3                           \n\t"
260         "mthi             $zero,          $ac3                           \n\t"
261         "mtlo             %[vector4a],    $ac2                           \n\t"
262         "mthi             $zero,          $ac2                           \n\t"
263         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
264         "preceu.ph.qbl    %[n1],          %[tn1]                         \n\t"
265         "lbux             %[st0],         %[Temp3](%[cm])                \n\t"
266         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]    \n\t"
267         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]    \n\t"
268         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]    \n\t"
269         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]    \n\t"
270         "extp             %[Temp3],       $ac1,           31             \n\t"
271 
272         "lbu              %[tp2],         1(%[dst])                      \n\t"
273         "lbu              %[tn2],         3(%[dst])                      \n\t"
274         "addqh_r.w        %[tp1],         %[tp1],         %[st0]         \n\t"
275 
276         /* odd 3. pixel */
277         "lbux             %[st1],         %[Temp2](%[cm])                \n\t"
278         "preceu.ph.qbr    %[p2],          %[tn3]                         \n\t"
279         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]    \n\t"
280         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]    \n\t"
281         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]    \n\t"
282         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]    \n\t"
283         "addqh_r.w        %[tp2],         %[tp2],         %[st1]         \n\t"
284         "extp             %[Temp2],       $ac3,           31             \n\t"
285 
286         "lbu              %[tn3],         5(%[dst])                      \n\t"
287 
288         /* odd 4. pixel */
289         "sb               %[tp2],         1(%[dst])                      \n\t"
290         "sb               %[tp1],         6(%[dst])                      \n\t"
291         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]    \n\t"
292         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]    \n\t"
293         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]    \n\t"
294         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]    \n\t"
295         "extp             %[Temp1],       $ac2,           31             \n\t"
296 
297         "lbu              %[tn1],         7(%[dst])                      \n\t"
298 
299         /* clamp */
300         "lbux             %[p4],          %[Temp3](%[cm])                \n\t"
301         "addqh_r.w        %[tn2],         %[tn2],         %[p4]          \n\t"
302 
303         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
304         "addqh_r.w        %[tn3],         %[tn3],         %[p2]          \n\t"
305 
306         "lbux             %[n1],          %[Temp1](%[cm])                \n\t"
307         "addqh_r.w        %[tn1],         %[tn1],         %[n1]          \n\t"
308 
309         /* store bytes */
310         "sb               %[tn2],         3(%[dst])                      \n\t"
311         "sb               %[tn3],         5(%[dst])                      \n\t"
312         "sb               %[tn1],         7(%[dst])                      \n\t"
313 
314         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
315           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
316           [st0] "=&r" (st0), [st1] "=&r" (st1),
317           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
318           [n1] "=&r" (n1),
319           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
320         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
321           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
322           [vector4a] "r" (vector4a),
323           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
324     );
325 
326     /* Next row... */
327     src += src_stride;
328     dst += dst_stride;
329   }
330 }
331 
convolve_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)332 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
333                                         int32_t src_stride,
334                                         uint8_t *dst_ptr,
335                                         int32_t dst_stride,
336                                         const int16_t *filter_x0,
337                                         int32_t h,
338                                         int32_t count) {
339   int32_t y, c;
340   const uint8_t *src;
341   uint8_t *dst;
342   uint8_t *cm = vp9_ff_cropTbl;
343   uint32_t vector_64 = 64;
344   int32_t filter12, filter34, filter56, filter78;
345   int32_t Temp1, Temp2, Temp3;
346   uint32_t qload1, qload2, qload3;
347   uint32_t p1, p2, p3, p4, p5;
348   uint32_t st1, st2, st3;
349 
350   filter12 = ((const int32_t *)filter_x0)[0];
351   filter34 = ((const int32_t *)filter_x0)[1];
352   filter56 = ((const int32_t *)filter_x0)[2];
353   filter78 = ((const int32_t *)filter_x0)[3];
354 
355   for (y = h; y--;) {
356     src = src_ptr;
357     dst = dst_ptr;
358 
359     /* prefetch data to cache memory */
360     vp9_prefetch_load(src_ptr + src_stride);
361     vp9_prefetch_load(src_ptr + src_stride + 32);
362     vp9_prefetch_store(dst_ptr + dst_stride);
363 
364     for (c = 0; c < count; c++) {
365       __asm__ __volatile__ (
366           "ulw              %[qload1],    0(%[src])                    \n\t"
367           "ulw              %[qload2],    4(%[src])                    \n\t"
368 
369           /* even 1. pixel */
370           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
371           "mthi             $zero,        $ac1                         \n\t"
372           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
373           "mthi             $zero,        $ac2                         \n\t"
374           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
375           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
376           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
377           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
378           "ulw              %[qload3],    8(%[src])                    \n\t"
379           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
380           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
381           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
382           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
383           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
384           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
385 
386           /* even 2. pixel */
387           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
388           "mthi             $zero,        $ac3                         \n\t"
389           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
390           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
391           "ulw              %[qload1],    12(%[src])                   \n\t"
392           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
393           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
394           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
395           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
396           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
397           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
398 
399           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
400 
401           /* even 3. pixel */
402           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
403           "mthi             $zero,        $ac1                         \n\t"
404           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
405           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
406           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
407           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
408           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
409           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
410           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
411           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
412           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
413 
414           /* even 4. pixel */
415           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
416           "mthi             $zero,        $ac2                         \n\t"
417           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
418           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
419           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
420           "ulw              %[qload2],    16(%[src])                   \n\t"
421           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
422           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
423           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
424           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
425           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
426           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
427           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
428           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
429 
430           /* even 5. pixel */
431           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
432           "mthi             $zero,        $ac3                         \n\t"
433           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
434           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
435           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
436           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
437           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
438           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
439           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
440           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
441           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
442 
443           /* even 6. pixel */
444           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
445           "mthi             $zero,        $ac1                         \n\t"
446           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
447           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
448           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
449           "ulw              %[qload3],    20(%[src])                   \n\t"
450           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
451           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
452           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
453           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
454           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
455           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
456           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
457 
458           /* even 7. pixel */
459           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
460           "mthi             $zero,        $ac2                         \n\t"
461           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
462           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
463           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
464           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
465           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
466           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
467           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
468           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
469           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
470           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
471 
472           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
473 
474           /* even 8. pixel */
475           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
476           "mthi             $zero,        $ac3                         \n\t"
477           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
478           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
479           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
480           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
481           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
482           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
483           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
484           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
485 
486           /* ODD pixels */
487           "ulw              %[qload1],    1(%[src])                   \n\t"
488           "ulw              %[qload2],    5(%[src])                    \n\t"
489 
490           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
491 
492           /* odd 1. pixel */
493           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
494           "mthi             $zero,        $ac1                         \n\t"
495           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
496           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
497           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
498           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
499           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
500           "ulw              %[qload3],    9(%[src])                    \n\t"
501           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
502           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
503           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
504           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
505           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
506           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
507           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
508 
509           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
510 
511           /* odd 2. pixel */
512           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
513           "mthi             $zero,        $ac2                         \n\t"
514           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
515           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
516           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
517           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
518           "ulw              %[qload1],    13(%[src])                   \n\t"
519           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
520           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
521           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
522           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
523           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
524           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
525           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
526 
527           /* odd 3. pixel */
528           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
529           "mthi             $zero,        $ac3                         \n\t"
530           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
531           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
532           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
533           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
534           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
535           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
536           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
537           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
538           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
539 
540           /* odd 4. pixel */
541           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
542           "mthi             $zero,        $ac1                         \n\t"
543           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
544           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
545           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
546           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
547           "ulw              %[qload2],    17(%[src])                   \n\t"
548           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
549           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
550           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
551           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
552           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
553           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
554 
555           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
556 
557           /* odd 5. pixel */
558           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
559           "mthi             $zero,        $ac2                         \n\t"
560           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
561           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
562           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
563           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
564           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
565           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
566           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
567           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
568           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
569 
570           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
571 
572           /* odd 6. pixel */
573           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
574           "mthi             $zero,        $ac3                         \n\t"
575           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
576           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
577           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
578           "ulw              %[qload3],    21(%[src])                   \n\t"
579           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
580           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
581           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
582           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
583           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
584           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
585 
586           /* odd 7. pixel */
587           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
588           "mthi             $zero,        $ac1                         \n\t"
589           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
590           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
591           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
592           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
593           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
594           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
595           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
596           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
597           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
598 
599           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
600 
601           /* odd 8. pixel */
602           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
603           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
604           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
605           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
606           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
607 
608           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
609 
610           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
611           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
612 
613           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
614           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
615 
616           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
617           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
618 
619           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
620           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
621           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
622 
623           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
624             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
625             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
626             [qload3] "=&r" (qload3), [p5] "=&r" (p5),
627             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
628           : [filter12] "r" (filter12), [filter34] "r" (filter34),
629             [filter56] "r" (filter56), [filter78] "r" (filter78),
630             [vector_64] "r" (vector_64),
631             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
632       );
633 
634       src += 16;
635       dst += 16;
636     }
637 
638     /* Next row... */
639     src_ptr += src_stride;
640     dst_ptr += dst_stride;
641   }
642 }
643 
convolve_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)644 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
645                                         int32_t src_stride,
646                                         uint8_t *dst_ptr,
647                                         int32_t dst_stride,
648                                         const int16_t *filter_x0,
649                                         int32_t h) {
650   int32_t y, c;
651   const uint8_t *src;
652   uint8_t *dst;
653   uint8_t *cm = vp9_ff_cropTbl;
654   uint32_t vector_64 = 64;
655   int32_t filter12, filter34, filter56, filter78;
656   int32_t Temp1, Temp2, Temp3;
657   uint32_t qload1, qload2, qload3;
658   uint32_t p1, p2, p3, p4, p5;
659   uint32_t st1, st2, st3;
660 
661   filter12 = ((const int32_t *)filter_x0)[0];
662   filter34 = ((const int32_t *)filter_x0)[1];
663   filter56 = ((const int32_t *)filter_x0)[2];
664   filter78 = ((const int32_t *)filter_x0)[3];
665 
666   for (y = h; y--;) {
667     src = src_ptr;
668     dst = dst_ptr;
669 
670     /* prefetch data to cache memory */
671     vp9_prefetch_load(src_ptr + src_stride);
672     vp9_prefetch_load(src_ptr + src_stride + 32);
673     vp9_prefetch_load(src_ptr + src_stride + 64);
674     vp9_prefetch_store(dst_ptr + dst_stride);
675     vp9_prefetch_store(dst_ptr + dst_stride + 32);
676 
677     for (c = 0; c < 4; c++) {
678       __asm__ __volatile__ (
679           "ulw              %[qload1],    0(%[src])                    \n\t"
680           "ulw              %[qload2],    4(%[src])                    \n\t"
681 
682           /* even 1. pixel */
683           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
684           "mthi             $zero,        $ac1                         \n\t"
685           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
686           "mthi             $zero,        $ac2                         \n\t"
687           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
688           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
689           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
690           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
691           "ulw              %[qload3],    8(%[src])                    \n\t"
692           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
693           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
694           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
695           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
696           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
697           "lbu              %[st2],       0(%[dst])                    \n\t" /* load even 1 from dst */
698 
699           /* even 2. pixel */
700           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
701           "mthi             $zero,        $ac3                         \n\t"
702           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
703           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
704           "ulw              %[qload1],    12(%[src])                   \n\t"
705           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
706           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
707           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
708           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
709           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
710           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
711 
712           "lbu              %[qload3],    2(%[dst])                    \n\t" /* load even 2 from dst */
713 
714           /* even 3. pixel */
715           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
716           "mthi             $zero,        $ac1                         \n\t"
717           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 1 */
718           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
719           "sb               %[st2],       0(%[dst])                    \n\t" /* store even 1 to dst */
720           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
721           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
722           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
723           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
724           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
725           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
726 
727           /* even 4. pixel */
728           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
729           "mthi             $zero,        $ac2                         \n\t"
730           "addqh_r.w        %[qload3],    %[qload3],      %[st2]       \n\t" /* average even 2 */
731           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
732           "sb               %[qload3],    2(%[dst])                    \n\t" /* store even 2 to dst */
733           "ulw              %[qload2],    16(%[src])                   \n\t"
734           "lbu              %[qload3],    4(%[dst])                    \n\t" /* load even 3 from dst */
735           "lbu              %[qload1],    6(%[dst])                    \n\t" /* load even 4 from dst */
736           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
737           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
738           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
739           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
740           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
741           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
742 
743           /* even 5. pixel */
744           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
745           "mthi             $zero,        $ac3                         \n\t"
746           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 3 */
747           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
748           "sb               %[qload3],    4(%[dst])                    \n\t" /* store even 3 to dst */
749           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
750           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
751           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
752           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
753           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
754           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
755 
756           /* even 6. pixel */
757           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
758           "mthi             $zero,        $ac1                         \n\t"
759           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average even 4 */
760           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
761           "sb               %[qload1],    6(%[dst])                    \n\t" /* store even 4 to dst */
762           "ulw              %[qload3],    20(%[src])                   \n\t"
763           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
764           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
765           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
766           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
767           "lbu              %[qload2],    8(%[dst])                    \n\t" /* load even 5 from dst */
768           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
769           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
770 
771           /* even 7. pixel */
772           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
773           "mthi             $zero,        $ac2                         \n\t"
774           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 5 */
775           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
776           "sb               %[qload2],    8(%[dst])                    \n\t" /* store even 5 to dst */
777           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
778           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
779           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
780           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
781           "lbu              %[qload3],    10(%[dst])                   \n\t" /* load even 6 from dst */
782           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
783           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
784 
785           "lbu              %[st2],       12(%[dst])                   \n\t" /* load even 7 from dst */
786 
787           /* even 8. pixel */
788           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
789           "mthi             $zero,        $ac3                         \n\t"
790           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average even 6 */
791           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
792           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
793           "sb               %[qload3],    10(%[dst])                   \n\t" /* store even 6 to dst */
794           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
795           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
796           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
797           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
798 
799           /* ODD pixels */
800           "ulw              %[qload1],    1(%[src])                   \n\t"
801           "ulw              %[qload2],    5(%[src])                    \n\t"
802 
803           "addqh_r.w        %[st2],       %[st2],         %[st1]       \n\t" /* average even 7 */
804 
805           /* odd 1. pixel */
806           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
807           "mthi             $zero,        $ac1                         \n\t"
808           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
809           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
810           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
811           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
812           "sb               %[st2],       12(%[dst])                   \n\t" /* store even 7 to dst */
813           "ulw              %[qload3],    9(%[src])                    \n\t"
814           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
815           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
816           "lbu              %[qload2],    14(%[dst])                   \n\t" /* load even 8 from dst */
817           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
818           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
819           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
820           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
821 
822           "lbu              %[st1],       1(%[dst])                    \n\t" /* load odd 1 from dst */
823 
824           /* odd 2. pixel */
825           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
826           "mthi             $zero,        $ac2                         \n\t"
827           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average even 8 */
828           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
829           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
830           "sb               %[qload2],    14(%[dst])                   \n\t" /* store even 8 to dst */
831           "ulw              %[qload1],    13(%[src])                   \n\t"
832           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
833           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
834           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
835           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
836           "lbu              %[qload3],    3(%[dst])                    \n\t" /* load odd 2 from dst */
837           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
838           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
839 
840           /* odd 3. pixel */
841           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
842           "mthi             $zero,        $ac3                         \n\t"
843           "addqh_r.w        %[st3],       %[st3],         %[st1]       \n\t" /* average odd 1 */
844           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
845           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
846           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
847           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
848           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
849           "sb               %[st3],       1(%[dst])                    \n\t" /* store odd 1 to dst */
850           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
851           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
852 
853           /* odd 4. pixel */
854           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
855           "mthi             $zero,        $ac1                         \n\t"
856           "addqh_r.w        %[qload3],    %[qload3],      %[st1]       \n\t" /* average odd 2 */
857           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
858           "sb               %[qload3],    3(%[dst])                    \n\t" /* store odd 2 to dst */
859           "lbu              %[qload1],    5(%[dst])                    \n\t" /* load odd 3 from dst */
860           "ulw              %[qload2],    17(%[src])                   \n\t"
861           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
862           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
863           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
864           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
865           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
866           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
867 
868           "lbu              %[st1],       7(%[dst])                    \n\t" /* load odd 4 from dst */
869 
870           /* odd 5. pixel */
871           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
872           "mthi             $zero,        $ac2                         \n\t"
873           "addqh_r.w        %[qload1],    %[qload1],      %[st2]       \n\t" /* average odd 3 */
874           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
875           "sb               %[qload1],    5(%[dst])                    \n\t" /* store odd 3 to dst */
876           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
877           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
878           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
879           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
880           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
881           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
882 
883           "lbu              %[qload1],    9(%[dst])                    \n\t" /* load odd 5 from dst */
884 
885           /* odd 6. pixel */
886           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
887           "mthi             $zero,        $ac3                         \n\t"
888           "addqh_r.w        %[st1],       %[st1],         %[st3]       \n\t" /* average odd 4 */
889           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
890           "sb               %[st1],       7(%[dst])                    \n\t" /* store odd 4 to dst */
891           "ulw              %[qload3],    21(%[src])                   \n\t"
892           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
893           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
894           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
895           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
896           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
897           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
898 
899           /* odd 7. pixel */
900           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
901           "mthi             $zero,        $ac1                         \n\t"
902           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 5 */
903           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
904           "sb               %[qload1],    9(%[dst])                    \n\t" /* store odd 5 to dst */
905           "lbu              %[qload2],    11(%[dst])                   \n\t" /* load odd 6 from dst */
906           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
907           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
908           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
909           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
910           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
911 
912           "lbu              %[qload3],    13(%[dst])                   \n\t" /* load odd 7 from dst */
913 
914           /* odd 8. pixel */
915           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
916           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
917           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
918           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
919           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
920 
921           "lbu              %[qload1],    15(%[dst])                   \n\t" /* load odd 8 from dst */
922 
923           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
924           "addqh_r.w        %[qload2],    %[qload2],      %[st2]       \n\t" /* average odd 6 */
925 
926           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
927           "addqh_r.w        %[qload3],    %[qload3],      %[st3]       \n\t" /* average odd 7 */
928 
929           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
930           "addqh_r.w        %[qload1],    %[qload1],      %[st1]       \n\t" /* average odd 8 */
931 
932           "sb               %[qload2],    11(%[dst])                   \n\t" /* store odd 6 to dst */
933           "sb               %[qload3],    13(%[dst])                   \n\t" /* store odd 7 to dst */
934           "sb               %[qload1],    15(%[dst])                   \n\t" /* store odd 8 to dst */
935 
936           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
937             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
938             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
939             [qload3] "=&r" (qload3), [p5] "=&r" (p5),
940             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
941           : [filter12] "r" (filter12), [filter34] "r" (filter34),
942             [filter56] "r" (filter56), [filter78] "r" (filter78),
943             [vector_64] "r" (vector_64),
944             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
945       );
946 
947       src += 16;
948       dst += 16;
949     }
950 
951     /* Next row... */
952     src_ptr += src_stride;
953     dst_ptr += dst_stride;
954   }
955 }
956 
vp9_convolve8_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)957 void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
958                                    uint8_t *dst, ptrdiff_t dst_stride,
959                                    const int16_t *filter_x, int x_step_q4,
960                                    const int16_t *filter_y, int y_step_q4,
961                                    int w, int h) {
962   if (((const int32_t *)filter_x)[1] == 0x800000) {
963     vp9_convolve_avg(src, src_stride,
964                      dst, dst_stride,
965                      filter_x, x_step_q4,
966                      filter_y, y_step_q4,
967                      w, h);
968   } else if (((const int32_t *)filter_x)[0] == 0) {
969     vp9_convolve2_avg_horiz_dspr2(src, src_stride,
970                                   dst, dst_stride,
971                                   filter_x, x_step_q4,
972                                   filter_y, y_step_q4,
973                                   w, h);
974   } else {
975     if (16 == x_step_q4) {
976       uint32_t pos = 38;
977 
978       src -= 3;
979 
980       /* bit positon for extract from acc */
981       __asm__ __volatile__ (
982         "wrdsp      %[pos],     1           \n\t"
983         :
984         : [pos] "r" (pos)
985       );
986 
987       /* prefetch data to cache memory */
988       vp9_prefetch_load(src);
989       vp9_prefetch_load(src + 32);
990       vp9_prefetch_store(dst);
991 
992       switch (w) {
993         case 4:
994           convolve_avg_horiz_4_dspr2(src, src_stride,
995                                      dst, dst_stride,
996                                      filter_x, h);
997           break;
998         case 8:
999           convolve_avg_horiz_8_dspr2(src, src_stride,
1000                                      dst, dst_stride,
1001                                      filter_x, h);
1002           break;
1003         case 16:
1004           convolve_avg_horiz_16_dspr2(src, src_stride,
1005                                       dst, dst_stride,
1006                                       filter_x, h, 1);
1007           break;
1008         case 32:
1009           convolve_avg_horiz_16_dspr2(src, src_stride,
1010                                       dst, dst_stride,
1011                                       filter_x, h, 2);
1012           break;
1013         case 64:
1014           vp9_prefetch_load(src + 64);
1015           vp9_prefetch_store(dst + 32);
1016 
1017           convolve_avg_horiz_64_dspr2(src, src_stride,
1018                                       dst, dst_stride,
1019                                       filter_x, h);
1020           break;
1021         default:
1022           vp9_convolve8_avg_horiz_c(src + 3, src_stride,
1023                                     dst, dst_stride,
1024                                     filter_x, x_step_q4,
1025                                     filter_y, y_step_q4,
1026                                     w, h);
1027           break;
1028       }
1029     } else {
1030       vp9_convolve8_avg_horiz_c(src, src_stride,
1031                                 dst, dst_stride,
1032                                 filter_x, x_step_q4,
1033                                 filter_y, y_step_q4,
1034                                 w, h);
1035     }
1036   }
1037 }
1038 #endif
1039