1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_avg_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)23 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
24                                       int32_t src_stride,
25                                       uint8_t *dst,
26                                       int32_t dst_stride,
27                                       const int16_t *filter_y,
28                                       int32_t w,
29                                       int32_t h) {
30   int32_t       x, y;
31   const uint8_t *src_ptr;
32   uint8_t       *dst_ptr;
33   uint8_t       *cm = vp9_ff_cropTbl;
34   uint32_t      vector4a = 64;
35   uint32_t      load1, load2, load3, load4;
36   uint32_t      p1, p2;
37   uint32_t      n1, n2;
38   uint32_t      scratch1, scratch2;
39   uint32_t      store1, store2;
40   int32_t       vector1b, vector2b, vector3b, vector4b;
41   int32_t       Temp1, Temp2;
42 
43   vector1b = ((const int32_t *)filter_y)[0];
44   vector2b = ((const int32_t *)filter_y)[1];
45   vector3b = ((const int32_t *)filter_y)[2];
46   vector4b = ((const int32_t *)filter_y)[3];
47 
48   src -= 3 * src_stride;
49 
50   for (y = h; y--;) {
51     /* prefetch data to cache memory */
52     vp9_prefetch_store(dst + dst_stride);
53 
54     for (x = 0; x < w; x += 4) {
55       src_ptr = src + x;
56       dst_ptr = dst + x;
57 
58       __asm__ __volatile__ (
59           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
62           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
64           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
65           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
66 
67           "mtlo             %[vector4a],  $ac0                            \n\t"
68           "mtlo             %[vector4a],  $ac1                            \n\t"
69           "mtlo             %[vector4a],  $ac2                            \n\t"
70           "mtlo             %[vector4a],  $ac3                            \n\t"
71           "mthi             $zero,        $ac0                            \n\t"
72           "mthi             $zero,        $ac1                            \n\t"
73           "mthi             $zero,        $ac2                            \n\t"
74           "mthi             $zero,        $ac3                            \n\t"
75 
76           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
77           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
78           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
79           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
80           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
81           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
82           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
83           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
84 
85           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
86           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
87           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
88           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
89 
90           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
91           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
92           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
93           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
94           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
95           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
96           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
97           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
98 
99           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
100           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
101           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
102           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
103 
104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
108           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
110           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
111           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
112 
113           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
114           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
115           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
116           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
117           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
118           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
119           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
120           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
121 
122           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
123           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
124           "extp             %[Temp1],     $ac0,           31              \n\t"
125           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
126           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
127           "extp             %[Temp2],     $ac1,           31              \n\t"
128 
129           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
130           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
131           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
132           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
133           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
134           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
135           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
136           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
137           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
138           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
139 
140           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
141           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
142           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
143           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
144           "extp             %[Temp1],     $ac2,           31              \n\t"
145 
146           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
147           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
148           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
149           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
150           "extp             %[Temp2],     $ac3,           31              \n\t"
151           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
152 
153           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
154           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
155           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
156 
157           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
158           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
159           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
160           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
161 
162           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
163           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
164 
165           : [load1] "=&r" (load1), [load2] "=&r" (load2),
166             [load3] "=&r" (load3), [load4] "=&r" (load4),
167             [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
168             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
169             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
170             [store1] "=&r" (store1), [store2] "=&r" (store2),
171             [src_ptr] "+r" (src_ptr)
172           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
173             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
174             [vector4a] "r" (vector4a),
175             [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
176       );
177     }
178 
179     /* Next row... */
180     src += src_stride;
181     dst += dst_stride;
182   }
183 }
184 
convolve_avg_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)185 static void convolve_avg_vert_64_dspr2(const uint8_t *src,
186                                        int32_t src_stride,
187                                        uint8_t *dst,
188                                        int32_t dst_stride,
189                                        const int16_t *filter_y,
190                                        int32_t h) {
191   int32_t       x, y;
192   const uint8_t *src_ptr;
193   uint8_t       *dst_ptr;
194   uint8_t       *cm = vp9_ff_cropTbl;
195   uint32_t      vector4a = 64;
196   uint32_t      load1, load2, load3, load4;
197   uint32_t      p1, p2;
198   uint32_t      n1, n2;
199   uint32_t      scratch1, scratch2;
200   uint32_t      store1, store2;
201   int32_t       vector1b, vector2b, vector3b, vector4b;
202   int32_t       Temp1, Temp2;
203 
204   vector1b = ((const int32_t *)filter_y)[0];
205   vector2b = ((const int32_t *)filter_y)[1];
206   vector3b = ((const int32_t *)filter_y)[2];
207   vector4b = ((const int32_t *)filter_y)[3];
208 
209   src -= 3 * src_stride;
210 
211   for (y = h; y--;) {
212     /* prefetch data to cache memory */
213     vp9_prefetch_store(dst + dst_stride);
214     vp9_prefetch_store(dst + dst_stride + 32);
215 
216     for (x = 0; x < 64; x += 4) {
217       src_ptr = src + x;
218       dst_ptr = dst + x;
219 
220       __asm__ __volatile__ (
221           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
222           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
223           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
224           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
225           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
226           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
227           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
228 
229           "mtlo             %[vector4a],  $ac0                            \n\t"
230           "mtlo             %[vector4a],  $ac1                            \n\t"
231           "mtlo             %[vector4a],  $ac2                            \n\t"
232           "mtlo             %[vector4a],  $ac3                            \n\t"
233           "mthi             $zero,        $ac0                            \n\t"
234           "mthi             $zero,        $ac1                            \n\t"
235           "mthi             $zero,        $ac2                            \n\t"
236           "mthi             $zero,        $ac3                            \n\t"
237 
238           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
239           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
240           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
241           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
242           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
243           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
244           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
245           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
246 
247           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
248           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
249           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
250           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
251 
252           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
253           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
254           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
255           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
256           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
257           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
258           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
259           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
260 
261           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
262           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
263           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
264           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
265 
266           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
267           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
268           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
269           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
270           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
271           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
272           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
273           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
274 
275           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
276           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
277           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
278           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
279           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
280           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
281           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
282           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
283 
284           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
285           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
286           "extp             %[Temp1],     $ac0,           31              \n\t"
287           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
288           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
289           "extp             %[Temp2],     $ac1,           31              \n\t"
290 
291           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
292           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
293           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
294           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
295           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
296           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
297           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
298           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
299           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
300           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
301 
302           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
303           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
304           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
305           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
306           "extp             %[Temp1],     $ac2,           31              \n\t"
307 
308           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
309           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
310           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
311           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
312           "extp             %[Temp2],     $ac3,           31              \n\t"
313           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
314 
315           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
316           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
317           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
318 
319           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
320           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
321           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
322           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
323 
324           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
325           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
326 
327           : [load1] "=&r" (load1), [load2] "=&r" (load2),
328             [load3] "=&r" (load3), [load4] "=&r" (load4),
329             [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
330             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
331             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
332             [store1] "=&r" (store1), [store2] "=&r" (store2),
333             [src_ptr] "+r" (src_ptr)
334           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
335             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
336             [vector4a] "r" (vector4a),
337             [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
338       );
339     }
340 
341     /* Next row... */
342     src += src_stride;
343     dst += dst_stride;
344   }
345 }
346 
vp9_convolve8_avg_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)347 void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
348                                   uint8_t *dst, ptrdiff_t dst_stride,
349                                   const int16_t *filter_x, int x_step_q4,
350                                   const int16_t *filter_y, int y_step_q4,
351                                   int w, int h) {
352   if (((const int32_t *)filter_y)[1] == 0x800000) {
353     vp9_convolve_avg(src, src_stride,
354                      dst, dst_stride,
355                      filter_x, x_step_q4,
356                      filter_y, y_step_q4,
357                      w, h);
358   } else if (((const int32_t *)filter_y)[0] == 0) {
359     vp9_convolve2_avg_vert_dspr2(src, src_stride,
360                                  dst, dst_stride,
361                                  filter_x, x_step_q4,
362                                  filter_y, y_step_q4,
363                                  w, h);
364   } else {
365     if (16 == y_step_q4) {
366       uint32_t pos = 38;
367 
368       /* bit positon for extract from acc */
369       __asm__ __volatile__ (
370         "wrdsp      %[pos],     1           \n\t"
371         :
372         : [pos] "r" (pos)
373       );
374 
375       vp9_prefetch_store(dst);
376 
377       switch (w) {
378         case 4:
379         case 8:
380         case 16:
381         case 32:
382           convolve_avg_vert_4_dspr2(src, src_stride,
383                                     dst, dst_stride,
384                                     filter_y, w, h);
385           break;
386         case 64:
387           vp9_prefetch_store(dst + 32);
388           convolve_avg_vert_64_dspr2(src, src_stride,
389                                      dst, dst_stride,
390                                      filter_y, h);
391           break;
392         default:
393           vp9_convolve8_avg_vert_c(src, src_stride,
394                                    dst, dst_stride,
395                                    filter_x, x_step_q4,
396                                    filter_y, y_step_q4,
397                                    w, h);
398           break;
399       }
400     } else {
401       vp9_convolve8_avg_vert_c(src, src_stride,
402                                dst, dst_stride,
403                                filter_x, x_step_q4,
404                                filter_y, y_step_q4,
405                                w, h);
406     }
407   }
408 }
409 
vp9_convolve8_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)410 void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
411                              uint8_t *dst, ptrdiff_t dst_stride,
412                              const int16_t *filter_x, int x_step_q4,
413                              const int16_t *filter_y, int y_step_q4,
414                              int w, int h) {
415   /* Fixed size intermediate buffer places limits on parameters. */
416   DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
417   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
418 
419   assert(w <= 64);
420   assert(h <= 64);
421 
422   if (intermediate_height < h)
423     intermediate_height = h;
424 
425   if (x_step_q4 != 16 || y_step_q4 != 16)
426     return vp9_convolve8_avg_c(src, src_stride,
427                                dst, dst_stride,
428                                filter_x, x_step_q4,
429                                filter_y, y_step_q4,
430                                w, h);
431 
432   vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
433                       temp, 64,
434                       filter_x, x_step_q4,
435                       filter_y, y_step_q4,
436                       w, intermediate_height);
437 
438   vp9_convolve8_avg_vert(temp + 64 * 3, 64,
439                          dst, dst_stride,
440                          filter_x, x_step_q4,
441                          filter_y, y_step_q4,
442                          w, h);
443 }
444 
vp9_convolve_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)445 void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
446                             uint8_t *dst, ptrdiff_t dst_stride,
447                             const int16_t *filter_x, int filter_x_stride,
448                             const int16_t *filter_y, int filter_y_stride,
449                             int w, int h) {
450   int x, y;
451   uint32_t tp1, tp2, tn1;
452   uint32_t tp3, tp4, tn2;
453 
454   /* prefetch data to cache memory */
455   vp9_prefetch_load(src);
456   vp9_prefetch_load(src + 32);
457   vp9_prefetch_store(dst);
458 
459   switch (w) {
460     case 4:
461       /* 1 word storage */
462       for (y = h; y--; ) {
463         vp9_prefetch_load(src + src_stride);
464         vp9_prefetch_load(src + src_stride + 32);
465         vp9_prefetch_store(dst + dst_stride);
466 
467         __asm__ __volatile__ (
468             "ulw              %[tp1],         0(%[src])      \n\t"
469             "ulw              %[tp2],         0(%[dst])      \n\t"
470             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
471             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
472 
473             : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
474               [tp2] "=&r" (tp2)
475             : [src] "r" (src), [dst] "r" (dst)
476         );
477 
478         src += src_stride;
479         dst += dst_stride;
480       }
481       break;
482     case 8:
483       /* 2 word storage */
484       for (y = h; y--; ) {
485         vp9_prefetch_load(src + src_stride);
486         vp9_prefetch_load(src + src_stride + 32);
487         vp9_prefetch_store(dst + dst_stride);
488 
489         __asm__ __volatile__ (
490             "ulw              %[tp1],         0(%[src])      \n\t"
491             "ulw              %[tp2],         0(%[dst])      \n\t"
492             "ulw              %[tp3],         4(%[src])      \n\t"
493             "ulw              %[tp4],         4(%[dst])      \n\t"
494             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
495             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
496             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
497             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
498 
499             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
500               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
501               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
502             : [src] "r" (src), [dst] "r" (dst)
503         );
504 
505         src += src_stride;
506         dst += dst_stride;
507       }
508       break;
509     case 16:
510       /* 4 word storage */
511       for (y = h; y--; ) {
512         vp9_prefetch_load(src + src_stride);
513         vp9_prefetch_load(src + src_stride + 32);
514         vp9_prefetch_store(dst + dst_stride);
515 
516         __asm__ __volatile__ (
517             "ulw              %[tp1],         0(%[src])      \n\t"
518             "ulw              %[tp2],         0(%[dst])      \n\t"
519             "ulw              %[tp3],         4(%[src])      \n\t"
520             "ulw              %[tp4],         4(%[dst])      \n\t"
521             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
522             "ulw              %[tp1],         8(%[src])      \n\t"
523             "ulw              %[tp2],         8(%[dst])      \n\t"
524             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
525             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
526             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
527             "ulw              %[tp3],         12(%[src])     \n\t"
528             "ulw              %[tp4],         12(%[dst])     \n\t"
529             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
530             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
531             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
532             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
533 
534             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
535               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
536               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
537             : [src] "r" (src), [dst] "r" (dst)
538         );
539 
540         src += src_stride;
541         dst += dst_stride;
542       }
543       break;
544     case 32:
545       /* 8 word storage */
546       for (y = h; y--; ) {
547         vp9_prefetch_load(src + src_stride);
548         vp9_prefetch_load(src + src_stride + 32);
549         vp9_prefetch_store(dst + dst_stride);
550 
551         __asm__ __volatile__ (
552             "ulw              %[tp1],         0(%[src])      \n\t"
553             "ulw              %[tp2],         0(%[dst])      \n\t"
554             "ulw              %[tp3],         4(%[src])      \n\t"
555             "ulw              %[tp4],         4(%[dst])      \n\t"
556             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
557             "ulw              %[tp1],         8(%[src])      \n\t"
558             "ulw              %[tp2],         8(%[dst])      \n\t"
559             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
560             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
561             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
562             "ulw              %[tp3],         12(%[src])     \n\t"
563             "ulw              %[tp4],         12(%[dst])     \n\t"
564             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
565             "ulw              %[tp1],         16(%[src])     \n\t"
566             "ulw              %[tp2],         16(%[dst])     \n\t"
567             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
568             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
569             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
570             "ulw              %[tp3],         20(%[src])     \n\t"
571             "ulw              %[tp4],         20(%[dst])     \n\t"
572             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
573             "ulw              %[tp1],         24(%[src])     \n\t"
574             "ulw              %[tp2],         24(%[dst])     \n\t"
575             "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
576             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
577             "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
578             "ulw              %[tp3],         28(%[src])     \n\t"
579             "ulw              %[tp4],         28(%[dst])     \n\t"
580             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
581             "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
582             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
583             "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
584 
585             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
586               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
587               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
588             : [src] "r" (src), [dst] "r" (dst)
589         );
590 
591         src += src_stride;
592         dst += dst_stride;
593       }
594       break;
595     case 64:
596       vp9_prefetch_load(src + 64);
597       vp9_prefetch_store(dst + 32);
598 
599       /* 16 word storage */
600       for (y = h; y--; ) {
601         vp9_prefetch_load(src + src_stride);
602         vp9_prefetch_load(src + src_stride + 32);
603         vp9_prefetch_load(src + src_stride + 64);
604         vp9_prefetch_store(dst + dst_stride);
605         vp9_prefetch_store(dst + dst_stride + 32);
606 
607         __asm__ __volatile__ (
608             "ulw              %[tp1],         0(%[src])      \n\t"
609             "ulw              %[tp2],         0(%[dst])      \n\t"
610             "ulw              %[tp3],         4(%[src])      \n\t"
611             "ulw              %[tp4],         4(%[dst])      \n\t"
612             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
613             "ulw              %[tp1],         8(%[src])      \n\t"
614             "ulw              %[tp2],         8(%[dst])      \n\t"
615             "sw               %[tn1],         0(%[dst])      \n\t"  /* store */
616             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
617             "sw               %[tn2],         4(%[dst])      \n\t"  /* store */
618             "ulw              %[tp3],         12(%[src])     \n\t"
619             "ulw              %[tp4],         12(%[dst])     \n\t"
620             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
621             "ulw              %[tp1],         16(%[src])     \n\t"
622             "ulw              %[tp2],         16(%[dst])     \n\t"
623             "sw               %[tn1],         8(%[dst])      \n\t"  /* store */
624             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
625             "sw               %[tn2],         12(%[dst])     \n\t"  /* store */
626             "ulw              %[tp3],         20(%[src])     \n\t"
627             "ulw              %[tp4],         20(%[dst])     \n\t"
628             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
629             "ulw              %[tp1],         24(%[src])     \n\t"
630             "ulw              %[tp2],         24(%[dst])     \n\t"
631             "sw               %[tn1],         16(%[dst])     \n\t"  /* store */
632             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
633             "sw               %[tn2],         20(%[dst])     \n\t"  /* store */
634             "ulw              %[tp3],         28(%[src])     \n\t"
635             "ulw              %[tp4],         28(%[dst])     \n\t"
636             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
637             "ulw              %[tp1],         32(%[src])     \n\t"
638             "ulw              %[tp2],         32(%[dst])     \n\t"
639             "sw               %[tn1],         24(%[dst])     \n\t"  /* store */
640             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
641             "sw               %[tn2],         28(%[dst])     \n\t"  /* store */
642             "ulw              %[tp3],         36(%[src])     \n\t"
643             "ulw              %[tp4],         36(%[dst])     \n\t"
644             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
645             "ulw              %[tp1],         40(%[src])     \n\t"
646             "ulw              %[tp2],         40(%[dst])     \n\t"
647             "sw               %[tn1],         32(%[dst])     \n\t"  /* store */
648             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
649             "sw               %[tn2],         36(%[dst])     \n\t"  /* store */
650             "ulw              %[tp3],         44(%[src])     \n\t"
651             "ulw              %[tp4],         44(%[dst])     \n\t"
652             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
653             "ulw              %[tp1],         48(%[src])     \n\t"
654             "ulw              %[tp2],         48(%[dst])     \n\t"
655             "sw               %[tn1],         40(%[dst])     \n\t"  /* store */
656             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
657             "sw               %[tn2],         44(%[dst])     \n\t"  /* store */
658             "ulw              %[tp3],         52(%[src])     \n\t"
659             "ulw              %[tp4],         52(%[dst])     \n\t"
660             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
661             "ulw              %[tp1],         56(%[src])     \n\t"
662             "ulw              %[tp2],         56(%[dst])     \n\t"
663             "sw               %[tn1],         48(%[dst])     \n\t"  /* store */
664             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
665             "sw               %[tn2],         52(%[dst])     \n\t"  /* store */
666             "ulw              %[tp3],         60(%[src])     \n\t"
667             "ulw              %[tp4],         60(%[dst])     \n\t"
668             "adduh_r.qb       %[tn1], %[tp2], %[tp1]         \n\t"  /* average */
669             "sw               %[tn1],         56(%[dst])     \n\t"  /* store */
670             "adduh_r.qb       %[tn2], %[tp3], %[tp4]         \n\t"  /* average */
671             "sw               %[tn2],         60(%[dst])     \n\t"  /* store */
672 
673             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
674               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
675               [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
676             : [src] "r" (src), [dst] "r" (dst)
677         );
678 
679         src += src_stride;
680         dst += dst_stride;
681       }
682       break;
683     default:
684       for (y = h; y > 0; --y) {
685         for (x = 0; x < w; ++x) {
686           dst[x] = (dst[x] + src[x] + 1) >> 1;
687         }
688 
689         src += src_stride;
690         dst += dst_stride;
691       }
692       break;
693   }
694 }
695 #endif
696