1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_filter.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
23 uint8_t vp9_ff_cropTbl_a[256 + 2 * CROP_WIDTH];
24 uint8_t *vp9_ff_cropTbl;
25 
vp9_dsputil_static_init(void)26 void vp9_dsputil_static_init(void) {
27   int i;
28 
29   for (i = 0; i < 256; i++) vp9_ff_cropTbl_a[i + CROP_WIDTH] = i;
30 
31   for (i = 0; i < CROP_WIDTH; i++) {
32     vp9_ff_cropTbl_a[i] = 0;
33     vp9_ff_cropTbl_a[i + CROP_WIDTH + 256] = 255;
34   }
35 
36   vp9_ff_cropTbl = &vp9_ff_cropTbl_a[CROP_WIDTH];
37 }
38 
convolve_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)39 static void convolve_horiz_4_transposed_dspr2(const uint8_t *src,
40                                               int32_t src_stride,
41                                               uint8_t *dst,
42                                               int32_t dst_stride,
43                                               const int16_t *filter_x0,
44                                               int32_t h) {
45   int32_t y;
46   uint8_t *cm = vp9_ff_cropTbl;
47   uint8_t *dst_ptr;
48   int32_t vector1b, vector2b, vector3b, vector4b;
49   int32_t Temp1, Temp2, Temp3, Temp4;
50   uint32_t vector4a = 64;
51   uint32_t tp1, tp2;
52   uint32_t p1, p2, p3, p4;
53   uint32_t tn1, tn2;
54 
55   vector1b = ((const int32_t *)filter_x0)[0];
56   vector2b = ((const int32_t *)filter_x0)[1];
57   vector3b = ((const int32_t *)filter_x0)[2];
58   vector4b = ((const int32_t *)filter_x0)[3];
59 
60   for (y = h; y--;) {
61     dst_ptr = dst;
62     /* prefetch data to cache memory */
63     vp9_prefetch_load(src + src_stride);
64     vp9_prefetch_load(src + src_stride + 32);
65 
66     __asm__ __volatile__ (
67         "ulw              %[tp1],         0(%[src])                      \n\t"
68         "ulw              %[tp2],         4(%[src])                      \n\t"
69 
70         /* even 1. pixel */
71         "mtlo             %[vector4a],    $ac3                           \n\t"
72         "mthi             $zero,          $ac3                           \n\t"
73         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
74         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
75         "preceu.ph.qbr    %[p3],          %[tp2]                         \n\t"
76         "preceu.ph.qbl    %[p4],          %[tp2]                         \n\t"
77         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
78         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
79         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
80         "ulw              %[tn2],         8(%[src])                      \n\t"
81         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
82         "extp             %[Temp1],       $ac3,           31             \n\t"
83 
84         /* even 2. pixel */
85         "mtlo             %[vector4a],    $ac2                           \n\t"
86         "mthi             $zero,          $ac2                           \n\t"
87         "preceu.ph.qbr    %[p1],          %[tn2]                         \n\t"
88         "balign           %[tn1],         %[tn2],         3              \n\t"
89         "balign           %[tn2],         %[tp2],         3              \n\t"
90         "balign           %[tp2],         %[tp1],         3              \n\t"
91         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
92         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
93         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
94         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
95         "extp             %[Temp3],       $ac2,           31             \n\t"
96 
97         /* odd 1. pixel */
98         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
99         "mtlo             %[vector4a],    $ac3                           \n\t"
100         "mthi             $zero,          $ac3                           \n\t"
101         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
102         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
103         "preceu.ph.qbr    %[p3],          %[tn2]                         \n\t"
104         "preceu.ph.qbl    %[p4],          %[tn2]                         \n\t"
105         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]    \n\t"
106         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]    \n\t"
107         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]    \n\t"
108         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]    \n\t"
109         "extp             %[Temp2],       $ac3,           31             \n\t"
110 
111         /* odd 2. pixel */
112         "lbux             %[tp2],         %[Temp3](%[cm])                \n\t"
113         "mtlo             %[vector4a],    $ac2                           \n\t"
114         "mthi             $zero,          $ac2                           \n\t"
115         "preceu.ph.qbr    %[p1],          %[tn1]                         \n\t"
116         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]    \n\t"
117         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]    \n\t"
118         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]    \n\t"
119         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]    \n\t"
120         "extp             %[Temp4],       $ac2,           31             \n\t"
121 
122         /* clamp */
123         "lbux             %[tn1],         %[Temp2](%[cm])                \n\t"
124         "lbux             %[p2],          %[Temp4](%[cm])                \n\t"
125 
126         /* store bytes */
127         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
128         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
129 
130         "sb               %[tn1],         0(%[dst_ptr])                  \n\t"
131         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
132 
133         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
134         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
135 
136         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
137         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
138 
139         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
140           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
141           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4),
142           [dst_ptr] "+r" (dst_ptr)
143         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
144           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
145           [vector4a] "r" (vector4a),
146           [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
147     );
148 
149     /* Next row... */
150     src += src_stride;
151     dst += 1;
152   }
153 }
154 
convolve_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)155 static void convolve_horiz_8_transposed_dspr2(const uint8_t *src,
156                                               int32_t src_stride,
157                                               uint8_t *dst,
158                                               int32_t dst_stride,
159                                               const int16_t *filter_x0,
160                                               int32_t h) {
161   int32_t y;
162   uint8_t *cm = vp9_ff_cropTbl;
163   uint8_t *dst_ptr;
164   uint32_t vector4a = 64;
165   int32_t vector1b, vector2b, vector3b, vector4b;
166   int32_t Temp1, Temp2, Temp3;
167   uint32_t tp1, tp2, tp3;
168   uint32_t p1, p2, p3, p4, n1;
169   uint8_t *odd_dst;
170   uint32_t dst_pitch_2 = (dst_stride << 1);
171 
172   vector1b = ((const int32_t *)filter_x0)[0];
173   vector2b = ((const int32_t *)filter_x0)[1];
174   vector3b = ((const int32_t *)filter_x0)[2];
175   vector4b = ((const int32_t *)filter_x0)[3];
176 
177   for (y = h; y--;) {
178     /* prefetch data to cache memory */
179     vp9_prefetch_load(src + src_stride);
180     vp9_prefetch_load(src + src_stride + 32);
181 
182     dst_ptr = dst;
183     odd_dst = (dst_ptr + dst_stride);
184 
185     __asm__ __volatile__ (
186         "ulw              %[tp2],         0(%[src])                       \n\t"
187         "ulw              %[tp1],         4(%[src])                       \n\t"
188 
189         /* even 1. pixel */
190         "mtlo             %[vector4a],    $ac3                            \n\t"
191         "mthi             $zero,          $ac3                            \n\t"
192         "mtlo             %[vector4a],    $ac2                            \n\t"
193         "mthi             $zero,          $ac2                            \n\t"
194         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
195         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
196         "preceu.ph.qbr    %[p3],          %[tp1]                          \n\t"
197         "preceu.ph.qbl    %[p4],          %[tp1]                          \n\t"
198         "ulw              %[tp3],         8(%[src])                       \n\t"
199         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
200         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
201         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
202         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
203         "extp             %[Temp1],       $ac3,           31              \n\t"
204 
205         /* even 2. pixel */
206         "preceu.ph.qbr    %[p1],          %[tp3]                          \n\t"
207         "preceu.ph.qbl    %[n1],          %[tp3]                          \n\t"
208         "ulw              %[tp2],         12(%[src])                      \n\t"
209         "dpa.w.ph         $ac2,           %[p2],          %[vector1b]     \n\t"
210         "dpa.w.ph         $ac2,           %[p3],          %[vector2b]     \n\t"
211         "dpa.w.ph         $ac2,           %[p4],          %[vector3b]     \n\t"
212         "dpa.w.ph         $ac2,           %[p1],          %[vector4b]     \n\t"
213         "extp             %[Temp3],       $ac2,           31              \n\t"
214 
215         /* even 3. pixel */
216         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
217         "mtlo             %[vector4a],    $ac1                            \n\t"
218         "mthi             $zero,          $ac1                            \n\t"
219         "preceu.ph.qbr    %[p2],          %[tp2]                          \n\t"
220         "dpa.w.ph         $ac1,           %[p3],          %[vector1b]     \n\t"
221         "dpa.w.ph         $ac1,           %[p4],          %[vector2b]     \n\t"
222         "dpa.w.ph         $ac1,           %[p1],          %[vector3b]     \n\t"
223         "lbux             %[tp3],         %[Temp3](%[cm])                 \n\t"
224         "dpa.w.ph         $ac1,           %[n1],          %[vector4b]     \n\t"
225         "extp             %[p3],          $ac1,           31              \n\t"
226 
227         /* even 4. pixel */
228         "mtlo             %[vector4a],    $ac2                            \n\t"
229         "mthi             $zero,          $ac2                            \n\t"
230         "mtlo             %[vector4a],    $ac3                            \n\t"
231         "mthi             $zero,          $ac3                            \n\t"
232         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
233         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
234         "sb               %[tp3],         0(%[dst_ptr])                   \n\t"
235         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
236 
237         "ulw              %[tp1],         1(%[src])                       \n\t"
238         "ulw              %[tp3],         5(%[src])                       \n\t"
239 
240         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
241         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
242         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
243         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
244         "extp             %[Temp3],       $ac2,           31              \n\t"
245 
246         "lbux             %[tp2],         %[p3](%[cm])                    \n\t"
247 
248         /* odd 1. pixel */
249         "mtlo             %[vector4a],    $ac1                            \n\t"
250         "mthi             $zero,          $ac1                            \n\t"
251         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
252         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
253         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
254         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
255         "sb               %[tp2],         0(%[dst_ptr])                   \n\t"
256         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
257         "ulw              %[tp2],         9(%[src])                       \n\t"
258 
259         "dpa.w.ph         $ac3,           %[p1],          %[vector1b]     \n\t"
260         "dpa.w.ph         $ac3,           %[p2],          %[vector2b]     \n\t"
261         "dpa.w.ph         $ac3,           %[p3],          %[vector3b]     \n\t"
262         "dpa.w.ph         $ac3,           %[p4],          %[vector4b]     \n\t"
263         "extp             %[Temp2],       $ac3,           31              \n\t"
264 
265         /* odd 2. pixel */
266         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
267         "mtlo             %[vector4a],    $ac3                            \n\t"
268         "mthi             $zero,          $ac3                            \n\t"
269         "mtlo             %[vector4a],    $ac2                            \n\t"
270         "mthi             $zero,          $ac2                            \n\t"
271         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
272         "preceu.ph.qbl    %[n1],          %[tp2]                          \n\t"
273         "ulw              %[Temp1],       13(%[src])                      \n\t"
274         "dpa.w.ph         $ac1,           %[p2],          %[vector1b]     \n\t"
275         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
276         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
277         "dpa.w.ph         $ac1,           %[p3],          %[vector2b]     \n\t"
278         "dpa.w.ph         $ac1,           %[p4],          %[vector3b]     \n\t"
279         "dpa.w.ph         $ac1,           %[p1],          %[vector4b]     \n\t"
280         "extp             %[Temp3],       $ac1,           31              \n\t"
281 
282         /* odd 3. pixel */
283         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
284         "preceu.ph.qbr    %[p2],          %[Temp1]                        \n\t"
285         "dpa.w.ph         $ac3,           %[p3],          %[vector1b]     \n\t"
286         "dpa.w.ph         $ac3,           %[p4],          %[vector2b]     \n\t"
287         "dpa.w.ph         $ac3,           %[p1],          %[vector3b]     \n\t"
288         "dpa.w.ph         $ac3,           %[n1],          %[vector4b]     \n\t"
289         "extp             %[Temp2],       $ac3,           31              \n\t"
290 
291         /* odd 4. pixel */
292         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
293         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
294         "dpa.w.ph         $ac2,           %[p4],          %[vector1b]     \n\t"
295         "dpa.w.ph         $ac2,           %[p1],          %[vector2b]     \n\t"
296         "dpa.w.ph         $ac2,           %[n1],          %[vector3b]     \n\t"
297         "dpa.w.ph         $ac2,           %[p2],          %[vector4b]     \n\t"
298         "extp             %[Temp1],       $ac2,           31              \n\t"
299 
300         /* clamp */
301         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
302         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
303         "lbux             %[n1],          %[Temp1](%[cm])                 \n\t"
304 
305         /* store bytes */
306         "sb               %[p4],          0(%[odd_dst])                   \n\t"
307         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
308 
309         "sb               %[p2],          0(%[odd_dst])                   \n\t"
310         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
311 
312         "sb               %[n1],          0(%[odd_dst])                   \n\t"
313 
314         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
315           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
316           [n1] "=&r" (n1),
317           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
318           [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
319         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
320           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
321           [vector4a] "r" (vector4a), [cm] "r" (cm),
322           [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
323     );
324 
325     /* Next row... */
326     src += src_stride;
327     dst += 1;
328   }
329 }
330 
convolve_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)331 static void convolve_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
332                                                int32_t src_stride,
333                                                uint8_t *dst_ptr,
334                                                int32_t dst_stride,
335                                                const int16_t *filter_x0,
336                                                int32_t h,
337                                                int32_t count) {
338   int32_t c, y;
339   const uint8_t *src;
340   uint8_t *dst;
341   uint8_t *cm = vp9_ff_cropTbl;
342   uint32_t vector_64 = 64;
343   int32_t  filter12, filter34, filter56, filter78;
344   int32_t  Temp1, Temp2, Temp3;
345   uint32_t qload1, qload2;
346   uint32_t p1, p2, p3, p4, p5;
347   uint32_t st1, st2, st3;
348   uint32_t dst_pitch_2 = (dst_stride << 1);
349   uint8_t  *odd_dst;
350 
351   filter12 = ((const int32_t *)filter_x0)[0];
352   filter34 = ((const int32_t *)filter_x0)[1];
353   filter56 = ((const int32_t *)filter_x0)[2];
354   filter78 = ((const int32_t *)filter_x0)[3];
355 
356   for (y = h; y--;) {
357     /* prefetch data to cache memory */
358     vp9_prefetch_load(src_ptr + src_stride);
359     vp9_prefetch_load(src_ptr + src_stride + 32);
360 
361     src = src_ptr;
362     dst = dst_ptr;
363 
364     odd_dst = (dst + dst_stride);
365 
366     for (c = 0; c < count; c++) {
367       __asm__ __volatile__ (
368           "ulw              %[qload1],        0(%[src])                       \n\t"
369           "ulw              %[qload2],        4(%[src])                       \n\t"
370 
371           /* even 1. pixel */
372           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
373           "mthi             $zero,            $ac1                            \n\t"
374           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
375           "mthi             $zero,            $ac2                            \n\t"
376           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
377           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
378           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
379           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
380           "ulw              %[qload2],        8(%[src])                       \n\t"
381           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
382           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
383           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
384           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
385           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
386 
387           /* even 2. pixel */
388           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
389           "mthi             $zero,            $ac3                            \n\t"
390           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
391           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
392           "ulw              %[qload1],        12(%[src])                      \n\t"
393           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
394           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
395           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
396           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
397           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
398           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
399 
400           /* even 3. pixel */
401           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
402           "mthi             $zero,            $ac1                            \n\t"
403           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
404           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
405           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
406           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
407           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
408           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
409           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
410           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
411           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
412 
413           /* even 4. pixel */
414           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
415           "mthi             $zero,            $ac2                            \n\t"
416           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
417           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
418           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
419           "ulw              %[qload2],        16(%[src])                      \n\t"
420           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
421           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
422           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
423           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
424           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
425           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
426 
427           /* even 5. pixel */
428           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
429           "mthi             $zero,            $ac3                            \n\t"
430           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
431           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
432           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
433           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
434           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
435           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
436           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
437           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
438           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
439 
440           /* even 6. pixel */
441           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
442           "mthi             $zero,            $ac1                            \n\t"
443           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
444           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
445           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
446           "ulw              %[qload1],        20(%[src])                      \n\t"
447           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
448           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
449           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
450           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
451           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
452           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
453 
454           /* even 7. pixel */
455           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
456           "mthi             $zero,            $ac2                            \n\t"
457           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
458           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
459           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
460           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
461           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
462           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
463           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
464           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
465           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
466 
467           /* even 8. pixel */
468           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
469           "mthi             $zero,            $ac3                            \n\t"
470           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
471           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
472           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
473           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
474           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
475           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
476           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
477           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
478 
479           /* ODD pixels */
480           "ulw              %[qload1],        1(%[src])                       \n\t"
481           "ulw              %[qload2],        5(%[src])                       \n\t"
482 
483           /* odd 1. pixel */
484           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
485           "mthi             $zero,            $ac1                            \n\t"
486           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
487           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
488           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
489           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
490           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
491           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
492           "ulw              %[qload2],        9(%[src])                       \n\t"
493           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
494           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
495           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
496           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
497           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
498           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
499 
500           /* odd 2. pixel */
501           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
502           "mthi             $zero,            $ac2                            \n\t"
503           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
504           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
505           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
506           "ulw              %[qload1],        13(%[src])                      \n\t"
507           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
508           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
509           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
510           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
511           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
512           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
513 
514           /* odd 3. pixel */
515           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
516           "mthi             $zero,            $ac3                            \n\t"
517           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
518           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
519           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
520           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
521           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
522           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
523           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
524           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
525           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
526 
527           /* odd 4. pixel */
528           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
529           "mthi             $zero,            $ac1                            \n\t"
530           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
531           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
532           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
533           "ulw              %[qload2],        17(%[src])                      \n\t"
534           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
535           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
536           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
537           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
538           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
539           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
540 
541           /* odd 5. pixel */
542           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
543           "mthi             $zero,            $ac2                            \n\t"
544           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
545           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
546           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
547           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
548           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
549           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
550           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
551           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
552           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
553 
554           /* odd 6. pixel */
555           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
556           "mthi             $zero,            $ac3                            \n\t"
557           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
558           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
559           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
560           "ulw              %[qload1],        21(%[src])                      \n\t"
561           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
562           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
563           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
564           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
565           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
566           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
567 
568           /* odd 7. pixel */
569           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
570           "mthi             $zero,            $ac1                            \n\t"
571           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
572           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
573           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
574           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
575           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
576           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
577           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
578           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
579 
580           /* odd 8. pixel */
581           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
582           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
583           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
584           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
585           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
586 
587           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
588           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
589           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
590 
591           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
592           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
593 
594           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
595           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
596 
597           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
598 
599           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
600             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
601             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
602             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
603             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
604           : [filter12] "r" (filter12), [filter34] "r" (filter34),
605             [filter56] "r" (filter56), [filter78] "r" (filter78),
606             [vector_64] "r" (vector_64), [cm] "r" (cm),
607             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
608       );
609 
610       src += 16;
611       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
612       odd_dst = (dst + dst_stride);
613     }
614 
615     /* Next row... */
616     src_ptr += src_stride;
617 
618     dst_ptr += 1;
619   }
620 }
621 
convolve_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)622 static void convolve_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
623                                                int32_t src_stride,
624                                                uint8_t *dst_ptr,
625                                                int32_t dst_stride,
626                                                const int16_t *filter_x0,
627                                                int32_t h) {
628   int32_t c, y;
629   const uint8_t *src;
630   uint8_t *dst;
631   uint8_t *cm = vp9_ff_cropTbl;
632   uint32_t vector_64 = 64;
633   int32_t  filter12, filter34, filter56, filter78;
634   int32_t  Temp1, Temp2, Temp3;
635   uint32_t qload1, qload2;
636   uint32_t p1, p2, p3, p4, p5;
637   uint32_t st1, st2, st3;
638   uint32_t dst_pitch_2 = (dst_stride << 1);
639   uint8_t  *odd_dst;
640 
641   filter12 = ((const int32_t *)filter_x0)[0];
642   filter34 = ((const int32_t *)filter_x0)[1];
643   filter56 = ((const int32_t *)filter_x0)[2];
644   filter78 = ((const int32_t *)filter_x0)[3];
645 
646   for (y = h; y--;) {
647     /* prefetch data to cache memory */
648     vp9_prefetch_load(src_ptr + src_stride);
649     vp9_prefetch_load(src_ptr + src_stride + 32);
650     vp9_prefetch_load(src_ptr + src_stride + 64);
651 
652     src = src_ptr;
653     dst = dst_ptr;
654 
655     odd_dst = (dst + dst_stride);
656 
657     for (c = 0; c < 4; c++) {
658       __asm__ __volatile__ (
659           "ulw              %[qload1],        0(%[src])                       \n\t"
660           "ulw              %[qload2],        4(%[src])                       \n\t"
661 
662           /* even 1. pixel */
663           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
664           "mthi             $zero,            $ac1                            \n\t"
665           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
666           "mthi             $zero,            $ac2                            \n\t"
667           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
668           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
669           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
670           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
671           "ulw              %[qload2],        8(%[src])                       \n\t"
672           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* even 1 */
673           "dpa.w.ph         $ac1,             %[p2],          %[filter34]     \n\t" /* even 1 */
674           "dpa.w.ph         $ac1,             %[p3],          %[filter56]     \n\t" /* even 1 */
675           "dpa.w.ph         $ac1,             %[p4],          %[filter78]     \n\t" /* even 1 */
676           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
677 
678           /* even 2. pixel */
679           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
680           "mthi             $zero,            $ac3                            \n\t"
681           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
682           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
683           "ulw              %[qload1],        12(%[src])                      \n\t"
684           "dpa.w.ph         $ac2,             %[p2],          %[filter12]     \n\t" /* even 1 */
685           "dpa.w.ph         $ac2,             %[p3],          %[filter34]     \n\t" /* even 1 */
686           "dpa.w.ph         $ac2,             %[p4],          %[filter56]     \n\t" /* even 1 */
687           "dpa.w.ph         $ac2,             %[p1],          %[filter78]     \n\t" /* even 1 */
688           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
689           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
690 
691           /* even 3. pixel */
692           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
693           "mthi             $zero,            $ac1                            \n\t"
694           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
695           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
696           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
697           "dpa.w.ph         $ac3,             %[p3],          %[filter12]     \n\t" /* even 3 */
698           "dpa.w.ph         $ac3,             %[p4],          %[filter34]     \n\t" /* even 3 */
699           "dpa.w.ph         $ac3,             %[p1],          %[filter56]     \n\t" /* even 3 */
700           "dpa.w.ph         $ac3,             %[p5],          %[filter78]     \n\t" /* even 3 */
701           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
702           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
703 
704           /* even 4. pixel */
705           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
706           "mthi             $zero,            $ac2                            \n\t"
707           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
708           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
709           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
710           "ulw              %[qload2],        16(%[src])                      \n\t"
711           "dpa.w.ph         $ac1,             %[p4],          %[filter12]     \n\t" /* even 4 */
712           "dpa.w.ph         $ac1,             %[p1],          %[filter34]     \n\t" /* even 4 */
713           "dpa.w.ph         $ac1,             %[p5],          %[filter56]     \n\t" /* even 4 */
714           "dpa.w.ph         $ac1,             %[p2],          %[filter78]     \n\t" /* even 4 */
715           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
716           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
717 
718           /* even 5. pixel */
719           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
720           "mthi             $zero,            $ac3                            \n\t"
721           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
722           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
723           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
724           "dpa.w.ph         $ac2,             %[p1],          %[filter12]     \n\t" /* even 5 */
725           "dpa.w.ph         $ac2,             %[p5],          %[filter34]     \n\t" /* even 5 */
726           "dpa.w.ph         $ac2,             %[p2],          %[filter56]     \n\t" /* even 5 */
727           "dpa.w.ph         $ac2,             %[p3],          %[filter78]     \n\t" /* even 5 */
728           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
729           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
730 
731           /* even 6. pixel */
732           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
733           "mthi             $zero,            $ac1                            \n\t"
734           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
735           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
736           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
737           "ulw              %[qload1],        20(%[src])                      \n\t"
738           "dpa.w.ph         $ac3,             %[p5],          %[filter12]     \n\t" /* even 6 */
739           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* even 6 */
740           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* even 6 */
741           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* even 6 */
742           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
743           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
744 
745           /* even 7. pixel */
746           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
747           "mthi             $zero,            $ac2                            \n\t"
748           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
749           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
750           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
751           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* even 7 */
752           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* even 7 */
753           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* even 7 */
754           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* even 7 */
755           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
756           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
757 
758           /* even 8. pixel */
759           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
760           "mthi             $zero,            $ac3                            \n\t"
761           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* even 8 */
762           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* even 8 */
763           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
764           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
765           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* even 8 */
766           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* even 8 */
767           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
768           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
769 
770           /* ODD pixels */
771           "ulw              %[qload1],        1(%[src])                       \n\t"
772           "ulw              %[qload2],        5(%[src])                       \n\t"
773 
774           /* odd 1. pixel */
775           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
776           "mthi             $zero,            $ac1                            \n\t"
777           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
778           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
779           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
780           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
781           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
782           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
783           "ulw              %[qload2],        9(%[src])                       \n\t"
784           "dpa.w.ph         $ac3,             %[p1],          %[filter12]     \n\t" /* odd 1 */
785           "dpa.w.ph         $ac3,             %[p2],          %[filter34]     \n\t" /* odd 1 */
786           "dpa.w.ph         $ac3,             %[p3],          %[filter56]     \n\t" /* odd 1 */
787           "dpa.w.ph         $ac3,             %[p4],          %[filter78]     \n\t" /* odd 1 */
788           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
789           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
790 
791           /* odd 2. pixel */
792           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
793           "mthi             $zero,            $ac2                            \n\t"
794           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
795           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
796           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
797           "ulw              %[qload1],        13(%[src])                      \n\t"
798           "dpa.w.ph         $ac1,             %[p2],          %[filter12]     \n\t" /* odd 2 */
799           "dpa.w.ph         $ac1,             %[p3],          %[filter34]     \n\t" /* odd 2 */
800           "dpa.w.ph         $ac1,             %[p4],          %[filter56]     \n\t" /* odd 2 */
801           "dpa.w.ph         $ac1,             %[p1],          %[filter78]     \n\t" /* odd 2 */
802           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
803           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
804 
805           /* odd 3. pixel */
806           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
807           "mthi             $zero,            $ac3                            \n\t"
808           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
809           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
810           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
811           "dpa.w.ph         $ac2,             %[p3],          %[filter12]     \n\t" /* odd 3 */
812           "dpa.w.ph         $ac2,             %[p4],          %[filter34]     \n\t" /* odd 3 */
813           "dpa.w.ph         $ac2,             %[p1],          %[filter56]     \n\t" /* odd 3 */
814           "dpa.w.ph         $ac2,             %[p5],          %[filter78]     \n\t" /* odd 3 */
815           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
816           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
817 
818           /* odd 4. pixel */
819           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
820           "mthi             $zero,            $ac1                            \n\t"
821           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
822           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
823           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
824           "ulw              %[qload2],        17(%[src])                      \n\t"
825           "dpa.w.ph         $ac3,             %[p4],          %[filter12]     \n\t" /* odd 4 */
826           "dpa.w.ph         $ac3,             %[p1],          %[filter34]     \n\t" /* odd 4 */
827           "dpa.w.ph         $ac3,             %[p5],          %[filter56]     \n\t" /* odd 4 */
828           "dpa.w.ph         $ac3,             %[p2],          %[filter78]     \n\t" /* odd 4 */
829           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
830           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
831 
832           /* odd 5. pixel */
833           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
834           "mthi             $zero,            $ac2                            \n\t"
835           "preceu.ph.qbr    %[p4],            %[qload2]                       \n\t"
836           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
837           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
838           "dpa.w.ph         $ac1,             %[p1],          %[filter12]     \n\t" /* odd 5 */
839           "dpa.w.ph         $ac1,             %[p5],          %[filter34]     \n\t" /* odd 5 */
840           "dpa.w.ph         $ac1,             %[p2],          %[filter56]     \n\t" /* odd 5 */
841           "dpa.w.ph         $ac1,             %[p3],          %[filter78]     \n\t" /* odd 5 */
842           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
843           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
844 
845           /* odd 6. pixel */
846           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
847           "mthi             $zero,            $ac3                            \n\t"
848           "preceu.ph.qbl    %[p1],            %[qload2]                       \n\t"
849           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
850           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
851           "ulw              %[qload1],        21(%[src])                      \n\t"
852           "dpa.w.ph         $ac2,             %[p5],          %[filter12]     \n\t" /* odd 6 */
853           "dpa.w.ph         $ac2,             %[p2],          %[filter34]     \n\t" /* odd 6 */
854           "dpa.w.ph         $ac2,             %[p3],          %[filter56]     \n\t" /* odd 6 */
855           "dpa.w.ph         $ac2,             %[p4],          %[filter78]     \n\t" /* odd 6 */
856           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
857           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
858 
859           /* odd 7. pixel */
860           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
861           "mthi             $zero,            $ac1                            \n\t"
862           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
863           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
864           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
865           "dpa.w.ph         $ac3,             %[p2],          %[filter12]     \n\t" /* odd 7 */
866           "dpa.w.ph         $ac3,             %[p3],          %[filter34]     \n\t" /* odd 7 */
867           "dpa.w.ph         $ac3,             %[p4],          %[filter56]     \n\t" /* odd 7 */
868           "dpa.w.ph         $ac3,             %[p1],          %[filter78]     \n\t" /* odd 7 */
869           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
870 
871           /* odd 8. pixel */
872           "dpa.w.ph         $ac1,             %[p3],          %[filter12]     \n\t" /* odd 8 */
873           "dpa.w.ph         $ac1,             %[p4],          %[filter34]     \n\t" /* odd 8 */
874           "dpa.w.ph         $ac1,             %[p1],          %[filter56]     \n\t" /* odd 8 */
875           "dpa.w.ph         $ac1,             %[p5],          %[filter78]     \n\t" /* odd 8 */
876           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
877 
878           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
879           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
880           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
881 
882           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
883           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
884 
885           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
886           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
887 
888           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
889 
890           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
891             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
892             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
893             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
894             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
895           : [filter12] "r" (filter12), [filter34] "r" (filter34),
896             [filter56] "r" (filter56), [filter78] "r" (filter78),
897             [vector_64] "r" (vector_64), [cm] "r" (cm),
898             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
899       );
900 
901       src += 16;
902       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
903       odd_dst = (dst + dst_stride);
904     }
905 
906     /* Next row... */
907     src_ptr += src_stride;
908 
909     dst_ptr += 1;
910   }
911 }
912 
convolve_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)913 void convolve_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
914                                uint8_t *dst, ptrdiff_t dst_stride,
915                                const int16_t *filter, int w, int h) {
916   int x, y, k;
917 
918   for (y = 0; y < h; ++y) {
919     for (x = 0; x < w; ++x) {
920       int sum = 0;
921 
922       for (k = 0; k < 8; ++k)
923         sum += src[x + k] * filter[k];
924 
925       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
926     }
927 
928     src += src_stride;
929     dst += 1;
930   }
931 }
932 
copy_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,int w,int h)933 void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
934                            uint8_t *dst, ptrdiff_t dst_stride,
935                            int w, int h) {
936   int x, y;
937 
938   for (y = 0; y < h; ++y) {
939     for (x = 0; x < w; ++x) {
940       dst[x * dst_stride] = src[x];
941     }
942 
943     src += src_stride;
944     dst += 1;
945   }
946 }
947 
vp9_convolve8_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)948 void vp9_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride,
949                          uint8_t *dst, ptrdiff_t dst_stride,
950                          const int16_t *filter_x, int x_step_q4,
951                          const int16_t *filter_y, int y_step_q4,
952                          int w, int h) {
953   DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
954   int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
955   uint32_t pos = 38;
956 
957   /* bit positon for extract from acc */
958   __asm__ __volatile__ (
959     "wrdsp      %[pos],     1           \n\t"
960     :
961     : [pos] "r" (pos)
962   );
963 
964   if (intermediate_height < h)
965     intermediate_height = h;
966 
967   if (x_step_q4 != 16 || y_step_q4 != 16)
968     return vp9_convolve8_c(src, src_stride,
969                            dst, dst_stride,
970                            filter_x, x_step_q4,
971                            filter_y, y_step_q4,
972                            w, h);
973 
974   if ((((const int32_t *)filter_x)[1] == 0x800000)
975       && (((const int32_t *)filter_y)[1] == 0x800000))
976     return vp9_convolve_copy(src, src_stride,
977                              dst, dst_stride,
978                              filter_x, x_step_q4,
979                              filter_y, y_step_q4,
980                              w, h);
981 
982   /* copy the src to dst */
983   if (filter_x[3] == 0x80) {
984     copy_horiz_transposed(src - src_stride * 3, src_stride,
985                           temp, intermediate_height,
986                           w, intermediate_height);
987   } else if (((const int32_t *)filter_x)[0] == 0) {
988     vp9_convolve2_dspr2(src - src_stride * 3, src_stride,
989                         temp, intermediate_height,
990                         filter_x,
991                         w, intermediate_height);
992   } else {
993     src -= (src_stride * 3 + 3);
994 
995     /* prefetch data to cache memory */
996     vp9_prefetch_load(src);
997     vp9_prefetch_load(src + 32);
998 
999     switch (w) {
1000       case 4:
1001         convolve_horiz_4_transposed_dspr2(src, src_stride,
1002                                           temp, intermediate_height,
1003                                           filter_x, intermediate_height);
1004         break;
1005       case 8:
1006         convolve_horiz_8_transposed_dspr2(src, src_stride,
1007                                           temp, intermediate_height,
1008                                           filter_x, intermediate_height);
1009         break;
1010       case 16:
1011       case 32:
1012         convolve_horiz_16_transposed_dspr2(src, src_stride,
1013                                            temp, intermediate_height,
1014                                            filter_x, intermediate_height,
1015                                            (w/16));
1016         break;
1017       case 64:
1018         vp9_prefetch_load(src + 32);
1019         convolve_horiz_64_transposed_dspr2(src, src_stride,
1020                                            temp, intermediate_height,
1021                                            filter_x, intermediate_height);
1022         break;
1023       default:
1024         convolve_horiz_transposed(src, src_stride,
1025                                   temp, intermediate_height,
1026                                   filter_x, w, intermediate_height);
1027         break;
1028     }
1029   }
1030 
1031   /* copy the src to dst */
1032   if (filter_y[3] == 0x80) {
1033     copy_horiz_transposed(temp + 3, intermediate_height,
1034                           dst, dst_stride,
1035                           h, w);
1036   } else if (((const int32_t *)filter_y)[0] == 0) {
1037     vp9_convolve2_dspr2(temp + 3, intermediate_height,
1038                         dst, dst_stride,
1039                         filter_y,
1040                         h, w);
1041   } else {
1042     switch (h) {
1043       case 4:
1044         convolve_horiz_4_transposed_dspr2(temp, intermediate_height,
1045                                           dst, dst_stride,
1046                                           filter_y, w);
1047         break;
1048       case 8:
1049         convolve_horiz_8_transposed_dspr2(temp, intermediate_height,
1050                                           dst, dst_stride,
1051                                           filter_y, w);
1052         break;
1053       case 16:
1054       case 32:
1055         convolve_horiz_16_transposed_dspr2(temp, intermediate_height,
1056                                            dst, dst_stride,
1057                                            filter_y, w, (h/16));
1058         break;
1059       case 64:
1060         convolve_horiz_64_transposed_dspr2(temp, intermediate_height,
1061                                            dst, dst_stride,
1062                                            filter_y, w);
1063         break;
1064       default:
1065         convolve_horiz_transposed(temp, intermediate_height,
1066                                   dst, dst_stride,
1067                                   filter_y, h, w);
1068         break;
1069     }
1070   }
1071 }
1072 
vp9_convolve_copy_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)1073 void vp9_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride,
1074                              uint8_t *dst, ptrdiff_t dst_stride,
1075                              const int16_t *filter_x, int filter_x_stride,
1076                              const int16_t *filter_y, int filter_y_stride,
1077                              int w, int h) {
1078   int x, y;
1079 
1080   /* prefetch data to cache memory */
1081   vp9_prefetch_load(src);
1082   vp9_prefetch_load(src + 32);
1083   vp9_prefetch_store(dst);
1084 
1085   switch (w) {
1086     case 4:
1087       {
1088       uint32_t tp1;
1089 
1090       /* 1 word storage */
1091       for (y = h; y--; ) {
1092         vp9_prefetch_load(src + src_stride);
1093         vp9_prefetch_load(src + src_stride + 32);
1094         vp9_prefetch_store(dst + dst_stride);
1095 
1096         __asm__ __volatile__ (
1097             "ulw              %[tp1],         (%[src])      \n\t"
1098             "sw               %[tp1],         (%[dst])      \n\t"  /* store */
1099 
1100             : [tp1] "=&r" (tp1)
1101             : [src] "r" (src), [dst] "r" (dst)
1102         );
1103 
1104         src += src_stride;
1105         dst += dst_stride;
1106       }
1107       }
1108       break;
1109     case 8:
1110       {
1111       uint32_t tp1, tp2;
1112 
1113       /* 2 word storage */
1114       for (y = h; y--; ) {
1115         vp9_prefetch_load(src + src_stride);
1116         vp9_prefetch_load(src + src_stride + 32);
1117         vp9_prefetch_store(dst + dst_stride);
1118 
1119         __asm__ __volatile__ (
1120             "ulw              %[tp1],         0(%[src])      \n\t"
1121             "ulw              %[tp2],         4(%[src])      \n\t"
1122             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1123             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1124 
1125             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2)
1126             : [src] "r" (src), [dst] "r" (dst)
1127         );
1128 
1129         src += src_stride;
1130         dst += dst_stride;
1131       }
1132       }
1133       break;
1134     case 16:
1135       {
1136       uint32_t tp1, tp2, tp3, tp4;
1137 
1138       /* 4 word storage */
1139       for (y = h; y--; ) {
1140         vp9_prefetch_load(src + src_stride);
1141         vp9_prefetch_load(src + src_stride + 32);
1142         vp9_prefetch_store(dst + dst_stride);
1143 
1144         __asm__ __volatile__ (
1145             "ulw              %[tp1],         0(%[src])      \n\t"
1146             "ulw              %[tp2],         4(%[src])      \n\t"
1147             "ulw              %[tp3],         8(%[src])      \n\t"
1148             "ulw              %[tp4],         12(%[src])     \n\t"
1149 
1150             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1151             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1152             "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
1153             "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
1154 
1155             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1156               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4)
1157             : [src] "r" (src), [dst] "r" (dst)
1158         );
1159 
1160         src += src_stride;
1161         dst += dst_stride;
1162       }
1163       }
1164       break;
1165     case 32:
1166       {
1167       uint32_t tp1, tp2, tp3, tp4;
1168       uint32_t tp5, tp6, tp7, tp8;
1169 
1170       /* 8 word storage */
1171       for (y = h; y--; ) {
1172         vp9_prefetch_load(src + src_stride);
1173         vp9_prefetch_load(src + src_stride + 32);
1174         vp9_prefetch_store(dst + dst_stride);
1175 
1176         __asm__ __volatile__ (
1177             "ulw              %[tp1],         0(%[src])      \n\t"
1178             "ulw              %[tp2],         4(%[src])      \n\t"
1179             "ulw              %[tp3],         8(%[src])      \n\t"
1180             "ulw              %[tp4],         12(%[src])     \n\t"
1181             "ulw              %[tp5],         16(%[src])     \n\t"
1182             "ulw              %[tp6],         20(%[src])     \n\t"
1183             "ulw              %[tp7],         24(%[src])     \n\t"
1184             "ulw              %[tp8],         28(%[src])     \n\t"
1185 
1186             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1187             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1188             "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
1189             "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
1190             "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
1191             "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
1192             "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
1193             "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
1194 
1195             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1196               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1197               [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1198               [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1199             : [src] "r" (src), [dst] "r" (dst)
1200         );
1201 
1202         src += src_stride;
1203         dst += dst_stride;
1204       }
1205       }
1206       break;
1207     case 64:
1208       {
1209       uint32_t tp1, tp2, tp3, tp4;
1210       uint32_t tp5, tp6, tp7, tp8;
1211 
1212       vp9_prefetch_load(src + 64);
1213       vp9_prefetch_store(dst + 32);
1214 
1215       /* 16 word storage */
1216       for (y = h; y--; ) {
1217         vp9_prefetch_load(src + src_stride);
1218         vp9_prefetch_load(src + src_stride + 32);
1219         vp9_prefetch_load(src + src_stride + 64);
1220         vp9_prefetch_store(dst + dst_stride);
1221         vp9_prefetch_store(dst + dst_stride + 32);
1222 
1223         __asm__ __volatile__ (
1224             "ulw              %[tp1],         0(%[src])      \n\t"
1225             "ulw              %[tp2],         4(%[src])      \n\t"
1226             "ulw              %[tp3],         8(%[src])      \n\t"
1227             "ulw              %[tp4],         12(%[src])     \n\t"
1228             "ulw              %[tp5],         16(%[src])     \n\t"
1229             "ulw              %[tp6],         20(%[src])     \n\t"
1230             "ulw              %[tp7],         24(%[src])     \n\t"
1231             "ulw              %[tp8],         28(%[src])     \n\t"
1232 
1233             "sw               %[tp1],         0(%[dst])      \n\t"  /* store */
1234             "sw               %[tp2],         4(%[dst])      \n\t"  /* store */
1235             "sw               %[tp3],         8(%[dst])      \n\t"  /* store */
1236             "sw               %[tp4],         12(%[dst])     \n\t"  /* store */
1237             "sw               %[tp5],         16(%[dst])     \n\t"  /* store */
1238             "sw               %[tp6],         20(%[dst])     \n\t"  /* store */
1239             "sw               %[tp7],         24(%[dst])     \n\t"  /* store */
1240             "sw               %[tp8],         28(%[dst])     \n\t"  /* store */
1241 
1242             "ulw              %[tp1],         32(%[src])     \n\t"
1243             "ulw              %[tp2],         36(%[src])     \n\t"
1244             "ulw              %[tp3],         40(%[src])     \n\t"
1245             "ulw              %[tp4],         44(%[src])     \n\t"
1246             "ulw              %[tp5],         48(%[src])     \n\t"
1247             "ulw              %[tp6],         52(%[src])     \n\t"
1248             "ulw              %[tp7],         56(%[src])     \n\t"
1249             "ulw              %[tp8],         60(%[src])     \n\t"
1250 
1251             "sw               %[tp1],         32(%[dst])     \n\t"  /* store */
1252             "sw               %[tp2],         36(%[dst])     \n\t"  /* store */
1253             "sw               %[tp3],         40(%[dst])     \n\t"  /* store */
1254             "sw               %[tp4],         44(%[dst])     \n\t"  /* store */
1255             "sw               %[tp5],         48(%[dst])     \n\t"  /* store */
1256             "sw               %[tp6],         52(%[dst])     \n\t"  /* store */
1257             "sw               %[tp7],         56(%[dst])     \n\t"  /* store */
1258             "sw               %[tp8],         60(%[dst])     \n\t"  /* store */
1259 
1260             : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
1261               [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
1262               [tp5] "=&r" (tp5), [tp6] "=&r" (tp6),
1263               [tp7] "=&r" (tp7), [tp8] "=&r" (tp8)
1264             : [src] "r" (src), [dst] "r" (dst)
1265         );
1266 
1267         src += src_stride;
1268         dst += dst_stride;
1269       }
1270       }
1271       break;
1272     default:
1273       for (y = h; y--; ) {
1274         for (x = 0; x < w; ++x) {
1275           dst[x] = src[x];
1276         }
1277 
1278         src += src_stride;
1279         dst += dst_stride;
1280       }
1281       break;
1282   }
1283 }
1284 #endif
1285