1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_horiz_4_dspr2(const uint8_t *src,
24                                    int32_t src_stride,
25                                    uint8_t *dst,
26                                    int32_t dst_stride,
27                                    const int16_t *filter_x0,
28                                    int32_t h) {
29   int32_t y;
30   uint8_t *cm = vp9_ff_cropTbl;
31   int32_t vector1b, vector2b, vector3b, vector4b;
32   int32_t Temp1, Temp2, Temp3, Temp4;
33   uint32_t vector4a = 64;
34   uint32_t tp1, tp2;
35   uint32_t p1, p2, p3, p4;
36   uint32_t n1, n2, n3, n4;
37   uint32_t tn1, tn2;
38 
39   vector1b = ((const int32_t *)filter_x0)[0];
40   vector2b = ((const int32_t *)filter_x0)[1];
41   vector3b = ((const int32_t *)filter_x0)[2];
42   vector4b = ((const int32_t *)filter_x0)[3];
43 
44   for (y = h; y--;) {
45     /* prefetch data to cache memory */
46     vp9_prefetch_load(src + src_stride);
47     vp9_prefetch_load(src + src_stride + 32);
48     vp9_prefetch_store(dst + dst_stride);
49 
50     __asm__ __volatile__ (
51         "ulw              %[tp1],      0(%[src])                      \n\t"
52         "ulw              %[tp2],      4(%[src])                      \n\t"
53 
54         /* even 1. pixel */
55         "mtlo             %[vector4a], $ac3                           \n\t"
56         "mthi             $zero,       $ac3                           \n\t"
57         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
58         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
59         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
60         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
61         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
62         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
63         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
64         "ulw              %[tn2],      8(%[src])                      \n\t"
65         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
66         "extp             %[Temp1],    $ac3,           31             \n\t"
67 
68         /* even 2. pixel */
69         "mtlo             %[vector4a], $ac2                           \n\t"
70         "mthi             $zero,       $ac2                           \n\t"
71         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
72         "balign           %[tn1],      %[tn2],         3              \n\t"
73         "balign           %[tn2],      %[tp2],         3              \n\t"
74         "balign           %[tp2],      %[tp1],         3              \n\t"
75         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
76         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
77         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
78         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
79         "extp             %[Temp3],    $ac2,           31             \n\t"
80 
81         /* odd 1. pixel */
82         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
83         "mtlo             %[vector4a], $ac3                           \n\t"
84         "mthi             $zero,       $ac3                           \n\t"
85         "preceu.ph.qbr    %[n1],       %[tp2]                         \n\t"
86         "preceu.ph.qbl    %[n2],       %[tp2]                         \n\t"
87         "preceu.ph.qbr    %[n3],       %[tn2]                         \n\t"
88         "preceu.ph.qbl    %[n4],       %[tn2]                         \n\t"
89         "dpa.w.ph         $ac3,        %[n1],          %[vector1b]    \n\t"
90         "dpa.w.ph         $ac3,        %[n2],          %[vector2b]    \n\t"
91         "dpa.w.ph         $ac3,        %[n3],          %[vector3b]    \n\t"
92         "dpa.w.ph         $ac3,        %[n4],          %[vector4b]    \n\t"
93         "extp             %[Temp2],    $ac3,           31             \n\t"
94 
95         /* odd 2. pixel */
96         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
97         "mtlo             %[vector4a], $ac2                           \n\t"
98         "mthi             $zero,       $ac2                           \n\t"
99         "preceu.ph.qbr    %[n1],       %[tn1]                         \n\t"
100         "dpa.w.ph         $ac2,        %[n2],          %[vector1b]    \n\t"
101         "dpa.w.ph         $ac2,        %[n3],          %[vector2b]    \n\t"
102         "dpa.w.ph         $ac2,        %[n4],          %[vector3b]    \n\t"
103         "dpa.w.ph         $ac2,        %[n1],          %[vector4b]    \n\t"
104         "extp             %[Temp4],    $ac2,           31             \n\t"
105 
106         /* clamp */
107         "lbux             %[tn1],      %[Temp2](%[cm])                \n\t"
108         "lbux             %[n2],       %[Temp4](%[cm])                \n\t"
109 
110         /* store bytes */
111         "sb               %[tp1],      0(%[dst])                      \n\t"
112         "sb               %[tn1],      1(%[dst])                      \n\t"
113         "sb               %[tp2],      2(%[dst])                      \n\t"
114         "sb               %[n2],       3(%[dst])                      \n\t"
115 
116         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
117           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
118           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
119           [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
120           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
121           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
122         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
123           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
124           [vector4a] "r" (vector4a),
125           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
126     );
127 
128     /* Next row... */
129     src += src_stride;
130     dst += dst_stride;
131   }
132 }
133 
convolve_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)134 static void convolve_horiz_8_dspr2(const uint8_t *src,
135                                    int32_t src_stride,
136                                    uint8_t *dst,
137                                    int32_t dst_stride,
138                                    const int16_t *filter_x0,
139                                    int32_t h) {
140   int32_t y;
141   uint8_t *cm = vp9_ff_cropTbl;
142   uint32_t vector4a = 64;
143   int32_t vector1b, vector2b, vector3b, vector4b;
144   int32_t Temp1, Temp2, Temp3;
145   uint32_t tp1, tp2;
146   uint32_t p1, p2, p3, p4, n1;
147   uint32_t tn1, tn2, tn3;
148   uint32_t st0, st1;
149 
150   vector1b = ((const int32_t *)filter_x0)[0];
151   vector2b = ((const int32_t *)filter_x0)[1];
152   vector3b = ((const int32_t *)filter_x0)[2];
153   vector4b = ((const int32_t *)filter_x0)[3];
154 
155   for (y = h; y--;) {
156     /* prefetch data to cache memory */
157     vp9_prefetch_load(src + src_stride);
158     vp9_prefetch_load(src + src_stride + 32);
159     vp9_prefetch_store(dst + dst_stride);
160 
161     __asm__ __volatile__ (
162         "ulw              %[tp1],      0(%[src])                      \n\t"
163         "ulw              %[tp2],      4(%[src])                      \n\t"
164 
165         /* even 1. pixel */
166         "mtlo             %[vector4a], $ac3                           \n\t"
167         "mthi             $zero,       $ac3                           \n\t"
168         "mtlo             %[vector4a], $ac2                           \n\t"
169         "mthi             $zero,       $ac2                           \n\t"
170         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
171         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
172         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
173         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
174         "ulw              %[tn2],      8(%[src])                      \n\t"
175         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
176         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
177         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
178         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
179         "extp             %[Temp1],    $ac3,           31             \n\t"
180 
181         /* even 2. pixel */
182         "preceu.ph.qbr    %[p1],       %[tn2]                         \n\t"
183         "preceu.ph.qbl    %[n1],       %[tn2]                         \n\t"
184         "ulw              %[tn1],      12(%[src])                     \n\t"
185         "dpa.w.ph         $ac2,        %[p2],          %[vector1b]    \n\t"
186         "dpa.w.ph         $ac2,        %[p3],          %[vector2b]    \n\t"
187         "dpa.w.ph         $ac2,        %[p4],          %[vector3b]    \n\t"
188         "dpa.w.ph         $ac2,        %[p1],          %[vector4b]    \n\t"
189         "extp             %[Temp3],    $ac2,           31             \n\t"
190 
191         /* even 3. pixel */
192         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
193         "mtlo             %[vector4a], $ac1                           \n\t"
194         "mthi             $zero,       $ac1                           \n\t"
195         "preceu.ph.qbr    %[p2],       %[tn1]                         \n\t"
196         "dpa.w.ph         $ac1,        %[p3],          %[vector1b]    \n\t"
197         "dpa.w.ph         $ac1,        %[p4],          %[vector2b]    \n\t"
198         "dpa.w.ph         $ac1,        %[p1],          %[vector3b]    \n\t"
199         "dpa.w.ph         $ac1,        %[n1],          %[vector4b]    \n\t"
200         "extp             %[Temp1],    $ac1,           31             \n\t"
201 
202         /* even 4. pixel */
203         "mtlo             %[vector4a], $ac2                           \n\t"
204         "mthi             $zero,       $ac2                           \n\t"
205         "mtlo             %[vector4a], $ac3                           \n\t"
206         "mthi             $zero,       $ac3                           \n\t"
207         "sb               %[st0],      0(%[dst])                      \n\t"
208         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
209 
210         "balign           %[tn3],      %[tn1],         3              \n\t"
211         "balign           %[tn1],      %[tn2],         3              \n\t"
212         "balign           %[tn2],      %[tp2],         3              \n\t"
213         "balign           %[tp2],      %[tp1],         3              \n\t"
214 
215         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
216         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
217         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
218         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
219         "extp             %[Temp3],    $ac2,           31             \n\t"
220 
221         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
222 
223         /* odd 1. pixel */
224         "mtlo             %[vector4a], $ac1                           \n\t"
225         "mthi             $zero,       $ac1                           \n\t"
226         "sb               %[st1],      2(%[dst])                      \n\t"
227         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
228         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
229         "preceu.ph.qbr    %[p3],       %[tn2]                         \n\t"
230         "preceu.ph.qbl    %[p4],       %[tn2]                         \n\t"
231         "sb               %[st0],      4(%[dst])                      \n\t"
232         "dpa.w.ph         $ac3,        %[p1],          %[vector1b]    \n\t"
233         "dpa.w.ph         $ac3,        %[p2],          %[vector2b]    \n\t"
234         "dpa.w.ph         $ac3,        %[p3],          %[vector3b]    \n\t"
235         "dpa.w.ph         $ac3,        %[p4],          %[vector4b]    \n\t"
236         "extp             %[Temp2],    $ac3,           31             \n\t"
237 
238         /* odd 2. pixel */
239         "mtlo             %[vector4a], $ac3                           \n\t"
240         "mthi             $zero,       $ac3                           \n\t"
241         "mtlo             %[vector4a], $ac2                           \n\t"
242         "mthi             $zero,       $ac2                           \n\t"
243         "preceu.ph.qbr    %[p1],       %[tn1]                         \n\t"
244         "preceu.ph.qbl    %[n1],       %[tn1]                         \n\t"
245         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
246         "dpa.w.ph         $ac1,        %[p2],          %[vector1b]    \n\t"
247         "dpa.w.ph         $ac1,        %[p3],          %[vector2b]    \n\t"
248         "dpa.w.ph         $ac1,        %[p4],          %[vector3b]    \n\t"
249         "dpa.w.ph         $ac1,        %[p1],          %[vector4b]    \n\t"
250         "extp             %[Temp3],    $ac1,           31             \n\t"
251 
252         /* odd 3. pixel */
253         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
254         "preceu.ph.qbr    %[p2],       %[tn3]                         \n\t"
255         "dpa.w.ph         $ac3,        %[p3],          %[vector1b]    \n\t"
256         "dpa.w.ph         $ac3,        %[p4],          %[vector2b]    \n\t"
257         "dpa.w.ph         $ac3,        %[p1],          %[vector3b]    \n\t"
258         "dpa.w.ph         $ac3,        %[n1],          %[vector4b]    \n\t"
259         "extp             %[Temp2],    $ac3,           31             \n\t"
260 
261         /* odd 4. pixel */
262         "sb               %[st1],      1(%[dst])                      \n\t"
263         "sb               %[st0],      6(%[dst])                      \n\t"
264         "dpa.w.ph         $ac2,        %[p4],          %[vector1b]    \n\t"
265         "dpa.w.ph         $ac2,        %[p1],          %[vector2b]    \n\t"
266         "dpa.w.ph         $ac2,        %[n1],          %[vector3b]    \n\t"
267         "dpa.w.ph         $ac2,        %[p2],          %[vector4b]    \n\t"
268         "extp             %[Temp1],    $ac2,           31             \n\t"
269 
270         /* clamp */
271         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
272         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
273         "lbux             %[n1],       %[Temp1](%[cm])                \n\t"
274 
275         /* store bytes */
276         "sb               %[p4],       3(%[dst])                      \n\t"
277         "sb               %[p2],       5(%[dst])                      \n\t"
278         "sb               %[n1],       7(%[dst])                      \n\t"
279 
280         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
281           [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
282           [st0] "=&r" (st0), [st1] "=&r" (st1),
283           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
284           [n1] "=&r" (n1),
285           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
286         : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
287           [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
288           [vector4a] "r" (vector4a),
289           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
290     );
291 
292     /* Next row... */
293     src += src_stride;
294     dst += dst_stride;
295   }
296 }
297 
convolve_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)298 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
299                                     int32_t src_stride,
300                                     uint8_t *dst_ptr,
301                                     int32_t dst_stride,
302                                     const int16_t *filter_x0,
303                                     int32_t h,
304                                     int32_t count) {
305   int32_t y, c;
306   const uint8_t *src;
307   uint8_t *dst;
308   uint8_t *cm = vp9_ff_cropTbl;
309   uint32_t vector_64 = 64;
310   int32_t filter12, filter34, filter56, filter78;
311   int32_t Temp1, Temp2, Temp3;
312   uint32_t qload1, qload2, qload3;
313   uint32_t p1, p2, p3, p4, p5;
314   uint32_t st1, st2, st3;
315 
316   filter12 = ((const int32_t *)filter_x0)[0];
317   filter34 = ((const int32_t *)filter_x0)[1];
318   filter56 = ((const int32_t *)filter_x0)[2];
319   filter78 = ((const int32_t *)filter_x0)[3];
320 
321   for (y = h; y--;) {
322     src = src_ptr;
323     dst = dst_ptr;
324 
325     /* prefetch data to cache memory */
326     vp9_prefetch_load(src_ptr + src_stride);
327     vp9_prefetch_load(src_ptr + src_stride + 32);
328     vp9_prefetch_store(dst_ptr + dst_stride);
329 
330     for (c = 0; c < count; c++) {
331       __asm__ __volatile__ (
332           "ulw              %[qload1],    0(%[src])                    \n\t"
333           "ulw              %[qload2],    4(%[src])                    \n\t"
334 
335           /* even 1. pixel */
336           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
337           "mthi             $zero,        $ac1                         \n\t"
338           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
339           "mthi             $zero,        $ac2                         \n\t"
340           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
341           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
342           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
343           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
344           "ulw              %[qload3],    8(%[src])                    \n\t"
345           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
346           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
347           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
348           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
349           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
350 
351           /* even 2. pixel */
352           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
353           "mthi             $zero,        $ac3                         \n\t"
354           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
355           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
356           "ulw              %[qload1],    12(%[src])                   \n\t"
357           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
358           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
359           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
360           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
361           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
362           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
363 
364           /* even 3. pixel */
365           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
366           "mthi             $zero,        $ac1                         \n\t"
367           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
368           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
369           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
370           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
371           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
372           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
373           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
374           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
375 
376           /* even 4. pixel */
377           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
378           "mthi             $zero,        $ac2                         \n\t"
379           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
380           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
381           "ulw              %[qload2],    16(%[src])                   \n\t"
382           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
383           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
384           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
385           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
386           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
387           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
388 
389           /* even 5. pixel */
390           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
391           "mthi             $zero,        $ac3                         \n\t"
392           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
393           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
394           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
395           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
396           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
397           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
398           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
399           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
400 
401           /* even 6. pixel */
402           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
403           "mthi             $zero,        $ac1                         \n\t"
404           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
405           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
406           "ulw              %[qload3],    20(%[src])                   \n\t"
407           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
408           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
409           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
410           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
411           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
412           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
413 
414           /* even 7. pixel */
415           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
416           "mthi             $zero,        $ac2                         \n\t"
417           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
418           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
419           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
420           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
421           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
422           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
423           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
424           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
425 
426           /* even 8. pixel */
427           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
428           "mthi             $zero,        $ac3                         \n\t"
429           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
430           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
431           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
432           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
433           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
434           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
435           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
436 
437           /* ODD pixels */
438           "ulw              %[qload1],    1(%[src])                    \n\t"
439           "ulw              %[qload2],    5(%[src])                    \n\t"
440 
441           /* odd 1. pixel */
442           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
443           "mthi             $zero,        $ac1                         \n\t"
444           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
445           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
446           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
447           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
448           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
449           "ulw              %[qload3],    9(%[src])                    \n\t"
450           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
451           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
452           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
453           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
454           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
455           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
456 
457           /* odd 2. pixel */
458           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
459           "mthi             $zero,        $ac2                         \n\t"
460           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
461           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
462           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
463           "ulw              %[qload1],    13(%[src])                   \n\t"
464           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
465           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
466           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
467           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
468           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
469           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
470 
471           /* odd 3. pixel */
472           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
473           "mthi             $zero,        $ac3                         \n\t"
474           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
475           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
476           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
477           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
478           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
479           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
480           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
481           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
482 
483           /* odd 4. pixel */
484           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
485           "mthi             $zero,        $ac1                         \n\t"
486           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
487           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
488           "ulw              %[qload2],    17(%[src])                   \n\t"
489           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
490           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
491           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
492           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
493           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
494           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
495 
496           /* odd 5. pixel */
497           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
498           "mthi             $zero,        $ac2                         \n\t"
499           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
500           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
501           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
502           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
503           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
504           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
505           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
506           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
507 
508           /* odd 6. pixel */
509           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
510           "mthi             $zero,        $ac3                         \n\t"
511           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
512           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
513           "ulw              %[qload3],    21(%[src])                   \n\t"
514           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
515           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
516           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
517           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
518           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
519           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
520 
521           /* odd 7. pixel */
522           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
523           "mthi             $zero,        $ac1                         \n\t"
524           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
525           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
526           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
527           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
528           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
529           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
530           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
531 
532           /* odd 8. pixel */
533           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
534           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
535           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
536           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
537           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
538 
539           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
540           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
541           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
542 
543           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
544           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
545           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
546 
547           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
548             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
549             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
550             [p5] "=&r" (p5),
551             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
552           : [filter12] "r" (filter12), [filter34] "r" (filter34),
553             [filter56] "r" (filter56), [filter78] "r" (filter78),
554             [vector_64] "r" (vector_64),
555             [cm] "r" (cm), [dst] "r" (dst),
556             [src] "r" (src)
557       );
558 
559       src += 16;
560       dst += 16;
561     }
562 
563     /* Next row... */
564     src_ptr += src_stride;
565     dst_ptr += dst_stride;
566   }
567 }
568 
convolve_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)569 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
570                                     int32_t src_stride,
571                                     uint8_t *dst_ptr,
572                                     int32_t dst_stride,
573                                     const int16_t *filter_x0,
574                                     int32_t h) {
575   int32_t y, c;
576   const uint8_t *src;
577   uint8_t *dst;
578   uint8_t *cm = vp9_ff_cropTbl;
579   uint32_t vector_64 = 64;
580   int32_t filter12, filter34, filter56, filter78;
581   int32_t Temp1, Temp2, Temp3;
582   uint32_t qload1, qload2, qload3;
583   uint32_t p1, p2, p3, p4, p5;
584   uint32_t st1, st2, st3;
585 
586   filter12 = ((const int32_t *)filter_x0)[0];
587   filter34 = ((const int32_t *)filter_x0)[1];
588   filter56 = ((const int32_t *)filter_x0)[2];
589   filter78 = ((const int32_t *)filter_x0)[3];
590 
591   for (y = h; y--;) {
592     src = src_ptr;
593     dst = dst_ptr;
594 
595     /* prefetch data to cache memory */
596     vp9_prefetch_load(src_ptr + src_stride);
597     vp9_prefetch_load(src_ptr + src_stride + 32);
598     vp9_prefetch_load(src_ptr + src_stride + 64);
599     vp9_prefetch_store(dst_ptr + dst_stride);
600     vp9_prefetch_store(dst_ptr + dst_stride + 32);
601 
602     for (c = 0; c < 4; c++) {
603       __asm__ __volatile__ (
604           "ulw              %[qload1],    0(%[src])                    \n\t"
605           "ulw              %[qload2],    4(%[src])                    \n\t"
606 
607           /* even 1. pixel */
608           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
609           "mthi             $zero,        $ac1                         \n\t"
610           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
611           "mthi             $zero,        $ac2                         \n\t"
612           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
613           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
614           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
615           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
616           "ulw              %[qload3],    8(%[src])                    \n\t"
617           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* even 1 */
618           "dpa.w.ph         $ac1,         %[p2],          %[filter34]  \n\t" /* even 1 */
619           "dpa.w.ph         $ac1,         %[p3],          %[filter56]  \n\t" /* even 1 */
620           "dpa.w.ph         $ac1,         %[p4],          %[filter78]  \n\t" /* even 1 */
621           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
622 
623           /* even 2. pixel */
624           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
625           "mthi             $zero,        $ac3                         \n\t"
626           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
627           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
628           "ulw              %[qload1],    12(%[src])                   \n\t"
629           "dpa.w.ph         $ac2,         %[p2],          %[filter12]  \n\t" /* even 1 */
630           "dpa.w.ph         $ac2,         %[p3],          %[filter34]  \n\t" /* even 1 */
631           "dpa.w.ph         $ac2,         %[p4],          %[filter56]  \n\t" /* even 1 */
632           "dpa.w.ph         $ac2,         %[p1],          %[filter78]  \n\t" /* even 1 */
633           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
634           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
635 
636           /* even 3. pixel */
637           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
638           "mthi             $zero,        $ac1                         \n\t"
639           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
640           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
641           "dpa.w.ph         $ac3,         %[p3],          %[filter12]  \n\t" /* even 3 */
642           "dpa.w.ph         $ac3,         %[p4],          %[filter34]  \n\t" /* even 3 */
643           "dpa.w.ph         $ac3,         %[p1],          %[filter56]  \n\t" /* even 3 */
644           "dpa.w.ph         $ac3,         %[p5],          %[filter78]  \n\t" /* even 3 */
645           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
646           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
647 
648           /* even 4. pixel */
649           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
650           "mthi             $zero,        $ac2                         \n\t"
651           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
652           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
653           "ulw              %[qload2],    16(%[src])                   \n\t"
654           "dpa.w.ph         $ac1,         %[p4],          %[filter12]  \n\t" /* even 4 */
655           "dpa.w.ph         $ac1,         %[p1],          %[filter34]  \n\t" /* even 4 */
656           "dpa.w.ph         $ac1,         %[p5],          %[filter56]  \n\t" /* even 4 */
657           "dpa.w.ph         $ac1,         %[p2],          %[filter78]  \n\t" /* even 4 */
658           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
659           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
660 
661           /* even 5. pixel */
662           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
663           "mthi             $zero,        $ac3                         \n\t"
664           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
665           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
666           "dpa.w.ph         $ac2,         %[p1],          %[filter12]  \n\t" /* even 5 */
667           "dpa.w.ph         $ac2,         %[p5],          %[filter34]  \n\t" /* even 5 */
668           "dpa.w.ph         $ac2,         %[p2],          %[filter56]  \n\t" /* even 5 */
669           "dpa.w.ph         $ac2,         %[p3],          %[filter78]  \n\t" /* even 5 */
670           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
671           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
672 
673           /* even 6. pixel */
674           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
675           "mthi             $zero,        $ac1                         \n\t"
676           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
677           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
678           "ulw              %[qload3],    20(%[src])                   \n\t"
679           "dpa.w.ph         $ac3,         %[p5],          %[filter12]  \n\t" /* even 6 */
680           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* even 6 */
681           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* even 6 */
682           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* even 6 */
683           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
684           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
685 
686           /* even 7. pixel */
687           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
688           "mthi             $zero,        $ac2                         \n\t"
689           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
690           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
691           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* even 7 */
692           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* even 7 */
693           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* even 7 */
694           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* even 7 */
695           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
696           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
697 
698           /* even 8. pixel */
699           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
700           "mthi             $zero,        $ac3                         \n\t"
701           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* even 8 */
702           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* even 8 */
703           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
704           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* even 8 */
705           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* even 8 */
706           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
707           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
708 
709           /* ODD pixels */
710           "ulw              %[qload1],    1(%[src])                    \n\t"
711           "ulw              %[qload2],    5(%[src])                    \n\t"
712 
713           /* odd 1. pixel */
714           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
715           "mthi             $zero,        $ac1                         \n\t"
716           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
717           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
718           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
719           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
720           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
721           "ulw              %[qload3],    9(%[src])                    \n\t"
722           "dpa.w.ph         $ac3,         %[p1],          %[filter12]  \n\t" /* odd 1 */
723           "dpa.w.ph         $ac3,         %[p2],          %[filter34]  \n\t" /* odd 1 */
724           "dpa.w.ph         $ac3,         %[p3],          %[filter56]  \n\t" /* odd 1 */
725           "dpa.w.ph         $ac3,         %[p4],          %[filter78]  \n\t" /* odd 1 */
726           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
727           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
728 
729           /* odd 2. pixel */
730           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
731           "mthi             $zero,        $ac2                         \n\t"
732           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
733           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
734           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
735           "ulw              %[qload1],    13(%[src])                   \n\t"
736           "dpa.w.ph         $ac1,         %[p2],          %[filter12]  \n\t" /* odd 2 */
737           "dpa.w.ph         $ac1,         %[p3],          %[filter34]  \n\t" /* odd 2 */
738           "dpa.w.ph         $ac1,         %[p4],          %[filter56]  \n\t" /* odd 2 */
739           "dpa.w.ph         $ac1,         %[p1],          %[filter78]  \n\t" /* odd 2 */
740           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
741           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
742 
743           /* odd 3. pixel */
744           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
745           "mthi             $zero,        $ac3                         \n\t"
746           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
747           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
748           "dpa.w.ph         $ac2,         %[p3],          %[filter12]  \n\t" /* odd 3 */
749           "dpa.w.ph         $ac2,         %[p4],          %[filter34]  \n\t" /* odd 3 */
750           "dpa.w.ph         $ac2,         %[p1],          %[filter56]  \n\t" /* odd 3 */
751           "dpa.w.ph         $ac2,         %[p5],          %[filter78]  \n\t" /* odd 3 */
752           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
753           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
754 
755           /* odd 4. pixel */
756           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
757           "mthi             $zero,        $ac1                         \n\t"
758           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
759           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
760           "ulw              %[qload2],    17(%[src])                   \n\t"
761           "dpa.w.ph         $ac3,         %[p4],          %[filter12]  \n\t" /* odd 4 */
762           "dpa.w.ph         $ac3,         %[p1],          %[filter34]  \n\t" /* odd 4 */
763           "dpa.w.ph         $ac3,         %[p5],          %[filter56]  \n\t" /* odd 4 */
764           "dpa.w.ph         $ac3,         %[p2],          %[filter78]  \n\t" /* odd 4 */
765           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
766           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
767 
768           /* odd 5. pixel */
769           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
770           "mthi             $zero,        $ac2                         \n\t"
771           "preceu.ph.qbr    %[p4],        %[qload2]                    \n\t"
772           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
773           "dpa.w.ph         $ac1,         %[p1],          %[filter12]  \n\t" /* odd 5 */
774           "dpa.w.ph         $ac1,         %[p5],          %[filter34]  \n\t" /* odd 5 */
775           "dpa.w.ph         $ac1,         %[p2],          %[filter56]  \n\t" /* odd 5 */
776           "dpa.w.ph         $ac1,         %[p3],          %[filter78]  \n\t" /* odd 5 */
777           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
778           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
779 
780           /* odd 6. pixel */
781           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
782           "mthi             $zero,        $ac3                         \n\t"
783           "preceu.ph.qbl    %[p1],        %[qload2]                    \n\t"
784           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
785           "ulw              %[qload3],    21(%[src])                   \n\t"
786           "dpa.w.ph         $ac2,         %[p5],          %[filter12]  \n\t" /* odd 6 */
787           "dpa.w.ph         $ac2,         %[p2],          %[filter34]  \n\t" /* odd 6 */
788           "dpa.w.ph         $ac2,         %[p3],          %[filter56]  \n\t" /* odd 6 */
789           "dpa.w.ph         $ac2,         %[p4],          %[filter78]  \n\t" /* odd 6 */
790           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
791           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
792 
793           /* odd 7. pixel */
794           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
795           "mthi             $zero,        $ac1                         \n\t"
796           "preceu.ph.qbr    %[p5],        %[qload3]                    \n\t"
797           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
798           "dpa.w.ph         $ac3,         %[p2],          %[filter12]  \n\t" /* odd 7 */
799           "dpa.w.ph         $ac3,         %[p3],          %[filter34]  \n\t" /* odd 7 */
800           "dpa.w.ph         $ac3,         %[p4],          %[filter56]  \n\t" /* odd 7 */
801           "dpa.w.ph         $ac3,         %[p1],          %[filter78]  \n\t" /* odd 7 */
802           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
803 
804           /* odd 8. pixel */
805           "dpa.w.ph         $ac1,         %[p3],          %[filter12]  \n\t" /* odd 8 */
806           "dpa.w.ph         $ac1,         %[p4],          %[filter34]  \n\t" /* odd 8 */
807           "dpa.w.ph         $ac1,         %[p1],          %[filter56]  \n\t" /* odd 8 */
808           "dpa.w.ph         $ac1,         %[p5],          %[filter78]  \n\t" /* odd 8 */
809           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
810 
811           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
812           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
813           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
814 
815           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
816           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
817           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
818 
819           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
820             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
821             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
822             [p5] "=&r" (p5),
823             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
824           : [filter12] "r" (filter12), [filter34] "r" (filter34),
825             [filter56] "r" (filter56), [filter78] "r" (filter78),
826             [vector_64] "r" (vector_64),
827             [cm] "r" (cm), [dst] "r" (dst),
828             [src] "r" (src)
829       );
830 
831       src += 16;
832       dst += 16;
833     }
834 
835     /* Next row... */
836     src_ptr += src_stride;
837     dst_ptr += dst_stride;
838   }
839 }
840 
vp9_convolve8_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)841 void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
842                                uint8_t *dst, ptrdiff_t dst_stride,
843                                const int16_t *filter_x, int x_step_q4,
844                                const int16_t *filter_y, int y_step_q4,
845                                int w, int h) {
846   if (((const int32_t *)filter_x)[1] == 0x800000) {
847     vp9_convolve_copy(src, src_stride,
848                       dst, dst_stride,
849                       filter_x, x_step_q4,
850                       filter_y, y_step_q4,
851                       w, h);
852   } else if (((const int32_t *)filter_x)[0] == 0) {
853     vp9_convolve2_horiz_dspr2(src, src_stride,
854                               dst, dst_stride,
855                               filter_x, x_step_q4,
856                               filter_y, y_step_q4,
857                               w, h);
858   } else {
859     if (16 == x_step_q4) {
860       uint32_t pos = 38;
861 
862       vp9_prefetch_load((const uint8_t *)filter_x);
863       src -= 3;
864 
865       /* bit positon for extract from acc */
866       __asm__ __volatile__ (
867         "wrdsp      %[pos],     1           \n\t"
868         :
869         : [pos] "r" (pos)
870       );
871 
872       /* prefetch data to cache memory */
873       vp9_prefetch_load(src);
874       vp9_prefetch_load(src + 32);
875       vp9_prefetch_store(dst);
876 
877       switch (w) {
878         case 4:
879           convolve_horiz_4_dspr2(src, (int32_t)src_stride,
880                                  dst, (int32_t)dst_stride,
881                                  filter_x, (int32_t)h);
882           break;
883         case 8:
884           convolve_horiz_8_dspr2(src, (int32_t)src_stride,
885                                  dst, (int32_t)dst_stride,
886                                  filter_x, (int32_t)h);
887           break;
888         case 16:
889           convolve_horiz_16_dspr2(src, (int32_t)src_stride,
890                                   dst, (int32_t)dst_stride,
891                                   filter_x, (int32_t)h, 1);
892           break;
893         case 32:
894           convolve_horiz_16_dspr2(src, (int32_t)src_stride,
895                                   dst, (int32_t)dst_stride,
896                                   filter_x, (int32_t)h, 2);
897           break;
898         case 64:
899           vp9_prefetch_load(src + 64);
900           vp9_prefetch_store(dst + 32);
901 
902           convolve_horiz_64_dspr2(src, (int32_t)src_stride,
903                                   dst, (int32_t)dst_stride,
904                                   filter_x, (int32_t)h);
905           break;
906         default:
907           vp9_convolve8_horiz_c(src + 3, src_stride,
908                                 dst, dst_stride,
909                                 filter_x, x_step_q4,
910                                 filter_y, y_step_q4,
911                                 w, h);
912           break;
913       }
914     } else {
915       vp9_convolve8_horiz_c(src, src_stride,
916                             dst, dst_stride,
917                             filter_x, x_step_q4,
918                             filter_y, y_step_q4,
919                             w, h);
920     }
921   }
922 }
923 #endif
924