1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_bi_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
24                                       int32_t src_stride,
25                                       uint8_t *dst,
26                                       int32_t dst_stride,
27                                       const int16_t *filter_x0,
28                                       int32_t h) {
29   int32_t y;
30   uint8_t *cm = vp9_ff_cropTbl;
31   int32_t Temp1, Temp2, Temp3, Temp4;
32   uint32_t vector4a = 64;
33   uint32_t tp1, tp2;
34   uint32_t p1, p2;
35   const int16_t *filter = &filter_x0[3];
36   uint32_t filter45;;
37 
38   filter45 = ((const int32_t *)filter)[0];
39 
40   for (y = h; y--;) {
41     /* prefetch data to cache memory */
42     vp9_prefetch_load(src + src_stride);
43     vp9_prefetch_load(src + src_stride + 32);
44     vp9_prefetch_store(dst + dst_stride);
45 
46     __asm__ __volatile__ (
47         "ulw              %[tp1],      0(%[src])                      \n\t"
48         "ulw              %[tp2],      4(%[src])                      \n\t"
49 
50         /* even 1. pixel */
51         "mtlo             %[vector4a], $ac3                           \n\t"
52         "mthi             $zero,       $ac3                           \n\t"
53         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
54         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
55         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
56         "extp             %[Temp1],    $ac3,           31             \n\t"
57 
58         /* even 2. pixel */
59         "mtlo             %[vector4a], $ac2                           \n\t"
60         "mthi             $zero,       $ac2                           \n\t"
61         "balign           %[tp2],      %[tp1],         3              \n\t"
62         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
63         "extp             %[Temp3],    $ac2,           31             \n\t"
64 
65         /* odd 1. pixel */
66         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
67         "mtlo             %[vector4a], $ac3                           \n\t"
68         "mthi             $zero,       $ac3                           \n\t"
69         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
70         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
71         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
72         "extp             %[Temp2],    $ac3,           31             \n\t"
73 
74         /* odd 2. pixel */
75         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
76         "mtlo             %[vector4a], $ac2                           \n\t"
77         "mthi             $zero,       $ac2                           \n\t"
78         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
79         "extp             %[Temp4],    $ac2,           31             \n\t"
80 
81         /* clamp */
82         "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
83         "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
84 
85         /* store bytes */
86         "sb               %[tp1],      0(%[dst])                      \n\t"
87         "sb               %[p1],       1(%[dst])                      \n\t"
88         "sb               %[tp2],      2(%[dst])                      \n\t"
89         "sb               %[p2],       3(%[dst])                      \n\t"
90 
91         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
92           [p1] "=&r" (p1), [p2] "=&r" (p2),
93           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
94           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
95         : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
96           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
97     );
98 
99     /* Next row... */
100     src += src_stride;
101     dst += dst_stride;
102   }
103 }
104 
convolve_bi_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)105 static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
106                                       int32_t src_stride,
107                                       uint8_t *dst,
108                                       int32_t dst_stride,
109                                       const int16_t *filter_x0,
110                                       int32_t h) {
111   int32_t y;
112   uint8_t *cm = vp9_ff_cropTbl;
113   uint32_t vector4a = 64;
114   int32_t Temp1, Temp2, Temp3;
115   uint32_t tp1, tp2, tp3;
116   uint32_t p1, p2, p3, p4;
117   uint32_t st0, st1;
118   const int16_t *filter = &filter_x0[3];
119   uint32_t filter45;;
120 
121   filter45 = ((const int32_t *)filter)[0];
122 
123   for (y = h; y--;) {
124     /* prefetch data to cache memory */
125     vp9_prefetch_load(src + src_stride);
126     vp9_prefetch_load(src + src_stride + 32);
127     vp9_prefetch_store(dst + dst_stride);
128 
129     __asm__ __volatile__ (
130         "ulw              %[tp1],      0(%[src])                      \n\t"
131         "ulw              %[tp2],      4(%[src])                      \n\t"
132 
133         /* even 1. pixel */
134         "mtlo             %[vector4a], $ac3                           \n\t"
135         "mthi             $zero,       $ac3                           \n\t"
136         "mtlo             %[vector4a], $ac2                           \n\t"
137         "mthi             $zero,       $ac2                           \n\t"
138         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
139         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
140         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
141         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
142         "ulw              %[tp3],      8(%[src])                      \n\t"
143         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
144         "extp             %[Temp1],    $ac3,           31             \n\t"
145 
146         /* even 2. pixel */
147         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
148         "extp             %[Temp3],    $ac2,           31             \n\t"
149 
150         /* even 3. pixel */
151         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
152         "mtlo             %[vector4a], $ac1                           \n\t"
153         "mthi             $zero,       $ac1                           \n\t"
154         "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
155         "extp             %[Temp1],    $ac1,           31             \n\t"
156 
157         /* even 4. pixel */
158         "mtlo             %[vector4a], $ac2                           \n\t"
159         "mthi             $zero,       $ac2                           \n\t"
160         "mtlo             %[vector4a], $ac3                           \n\t"
161         "mthi             $zero,       $ac3                           \n\t"
162         "sb               %[st0],      0(%[dst])                      \n\t"
163         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
164 
165         "balign           %[tp3],      %[tp2],         3              \n\t"
166         "balign           %[tp2],      %[tp1],         3              \n\t"
167 
168         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
169         "extp             %[Temp3],    $ac2,           31             \n\t"
170 
171         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
172 
173         /* odd 1. pixel */
174         "mtlo             %[vector4a], $ac1                           \n\t"
175         "mthi             $zero,       $ac1                           \n\t"
176         "sb               %[st1],      2(%[dst])                      \n\t"
177         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
178         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
179         "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
180         "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
181         "sb               %[st0],      4(%[dst])                      \n\t"
182         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
183         "extp             %[Temp2],    $ac3,           31             \n\t"
184 
185         /* odd 2. pixel */
186         "mtlo             %[vector4a], $ac3                           \n\t"
187         "mthi             $zero,       $ac3                           \n\t"
188         "mtlo             %[vector4a], $ac2                           \n\t"
189         "mthi             $zero,       $ac2                           \n\t"
190         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
191         "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
192         "extp             %[Temp3],    $ac1,           31             \n\t"
193 
194         /* odd 3. pixel */
195         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
196         "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
197         "extp             %[Temp2],    $ac3,           31             \n\t"
198 
199         /* odd 4. pixel */
200         "sb               %[st1],      1(%[dst])                      \n\t"
201         "sb               %[st0],      6(%[dst])                      \n\t"
202         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
203         "extp             %[Temp1],    $ac2,           31             \n\t"
204 
205         /* clamp */
206         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
207         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
208         "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
209 
210         /* store bytes */
211         "sb               %[p4],       3(%[dst])                      \n\t"
212         "sb               %[p2],       5(%[dst])                      \n\t"
213         "sb               %[p1],       7(%[dst])                      \n\t"
214 
215         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
216           [st0] "=&r" (st0), [st1] "=&r" (st1),
217           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
218           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
219         : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
220           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
221     );
222 
223     /* Next row... */
224     src += src_stride;
225     dst += dst_stride;
226   }
227 }
228 
convolve_bi_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)229 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
230                                        int32_t src_stride,
231                                        uint8_t *dst_ptr,
232                                        int32_t dst_stride,
233                                        const int16_t *filter_x0,
234                                        int32_t h,
235                                        int32_t count) {
236   int32_t y, c;
237   const uint8_t *src;
238   uint8_t *dst;
239   uint8_t *cm = vp9_ff_cropTbl;
240   uint32_t vector_64 = 64;
241   int32_t Temp1, Temp2, Temp3;
242   uint32_t qload1, qload2, qload3;
243   uint32_t p1, p2, p3, p4, p5;
244   uint32_t st1, st2, st3;
245   const int16_t *filter = &filter_x0[3];
246   uint32_t filter45;;
247 
248   filter45 = ((const int32_t *)filter)[0];
249 
250   for (y = h; y--;) {
251     src = src_ptr;
252     dst = dst_ptr;
253 
254     /* prefetch data to cache memory */
255     vp9_prefetch_load(src_ptr + src_stride);
256     vp9_prefetch_load(src_ptr + src_stride + 32);
257     vp9_prefetch_store(dst_ptr + dst_stride);
258 
259     for (c = 0; c < count; c++) {
260       __asm__ __volatile__ (
261           "ulw              %[qload1],    0(%[src])                    \n\t"
262           "ulw              %[qload2],    4(%[src])                    \n\t"
263 
264           /* even 1. pixel */
265           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
266           "mthi             $zero,        $ac1                         \n\t"
267           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
268           "mthi             $zero,        $ac2                         \n\t"
269           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
270           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
271           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
272           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
273           "ulw              %[qload3],    8(%[src])                    \n\t"
274           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
275           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
276 
277           /* even 2. pixel */
278           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
279           "mthi             $zero,        $ac3                         \n\t"
280           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
281           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
282           "ulw              %[qload1],    12(%[src])                   \n\t"
283           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
284           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
285           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
286 
287           /* even 3. pixel */
288           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
289           "mthi             $zero,        $ac1                         \n\t"
290           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
291           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
292           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
293           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
294           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
295 
296           /* even 4. pixel */
297           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
298           "mthi             $zero,        $ac2                         \n\t"
299           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
300           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
301           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
302           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
303           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
304 
305           /* even 5. pixel */
306           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
307           "mthi             $zero,        $ac3                         \n\t"
308           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
309           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
310           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
311           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
312 
313           /* even 6. pixel */
314           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
315           "mthi             $zero,        $ac1                         \n\t"
316           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
317           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
318           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
319           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
320 
321           /* even 7. pixel */
322           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
323           "mthi             $zero,        $ac2                         \n\t"
324           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
325           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
326           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
327           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
328 
329           /* even 8. pixel */
330           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
331           "mthi             $zero,        $ac3                         \n\t"
332           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
333           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
334           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
335           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
336 
337           /* ODD pixels */
338           "ulw              %[qload1],    1(%[src])                    \n\t"
339           "ulw              %[qload2],    5(%[src])                    \n\t"
340 
341           /* odd 1. pixel */
342           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
343           "mthi             $zero,        $ac1                         \n\t"
344           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
345           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
346           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
347           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
348           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
349           "ulw              %[qload3],    9(%[src])                    \n\t"
350           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
351           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
352           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
353 
354           /* odd 2. pixel */
355           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
356           "mthi             $zero,        $ac2                         \n\t"
357           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
358           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
359           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
360           "ulw              %[qload1],    13(%[src])                   \n\t"
361           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
362           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
363           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
364 
365           /* odd 3. pixel */
366           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
367           "mthi             $zero,        $ac3                         \n\t"
368           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
369           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
370           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
371           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
372           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
373 
374           /* odd 4. pixel */
375           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
376           "mthi             $zero,        $ac1                         \n\t"
377           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
378           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
379           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
380           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
381           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
382 
383           /* odd 5. pixel */
384           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
385           "mthi             $zero,        $ac2                         \n\t"
386           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
387           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
388           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
389           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
390 
391           /* odd 6. pixel */
392           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
393           "mthi             $zero,        $ac3                         \n\t"
394           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
395           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
396           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
397           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
398 
399           /* odd 7. pixel */
400           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
401           "mthi             $zero,        $ac1                         \n\t"
402           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
403           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
404           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
405 
406           /* odd 8. pixel */
407           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
408           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
409 
410           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
411           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
412           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
413 
414           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
415           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
416           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
417 
418           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
419             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
420             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
421             [p5] "=&r" (p5),
422             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
423           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
424             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
425       );
426 
427       src += 16;
428       dst += 16;
429     }
430 
431     /* Next row... */
432     src_ptr += src_stride;
433     dst_ptr += dst_stride;
434   }
435 }
436 
convolve_bi_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)437 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
438                                        int32_t src_stride,
439                                        uint8_t *dst_ptr,
440                                        int32_t dst_stride,
441                                        const int16_t *filter_x0,
442                                        int32_t h) {
443   int32_t y, c;
444   const uint8_t *src;
445   uint8_t *dst;
446   uint8_t *cm = vp9_ff_cropTbl;
447   uint32_t vector_64 = 64;
448   int32_t Temp1, Temp2, Temp3;
449   uint32_t qload1, qload2, qload3;
450   uint32_t p1, p2, p3, p4, p5;
451   uint32_t st1, st2, st3;
452   const int16_t *filter = &filter_x0[3];
453   uint32_t filter45;;
454 
455   filter45 = ((const int32_t *)filter)[0];
456 
457   for (y = h; y--;) {
458     src = src_ptr;
459     dst = dst_ptr;
460 
461     /* prefetch data to cache memory */
462     vp9_prefetch_load(src_ptr + src_stride);
463     vp9_prefetch_load(src_ptr + src_stride + 32);
464     vp9_prefetch_load(src_ptr + src_stride + 64);
465     vp9_prefetch_store(dst_ptr + dst_stride);
466     vp9_prefetch_store(dst_ptr + dst_stride + 32);
467 
468     for (c = 0; c < 4; c++) {
469       __asm__ __volatile__ (
470           "ulw              %[qload1],    0(%[src])                    \n\t"
471           "ulw              %[qload2],    4(%[src])                    \n\t"
472 
473           /* even 1. pixel */
474           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
475           "mthi             $zero,        $ac1                         \n\t"
476           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
477           "mthi             $zero,        $ac2                         \n\t"
478           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
479           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
480           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
481           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
482           "ulw              %[qload3],    8(%[src])                    \n\t"
483           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
484           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
485 
486           /* even 2. pixel */
487           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
488           "mthi             $zero,        $ac3                         \n\t"
489           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
490           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
491           "ulw              %[qload1],    12(%[src])                   \n\t"
492           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
493           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
494           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
495 
496           /* even 3. pixel */
497           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
498           "mthi             $zero,        $ac1                         \n\t"
499           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
500           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
501           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
502           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
503           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
504 
505           /* even 4. pixel */
506           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
507           "mthi             $zero,        $ac2                         \n\t"
508           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
509           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
510           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
511           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
512           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
513 
514           /* even 5. pixel */
515           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
516           "mthi             $zero,        $ac3                         \n\t"
517           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
518           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
519           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
520           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
521 
522           /* even 6. pixel */
523           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
524           "mthi             $zero,        $ac1                         \n\t"
525           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
526           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
527           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
528           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
529 
530           /* even 7. pixel */
531           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
532           "mthi             $zero,        $ac2                         \n\t"
533           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
534           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
535           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
536           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
537 
538           /* even 8. pixel */
539           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
540           "mthi             $zero,        $ac3                         \n\t"
541           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
542           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
543           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
544           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
545 
546           /* ODD pixels */
547           "ulw              %[qload1],    1(%[src])                    \n\t"
548           "ulw              %[qload2],    5(%[src])                    \n\t"
549 
550           /* odd 1. pixel */
551           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
552           "mthi             $zero,        $ac1                         \n\t"
553           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
554           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
555           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
556           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
557           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
558           "ulw              %[qload3],    9(%[src])                    \n\t"
559           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
560           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
561           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
562 
563           /* odd 2. pixel */
564           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
565           "mthi             $zero,        $ac2                         \n\t"
566           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
567           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
568           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
569           "ulw              %[qload1],    13(%[src])                   \n\t"
570           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
571           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
572           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
573 
574           /* odd 3. pixel */
575           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
576           "mthi             $zero,        $ac3                         \n\t"
577           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
578           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
579           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
580           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
581           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
582 
583           /* odd 4. pixel */
584           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
585           "mthi             $zero,        $ac1                         \n\t"
586           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
587           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
588           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
589           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
590           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
591 
592           /* odd 5. pixel */
593           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
594           "mthi             $zero,        $ac2                         \n\t"
595           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
596           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
597           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
598           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
599 
600           /* odd 6. pixel */
601           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
602           "mthi             $zero,        $ac3                         \n\t"
603           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
604           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
605           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
606           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
607 
608           /* odd 7. pixel */
609           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
610           "mthi             $zero,        $ac1                         \n\t"
611           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
612           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
613           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
614 
615           /* odd 8. pixel */
616           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
617           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
618 
619           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
620           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
621           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
622 
623           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
624           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
625           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
626 
627           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
628             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
629             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
630             [p5] "=&r" (p5),
631             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
632           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
633             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
634       );
635 
636       src += 16;
637       dst += 16;
638     }
639 
640     /* Next row... */
641     src_ptr += src_stride;
642     dst_ptr += dst_stride;
643   }
644 }
645 
vp9_convolve2_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)646 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
647                                uint8_t *dst, ptrdiff_t dst_stride,
648                                const int16_t *filter_x, int x_step_q4,
649                                const int16_t *filter_y, int y_step_q4,
650                                int w, int h) {
651   if (16 == x_step_q4) {
652     uint32_t pos = 38;
653 
654     vp9_prefetch_load((const uint8_t *)filter_x);
655 
656     /* bit positon for extract from acc */
657     __asm__ __volatile__ (
658       "wrdsp      %[pos],     1           \n\t"
659       :
660       : [pos] "r" (pos)
661     );
662 
663     /* prefetch data to cache memory */
664     vp9_prefetch_load(src);
665     vp9_prefetch_load(src + 32);
666     vp9_prefetch_store(dst);
667 
668     switch (w) {
669       case 4:
670         convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
671                                   dst, (int32_t)dst_stride,
672                                   filter_x, (int32_t)h);
673         break;
674       case 8:
675         convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
676                                   dst, (int32_t)dst_stride,
677                                   filter_x, (int32_t)h);
678         break;
679       case 16:
680         convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
681                                    dst, (int32_t)dst_stride,
682                                    filter_x, (int32_t)h, 1);
683         break;
684       case 32:
685         convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
686                                    dst, (int32_t)dst_stride,
687                                    filter_x, (int32_t)h, 2);
688         break;
689       case 64:
690         vp9_prefetch_load(src + 64);
691         vp9_prefetch_store(dst + 32);
692 
693         convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
694                                    dst, (int32_t)dst_stride,
695                                    filter_x, (int32_t)h);
696         break;
697       default:
698         vp9_convolve8_horiz_c(src, src_stride,
699                               dst, dst_stride,
700                               filter_x, x_step_q4,
701                               filter_y, y_step_q4,
702                               w, h);
703         break;
704     }
705   } else {
706     vp9_convolve8_horiz_c(src, src_stride,
707                           dst, dst_stride,
708                           filter_x, x_step_q4,
709                           filter_y, y_step_q4,
710                           w, h);
711   }
712 }
713 #endif
714