1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_convolve.h"
17 #include "vpx_dsp/vpx_dsp_common.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_bi_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
22                                       int32_t src_stride,
23                                       uint8_t *dst,
24                                       int32_t dst_stride,
25                                       const int16_t *filter_x0,
26                                       int32_t h) {
27   int32_t y;
28   uint8_t *cm = vpx_ff_cropTbl;
29   int32_t Temp1, Temp2, Temp3, Temp4;
30   uint32_t vector4a = 64;
31   uint32_t tp1, tp2;
32   uint32_t p1, p2;
33   const int16_t *filter = &filter_x0[3];
34   uint32_t filter45;;
35 
36   filter45 = ((const int32_t *)filter)[0];
37 
38   for (y = h; y--;) {
39     /* prefetch data to cache memory */
40     prefetch_load(src + src_stride);
41     prefetch_load(src + src_stride + 32);
42     prefetch_store(dst + dst_stride);
43 
44     __asm__ __volatile__ (
45         "ulw              %[tp1],      0(%[src])                      \n\t"
46         "ulw              %[tp2],      4(%[src])                      \n\t"
47 
48         /* even 1. pixel */
49         "mtlo             %[vector4a], $ac3                           \n\t"
50         "mthi             $zero,       $ac3                           \n\t"
51         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
52         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
53         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
54         "extp             %[Temp1],    $ac3,           31             \n\t"
55 
56         /* even 2. pixel */
57         "mtlo             %[vector4a], $ac2                           \n\t"
58         "mthi             $zero,       $ac2                           \n\t"
59         "balign           %[tp2],      %[tp1],         3              \n\t"
60         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
61         "extp             %[Temp3],    $ac2,           31             \n\t"
62 
63         /* odd 1. pixel */
64         "lbux             %[tp1],      %[Temp1](%[cm])                \n\t"
65         "mtlo             %[vector4a], $ac3                           \n\t"
66         "mthi             $zero,       $ac3                           \n\t"
67         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
68         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
69         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
70         "extp             %[Temp2],    $ac3,           31             \n\t"
71 
72         /* odd 2. pixel */
73         "lbux             %[tp2],      %[Temp3](%[cm])                \n\t"
74         "mtlo             %[vector4a], $ac2                           \n\t"
75         "mthi             $zero,       $ac2                           \n\t"
76         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
77         "extp             %[Temp4],    $ac2,           31             \n\t"
78 
79         /* clamp */
80         "lbux             %[p1],       %[Temp2](%[cm])                \n\t"
81         "lbux             %[p2],       %[Temp4](%[cm])                \n\t"
82 
83         /* store bytes */
84         "sb               %[tp1],      0(%[dst])                      \n\t"
85         "sb               %[p1],       1(%[dst])                      \n\t"
86         "sb               %[tp2],      2(%[dst])                      \n\t"
87         "sb               %[p2],       3(%[dst])                      \n\t"
88 
89         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
90           [p1] "=&r" (p1), [p2] "=&r" (p2),
91           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
92           [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
93         : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
94           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
95     );
96 
97     /* Next row... */
98     src += src_stride;
99     dst += dst_stride;
100   }
101 }
102 
convolve_bi_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)103 static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
104                                       int32_t src_stride,
105                                       uint8_t *dst,
106                                       int32_t dst_stride,
107                                       const int16_t *filter_x0,
108                                       int32_t h) {
109   int32_t y;
110   uint8_t *cm = vpx_ff_cropTbl;
111   uint32_t vector4a = 64;
112   int32_t Temp1, Temp2, Temp3;
113   uint32_t tp1, tp2, tp3;
114   uint32_t p1, p2, p3, p4;
115   uint32_t st0, st1;
116   const int16_t *filter = &filter_x0[3];
117   uint32_t filter45;;
118 
119   filter45 = ((const int32_t *)filter)[0];
120 
121   for (y = h; y--;) {
122     /* prefetch data to cache memory */
123     prefetch_load(src + src_stride);
124     prefetch_load(src + src_stride + 32);
125     prefetch_store(dst + dst_stride);
126 
127     __asm__ __volatile__ (
128         "ulw              %[tp1],      0(%[src])                      \n\t"
129         "ulw              %[tp2],      4(%[src])                      \n\t"
130 
131         /* even 1. pixel */
132         "mtlo             %[vector4a], $ac3                           \n\t"
133         "mthi             $zero,       $ac3                           \n\t"
134         "mtlo             %[vector4a], $ac2                           \n\t"
135         "mthi             $zero,       $ac2                           \n\t"
136         "preceu.ph.qbr    %[p1],       %[tp1]                         \n\t"
137         "preceu.ph.qbl    %[p2],       %[tp1]                         \n\t"
138         "preceu.ph.qbr    %[p3],       %[tp2]                         \n\t"
139         "preceu.ph.qbl    %[p4],       %[tp2]                         \n\t"
140         "ulw              %[tp3],      8(%[src])                      \n\t"
141         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
142         "extp             %[Temp1],    $ac3,           31             \n\t"
143 
144         /* even 2. pixel */
145         "dpa.w.ph         $ac2,        %[p2],          %[filter45]    \n\t"
146         "extp             %[Temp3],    $ac2,           31             \n\t"
147 
148         /* even 3. pixel */
149         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
150         "mtlo             %[vector4a], $ac1                           \n\t"
151         "mthi             $zero,       $ac1                           \n\t"
152         "dpa.w.ph         $ac1,        %[p3],          %[filter45]    \n\t"
153         "extp             %[Temp1],    $ac1,           31             \n\t"
154 
155         /* even 4. pixel */
156         "mtlo             %[vector4a], $ac2                           \n\t"
157         "mthi             $zero,       $ac2                           \n\t"
158         "mtlo             %[vector4a], $ac3                           \n\t"
159         "mthi             $zero,       $ac3                           \n\t"
160         "sb               %[st0],      0(%[dst])                      \n\t"
161         "lbux             %[st1],      %[Temp3](%[cm])                \n\t"
162 
163         "balign           %[tp3],      %[tp2],         3              \n\t"
164         "balign           %[tp2],      %[tp1],         3              \n\t"
165 
166         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
167         "extp             %[Temp3],    $ac2,           31             \n\t"
168 
169         "lbux             %[st0],      %[Temp1](%[cm])                \n\t"
170 
171         /* odd 1. pixel */
172         "mtlo             %[vector4a], $ac1                           \n\t"
173         "mthi             $zero,       $ac1                           \n\t"
174         "sb               %[st1],      2(%[dst])                      \n\t"
175         "preceu.ph.qbr    %[p1],       %[tp2]                         \n\t"
176         "preceu.ph.qbl    %[p2],       %[tp2]                         \n\t"
177         "preceu.ph.qbr    %[p3],       %[tp3]                         \n\t"
178         "preceu.ph.qbl    %[p4],       %[tp3]                         \n\t"
179         "sb               %[st0],      4(%[dst])                      \n\t"
180         "dpa.w.ph         $ac3,        %[p1],          %[filter45]    \n\t"
181         "extp             %[Temp2],    $ac3,           31             \n\t"
182 
183         /* odd 2. pixel */
184         "mtlo             %[vector4a], $ac3                           \n\t"
185         "mthi             $zero,       $ac3                           \n\t"
186         "mtlo             %[vector4a], $ac2                           \n\t"
187         "mthi             $zero,       $ac2                           \n\t"
188         "lbux             %[st0],      %[Temp3](%[cm])                \n\t"
189         "dpa.w.ph         $ac1,        %[p2],          %[filter45]    \n\t"
190         "extp             %[Temp3],    $ac1,           31             \n\t"
191 
192         /* odd 3. pixel */
193         "lbux             %[st1],      %[Temp2](%[cm])                \n\t"
194         "dpa.w.ph         $ac3,        %[p3],          %[filter45]    \n\t"
195         "extp             %[Temp2],    $ac3,           31             \n\t"
196 
197         /* odd 4. pixel */
198         "sb               %[st1],      1(%[dst])                      \n\t"
199         "sb               %[st0],      6(%[dst])                      \n\t"
200         "dpa.w.ph         $ac2,        %[p4],          %[filter45]    \n\t"
201         "extp             %[Temp1],    $ac2,           31             \n\t"
202 
203         /* clamp */
204         "lbux             %[p4],       %[Temp3](%[cm])                \n\t"
205         "lbux             %[p2],       %[Temp2](%[cm])                \n\t"
206         "lbux             %[p1],       %[Temp1](%[cm])                \n\t"
207 
208         /* store bytes */
209         "sb               %[p4],       3(%[dst])                      \n\t"
210         "sb               %[p2],       5(%[dst])                      \n\t"
211         "sb               %[p1],       7(%[dst])                      \n\t"
212 
213         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
214           [st0] "=&r" (st0), [st1] "=&r" (st1),
215           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
216           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
217         : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
218           [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
219     );
220 
221     /* Next row... */
222     src += src_stride;
223     dst += dst_stride;
224   }
225 }
226 
convolve_bi_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)227 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
228                                        int32_t src_stride,
229                                        uint8_t *dst_ptr,
230                                        int32_t dst_stride,
231                                        const int16_t *filter_x0,
232                                        int32_t h,
233                                        int32_t count) {
234   int32_t y, c;
235   const uint8_t *src;
236   uint8_t *dst;
237   uint8_t *cm = vpx_ff_cropTbl;
238   uint32_t vector_64 = 64;
239   int32_t Temp1, Temp2, Temp3;
240   uint32_t qload1, qload2, qload3;
241   uint32_t p1, p2, p3, p4, p5;
242   uint32_t st1, st2, st3;
243   const int16_t *filter = &filter_x0[3];
244   uint32_t filter45;;
245 
246   filter45 = ((const int32_t *)filter)[0];
247 
248   for (y = h; y--;) {
249     src = src_ptr;
250     dst = dst_ptr;
251 
252     /* prefetch data to cache memory */
253     prefetch_load(src_ptr + src_stride);
254     prefetch_load(src_ptr + src_stride + 32);
255     prefetch_store(dst_ptr + dst_stride);
256 
257     for (c = 0; c < count; c++) {
258       __asm__ __volatile__ (
259           "ulw              %[qload1],    0(%[src])                    \n\t"
260           "ulw              %[qload2],    4(%[src])                    \n\t"
261 
262           /* even 1. pixel */
263           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
264           "mthi             $zero,        $ac1                         \n\t"
265           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
266           "mthi             $zero,        $ac2                         \n\t"
267           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
268           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
269           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
270           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
271           "ulw              %[qload3],    8(%[src])                    \n\t"
272           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
273           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
274 
275           /* even 2. pixel */
276           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
277           "mthi             $zero,        $ac3                         \n\t"
278           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
279           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
280           "ulw              %[qload1],    12(%[src])                   \n\t"
281           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
282           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
283           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
284 
285           /* even 3. pixel */
286           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
287           "mthi             $zero,        $ac1                         \n\t"
288           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
289           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
290           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
291           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
292           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
293 
294           /* even 4. pixel */
295           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
296           "mthi             $zero,        $ac2                         \n\t"
297           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
298           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
299           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
300           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
301           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
302 
303           /* even 5. pixel */
304           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
305           "mthi             $zero,        $ac3                         \n\t"
306           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
307           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
308           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
309           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
310 
311           /* even 6. pixel */
312           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
313           "mthi             $zero,        $ac1                         \n\t"
314           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
315           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
316           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
317           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
318 
319           /* even 7. pixel */
320           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
321           "mthi             $zero,        $ac2                         \n\t"
322           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
323           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
324           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
325           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
326 
327           /* even 8. pixel */
328           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
329           "mthi             $zero,        $ac3                         \n\t"
330           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
331           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
332           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
333           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
334 
335           /* ODD pixels */
336           "ulw              %[qload1],    1(%[src])                    \n\t"
337           "ulw              %[qload2],    5(%[src])                    \n\t"
338 
339           /* odd 1. pixel */
340           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
341           "mthi             $zero,        $ac1                         \n\t"
342           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
343           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
344           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
345           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
346           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
347           "ulw              %[qload3],    9(%[src])                    \n\t"
348           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
349           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
350           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
351 
352           /* odd 2. pixel */
353           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
354           "mthi             $zero,        $ac2                         \n\t"
355           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
356           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
357           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
358           "ulw              %[qload1],    13(%[src])                   \n\t"
359           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
360           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
361           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
362 
363           /* odd 3. pixel */
364           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
365           "mthi             $zero,        $ac3                         \n\t"
366           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
367           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
368           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
369           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
370           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
371 
372           /* odd 4. pixel */
373           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
374           "mthi             $zero,        $ac1                         \n\t"
375           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
376           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
377           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
378           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
379           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
380 
381           /* odd 5. pixel */
382           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
383           "mthi             $zero,        $ac2                         \n\t"
384           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
385           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
386           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
387           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
388 
389           /* odd 6. pixel */
390           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
391           "mthi             $zero,        $ac3                         \n\t"
392           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
393           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
394           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
395           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
396 
397           /* odd 7. pixel */
398           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
399           "mthi             $zero,        $ac1                         \n\t"
400           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
401           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
402           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
403 
404           /* odd 8. pixel */
405           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
406           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
407 
408           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
409           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
410           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
411 
412           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
413           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
414           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
415 
416           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
417             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
418             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
419             [p5] "=&r" (p5),
420             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
421           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
422             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
423       );
424 
425       src += 16;
426       dst += 16;
427     }
428 
429     /* Next row... */
430     src_ptr += src_stride;
431     dst_ptr += dst_stride;
432   }
433 }
434 
convolve_bi_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)435 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
436                                        int32_t src_stride,
437                                        uint8_t *dst_ptr,
438                                        int32_t dst_stride,
439                                        const int16_t *filter_x0,
440                                        int32_t h) {
441   int32_t y, c;
442   const uint8_t *src;
443   uint8_t *dst;
444   uint8_t *cm = vpx_ff_cropTbl;
445   uint32_t vector_64 = 64;
446   int32_t Temp1, Temp2, Temp3;
447   uint32_t qload1, qload2, qload3;
448   uint32_t p1, p2, p3, p4, p5;
449   uint32_t st1, st2, st3;
450   const int16_t *filter = &filter_x0[3];
451   uint32_t filter45;;
452 
453   filter45 = ((const int32_t *)filter)[0];
454 
455   for (y = h; y--;) {
456     src = src_ptr;
457     dst = dst_ptr;
458 
459     /* prefetch data to cache memory */
460     prefetch_load(src_ptr + src_stride);
461     prefetch_load(src_ptr + src_stride + 32);
462     prefetch_load(src_ptr + src_stride + 64);
463     prefetch_store(dst_ptr + dst_stride);
464     prefetch_store(dst_ptr + dst_stride + 32);
465 
466     for (c = 0; c < 4; c++) {
467       __asm__ __volatile__ (
468           "ulw              %[qload1],    0(%[src])                    \n\t"
469           "ulw              %[qload2],    4(%[src])                    \n\t"
470 
471           /* even 1. pixel */
472           "mtlo             %[vector_64], $ac1                         \n\t" /* even 1 */
473           "mthi             $zero,        $ac1                         \n\t"
474           "mtlo             %[vector_64], $ac2                         \n\t" /* even 2 */
475           "mthi             $zero,        $ac2                         \n\t"
476           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
477           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
478           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
479           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
480           "ulw              %[qload3],    8(%[src])                    \n\t"
481           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* even 1 */
482           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 1 */
483 
484           /* even 2. pixel */
485           "mtlo             %[vector_64], $ac3                         \n\t" /* even 3 */
486           "mthi             $zero,        $ac3                         \n\t"
487           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
488           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
489           "ulw              %[qload1],    12(%[src])                   \n\t"
490           "dpa.w.ph         $ac2,         %[p2],          %[filter45]  \n\t" /* even 1 */
491           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 1 */
492           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 1 */
493 
494           /* even 3. pixel */
495           "mtlo             %[vector_64], $ac1                         \n\t" /* even 4 */
496           "mthi             $zero,        $ac1                         \n\t"
497           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
498           "sb               %[st1],       0(%[dst])                    \n\t" /* even 1 */
499           "dpa.w.ph         $ac3,         %[p3],          %[filter45]  \n\t" /* even 3 */
500           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 3 */
501           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 1 */
502 
503           /* even 4. pixel */
504           "mtlo             %[vector_64], $ac2                         \n\t" /* even 5 */
505           "mthi             $zero,        $ac2                         \n\t"
506           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
507           "sb               %[st2],       2(%[dst])                    \n\t" /* even 1 */
508           "dpa.w.ph         $ac1,         %[p4],          %[filter45]  \n\t" /* even 4 */
509           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 4 */
510           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 3 */
511 
512           /* even 5. pixel */
513           "mtlo             %[vector_64], $ac3                         \n\t" /* even 6 */
514           "mthi             $zero,        $ac3                         \n\t"
515           "sb               %[st3],       4(%[dst])                    \n\t" /* even 3 */
516           "dpa.w.ph         $ac2,         %[p1],          %[filter45]  \n\t" /* even 5 */
517           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 5 */
518           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 4 */
519 
520           /* even 6. pixel */
521           "mtlo             %[vector_64], $ac1                         \n\t" /* even 7 */
522           "mthi             $zero,        $ac1                         \n\t"
523           "sb               %[st1],       6(%[dst])                    \n\t" /* even 4 */
524           "dpa.w.ph         $ac3,         %[p5],          %[filter45]  \n\t" /* even 6 */
525           "extp             %[Temp3],     $ac3,           31           \n\t" /* even 6 */
526           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 5 */
527 
528           /* even 7. pixel */
529           "mtlo             %[vector_64], $ac2                         \n\t" /* even 8 */
530           "mthi             $zero,        $ac2                         \n\t"
531           "sb               %[st2],       8(%[dst])                    \n\t" /* even 5 */
532           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* even 7 */
533           "extp             %[Temp1],     $ac1,           31           \n\t" /* even 7 */
534           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* even 6 */
535 
536           /* even 8. pixel */
537           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 1 */
538           "mthi             $zero,        $ac3                         \n\t"
539           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* even 8 */
540           "sb               %[st3],       10(%[dst])                   \n\t" /* even 6 */
541           "extp             %[Temp2],     $ac2,           31           \n\t" /* even 8 */
542           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* even 7 */
543 
544           /* ODD pixels */
545           "ulw              %[qload1],    1(%[src])                    \n\t"
546           "ulw              %[qload2],    5(%[src])                    \n\t"
547 
548           /* odd 1. pixel */
549           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 2 */
550           "mthi             $zero,        $ac1                         \n\t"
551           "preceu.ph.qbr    %[p1],        %[qload1]                    \n\t"
552           "preceu.ph.qbl    %[p2],        %[qload1]                    \n\t"
553           "preceu.ph.qbr    %[p3],        %[qload2]                    \n\t"
554           "preceu.ph.qbl    %[p4],        %[qload2]                    \n\t"
555           "sb               %[st1],       12(%[dst])                   \n\t" /* even 7 */
556           "ulw              %[qload3],    9(%[src])                    \n\t"
557           "dpa.w.ph         $ac3,         %[p1],          %[filter45]  \n\t" /* odd 1 */
558           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 1 */
559           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* even 8 */
560 
561           /* odd 2. pixel */
562           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 3 */
563           "mthi             $zero,        $ac2                         \n\t"
564           "preceu.ph.qbr    %[p1],        %[qload3]                    \n\t"
565           "preceu.ph.qbl    %[p5],        %[qload3]                    \n\t"
566           "sb               %[st2],       14(%[dst])                   \n\t" /* even 8 */
567           "ulw              %[qload1],    13(%[src])                   \n\t"
568           "dpa.w.ph         $ac1,         %[p2],          %[filter45]  \n\t" /* odd 2 */
569           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 2 */
570           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 1 */
571 
572           /* odd 3. pixel */
573           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 4 */
574           "mthi             $zero,        $ac3                         \n\t"
575           "preceu.ph.qbr    %[p2],        %[qload1]                    \n\t"
576           "sb               %[st3],       1(%[dst])                    \n\t" /* odd 1 */
577           "dpa.w.ph         $ac2,         %[p3],          %[filter45]  \n\t" /* odd 3 */
578           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 3 */
579           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 2 */
580 
581           /* odd 4. pixel */
582           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 5 */
583           "mthi             $zero,        $ac1                         \n\t"
584           "preceu.ph.qbl    %[p3],        %[qload1]                    \n\t"
585           "sb               %[st1],       3(%[dst])                    \n\t" /* odd 2 */
586           "dpa.w.ph         $ac3,         %[p4],          %[filter45]  \n\t" /* odd 4 */
587           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 4 */
588           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 3 */
589 
590           /* odd 5. pixel */
591           "mtlo             %[vector_64], $ac2                         \n\t" /* odd 6 */
592           "mthi             $zero,        $ac2                         \n\t"
593           "sb               %[st2],       5(%[dst])                    \n\t" /* odd 3 */
594           "dpa.w.ph         $ac1,         %[p1],          %[filter45]  \n\t" /* odd 5 */
595           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 5 */
596           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 4 */
597 
598           /* odd 6. pixel */
599           "mtlo             %[vector_64], $ac3                         \n\t" /* odd 7 */
600           "mthi             $zero,        $ac3                         \n\t"
601           "sb               %[st3],       7(%[dst])                    \n\t" /* odd 4 */
602           "dpa.w.ph         $ac2,         %[p5],          %[filter45]  \n\t" /* odd 6 */
603           "extp             %[Temp2],     $ac2,           31           \n\t" /* odd 6 */
604           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 5 */
605 
606           /* odd 7. pixel */
607           "mtlo             %[vector_64], $ac1                         \n\t" /* odd 8 */
608           "mthi             $zero,        $ac1                         \n\t"
609           "sb               %[st1],       9(%[dst])                    \n\t" /* odd 5 */
610           "dpa.w.ph         $ac3,         %[p2],          %[filter45]  \n\t" /* odd 7 */
611           "extp             %[Temp3],     $ac3,           31           \n\t" /* odd 7 */
612 
613           /* odd 8. pixel */
614           "dpa.w.ph         $ac1,         %[p3],          %[filter45]  \n\t" /* odd 8 */
615           "extp             %[Temp1],     $ac1,           31           \n\t" /* odd 8 */
616 
617           "lbux             %[st2],       %[Temp2](%[cm])              \n\t" /* odd 6 */
618           "lbux             %[st3],       %[Temp3](%[cm])              \n\t" /* odd 7 */
619           "lbux             %[st1],       %[Temp1](%[cm])              \n\t" /* odd 8 */
620 
621           "sb               %[st2],       11(%[dst])                   \n\t" /* odd 6 */
622           "sb               %[st3],       13(%[dst])                   \n\t" /* odd 7 */
623           "sb               %[st1],       15(%[dst])                   \n\t" /* odd 8 */
624 
625           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
626             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
627             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
628             [p5] "=&r" (p5),
629             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
630           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
631             [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
632       );
633 
634       src += 16;
635       dst += 16;
636     }
637 
638     /* Next row... */
639     src_ptr += src_stride;
640     dst_ptr += dst_stride;
641   }
642 }
643 
vpx_convolve2_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)644 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
645                                uint8_t *dst, ptrdiff_t dst_stride,
646                                const int16_t *filter_x, int x_step_q4,
647                                const int16_t *filter_y, int y_step_q4,
648                                int w, int h) {
649   uint32_t pos = 38;
650 
651   assert(x_step_q4 == 16);
652 
653   prefetch_load((const uint8_t *)filter_x);
654 
655   /* bit positon for extract from acc */
656   __asm__ __volatile__ (
657     "wrdsp      %[pos],     1           \n\t"
658     :
659     : [pos] "r" (pos)
660   );
661 
662   /* prefetch data to cache memory */
663   prefetch_load(src);
664   prefetch_load(src + 32);
665   prefetch_store(dst);
666 
667   switch (w) {
668     case 4:
669       convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
670                                 dst, (int32_t)dst_stride,
671                                 filter_x, (int32_t)h);
672       break;
673     case 8:
674       convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
675                                 dst, (int32_t)dst_stride,
676                                 filter_x, (int32_t)h);
677       break;
678     case 16:
679       convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
680                                  dst, (int32_t)dst_stride,
681                                  filter_x, (int32_t)h, 1);
682       break;
683     case 32:
684       convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
685                                  dst, (int32_t)dst_stride,
686                                  filter_x, (int32_t)h, 2);
687       break;
688     case 64:
689       prefetch_load(src + 64);
690       prefetch_store(dst + 32);
691 
692       convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
693                                  dst, (int32_t)dst_stride,
694                                  filter_x, (int32_t)h);
695       break;
696     default:
697       vpx_convolve8_horiz_c(src, src_stride,
698                             dst, dst_stride,
699                             filter_x, x_step_q4,
700                             filter_y, y_step_q4,
701                             w, h);
702       break;
703   }
704 }
705 #endif
706