1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_dsp_rtcd.h"
15 #include "vpx_dsp/mips/convolve_common_dspr2.h"
16 #include "vpx_dsp/vpx_dsp_common.h"
17 #include "vpx_dsp/vpx_filter.h"
18 #include "vpx_ports/mem.h"
19 
20 #if HAVE_DSPR2
convolve_bi_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)21 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
22                                                  int32_t src_stride,
23                                                  uint8_t *dst,
24                                                  int32_t dst_stride,
25                                                  const int16_t *filter_x0,
26                                                  int32_t h) {
27   int32_t       y;
28   uint8_t       *cm = vpx_ff_cropTbl;
29   uint8_t       *dst_ptr;
30   int32_t       Temp1, Temp2;
31   uint32_t      vector4a = 64;
32   uint32_t      tp1, tp2;
33   uint32_t      p1, p2;
34   const int16_t *filter = &filter_x0[3];
35   uint32_t      filter45;
36 
37   filter45 = ((const int32_t *)filter)[0];
38 
39   for (y = h; y--;) {
40     dst_ptr = dst;
41     /* prefetch data to cache memory */
42     prefetch_load(src + src_stride);
43     prefetch_load(src + src_stride + 32);
44 
45     __asm__ __volatile__ (
46         "ulw              %[tp1],         0(%[src])                      \n\t"
47         "ulw              %[tp2],         4(%[src])                      \n\t"
48 
49         /* even 1. pixel */
50         "mtlo             %[vector4a],    $ac3                           \n\t"
51         "mthi             $zero,          $ac3                           \n\t"
52         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
53         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
54         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
55         "extp             %[Temp1],       $ac3,           31             \n\t"
56 
57         /* even 2. pixel */
58         "mtlo             %[vector4a],    $ac2                           \n\t"
59         "mthi             $zero,          $ac2                           \n\t"
60         "balign           %[tp2],         %[tp1],         3              \n\t"
61         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
62         "extp             %[Temp2],       $ac2,           31             \n\t"
63 
64         /* odd 1. pixel */
65         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
66         "mtlo             %[vector4a],    $ac3                           \n\t"
67         "mthi             $zero,          $ac3                           \n\t"
68         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
69         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
70         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
71         "extp             %[Temp1],       $ac3,           31             \n\t"
72 
73         /* odd 2. pixel */
74         "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
75         "mtlo             %[vector4a],    $ac2                           \n\t"
76         "mthi             $zero,          $ac2                           \n\t"
77         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
78         "extp             %[Temp2],       $ac2,           31             \n\t"
79 
80         /* clamp */
81         "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
82         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
83 
84         /* store bytes */
85         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
86         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
87 
88         "sb               %[p1],          0(%[dst_ptr])                  \n\t"
89         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
90 
91         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
92         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
93 
94         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
95         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
96 
97         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
98           [p1] "=&r" (p1), [p2] "=&r" (p2),
99           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
100           [dst_ptr] "+r" (dst_ptr)
101         : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
102           [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
103     );
104 
105     /* Next row... */
106     src += src_stride;
107     dst += 1;
108   }
109 }
110 
convolve_bi_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)111 static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
112                                                  int32_t src_stride,
113                                                  uint8_t *dst,
114                                                  int32_t dst_stride,
115                                                  const int16_t *filter_x0,
116                                                  int32_t h) {
117   int32_t y;
118   uint8_t *cm = vpx_ff_cropTbl;
119   uint8_t *dst_ptr;
120   uint32_t vector4a = 64;
121   int32_t Temp1, Temp2, Temp3;
122   uint32_t tp1, tp2, tp3;
123   uint32_t p1, p2, p3, p4;
124   uint8_t *odd_dst;
125   uint32_t dst_pitch_2 = (dst_stride << 1);
126   const int16_t *filter = &filter_x0[3];
127   uint32_t      filter45;
128 
129   filter45 = ((const int32_t *)filter)[0];
130 
131   for (y = h; y--;) {
132     /* prefetch data to cache memory */
133     prefetch_load(src + src_stride);
134     prefetch_load(src + src_stride + 32);
135 
136     dst_ptr = dst;
137     odd_dst = (dst_ptr + dst_stride);
138 
139     __asm__ __volatile__ (
140         "ulw              %[tp1],         0(%[src])                       \n\t"
141         "ulw              %[tp2],         4(%[src])                       \n\t"
142 
143         /* even 1. pixel */
144         "mtlo             %[vector4a],    $ac3                            \n\t"
145         "mthi             $zero,          $ac3                            \n\t"
146         "mtlo             %[vector4a],    $ac2                            \n\t"
147         "mthi             $zero,          $ac2                            \n\t"
148         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
149         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
150         "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
151         "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
152         "ulw              %[tp3],         8(%[src])                       \n\t"
153         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
154         "extp             %[Temp1],       $ac3,           31              \n\t"
155 
156         /* even 2. pixel */
157         "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
158         "extp             %[Temp3],       $ac2,           31              \n\t"
159 
160         /* even 3. pixel */
161         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
162         "mtlo             %[vector4a],    $ac1                            \n\t"
163         "mthi             $zero,          $ac1                            \n\t"
164         "balign           %[tp3],         %[tp2],         3              \n\t"
165         "balign           %[tp2],         %[tp1],         3              \n\t"
166         "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
167         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
168         "extp             %[p3],          $ac1,           31              \n\t"
169 
170         /* even 4. pixel */
171         "mtlo             %[vector4a],    $ac2                            \n\t"
172         "mthi             $zero,          $ac2                            \n\t"
173         "mtlo             %[vector4a],    $ac3                            \n\t"
174         "mthi             $zero,          $ac3                            \n\t"
175         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
176         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
177         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
178         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
179 
180         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
181         "extp             %[Temp3],       $ac2,           31              \n\t"
182 
183         "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
184 
185         /* odd 1. pixel */
186         "mtlo             %[vector4a],    $ac1                            \n\t"
187         "mthi             $zero,          $ac1                            \n\t"
188         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
189         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
190         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
191         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
192         "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
193         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
194 
195         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
196         "extp             %[Temp2],       $ac3,           31              \n\t"
197 
198         /* odd 2. pixel */
199         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
200         "mtlo             %[vector4a],    $ac3                            \n\t"
201         "mthi             $zero,          $ac3                            \n\t"
202         "mtlo             %[vector4a],    $ac2                            \n\t"
203         "mthi             $zero,          $ac2                            \n\t"
204         "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
205         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
206         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
207         "extp             %[Temp3],       $ac1,           31              \n\t"
208 
209         /* odd 3. pixel */
210         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
211         "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
212         "extp             %[Temp2],       $ac3,           31              \n\t"
213 
214         /* odd 4. pixel */
215         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
216         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
217         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
218         "extp             %[Temp1],       $ac2,           31              \n\t"
219 
220         /* clamp */
221         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
222         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
223         "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
224 
225         /* store bytes */
226         "sb               %[p4],          0(%[odd_dst])                   \n\t"
227         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
228 
229         "sb               %[p2],          0(%[odd_dst])                   \n\t"
230         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
231 
232         "sb               %[p1],          0(%[odd_dst])                   \n\t"
233 
234         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
235           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
236           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
237           [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
238         : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
239           [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
240     );
241 
242     /* Next row... */
243     src += src_stride;
244     dst += 1;
245   }
246 }
247 
convolve_bi_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)248 static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
249                                                   int32_t src_stride,
250                                                   uint8_t *dst_ptr,
251                                                   int32_t dst_stride,
252                                                   const int16_t *filter_x0,
253                                                   int32_t h,
254                                                   int32_t count) {
255   int32_t       c, y;
256   const uint8_t *src;
257   uint8_t       *dst;
258   uint8_t       *cm = vpx_ff_cropTbl;
259   uint32_t      vector_64 = 64;
260   int32_t       Temp1, Temp2, Temp3;
261   uint32_t      qload1, qload2;
262   uint32_t      p1, p2, p3, p4, p5;
263   uint32_t      st1, st2, st3;
264   uint32_t      dst_pitch_2 = (dst_stride << 1);
265   uint8_t       *odd_dst;
266   const int16_t *filter = &filter_x0[3];
267   uint32_t      filter45;
268 
269   filter45 = ((const int32_t *)filter)[0];
270 
271   for (y = h; y--;) {
272     /* prefetch data to cache memory */
273     prefetch_load(src_ptr + src_stride);
274     prefetch_load(src_ptr + src_stride + 32);
275 
276     src = src_ptr;
277     dst = dst_ptr;
278 
279     odd_dst = (dst + dst_stride);
280 
281     for (c = 0; c < count; c++) {
282       __asm__ __volatile__ (
283           "ulw              %[qload1],        0(%[src])                       \n\t"
284           "ulw              %[qload2],        4(%[src])                       \n\t"
285 
286           /* even 1. pixel */
287           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
288           "mthi             $zero,            $ac1                            \n\t"
289           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
290           "mthi             $zero,            $ac2                            \n\t"
291           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
292           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
293           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
294           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
295           "ulw              %[qload1],        8(%[src])                       \n\t"
296           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
297           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
298 
299           /* even 2. pixel */
300           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
301           "mthi             $zero,            $ac3                            \n\t"
302           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
303           "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
304           "ulw              %[qload2],        12(%[src])                      \n\t"
305           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
306           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
307           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
308 
309           /* even 3. pixel */
310           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
311           "mthi             $zero,            $ac1                            \n\t"
312           "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
313           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
314           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
315           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
316           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
317           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
318 
319           /* even 4. pixel */
320           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
321           "mthi             $zero,            $ac2                            \n\t"
322           "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
323           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
324           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
325           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
326           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
327           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
328 
329           /* even 5. pixel */
330           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
331           "mthi             $zero,            $ac3                            \n\t"
332           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
333           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
334           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
335           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
336           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
337 
338           /* even 6. pixel */
339           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
340           "mthi             $zero,            $ac1                            \n\t"
341           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
342           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
343           "ulw              %[qload1],        20(%[src])                      \n\t"
344           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
345           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
346           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
347 
348           /* even 7. pixel */
349           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
350           "mthi             $zero,            $ac2                            \n\t"
351           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
352           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
353           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
354           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
355           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
356           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
357 
358           /* even 8. pixel */
359           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
360           "mthi             $zero,            $ac3                            \n\t"
361           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
362           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
363           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
364           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
365           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
366 
367           /* ODD pixels */
368           "ulw              %[qload1],        1(%[src])                       \n\t"
369           "ulw              %[qload2],        5(%[src])                       \n\t"
370 
371           /* odd 1. pixel */
372           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
373           "mthi             $zero,            $ac1                            \n\t"
374           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
375           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
376           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
377           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
378           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
379           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
380           "ulw              %[qload2],        9(%[src])                       \n\t"
381           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
382           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
383           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
384 
385           /* odd 2. pixel */
386           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
387           "mthi             $zero,            $ac2                            \n\t"
388           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
389           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
390           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
391           "ulw              %[qload1],        13(%[src])                      \n\t"
392           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
393           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
394           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
395 
396           /* odd 3. pixel */
397           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
398           "mthi             $zero,            $ac3                            \n\t"
399           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
400           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
401           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
402           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
403           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
404           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
405 
406           /* odd 4. pixel */
407           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
408           "mthi             $zero,            $ac1                            \n\t"
409           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
410           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
411           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
412           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
413           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
414           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
415 
416           /* odd 5. pixel */
417           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
418           "mthi             $zero,            $ac2                            \n\t"
419           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
420           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
421           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
422           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
423           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
424 
425           /* odd 6. pixel */
426           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
427           "mthi             $zero,            $ac3                            \n\t"
428           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
429           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
430           "ulw              %[qload1],        21(%[src])                      \n\t"
431           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
432           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
433           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
434 
435           /* odd 7. pixel */
436           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
437           "mthi             $zero,            $ac1                            \n\t"
438           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
439           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
440           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
441           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
442           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
443 
444           /* odd 8. pixel */
445           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
446           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
447 
448           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
449           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
450           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
451 
452           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
453           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
454 
455           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
456           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
457 
458           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
459 
460           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
461             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
462             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
463             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
464             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
465           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
466             [cm] "r" (cm),
467             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
468       );
469 
470       src += 16;
471       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
472       odd_dst = (dst + dst_stride);
473     }
474 
475     /* Next row... */
476     src_ptr += src_stride;
477     dst_ptr += 1;
478   }
479 }
480 
convolve_bi_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)481 static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
482                                                   int32_t src_stride,
483                                                   uint8_t *dst_ptr,
484                                                   int32_t dst_stride,
485                                                   const int16_t *filter_x0,
486                                                   int32_t h) {
487   int32_t       c, y;
488   const uint8_t *src;
489   uint8_t       *dst;
490   uint8_t       *cm = vpx_ff_cropTbl;
491   uint32_t      vector_64 = 64;
492   int32_t       Temp1, Temp2, Temp3;
493   uint32_t      qload1, qload2;
494   uint32_t      p1, p2, p3, p4, p5;
495   uint32_t      st1, st2, st3;
496   uint32_t      dst_pitch_2 = (dst_stride << 1);
497   uint8_t       *odd_dst;
498   const int16_t *filter = &filter_x0[3];
499   uint32_t      filter45;
500 
501   filter45 = ((const int32_t *)filter)[0];
502 
503   for (y = h; y--;) {
504     /* prefetch data to cache memory */
505     prefetch_load(src_ptr + src_stride);
506     prefetch_load(src_ptr + src_stride + 32);
507     prefetch_load(src_ptr + src_stride + 64);
508 
509     src = src_ptr;
510     dst = dst_ptr;
511 
512     odd_dst = (dst + dst_stride);
513 
514     for (c = 0; c < 4; c++) {
515       __asm__ __volatile__ (
516           "ulw              %[qload1],        0(%[src])                       \n\t"
517           "ulw              %[qload2],        4(%[src])                       \n\t"
518 
519           /* even 1. pixel */
520           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
521           "mthi             $zero,            $ac1                            \n\t"
522           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
523           "mthi             $zero,            $ac2                            \n\t"
524           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
525           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
526           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
527           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
528           "ulw              %[qload1],        8(%[src])                       \n\t"
529           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
530           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
531 
532           /* even 2. pixel */
533           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
534           "mthi             $zero,            $ac3                            \n\t"
535           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
536           "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
537           "ulw              %[qload2],        12(%[src])                      \n\t"
538           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
539           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
540           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
541 
542           /* even 3. pixel */
543           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
544           "mthi             $zero,            $ac1                            \n\t"
545           "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
546           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
547           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
548           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
549           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
550           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
551 
552           /* even 4. pixel */
553           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
554           "mthi             $zero,            $ac2                            \n\t"
555           "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
556           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
557           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
558           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
559           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
560           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
561 
562           /* even 5. pixel */
563           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
564           "mthi             $zero,            $ac3                            \n\t"
565           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
566           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
567           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
568           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
569           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
570 
571           /* even 6. pixel */
572           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
573           "mthi             $zero,            $ac1                            \n\t"
574           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
575           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
576           "ulw              %[qload1],        20(%[src])                      \n\t"
577           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
578           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
579           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
580 
581           /* even 7. pixel */
582           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
583           "mthi             $zero,            $ac2                            \n\t"
584           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
585           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
586           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
587           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
588           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
589           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
590 
591           /* even 8. pixel */
592           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
593           "mthi             $zero,            $ac3                            \n\t"
594           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
595           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
596           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
597           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
598           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
599 
600           /* ODD pixels */
601           "ulw              %[qload1],        1(%[src])                       \n\t"
602           "ulw              %[qload2],        5(%[src])                       \n\t"
603 
604           /* odd 1. pixel */
605           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
606           "mthi             $zero,            $ac1                            \n\t"
607           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
608           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
609           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
610           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
611           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
612           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
613           "ulw              %[qload2],        9(%[src])                       \n\t"
614           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
615           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
616           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
617 
618           /* odd 2. pixel */
619           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
620           "mthi             $zero,            $ac2                            \n\t"
621           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
622           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
623           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
624           "ulw              %[qload1],        13(%[src])                      \n\t"
625           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
626           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
627           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
628 
629           /* odd 3. pixel */
630           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
631           "mthi             $zero,            $ac3                            \n\t"
632           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
633           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
634           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
635           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
636           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
637           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
638 
639           /* odd 4. pixel */
640           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
641           "mthi             $zero,            $ac1                            \n\t"
642           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
643           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
644           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
645           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
646           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
647           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
648 
649           /* odd 5. pixel */
650           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
651           "mthi             $zero,            $ac2                            \n\t"
652           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
653           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
654           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
655           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
656           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
657 
658           /* odd 6. pixel */
659           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
660           "mthi             $zero,            $ac3                            \n\t"
661           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
662           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
663           "ulw              %[qload1],        21(%[src])                      \n\t"
664           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
665           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
666           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
667 
668           /* odd 7. pixel */
669           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
670           "mthi             $zero,            $ac1                            \n\t"
671           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
672           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
673           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
674           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
675           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
676 
677           /* odd 8. pixel */
678           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
679           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
680 
681           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
682           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
683           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
684 
685           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
686           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
687 
688           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
689           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
690 
691           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
692 
693           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
694             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
695             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
696             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
697             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
698           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
699             [cm] "r" (cm),
700             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
701       );
702 
703       src += 16;
704       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
705       odd_dst = (dst + dst_stride);
706     }
707 
708     /* Next row... */
709     src_ptr += src_stride;
710     dst_ptr += 1;
711   }
712 }
713 
convolve_bi_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)714 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
715                                   uint8_t *dst, ptrdiff_t dst_stride,
716                                   const int16_t *filter, int w, int h) {
717   int x, y;
718 
719   for (y = 0; y < h; ++y) {
720     for (x = 0; x < w; ++x) {
721       int sum = 0;
722 
723       sum += src[x] * filter[3];
724       sum += src[x + 1] * filter[4];
725 
726       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
727     }
728 
729     src += src_stride;
730     dst += 1;
731   }
732 }
733 
vpx_convolve2_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)734 void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
735                          uint8_t *dst, ptrdiff_t dst_stride,
736                          const int16_t *filter,
737                          int w, int h) {
738   uint32_t pos = 38;
739 
740   /* bit positon for extract from acc */
741   __asm__ __volatile__ (
742     "wrdsp      %[pos],     1           \n\t"
743     :
744     : [pos] "r" (pos)
745   );
746 
747   /* prefetch data to cache memory */
748   prefetch_load(src);
749   prefetch_load(src + 32);
750 
751   switch (w) {
752     case 4:
753       convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
754                                            dst, dst_stride,
755                                            filter, h);
756       break;
757     case 8:
758       convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
759                                            dst, dst_stride,
760                                            filter, h);
761       break;
762     case 16:
763     case 32:
764       convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
765                                             dst, dst_stride,
766                                             filter, h,
767                                             (w/16));
768       break;
769     case 64:
770       prefetch_load(src + 32);
771       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
772                                             dst, dst_stride,
773                                             filter, h);
774       break;
775     default:
776       convolve_bi_horiz_transposed(src, src_stride,
777                                    dst, dst_stride,
778                                    filter, w, h);
779       break;
780   }
781 }
782 #endif
783