1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_filter.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_bi_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
24                                                  int32_t src_stride,
25                                                  uint8_t *dst,
26                                                  int32_t dst_stride,
27                                                  const int16_t *filter_x0,
28                                                  int32_t h) {
29   int32_t       y;
30   uint8_t       *cm = vp9_ff_cropTbl;
31   uint8_t       *dst_ptr;
32   int32_t       Temp1, Temp2;
33   uint32_t      vector4a = 64;
34   uint32_t      tp1, tp2;
35   uint32_t      p1, p2;
36   const int16_t *filter = &filter_x0[3];
37   uint32_t      filter45;
38 
39   filter45 = ((const int32_t *)filter)[0];
40 
41   for (y = h; y--;) {
42     dst_ptr = dst;
43     /* prefetch data to cache memory */
44     vp9_prefetch_load(src + src_stride);
45     vp9_prefetch_load(src + src_stride + 32);
46 
47     __asm__ __volatile__ (
48         "ulw              %[tp1],         0(%[src])                      \n\t"
49         "ulw              %[tp2],         4(%[src])                      \n\t"
50 
51         /* even 1. pixel */
52         "mtlo             %[vector4a],    $ac3                           \n\t"
53         "mthi             $zero,          $ac3                           \n\t"
54         "preceu.ph.qbr    %[p1],          %[tp1]                         \n\t"
55         "preceu.ph.qbl    %[p2],          %[tp1]                         \n\t"
56         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
57         "extp             %[Temp1],       $ac3,           31             \n\t"
58 
59         /* even 2. pixel */
60         "mtlo             %[vector4a],    $ac2                           \n\t"
61         "mthi             $zero,          $ac2                           \n\t"
62         "balign           %[tp2],         %[tp1],         3              \n\t"
63         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
64         "extp             %[Temp2],       $ac2,           31             \n\t"
65 
66         /* odd 1. pixel */
67         "lbux             %[tp1],         %[Temp1](%[cm])                \n\t"
68         "mtlo             %[vector4a],    $ac3                           \n\t"
69         "mthi             $zero,          $ac3                           \n\t"
70         "preceu.ph.qbr    %[p1],          %[tp2]                         \n\t"
71         "preceu.ph.qbl    %[p2],          %[tp2]                         \n\t"
72         "dpa.w.ph         $ac3,           %[p1],          %[filter45]    \n\t"
73         "extp             %[Temp1],       $ac3,           31             \n\t"
74 
75         /* odd 2. pixel */
76         "lbux             %[tp2],         %[Temp2](%[cm])                \n\t"
77         "mtlo             %[vector4a],    $ac2                           \n\t"
78         "mthi             $zero,          $ac2                           \n\t"
79         "dpa.w.ph         $ac2,           %[p2],          %[filter45]    \n\t"
80         "extp             %[Temp2],       $ac2,           31             \n\t"
81 
82         /* clamp */
83         "lbux             %[p1],          %[Temp1](%[cm])                \n\t"
84         "lbux             %[p2],          %[Temp2](%[cm])                \n\t"
85 
86         /* store bytes */
87         "sb               %[tp1],         0(%[dst_ptr])                  \n\t"
88         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
89 
90         "sb               %[p1],          0(%[dst_ptr])                  \n\t"
91         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
92 
93         "sb               %[tp2],         0(%[dst_ptr])                  \n\t"
94         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
95 
96         "sb               %[p2],          0(%[dst_ptr])                  \n\t"
97         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_stride]  \n\t"
98 
99         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
100           [p1] "=&r" (p1), [p2] "=&r" (p2),
101           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
102           [dst_ptr] "+r" (dst_ptr)
103         : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
104           [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
105     );
106 
107     /* Next row... */
108     src += src_stride;
109     dst += 1;
110   }
111 }
112 
convolve_bi_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)113 static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
114                                                  int32_t src_stride,
115                                                  uint8_t *dst,
116                                                  int32_t dst_stride,
117                                                  const int16_t *filter_x0,
118                                                  int32_t h) {
119   int32_t y;
120   uint8_t *cm = vp9_ff_cropTbl;
121   uint8_t *dst_ptr;
122   uint32_t vector4a = 64;
123   int32_t Temp1, Temp2, Temp3;
124   uint32_t tp1, tp2, tp3;
125   uint32_t p1, p2, p3, p4;
126   uint8_t *odd_dst;
127   uint32_t dst_pitch_2 = (dst_stride << 1);
128   const int16_t *filter = &filter_x0[3];
129   uint32_t      filter45;
130 
131   filter45 = ((const int32_t *)filter)[0];
132 
133   for (y = h; y--;) {
134     /* prefetch data to cache memory */
135     vp9_prefetch_load(src + src_stride);
136     vp9_prefetch_load(src + src_stride + 32);
137 
138     dst_ptr = dst;
139     odd_dst = (dst_ptr + dst_stride);
140 
141     __asm__ __volatile__ (
142         "ulw              %[tp1],         0(%[src])                       \n\t"
143         "ulw              %[tp2],         4(%[src])                       \n\t"
144 
145         /* even 1. pixel */
146         "mtlo             %[vector4a],    $ac3                            \n\t"
147         "mthi             $zero,          $ac3                            \n\t"
148         "mtlo             %[vector4a],    $ac2                            \n\t"
149         "mthi             $zero,          $ac2                            \n\t"
150         "preceu.ph.qbr    %[p1],          %[tp1]                          \n\t"
151         "preceu.ph.qbl    %[p2],          %[tp1]                          \n\t"
152         "preceu.ph.qbr    %[p3],          %[tp2]                          \n\t"
153         "preceu.ph.qbl    %[p4],          %[tp2]                          \n\t"
154         "ulw              %[tp3],         8(%[src])                       \n\t"
155         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
156         "extp             %[Temp1],       $ac3,           31              \n\t"
157 
158         /* even 2. pixel */
159         "dpa.w.ph         $ac2,           %[p2],          %[filter45]     \n\t"
160         "extp             %[Temp3],       $ac2,           31              \n\t"
161 
162         /* even 3. pixel */
163         "lbux             %[Temp2],       %[Temp1](%[cm])                 \n\t"
164         "mtlo             %[vector4a],    $ac1                            \n\t"
165         "mthi             $zero,          $ac1                            \n\t"
166         "balign           %[tp3],         %[tp2],         3              \n\t"
167         "balign           %[tp2],         %[tp1],         3              \n\t"
168         "dpa.w.ph         $ac1,           %[p3],          %[filter45]     \n\t"
169         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
170         "extp             %[p3],          $ac1,           31              \n\t"
171 
172         /* even 4. pixel */
173         "mtlo             %[vector4a],    $ac2                            \n\t"
174         "mthi             $zero,          $ac2                            \n\t"
175         "mtlo             %[vector4a],    $ac3                            \n\t"
176         "mthi             $zero,          $ac3                            \n\t"
177         "sb               %[Temp2],       0(%[dst_ptr])                   \n\t"
178         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
179         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
180         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
181 
182         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
183         "extp             %[Temp3],       $ac2,           31              \n\t"
184 
185         "lbux             %[Temp1],         %[p3](%[cm])                    \n\t"
186 
187         /* odd 1. pixel */
188         "mtlo             %[vector4a],    $ac1                            \n\t"
189         "mthi             $zero,          $ac1                            \n\t"
190         "preceu.ph.qbr    %[p1],          %[tp2]                          \n\t"
191         "preceu.ph.qbl    %[p2],          %[tp2]                          \n\t"
192         "preceu.ph.qbr    %[p3],          %[tp3]                          \n\t"
193         "preceu.ph.qbl    %[p4],          %[tp3]                          \n\t"
194         "sb               %[Temp1],       0(%[dst_ptr])                   \n\t"
195         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
196 
197         "dpa.w.ph         $ac3,           %[p1],          %[filter45]     \n\t"
198         "extp             %[Temp2],       $ac3,           31              \n\t"
199 
200         /* odd 2. pixel */
201         "lbux             %[tp1],         %[Temp3](%[cm])                 \n\t"
202         "mtlo             %[vector4a],    $ac3                            \n\t"
203         "mthi             $zero,          $ac3                            \n\t"
204         "mtlo             %[vector4a],    $ac2                            \n\t"
205         "mthi             $zero,          $ac2                            \n\t"
206         "dpa.w.ph         $ac1,           %[p2],          %[filter45]     \n\t"
207         "sb               %[tp1],         0(%[dst_ptr])                   \n\t"
208         "addu             %[dst_ptr],     %[dst_ptr],     %[dst_pitch_2]  \n\t"
209         "extp             %[Temp3],       $ac1,           31              \n\t"
210 
211         /* odd 3. pixel */
212         "lbux             %[tp3],         %[Temp2](%[cm])                 \n\t"
213         "dpa.w.ph         $ac3,           %[p3],          %[filter45]     \n\t"
214         "extp             %[Temp2],       $ac3,           31              \n\t"
215 
216         /* odd 4. pixel */
217         "sb               %[tp3],         0(%[odd_dst])                   \n\t"
218         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
219         "dpa.w.ph         $ac2,           %[p4],          %[filter45]     \n\t"
220         "extp             %[Temp1],       $ac2,           31              \n\t"
221 
222         /* clamp */
223         "lbux             %[p4],          %[Temp3](%[cm])                 \n\t"
224         "lbux             %[p2],          %[Temp2](%[cm])                 \n\t"
225         "lbux             %[p1],          %[Temp1](%[cm])                 \n\t"
226 
227         /* store bytes */
228         "sb               %[p4],          0(%[odd_dst])                   \n\t"
229         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
230 
231         "sb               %[p2],          0(%[odd_dst])                   \n\t"
232         "addu             %[odd_dst],     %[odd_dst],     %[dst_pitch_2]  \n\t"
233 
234         "sb               %[p1],          0(%[odd_dst])                   \n\t"
235 
236         : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
237           [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
238           [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
239           [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
240         : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
241           [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
242     );
243 
244     /* Next row... */
245     src += src_stride;
246     dst += 1;
247   }
248 }
249 
convolve_bi_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)250 static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
251                                                   int32_t src_stride,
252                                                   uint8_t *dst_ptr,
253                                                   int32_t dst_stride,
254                                                   const int16_t *filter_x0,
255                                                   int32_t h,
256                                                   int32_t count) {
257   int32_t       c, y;
258   const uint8_t *src;
259   uint8_t       *dst;
260   uint8_t       *cm = vp9_ff_cropTbl;
261   uint32_t      vector_64 = 64;
262   int32_t       Temp1, Temp2, Temp3;
263   uint32_t      qload1, qload2;
264   uint32_t      p1, p2, p3, p4, p5;
265   uint32_t      st1, st2, st3;
266   uint32_t      dst_pitch_2 = (dst_stride << 1);
267   uint8_t       *odd_dst;
268   const int16_t *filter = &filter_x0[3];
269   uint32_t      filter45;
270 
271   filter45 = ((const int32_t *)filter)[0];
272 
273   for (y = h; y--;) {
274     /* prefetch data to cache memory */
275     vp9_prefetch_load(src_ptr + src_stride);
276     vp9_prefetch_load(src_ptr + src_stride + 32);
277 
278     src = src_ptr;
279     dst = dst_ptr;
280 
281     odd_dst = (dst + dst_stride);
282 
283     for (c = 0; c < count; c++) {
284       __asm__ __volatile__ (
285           "ulw              %[qload1],        0(%[src])                       \n\t"
286           "ulw              %[qload2],        4(%[src])                       \n\t"
287 
288           /* even 1. pixel */
289           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
290           "mthi             $zero,            $ac1                            \n\t"
291           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
292           "mthi             $zero,            $ac2                            \n\t"
293           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
294           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
295           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
296           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
297           "ulw              %[qload1],        8(%[src])                       \n\t"
298           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
299           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
300 
301           /* even 2. pixel */
302           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
303           "mthi             $zero,            $ac3                            \n\t"
304           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
305           "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
306           "ulw              %[qload2],        12(%[src])                      \n\t"
307           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
308           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
309           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
310 
311           /* even 3. pixel */
312           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
313           "mthi             $zero,            $ac1                            \n\t"
314           "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
315           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
316           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
317           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
318           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
319           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
320 
321           /* even 4. pixel */
322           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
323           "mthi             $zero,            $ac2                            \n\t"
324           "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
325           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
326           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
327           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
328           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
329           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
330 
331           /* even 5. pixel */
332           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
333           "mthi             $zero,            $ac3                            \n\t"
334           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
335           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
336           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
337           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
338           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
339 
340           /* even 6. pixel */
341           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
342           "mthi             $zero,            $ac1                            \n\t"
343           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
344           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
345           "ulw              %[qload1],        20(%[src])                      \n\t"
346           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
347           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
348           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
349 
350           /* even 7. pixel */
351           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
352           "mthi             $zero,            $ac2                            \n\t"
353           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
354           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
355           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
356           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
357           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
358           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
359 
360           /* even 8. pixel */
361           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
362           "mthi             $zero,            $ac3                            \n\t"
363           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
364           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
365           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
366           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
367           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
368 
369           /* ODD pixels */
370           "ulw              %[qload1],        1(%[src])                       \n\t"
371           "ulw              %[qload2],        5(%[src])                       \n\t"
372 
373           /* odd 1. pixel */
374           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
375           "mthi             $zero,            $ac1                            \n\t"
376           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
377           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
378           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
379           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
380           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
381           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
382           "ulw              %[qload2],        9(%[src])                       \n\t"
383           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
384           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
385           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
386 
387           /* odd 2. pixel */
388           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
389           "mthi             $zero,            $ac2                            \n\t"
390           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
391           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
392           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
393           "ulw              %[qload1],        13(%[src])                      \n\t"
394           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
395           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
396           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
397 
398           /* odd 3. pixel */
399           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
400           "mthi             $zero,            $ac3                            \n\t"
401           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
402           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
403           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
404           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
405           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
406           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
407 
408           /* odd 4. pixel */
409           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
410           "mthi             $zero,            $ac1                            \n\t"
411           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
412           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
413           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
414           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
415           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
416           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
417 
418           /* odd 5. pixel */
419           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
420           "mthi             $zero,            $ac2                            \n\t"
421           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
422           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
423           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
424           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
425           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
426 
427           /* odd 6. pixel */
428           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
429           "mthi             $zero,            $ac3                            \n\t"
430           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
431           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
432           "ulw              %[qload1],        21(%[src])                      \n\t"
433           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
434           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
435           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
436 
437           /* odd 7. pixel */
438           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
439           "mthi             $zero,            $ac1                            \n\t"
440           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
441           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
442           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
443           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
444           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
445 
446           /* odd 8. pixel */
447           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
448           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
449 
450           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
451           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
452           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
453 
454           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
455           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
456 
457           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
458           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
459 
460           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
461 
462           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
463             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
464             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
465             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
466             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
467           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
468             [cm] "r" (cm),
469             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
470       );
471 
472       src += 16;
473       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
474       odd_dst = (dst + dst_stride);
475     }
476 
477     /* Next row... */
478     src_ptr += src_stride;
479     dst_ptr += 1;
480   }
481 }
482 
convolve_bi_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)483 static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
484                                                   int32_t src_stride,
485                                                   uint8_t *dst_ptr,
486                                                   int32_t dst_stride,
487                                                   const int16_t *filter_x0,
488                                                   int32_t h) {
489   int32_t       c, y;
490   const uint8_t *src;
491   uint8_t       *dst;
492   uint8_t       *cm = vp9_ff_cropTbl;
493   uint32_t      vector_64 = 64;
494   int32_t       Temp1, Temp2, Temp3;
495   uint32_t      qload1, qload2;
496   uint32_t      p1, p2, p3, p4, p5;
497   uint32_t      st1, st2, st3;
498   uint32_t      dst_pitch_2 = (dst_stride << 1);
499   uint8_t       *odd_dst;
500   const int16_t *filter = &filter_x0[3];
501   uint32_t      filter45;
502 
503   filter45 = ((const int32_t *)filter)[0];
504 
505   for (y = h; y--;) {
506     /* prefetch data to cache memory */
507     vp9_prefetch_load(src_ptr + src_stride);
508     vp9_prefetch_load(src_ptr + src_stride + 32);
509     vp9_prefetch_load(src_ptr + src_stride + 64);
510 
511     src = src_ptr;
512     dst = dst_ptr;
513 
514     odd_dst = (dst + dst_stride);
515 
516     for (c = 0; c < 4; c++) {
517       __asm__ __volatile__ (
518           "ulw              %[qload1],        0(%[src])                       \n\t"
519           "ulw              %[qload2],        4(%[src])                       \n\t"
520 
521           /* even 1. pixel */
522           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 1 */
523           "mthi             $zero,            $ac1                            \n\t"
524           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 2 */
525           "mthi             $zero,            $ac2                            \n\t"
526           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
527           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
528           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
529           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
530           "ulw              %[qload1],        8(%[src])                       \n\t"
531           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* even 1 */
532           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 1 */
533 
534           /* even 2. pixel */
535           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 3 */
536           "mthi             $zero,            $ac3                            \n\t"
537           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
538           "preceu.ph.qbl    %[p5],            %[qload1]                       \n\t"
539           "ulw              %[qload2],        12(%[src])                      \n\t"
540           "dpa.w.ph         $ac2,             %[p2],          %[filter45]     \n\t" /* even 1 */
541           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 1 */
542           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 1 */
543 
544           /* even 3. pixel */
545           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 4 */
546           "mthi             $zero,            $ac1                            \n\t"
547           "preceu.ph.qbr    %[p2],            %[qload2]                       \n\t"
548           "sb               %[st1],           0(%[dst])                       \n\t" /* even 1 */
549           "addu             %[dst],           %[dst],         %[dst_pitch_2]             \n\t"
550           "dpa.w.ph         $ac3,             %[p3],          %[filter45]     \n\t" /* even 3 */
551           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 3 */
552           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 1 */
553 
554           /* even 4. pixel */
555           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 5 */
556           "mthi             $zero,            $ac2                            \n\t"
557           "preceu.ph.qbl    %[p3],            %[qload2]                       \n\t"
558           "sb               %[st2],           0(%[dst])                       \n\t" /* even 2 */
559           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
560           "dpa.w.ph         $ac1,             %[p4],          %[filter45]     \n\t" /* even 4 */
561           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 4 */
562           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 3 */
563 
564           /* even 5. pixel */
565           "mtlo             %[vector_64],     $ac3                            \n\t" /* even 6 */
566           "mthi             $zero,            $ac3                            \n\t"
567           "sb               %[st3],           0(%[dst])                       \n\t" /* even 3 */
568           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
569           "dpa.w.ph         $ac2,             %[p1],          %[filter45]     \n\t" /* even 5 */
570           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 5 */
571           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 4 */
572 
573           /* even 6. pixel */
574           "mtlo             %[vector_64],     $ac1                            \n\t" /* even 7 */
575           "mthi             $zero,            $ac1                            \n\t"
576           "sb               %[st1],           0(%[dst])                       \n\t" /* even 4 */
577           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
578           "ulw              %[qload1],        20(%[src])                      \n\t"
579           "dpa.w.ph         $ac3,             %[p5],          %[filter45]     \n\t" /* even 6 */
580           "extp             %[Temp3],         $ac3,           31              \n\t" /* even 6 */
581           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 5 */
582 
583           /* even 7. pixel */
584           "mtlo             %[vector_64],     $ac2                            \n\t" /* even 8 */
585           "mthi             $zero,            $ac2                            \n\t"
586           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
587           "sb               %[st2],           0(%[dst])                       \n\t" /* even 5 */
588           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
589           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* even 7 */
590           "extp             %[Temp1],         $ac1,           31              \n\t" /* even 7 */
591           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* even 6 */
592 
593           /* even 8. pixel */
594           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 1 */
595           "mthi             $zero,            $ac3                            \n\t"
596           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* even 8 */
597           "sb               %[st3],           0(%[dst])                       \n\t" /* even 6 */
598           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
599           "extp             %[Temp2],         $ac2,           31              \n\t" /* even 8 */
600           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* even 7 */
601 
602           /* ODD pixels */
603           "ulw              %[qload1],        1(%[src])                       \n\t"
604           "ulw              %[qload2],        5(%[src])                       \n\t"
605 
606           /* odd 1. pixel */
607           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 2 */
608           "mthi             $zero,            $ac1                            \n\t"
609           "preceu.ph.qbr    %[p1],            %[qload1]                       \n\t"
610           "preceu.ph.qbl    %[p2],            %[qload1]                       \n\t"
611           "preceu.ph.qbr    %[p3],            %[qload2]                       \n\t"
612           "preceu.ph.qbl    %[p4],            %[qload2]                       \n\t"
613           "sb               %[st1],           0(%[dst])                       \n\t" /* even 7 */
614           "addu             %[dst],           %[dst],         %[dst_pitch_2]  \n\t"
615           "ulw              %[qload2],        9(%[src])                       \n\t"
616           "dpa.w.ph         $ac3,             %[p1],          %[filter45]     \n\t" /* odd 1 */
617           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 1 */
618           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* even 8 */
619 
620           /* odd 2. pixel */
621           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 3 */
622           "mthi             $zero,            $ac2                            \n\t"
623           "preceu.ph.qbr    %[p1],            %[qload2]                       \n\t"
624           "preceu.ph.qbl    %[p5],            %[qload2]                       \n\t"
625           "sb               %[st2],           0(%[dst])                       \n\t" /* even 8 */
626           "ulw              %[qload1],        13(%[src])                      \n\t"
627           "dpa.w.ph         $ac1,             %[p2],          %[filter45]     \n\t" /* odd 2 */
628           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 2 */
629           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 1 */
630 
631           /* odd 3. pixel */
632           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 4 */
633           "mthi             $zero,            $ac3                            \n\t"
634           "preceu.ph.qbr    %[p2],            %[qload1]                       \n\t"
635           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 1 */
636           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
637           "dpa.w.ph         $ac2,             %[p3],          %[filter45]     \n\t" /* odd 3 */
638           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 3 */
639           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 2 */
640 
641           /* odd 4. pixel */
642           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 5 */
643           "mthi             $zero,            $ac1                            \n\t"
644           "preceu.ph.qbl    %[p3],            %[qload1]                       \n\t"
645           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 2 */
646           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
647           "dpa.w.ph         $ac3,             %[p4],          %[filter45]     \n\t" /* odd 4 */
648           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 4 */
649           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 3 */
650 
651           /* odd 5. pixel */
652           "mtlo             %[vector_64],     $ac2                            \n\t" /* odd 6 */
653           "mthi             $zero,            $ac2                            \n\t"
654           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 3 */
655           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
656           "dpa.w.ph         $ac1,             %[p1],          %[filter45]     \n\t" /* odd 5 */
657           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 5 */
658           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 4 */
659 
660           /* odd 6. pixel */
661           "mtlo             %[vector_64],     $ac3                            \n\t" /* odd 7 */
662           "mthi             $zero,            $ac3                            \n\t"
663           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 4 */
664           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
665           "ulw              %[qload1],        21(%[src])                      \n\t"
666           "dpa.w.ph         $ac2,             %[p5],          %[filter45]     \n\t" /* odd 6 */
667           "extp             %[Temp2],         $ac2,           31              \n\t" /* odd 6 */
668           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 5 */
669 
670           /* odd 7. pixel */
671           "mtlo             %[vector_64],     $ac1                            \n\t" /* odd 8 */
672           "mthi             $zero,            $ac1                            \n\t"
673           "preceu.ph.qbr    %[p5],            %[qload1]                       \n\t"
674           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 5 */
675           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
676           "dpa.w.ph         $ac3,             %[p2],          %[filter45]     \n\t" /* odd 7 */
677           "extp             %[Temp3],         $ac3,           31              \n\t" /* odd 7 */
678 
679           /* odd 8. pixel */
680           "dpa.w.ph         $ac1,             %[p3],          %[filter45]     \n\t" /* odd 8 */
681           "extp             %[Temp1],         $ac1,           31              \n\t" /* odd 8 */
682 
683           "lbux             %[st2],           %[Temp2](%[cm])                 \n\t" /* odd 6 */
684           "lbux             %[st3],           %[Temp3](%[cm])                 \n\t" /* odd 7 */
685           "lbux             %[st1],           %[Temp1](%[cm])                 \n\t" /* odd 8 */
686 
687           "sb               %[st2],           0(%[odd_dst])                   \n\t" /* odd 6 */
688           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
689 
690           "sb               %[st3],           0(%[odd_dst])                   \n\t" /* odd 7 */
691           "addu             %[odd_dst],       %[odd_dst],     %[dst_pitch_2]  \n\t"
692 
693           "sb               %[st1],           0(%[odd_dst])                   \n\t" /* odd 8 */
694 
695           : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
696             [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
697             [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
698             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
699             [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
700           : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
701             [cm] "r" (cm),
702             [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
703       );
704 
705       src += 16;
706       dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
707       odd_dst = (dst + dst_stride);
708     }
709 
710     /* Next row... */
711     src_ptr += src_stride;
712     dst_ptr += 1;
713   }
714 }
715 
convolve_bi_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)716 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
717                                   uint8_t *dst, ptrdiff_t dst_stride,
718                                   const int16_t *filter, int w, int h) {
719   int x, y;
720 
721   for (y = 0; y < h; ++y) {
722     for (x = 0; x < w; ++x) {
723       int sum = 0;
724 
725       sum += src[x] * filter[3];
726       sum += src[x + 1] * filter[4];
727 
728       dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
729     }
730 
731     src += src_stride;
732     dst += 1;
733   }
734 }
735 
vp9_convolve2_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)736 void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
737                          uint8_t *dst, ptrdiff_t dst_stride,
738                          const int16_t *filter,
739                          int w, int h) {
740   uint32_t pos = 38;
741 
742   /* bit positon for extract from acc */
743   __asm__ __volatile__ (
744     "wrdsp      %[pos],     1           \n\t"
745     :
746     : [pos] "r" (pos)
747   );
748 
749   /* prefetch data to cache memory */
750   vp9_prefetch_load(src);
751   vp9_prefetch_load(src + 32);
752 
753   switch (w) {
754     case 4:
755       convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
756                                            dst, dst_stride,
757                                            filter, h);
758       break;
759     case 8:
760       convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
761                                            dst, dst_stride,
762                                            filter, h);
763       break;
764     case 16:
765     case 32:
766       convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
767                                             dst, dst_stride,
768                                             filter, h,
769                                             (w/16));
770       break;
771     case 64:
772       vp9_prefetch_load(src + 32);
773       convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
774                                             dst, dst_stride,
775                                             filter, h);
776       break;
777     default:
778       convolve_bi_horiz_transposed(src, src_stride,
779                                    dst, dst_stride,
780                                    filter, w, h);
781       break;
782   }
783 }
784 #endif
785