1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)23 static void convolve_vert_4_dspr2(const uint8_t *src,
24                                   int32_t src_stride,
25                                   uint8_t *dst,
26                                   int32_t dst_stride,
27                                   const int16_t *filter_y,
28                                   int32_t w,
29                                   int32_t h) {
30   int32_t x, y;
31   const uint8_t *src_ptr;
32   uint8_t *dst_ptr;
33   uint8_t *cm = vp9_ff_cropTbl;
34   uint32_t vector4a = 64;
35   uint32_t load1, load2, load3, load4;
36   uint32_t p1, p2;
37   uint32_t n1, n2;
38   uint32_t scratch1, scratch2;
39   uint32_t store1, store2;
40   int32_t vector1b, vector2b, vector3b, vector4b;
41   int32_t Temp1, Temp2;
42 
43   vector1b = ((const int32_t *)filter_y)[0];
44   vector2b = ((const int32_t *)filter_y)[1];
45   vector3b = ((const int32_t *)filter_y)[2];
46   vector4b = ((const int32_t *)filter_y)[3];
47 
48   src -= 3 * src_stride;
49 
50   for (y = h; y--;) {
51     /* prefetch data to cache memory */
52     vp9_prefetch_store(dst + dst_stride);
53 
54     for (x = 0; x < w; x += 4) {
55       src_ptr = src + x;
56       dst_ptr = dst + x;
57 
58       __asm__ __volatile__ (
59           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
60           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
61           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
62           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
63           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
64           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
65           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
66 
67           "mtlo             %[vector4a],  $ac0                            \n\t"
68           "mtlo             %[vector4a],  $ac1                            \n\t"
69           "mtlo             %[vector4a],  $ac2                            \n\t"
70           "mtlo             %[vector4a],  $ac3                            \n\t"
71           "mthi             $zero,        $ac0                            \n\t"
72           "mthi             $zero,        $ac1                            \n\t"
73           "mthi             $zero,        $ac2                            \n\t"
74           "mthi             $zero,        $ac3                            \n\t"
75 
76           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
77           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
78           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
79           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
80           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
81           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
82           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
83           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
84 
85           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
86           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
87           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
88           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
89 
90           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
91           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
92           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
93           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
94           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
95           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
96           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
97           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
98 
99           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
100           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
101           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
102           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
103 
104           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
105           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
106           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
107           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
108           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
109           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
110           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
111           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
112 
113           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
114           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
115           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
116           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
117           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
118           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
119           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
120           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
121 
122           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
123           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
124           "extp             %[Temp1],     $ac0,           31              \n\t"
125           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
126           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
127           "extp             %[Temp2],     $ac1,           31              \n\t"
128 
129           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
130           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
131           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
132           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
133           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
134           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
135           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
136           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
137 
138           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
139           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
140           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
141           "extp             %[Temp1],     $ac2,           31              \n\t"
142 
143           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
144           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
145           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
146           "extp             %[Temp2],     $ac3,           31              \n\t"
147 
148           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
149           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
150 
151           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
152           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
153 
154           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
155           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
156 
157           : [load1] "=&r" (load1), [load2] "=&r" (load2),
158             [load3] "=&r" (load3), [load4] "=&r" (load4),
159             [p1] "=&r" (p1), [p2] "=&r" (p2),
160             [n1] "=&r" (n1), [n2] "=&r" (n2),
161             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
162             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
163             [store1] "=&r" (store1), [store2] "=&r" (store2),
164             [src_ptr] "+r" (src_ptr)
165           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
166             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
167             [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
168             [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
169       );
170     }
171 
172     /* Next row... */
173     src += src_stride;
174     dst += dst_stride;
175   }
176 }
177 
convolve_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)178 static void convolve_vert_64_dspr2(const uint8_t *src,
179                                    int32_t src_stride,
180                                    uint8_t *dst,
181                                    int32_t dst_stride,
182                                    const int16_t *filter_y,
183                                    int32_t h) {
184   int32_t x, y;
185   const uint8_t *src_ptr;
186   uint8_t *dst_ptr;
187   uint8_t *cm = vp9_ff_cropTbl;
188   uint32_t vector4a = 64;
189   uint32_t load1, load2, load3, load4;
190   uint32_t p1, p2;
191   uint32_t n1, n2;
192   uint32_t scratch1, scratch2;
193   uint32_t store1, store2;
194   int32_t vector1b, vector2b, vector3b, vector4b;
195   int32_t Temp1, Temp2;
196 
197   vector1b = ((const int32_t *)filter_y)[0];
198   vector2b = ((const int32_t *)filter_y)[1];
199   vector3b = ((const int32_t *)filter_y)[2];
200   vector4b = ((const int32_t *)filter_y)[3];
201 
202   src -= 3 * src_stride;
203 
204   for (y = h; y--;) {
205     /* prefetch data to cache memory */
206     vp9_prefetch_store(dst + dst_stride);
207     vp9_prefetch_store(dst + dst_stride + 32);
208 
209     for (x = 0; x < 64; x += 4) {
210       src_ptr = src + x;
211       dst_ptr = dst + x;
212 
213       __asm__ __volatile__ (
214           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
215           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
216           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
217           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
218           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
219           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
220           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
221 
222           "mtlo             %[vector4a],  $ac0                            \n\t"
223           "mtlo             %[vector4a],  $ac1                            \n\t"
224           "mtlo             %[vector4a],  $ac2                            \n\t"
225           "mtlo             %[vector4a],  $ac3                            \n\t"
226           "mthi             $zero,        $ac0                            \n\t"
227           "mthi             $zero,        $ac1                            \n\t"
228           "mthi             $zero,        $ac2                            \n\t"
229           "mthi             $zero,        $ac3                            \n\t"
230 
231           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
232           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
233           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
234           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
235           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
236           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
237           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
238           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
239 
240           "dpa.w.ph         $ac0,         %[p1],          %[vector1b]     \n\t"
241           "dpa.w.ph         $ac0,         %[p2],          %[vector2b]     \n\t"
242           "dpa.w.ph         $ac1,         %[n1],          %[vector1b]     \n\t"
243           "dpa.w.ph         $ac1,         %[n2],          %[vector2b]     \n\t"
244 
245           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
246           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
247           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
248           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
249           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
250           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
251           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
252           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
253 
254           "dpa.w.ph         $ac2,         %[p1],          %[vector1b]     \n\t"
255           "dpa.w.ph         $ac2,         %[p2],          %[vector2b]     \n\t"
256           "dpa.w.ph         $ac3,         %[n1],          %[vector1b]     \n\t"
257           "dpa.w.ph         $ac3,         %[n2],          %[vector2b]     \n\t"
258 
259           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
260           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
261           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
262           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
263           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
264           "ulw              %[load3],     0(%[src_ptr])                   \n\t"
265           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
266           "ulw              %[load4],     0(%[src_ptr])                   \n\t"
267 
268           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
269           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
270           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
271           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
272           "preceu.ph.qbr    %[scratch2],  %[load3]                        \n\t"
273           "preceu.ph.qbr    %[p2],        %[load4]                        \n\t"
274           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
275           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
276 
277           "dpa.w.ph         $ac0,         %[p1],          %[vector3b]     \n\t"
278           "dpa.w.ph         $ac0,         %[p2],          %[vector4b]     \n\t"
279           "extp             %[Temp1],     $ac0,           31              \n\t"
280           "dpa.w.ph         $ac1,         %[n1],          %[vector3b]     \n\t"
281           "dpa.w.ph         $ac1,         %[n2],          %[vector4b]     \n\t"
282           "extp             %[Temp2],     $ac1,           31              \n\t"
283 
284           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
285           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
286           "precrq.ph.w      %[n1],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
287           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
288           "preceu.ph.qbl    %[scratch2],  %[load3]                        \n\t"
289           "preceu.ph.qbl    %[p2],        %[load4]                        \n\t"
290           "precrq.ph.w      %[n2],        %[p2],          %[scratch2]     \n\t" /* pixel 2 */
291           "append           %[p2],        %[scratch2],    16              \n\t" /* pixel 1 */
292 
293           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
294           "dpa.w.ph         $ac2,         %[p1],          %[vector3b]     \n\t"
295           "dpa.w.ph         $ac2,         %[p2],          %[vector4b]     \n\t"
296           "extp             %[Temp1],     $ac2,           31              \n\t"
297 
298           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
299           "dpa.w.ph         $ac3,         %[n1],          %[vector3b]     \n\t"
300           "dpa.w.ph         $ac3,         %[n2],          %[vector4b]     \n\t"
301           "extp             %[Temp2],     $ac3,           31              \n\t"
302 
303           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
304           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
305 
306           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
307           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
308 
309           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
310           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
311 
312           : [load1] "=&r" (load1), [load2] "=&r" (load2),
313             [load3] "=&r" (load3), [load4] "=&r" (load4),
314             [p1] "=&r" (p1), [p2] "=&r" (p2),
315             [n1] "=&r" (n1), [n2] "=&r" (n2),
316             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
317             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
318             [store1] "=&r" (store1), [store2] "=&r" (store2),
319             [src_ptr] "+r" (src_ptr)
320           : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
321             [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
322             [vector4a] "r" (vector4a), [src_stride] "r" (src_stride),
323             [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
324       );
325     }
326 
327     /* Next row... */
328     src += src_stride;
329     dst += dst_stride;
330   }
331 }
332 
vp9_convolve8_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)333 void vp9_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
334                               uint8_t *dst, ptrdiff_t dst_stride,
335                               const int16_t *filter_x, int x_step_q4,
336                               const int16_t *filter_y, int y_step_q4,
337                               int w, int h) {
338   if (((const int32_t *)filter_y)[1] == 0x800000) {
339     vp9_convolve_copy(src, src_stride,
340                       dst, dst_stride,
341                       filter_x, x_step_q4,
342                       filter_y, y_step_q4,
343                       w, h);
344   } else if (((const int32_t *)filter_y)[0] == 0) {
345     vp9_convolve2_vert_dspr2(src, src_stride,
346                              dst, dst_stride,
347                              filter_x, x_step_q4,
348                              filter_y, y_step_q4,
349                              w, h);
350   } else {
351     if (16 == y_step_q4) {
352       uint32_t pos = 38;
353 
354       /* bit positon for extract from acc */
355       __asm__ __volatile__ (
356         "wrdsp      %[pos],     1           \n\t"
357         :
358         : [pos] "r" (pos)
359       );
360 
361       vp9_prefetch_store(dst);
362 
363       switch (w) {
364         case 4 :
365         case 8 :
366         case 16 :
367         case 32 :
368           convolve_vert_4_dspr2(src, src_stride,
369                                 dst, dst_stride,
370                                 filter_y, w, h);
371           break;
372         case 64 :
373           vp9_prefetch_store(dst + 32);
374           convolve_vert_64_dspr2(src, src_stride,
375                                  dst, dst_stride,
376                                  filter_y, h);
377           break;
378         default:
379           vp9_convolve8_vert_c(src, src_stride,
380                                dst, dst_stride,
381                                filter_x, x_step_q4,
382                                filter_y, y_step_q4,
383                                w, h);
384           break;
385       }
386     } else {
387       vp9_convolve8_vert_c(src, src_stride,
388                            dst, dst_stride,
389                            filter_x, x_step_q4,
390                            filter_y, y_step_q4,
391                            w, h);
392     }
393   }
394 }
395 
396 #endif
397