1 /*
2  *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include <assert.h>
12 #include <stdio.h>
13 
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21 
22 #if HAVE_DSPR2
convolve_bi_avg_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)23 static void convolve_bi_avg_vert_4_dspr2(const uint8_t *src,
24                                          int32_t src_stride,
25                                          uint8_t *dst,
26                                          int32_t dst_stride,
27                                          const int16_t *filter_y,
28                                          int32_t w,
29                                          int32_t h) {
30   int32_t       x, y;
31   const uint8_t *src_ptr;
32   uint8_t       *dst_ptr;
33   uint8_t       *cm = vp9_ff_cropTbl;
34   uint32_t      vector4a = 64;
35   uint32_t      load1, load2;
36   uint32_t      p1, p2;
37   uint32_t      scratch1, scratch2;
38   uint32_t      store1, store2;
39   int32_t       Temp1, Temp2;
40   const int16_t *filter = &filter_y[3];
41   uint32_t      filter45;
42 
43   filter45 = ((const int32_t *)filter)[0];
44 
45   for (y = h; y--;) {
46     /* prefetch data to cache memory */
47     vp9_prefetch_store(dst + dst_stride);
48 
49     for (x = 0; x < w; x += 4) {
50       src_ptr = src + x;
51       dst_ptr = dst + x;
52 
53       __asm__ __volatile__ (
54           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
55           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
56           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
57 
58           "mtlo             %[vector4a],  $ac0                            \n\t"
59           "mtlo             %[vector4a],  $ac1                            \n\t"
60           "mtlo             %[vector4a],  $ac2                            \n\t"
61           "mtlo             %[vector4a],  $ac3                            \n\t"
62           "mthi             $zero,        $ac0                            \n\t"
63           "mthi             $zero,        $ac1                            \n\t"
64           "mthi             $zero,        $ac2                            \n\t"
65           "mthi             $zero,        $ac3                            \n\t"
66 
67           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
68           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
69           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
70           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
71 
72           "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
73           "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
74 
75           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
76           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
77           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
78           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
79 
80           "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
81           "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
82 
83           "extp             %[Temp1],     $ac0,           31              \n\t"
84           "extp             %[Temp2],     $ac1,           31              \n\t"
85 
86           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
87           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
88 
89           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
90           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
91           "extp             %[Temp1],     $ac2,           31              \n\t"
92 
93           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
94           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
95           "extp             %[Temp2],     $ac3,           31              \n\t"
96           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
97 
98           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
99           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
100           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
101 
102           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
103           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
104           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
105           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
106 
107           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
108           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
109 
110           : [load1] "=&r" (load1), [load2] "=&r" (load2),
111             [p1] "=&r" (p1), [p2] "=&r" (p2),
112             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
113             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
114             [store1] "=&r" (store1), [store2] "=&r" (store2),
115             [src_ptr] "+r" (src_ptr)
116           : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
117             [src_stride] "r" (src_stride), [cm] "r" (cm),
118             [dst_ptr] "r" (dst_ptr)
119       );
120     }
121 
122     /* Next row... */
123     src += src_stride;
124     dst += dst_stride;
125   }
126 }
127 
convolve_bi_avg_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)128 static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src,
129                                           int32_t src_stride,
130                                           uint8_t *dst,
131                                           int32_t dst_stride,
132                                           const int16_t *filter_y,
133                                           int32_t h) {
134   int32_t       x, y;
135   const uint8_t *src_ptr;
136   uint8_t       *dst_ptr;
137   uint8_t       *cm = vp9_ff_cropTbl;
138   uint32_t      vector4a = 64;
139   uint32_t      load1, load2;
140   uint32_t      p1, p2;
141   uint32_t      scratch1, scratch2;
142   uint32_t      store1, store2;
143   int32_t       Temp1, Temp2;
144   const int16_t *filter = &filter_y[3];
145   uint32_t filter45;;
146 
147   filter45 = ((const int32_t *)filter)[0];
148 
149   for (y = h; y--;) {
150     /* prefetch data to cache memory */
151     vp9_prefetch_store(dst + dst_stride);
152     vp9_prefetch_store(dst + dst_stride + 32);
153 
154     for (x = 0; x < 64; x += 4) {
155       src_ptr = src + x;
156       dst_ptr = dst + x;
157 
158       __asm__ __volatile__ (
159           "ulw              %[load1],     0(%[src_ptr])                   \n\t"
160           "add              %[src_ptr],   %[src_ptr],     %[src_stride]   \n\t"
161           "ulw              %[load2],     0(%[src_ptr])                   \n\t"
162 
163           "mtlo             %[vector4a],  $ac0                            \n\t"
164           "mtlo             %[vector4a],  $ac1                            \n\t"
165           "mtlo             %[vector4a],  $ac2                            \n\t"
166           "mtlo             %[vector4a],  $ac3                            \n\t"
167           "mthi             $zero,        $ac0                            \n\t"
168           "mthi             $zero,        $ac1                            \n\t"
169           "mthi             $zero,        $ac2                            \n\t"
170           "mthi             $zero,        $ac3                            \n\t"
171 
172           "preceu.ph.qbr    %[scratch1],  %[load1]                        \n\t"
173           "preceu.ph.qbr    %[p1],        %[load2]                        \n\t"
174           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
175           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
176 
177           "dpa.w.ph         $ac0,         %[p1],          %[filter45]     \n\t"
178           "dpa.w.ph         $ac1,         %[p2],          %[filter45]     \n\t"
179 
180           "preceu.ph.qbl    %[scratch1],  %[load1]                        \n\t"
181           "preceu.ph.qbl    %[p1],        %[load2]                        \n\t"
182           "precrq.ph.w      %[p2],        %[p1],          %[scratch1]     \n\t" /* pixel 2 */
183           "append           %[p1],        %[scratch1],    16              \n\t" /* pixel 1 */
184 
185           "dpa.w.ph         $ac2,         %[p1],          %[filter45]     \n\t"
186           "dpa.w.ph         $ac3,         %[p2],          %[filter45]     \n\t"
187 
188           "extp             %[Temp1],     $ac0,           31              \n\t"
189           "extp             %[Temp2],     $ac1,           31              \n\t"
190 
191           "lbu              %[scratch1],  0(%[dst_ptr])                   \n\t"
192           "lbu              %[scratch2],  1(%[dst_ptr])                   \n\t"
193 
194           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
195           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 1 */
196           "extp             %[Temp1],     $ac2,           31              \n\t"
197 
198           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
199           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 2 */
200           "extp             %[Temp2],     $ac3,           31              \n\t"
201           "lbu              %[scratch1],  2(%[dst_ptr])                   \n\t"
202 
203           "sb               %[store1],    0(%[dst_ptr])                   \n\t"
204           "sb               %[store2],    1(%[dst_ptr])                   \n\t"
205           "lbu              %[scratch2],  3(%[dst_ptr])                   \n\t"
206 
207           "lbux             %[store1],    %[Temp1](%[cm])                 \n\t"
208           "lbux             %[store2],    %[Temp2](%[cm])                 \n\t"
209           "addqh_r.w        %[store1],    %[store1],      %[scratch1]     \n\t" /* pixel 3 */
210           "addqh_r.w        %[store2],    %[store2],      %[scratch2]     \n\t" /* pixel 4 */
211 
212           "sb               %[store1],    2(%[dst_ptr])                   \n\t"
213           "sb               %[store2],    3(%[dst_ptr])                   \n\t"
214 
215           : [load1] "=&r" (load1), [load2] "=&r" (load2),
216             [p1] "=&r" (p1), [p2] "=&r" (p2),
217             [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
218             [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
219             [store1] "=&r" (store1), [store2] "=&r" (store2),
220             [src_ptr] "+r" (src_ptr)
221           : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
222             [src_stride] "r" (src_stride), [cm] "r" (cm),
223             [dst_ptr] "r" (dst_ptr)
224       );
225     }
226 
227     /* Next row... */
228     src += src_stride;
229     dst += dst_stride;
230   }
231 }
232 
vp9_convolve2_avg_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)233 void vp9_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
234                                   uint8_t *dst, ptrdiff_t dst_stride,
235                                   const int16_t *filter_x, int x_step_q4,
236                                   const int16_t *filter_y, int y_step_q4,
237                                   int w, int h) {
238   if (16 == y_step_q4) {
239     uint32_t pos = 38;
240 
241     /* bit positon for extract from acc */
242     __asm__ __volatile__ (
243       "wrdsp      %[pos],     1           \n\t"
244       :
245       : [pos] "r" (pos)
246     );
247 
248     vp9_prefetch_store(dst);
249 
250     switch (w) {
251       case 4:
252       case 8:
253       case 16:
254       case 32:
255         convolve_bi_avg_vert_4_dspr2(src, src_stride,
256                                      dst, dst_stride,
257                                      filter_y, w, h);
258         break;
259       case 64:
260         vp9_prefetch_store(dst + 32);
261         convolve_bi_avg_vert_64_dspr2(src, src_stride,
262                                       dst, dst_stride,
263                                       filter_y, h);
264         break;
265       default:
266         vp9_convolve8_avg_vert_c(src, src_stride,
267                                  dst, dst_stride,
268                                  filter_x, x_step_q4,
269                                  filter_y, y_step_q4,
270                                  w, h);
271         break;
272     }
273   } else {
274     vp9_convolve8_avg_vert_c(src, src_stride,
275                              dst, dst_stride,
276                              filter_x, x_step_q4,
277                              filter_y, y_step_q4,
278                              w, h);
279   }
280 }
281 #endif
282