1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
convolve_bi_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_bi_horiz_4_dspr2(const uint8_t *src,
24 int32_t src_stride,
25 uint8_t *dst,
26 int32_t dst_stride,
27 const int16_t *filter_x0,
28 int32_t h) {
29 int32_t y;
30 uint8_t *cm = vp9_ff_cropTbl;
31 int32_t Temp1, Temp2, Temp3, Temp4;
32 uint32_t vector4a = 64;
33 uint32_t tp1, tp2;
34 uint32_t p1, p2;
35 const int16_t *filter = &filter_x0[3];
36 uint32_t filter45;;
37
38 filter45 = ((const int32_t *)filter)[0];
39
40 for (y = h; y--;) {
41 /* prefetch data to cache memory */
42 vp9_prefetch_load(src + src_stride);
43 vp9_prefetch_load(src + src_stride + 32);
44 vp9_prefetch_store(dst + dst_stride);
45
46 __asm__ __volatile__ (
47 "ulw %[tp1], 0(%[src]) \n\t"
48 "ulw %[tp2], 4(%[src]) \n\t"
49
50 /* even 1. pixel */
51 "mtlo %[vector4a], $ac3 \n\t"
52 "mthi $zero, $ac3 \n\t"
53 "preceu.ph.qbr %[p1], %[tp1] \n\t"
54 "preceu.ph.qbl %[p2], %[tp1] \n\t"
55 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
56 "extp %[Temp1], $ac3, 31 \n\t"
57
58 /* even 2. pixel */
59 "mtlo %[vector4a], $ac2 \n\t"
60 "mthi $zero, $ac2 \n\t"
61 "balign %[tp2], %[tp1], 3 \n\t"
62 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
63 "extp %[Temp3], $ac2, 31 \n\t"
64
65 /* odd 1. pixel */
66 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
67 "mtlo %[vector4a], $ac3 \n\t"
68 "mthi $zero, $ac3 \n\t"
69 "preceu.ph.qbr %[p1], %[tp2] \n\t"
70 "preceu.ph.qbl %[p2], %[tp2] \n\t"
71 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
72 "extp %[Temp2], $ac3, 31 \n\t"
73
74 /* odd 2. pixel */
75 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
76 "mtlo %[vector4a], $ac2 \n\t"
77 "mthi $zero, $ac2 \n\t"
78 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
79 "extp %[Temp4], $ac2, 31 \n\t"
80
81 /* clamp */
82 "lbux %[p1], %[Temp2](%[cm]) \n\t"
83 "lbux %[p2], %[Temp4](%[cm]) \n\t"
84
85 /* store bytes */
86 "sb %[tp1], 0(%[dst]) \n\t"
87 "sb %[p1], 1(%[dst]) \n\t"
88 "sb %[tp2], 2(%[dst]) \n\t"
89 "sb %[p2], 3(%[dst]) \n\t"
90
91 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
92 [p1] "=&r" (p1), [p2] "=&r" (p2),
93 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
94 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
95 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
96 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
97 );
98
99 /* Next row... */
100 src += src_stride;
101 dst += dst_stride;
102 }
103 }
104
convolve_bi_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)105 static void convolve_bi_horiz_8_dspr2(const uint8_t *src,
106 int32_t src_stride,
107 uint8_t *dst,
108 int32_t dst_stride,
109 const int16_t *filter_x0,
110 int32_t h) {
111 int32_t y;
112 uint8_t *cm = vp9_ff_cropTbl;
113 uint32_t vector4a = 64;
114 int32_t Temp1, Temp2, Temp3;
115 uint32_t tp1, tp2, tp3;
116 uint32_t p1, p2, p3, p4;
117 uint32_t st0, st1;
118 const int16_t *filter = &filter_x0[3];
119 uint32_t filter45;;
120
121 filter45 = ((const int32_t *)filter)[0];
122
123 for (y = h; y--;) {
124 /* prefetch data to cache memory */
125 vp9_prefetch_load(src + src_stride);
126 vp9_prefetch_load(src + src_stride + 32);
127 vp9_prefetch_store(dst + dst_stride);
128
129 __asm__ __volatile__ (
130 "ulw %[tp1], 0(%[src]) \n\t"
131 "ulw %[tp2], 4(%[src]) \n\t"
132
133 /* even 1. pixel */
134 "mtlo %[vector4a], $ac3 \n\t"
135 "mthi $zero, $ac3 \n\t"
136 "mtlo %[vector4a], $ac2 \n\t"
137 "mthi $zero, $ac2 \n\t"
138 "preceu.ph.qbr %[p1], %[tp1] \n\t"
139 "preceu.ph.qbl %[p2], %[tp1] \n\t"
140 "preceu.ph.qbr %[p3], %[tp2] \n\t"
141 "preceu.ph.qbl %[p4], %[tp2] \n\t"
142 "ulw %[tp3], 8(%[src]) \n\t"
143 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
144 "extp %[Temp1], $ac3, 31 \n\t"
145
146 /* even 2. pixel */
147 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
148 "extp %[Temp3], $ac2, 31 \n\t"
149
150 /* even 3. pixel */
151 "lbux %[st0], %[Temp1](%[cm]) \n\t"
152 "mtlo %[vector4a], $ac1 \n\t"
153 "mthi $zero, $ac1 \n\t"
154 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
155 "extp %[Temp1], $ac1, 31 \n\t"
156
157 /* even 4. pixel */
158 "mtlo %[vector4a], $ac2 \n\t"
159 "mthi $zero, $ac2 \n\t"
160 "mtlo %[vector4a], $ac3 \n\t"
161 "mthi $zero, $ac3 \n\t"
162 "sb %[st0], 0(%[dst]) \n\t"
163 "lbux %[st1], %[Temp3](%[cm]) \n\t"
164
165 "balign %[tp3], %[tp2], 3 \n\t"
166 "balign %[tp2], %[tp1], 3 \n\t"
167
168 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
169 "extp %[Temp3], $ac2, 31 \n\t"
170
171 "lbux %[st0], %[Temp1](%[cm]) \n\t"
172
173 /* odd 1. pixel */
174 "mtlo %[vector4a], $ac1 \n\t"
175 "mthi $zero, $ac1 \n\t"
176 "sb %[st1], 2(%[dst]) \n\t"
177 "preceu.ph.qbr %[p1], %[tp2] \n\t"
178 "preceu.ph.qbl %[p2], %[tp2] \n\t"
179 "preceu.ph.qbr %[p3], %[tp3] \n\t"
180 "preceu.ph.qbl %[p4], %[tp3] \n\t"
181 "sb %[st0], 4(%[dst]) \n\t"
182 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
183 "extp %[Temp2], $ac3, 31 \n\t"
184
185 /* odd 2. pixel */
186 "mtlo %[vector4a], $ac3 \n\t"
187 "mthi $zero, $ac3 \n\t"
188 "mtlo %[vector4a], $ac2 \n\t"
189 "mthi $zero, $ac2 \n\t"
190 "lbux %[st0], %[Temp3](%[cm]) \n\t"
191 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
192 "extp %[Temp3], $ac1, 31 \n\t"
193
194 /* odd 3. pixel */
195 "lbux %[st1], %[Temp2](%[cm]) \n\t"
196 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
197 "extp %[Temp2], $ac3, 31 \n\t"
198
199 /* odd 4. pixel */
200 "sb %[st1], 1(%[dst]) \n\t"
201 "sb %[st0], 6(%[dst]) \n\t"
202 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
203 "extp %[Temp1], $ac2, 31 \n\t"
204
205 /* clamp */
206 "lbux %[p4], %[Temp3](%[cm]) \n\t"
207 "lbux %[p2], %[Temp2](%[cm]) \n\t"
208 "lbux %[p1], %[Temp1](%[cm]) \n\t"
209
210 /* store bytes */
211 "sb %[p4], 3(%[dst]) \n\t"
212 "sb %[p2], 5(%[dst]) \n\t"
213 "sb %[p1], 7(%[dst]) \n\t"
214
215 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
216 [st0] "=&r" (st0), [st1] "=&r" (st1),
217 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
218 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
219 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
220 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
221 );
222
223 /* Next row... */
224 src += src_stride;
225 dst += dst_stride;
226 }
227 }
228
convolve_bi_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)229 static void convolve_bi_horiz_16_dspr2(const uint8_t *src_ptr,
230 int32_t src_stride,
231 uint8_t *dst_ptr,
232 int32_t dst_stride,
233 const int16_t *filter_x0,
234 int32_t h,
235 int32_t count) {
236 int32_t y, c;
237 const uint8_t *src;
238 uint8_t *dst;
239 uint8_t *cm = vp9_ff_cropTbl;
240 uint32_t vector_64 = 64;
241 int32_t Temp1, Temp2, Temp3;
242 uint32_t qload1, qload2, qload3;
243 uint32_t p1, p2, p3, p4, p5;
244 uint32_t st1, st2, st3;
245 const int16_t *filter = &filter_x0[3];
246 uint32_t filter45;;
247
248 filter45 = ((const int32_t *)filter)[0];
249
250 for (y = h; y--;) {
251 src = src_ptr;
252 dst = dst_ptr;
253
254 /* prefetch data to cache memory */
255 vp9_prefetch_load(src_ptr + src_stride);
256 vp9_prefetch_load(src_ptr + src_stride + 32);
257 vp9_prefetch_store(dst_ptr + dst_stride);
258
259 for (c = 0; c < count; c++) {
260 __asm__ __volatile__ (
261 "ulw %[qload1], 0(%[src]) \n\t"
262 "ulw %[qload2], 4(%[src]) \n\t"
263
264 /* even 1. pixel */
265 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
266 "mthi $zero, $ac1 \n\t"
267 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
268 "mthi $zero, $ac2 \n\t"
269 "preceu.ph.qbr %[p1], %[qload1] \n\t"
270 "preceu.ph.qbl %[p2], %[qload1] \n\t"
271 "preceu.ph.qbr %[p3], %[qload2] \n\t"
272 "preceu.ph.qbl %[p4], %[qload2] \n\t"
273 "ulw %[qload3], 8(%[src]) \n\t"
274 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
275 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
276
277 /* even 2. pixel */
278 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
279 "mthi $zero, $ac3 \n\t"
280 "preceu.ph.qbr %[p1], %[qload3] \n\t"
281 "preceu.ph.qbl %[p5], %[qload3] \n\t"
282 "ulw %[qload1], 12(%[src]) \n\t"
283 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
284 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
285 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
286
287 /* even 3. pixel */
288 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
289 "mthi $zero, $ac1 \n\t"
290 "preceu.ph.qbr %[p2], %[qload1] \n\t"
291 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
292 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
293 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
294 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
295
296 /* even 4. pixel */
297 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
298 "mthi $zero, $ac2 \n\t"
299 "preceu.ph.qbl %[p3], %[qload1] \n\t"
300 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
301 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
302 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
303 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
304
305 /* even 5. pixel */
306 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
307 "mthi $zero, $ac3 \n\t"
308 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
309 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
310 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
311 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
312
313 /* even 6. pixel */
314 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
315 "mthi $zero, $ac1 \n\t"
316 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
317 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
318 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
319 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
320
321 /* even 7. pixel */
322 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
323 "mthi $zero, $ac2 \n\t"
324 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
325 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
326 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
327 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
328
329 /* even 8. pixel */
330 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
331 "mthi $zero, $ac3 \n\t"
332 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
333 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
334 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
335 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
336
337 /* ODD pixels */
338 "ulw %[qload1], 1(%[src]) \n\t"
339 "ulw %[qload2], 5(%[src]) \n\t"
340
341 /* odd 1. pixel */
342 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
343 "mthi $zero, $ac1 \n\t"
344 "preceu.ph.qbr %[p1], %[qload1] \n\t"
345 "preceu.ph.qbl %[p2], %[qload1] \n\t"
346 "preceu.ph.qbr %[p3], %[qload2] \n\t"
347 "preceu.ph.qbl %[p4], %[qload2] \n\t"
348 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
349 "ulw %[qload3], 9(%[src]) \n\t"
350 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
351 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
352 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
353
354 /* odd 2. pixel */
355 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
356 "mthi $zero, $ac2 \n\t"
357 "preceu.ph.qbr %[p1], %[qload3] \n\t"
358 "preceu.ph.qbl %[p5], %[qload3] \n\t"
359 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
360 "ulw %[qload1], 13(%[src]) \n\t"
361 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
362 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
363 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
364
365 /* odd 3. pixel */
366 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
367 "mthi $zero, $ac3 \n\t"
368 "preceu.ph.qbr %[p2], %[qload1] \n\t"
369 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
370 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
371 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
372 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
373
374 /* odd 4. pixel */
375 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
376 "mthi $zero, $ac1 \n\t"
377 "preceu.ph.qbl %[p3], %[qload1] \n\t"
378 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
379 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
380 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
381 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
382
383 /* odd 5. pixel */
384 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
385 "mthi $zero, $ac2 \n\t"
386 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
387 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
388 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
389 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
390
391 /* odd 6. pixel */
392 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
393 "mthi $zero, $ac3 \n\t"
394 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
395 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
396 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
398
399 /* odd 7. pixel */
400 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
401 "mthi $zero, $ac1 \n\t"
402 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
403 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
404 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
405
406 /* odd 8. pixel */
407 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
408 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
409
410 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
411 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
412 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
413
414 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
415 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
416 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
417
418 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
419 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
420 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
421 [p5] "=&r" (p5),
422 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
423 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
424 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
425 );
426
427 src += 16;
428 dst += 16;
429 }
430
431 /* Next row... */
432 src_ptr += src_stride;
433 dst_ptr += dst_stride;
434 }
435 }
436
convolve_bi_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)437 static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr,
438 int32_t src_stride,
439 uint8_t *dst_ptr,
440 int32_t dst_stride,
441 const int16_t *filter_x0,
442 int32_t h) {
443 int32_t y, c;
444 const uint8_t *src;
445 uint8_t *dst;
446 uint8_t *cm = vp9_ff_cropTbl;
447 uint32_t vector_64 = 64;
448 int32_t Temp1, Temp2, Temp3;
449 uint32_t qload1, qload2, qload3;
450 uint32_t p1, p2, p3, p4, p5;
451 uint32_t st1, st2, st3;
452 const int16_t *filter = &filter_x0[3];
453 uint32_t filter45;;
454
455 filter45 = ((const int32_t *)filter)[0];
456
457 for (y = h; y--;) {
458 src = src_ptr;
459 dst = dst_ptr;
460
461 /* prefetch data to cache memory */
462 vp9_prefetch_load(src_ptr + src_stride);
463 vp9_prefetch_load(src_ptr + src_stride + 32);
464 vp9_prefetch_load(src_ptr + src_stride + 64);
465 vp9_prefetch_store(dst_ptr + dst_stride);
466 vp9_prefetch_store(dst_ptr + dst_stride + 32);
467
468 for (c = 0; c < 4; c++) {
469 __asm__ __volatile__ (
470 "ulw %[qload1], 0(%[src]) \n\t"
471 "ulw %[qload2], 4(%[src]) \n\t"
472
473 /* even 1. pixel */
474 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
475 "mthi $zero, $ac1 \n\t"
476 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
477 "mthi $zero, $ac2 \n\t"
478 "preceu.ph.qbr %[p1], %[qload1] \n\t"
479 "preceu.ph.qbl %[p2], %[qload1] \n\t"
480 "preceu.ph.qbr %[p3], %[qload2] \n\t"
481 "preceu.ph.qbl %[p4], %[qload2] \n\t"
482 "ulw %[qload3], 8(%[src]) \n\t"
483 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
484 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
485
486 /* even 2. pixel */
487 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
488 "mthi $zero, $ac3 \n\t"
489 "preceu.ph.qbr %[p1], %[qload3] \n\t"
490 "preceu.ph.qbl %[p5], %[qload3] \n\t"
491 "ulw %[qload1], 12(%[src]) \n\t"
492 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
493 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
494 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
495
496 /* even 3. pixel */
497 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
498 "mthi $zero, $ac1 \n\t"
499 "preceu.ph.qbr %[p2], %[qload1] \n\t"
500 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
501 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
502 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
503 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
504
505 /* even 4. pixel */
506 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
507 "mthi $zero, $ac2 \n\t"
508 "preceu.ph.qbl %[p3], %[qload1] \n\t"
509 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
510 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
511 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
512 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
513
514 /* even 5. pixel */
515 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
516 "mthi $zero, $ac3 \n\t"
517 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
518 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
519 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
520 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
521
522 /* even 6. pixel */
523 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
524 "mthi $zero, $ac1 \n\t"
525 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
526 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
527 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
528 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
529
530 /* even 7. pixel */
531 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
532 "mthi $zero, $ac2 \n\t"
533 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
534 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
535 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
536 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
537
538 /* even 8. pixel */
539 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
540 "mthi $zero, $ac3 \n\t"
541 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
542 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
543 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
544 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
545
546 /* ODD pixels */
547 "ulw %[qload1], 1(%[src]) \n\t"
548 "ulw %[qload2], 5(%[src]) \n\t"
549
550 /* odd 1. pixel */
551 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
552 "mthi $zero, $ac1 \n\t"
553 "preceu.ph.qbr %[p1], %[qload1] \n\t"
554 "preceu.ph.qbl %[p2], %[qload1] \n\t"
555 "preceu.ph.qbr %[p3], %[qload2] \n\t"
556 "preceu.ph.qbl %[p4], %[qload2] \n\t"
557 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
558 "ulw %[qload3], 9(%[src]) \n\t"
559 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
560 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
561 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
562
563 /* odd 2. pixel */
564 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
565 "mthi $zero, $ac2 \n\t"
566 "preceu.ph.qbr %[p1], %[qload3] \n\t"
567 "preceu.ph.qbl %[p5], %[qload3] \n\t"
568 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
569 "ulw %[qload1], 13(%[src]) \n\t"
570 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
571 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
572 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
573
574 /* odd 3. pixel */
575 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
576 "mthi $zero, $ac3 \n\t"
577 "preceu.ph.qbr %[p2], %[qload1] \n\t"
578 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
579 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
580 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
581 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
582
583 /* odd 4. pixel */
584 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
585 "mthi $zero, $ac1 \n\t"
586 "preceu.ph.qbl %[p3], %[qload1] \n\t"
587 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
588 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
589 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
590 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
591
592 /* odd 5. pixel */
593 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
594 "mthi $zero, $ac2 \n\t"
595 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
596 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
597 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
598 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
599
600 /* odd 6. pixel */
601 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
602 "mthi $zero, $ac3 \n\t"
603 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
604 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
605 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
606 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
607
608 /* odd 7. pixel */
609 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
610 "mthi $zero, $ac1 \n\t"
611 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
612 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
613 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
614
615 /* odd 8. pixel */
616 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
617 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
618
619 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
620 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
621 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
622
623 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
624 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
625 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
626
627 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
628 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
629 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
630 [p5] "=&r" (p5),
631 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
632 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
633 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
634 );
635
636 src += 16;
637 dst += 16;
638 }
639
640 /* Next row... */
641 src_ptr += src_stride;
642 dst_ptr += dst_stride;
643 }
644 }
645
vp9_convolve2_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)646 void vp9_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
647 uint8_t *dst, ptrdiff_t dst_stride,
648 const int16_t *filter_x, int x_step_q4,
649 const int16_t *filter_y, int y_step_q4,
650 int w, int h) {
651 if (16 == x_step_q4) {
652 uint32_t pos = 38;
653
654 vp9_prefetch_load((const uint8_t *)filter_x);
655
656 /* bit positon for extract from acc */
657 __asm__ __volatile__ (
658 "wrdsp %[pos], 1 \n\t"
659 :
660 : [pos] "r" (pos)
661 );
662
663 /* prefetch data to cache memory */
664 vp9_prefetch_load(src);
665 vp9_prefetch_load(src + 32);
666 vp9_prefetch_store(dst);
667
668 switch (w) {
669 case 4:
670 convolve_bi_horiz_4_dspr2(src, (int32_t)src_stride,
671 dst, (int32_t)dst_stride,
672 filter_x, (int32_t)h);
673 break;
674 case 8:
675 convolve_bi_horiz_8_dspr2(src, (int32_t)src_stride,
676 dst, (int32_t)dst_stride,
677 filter_x, (int32_t)h);
678 break;
679 case 16:
680 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
681 dst, (int32_t)dst_stride,
682 filter_x, (int32_t)h, 1);
683 break;
684 case 32:
685 convolve_bi_horiz_16_dspr2(src, (int32_t)src_stride,
686 dst, (int32_t)dst_stride,
687 filter_x, (int32_t)h, 2);
688 break;
689 case 64:
690 vp9_prefetch_load(src + 64);
691 vp9_prefetch_store(dst + 32);
692
693 convolve_bi_horiz_64_dspr2(src, (int32_t)src_stride,
694 dst, (int32_t)dst_stride,
695 filter_x, (int32_t)h);
696 break;
697 default:
698 vp9_convolve8_horiz_c(src, src_stride,
699 dst, dst_stride,
700 filter_x, x_step_q4,
701 filter_y, y_step_q4,
702 w, h);
703 break;
704 }
705 } else {
706 vp9_convolve8_horiz_c(src, src_stride,
707 dst, dst_stride,
708 filter_x, x_step_q4,
709 filter_y, y_step_q4,
710 w, h);
711 }
712 }
713 #endif
714