1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_filter.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
convolve_bi_horiz_4_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_bi_horiz_4_transposed_dspr2(const uint8_t *src,
24 int32_t src_stride,
25 uint8_t *dst,
26 int32_t dst_stride,
27 const int16_t *filter_x0,
28 int32_t h) {
29 int32_t y;
30 uint8_t *cm = vp9_ff_cropTbl;
31 uint8_t *dst_ptr;
32 int32_t Temp1, Temp2;
33 uint32_t vector4a = 64;
34 uint32_t tp1, tp2;
35 uint32_t p1, p2;
36 const int16_t *filter = &filter_x0[3];
37 uint32_t filter45;
38
39 filter45 = ((const int32_t *)filter)[0];
40
41 for (y = h; y--;) {
42 dst_ptr = dst;
43 /* prefetch data to cache memory */
44 vp9_prefetch_load(src + src_stride);
45 vp9_prefetch_load(src + src_stride + 32);
46
47 __asm__ __volatile__ (
48 "ulw %[tp1], 0(%[src]) \n\t"
49 "ulw %[tp2], 4(%[src]) \n\t"
50
51 /* even 1. pixel */
52 "mtlo %[vector4a], $ac3 \n\t"
53 "mthi $zero, $ac3 \n\t"
54 "preceu.ph.qbr %[p1], %[tp1] \n\t"
55 "preceu.ph.qbl %[p2], %[tp1] \n\t"
56 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
57 "extp %[Temp1], $ac3, 31 \n\t"
58
59 /* even 2. pixel */
60 "mtlo %[vector4a], $ac2 \n\t"
61 "mthi $zero, $ac2 \n\t"
62 "balign %[tp2], %[tp1], 3 \n\t"
63 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
64 "extp %[Temp2], $ac2, 31 \n\t"
65
66 /* odd 1. pixel */
67 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
68 "mtlo %[vector4a], $ac3 \n\t"
69 "mthi $zero, $ac3 \n\t"
70 "preceu.ph.qbr %[p1], %[tp2] \n\t"
71 "preceu.ph.qbl %[p2], %[tp2] \n\t"
72 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
73 "extp %[Temp1], $ac3, 31 \n\t"
74
75 /* odd 2. pixel */
76 "lbux %[tp2], %[Temp2](%[cm]) \n\t"
77 "mtlo %[vector4a], $ac2 \n\t"
78 "mthi $zero, $ac2 \n\t"
79 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
80 "extp %[Temp2], $ac2, 31 \n\t"
81
82 /* clamp */
83 "lbux %[p1], %[Temp1](%[cm]) \n\t"
84 "lbux %[p2], %[Temp2](%[cm]) \n\t"
85
86 /* store bytes */
87 "sb %[tp1], 0(%[dst_ptr]) \n\t"
88 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
89
90 "sb %[p1], 0(%[dst_ptr]) \n\t"
91 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
92
93 "sb %[tp2], 0(%[dst_ptr]) \n\t"
94 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
95
96 "sb %[p2], 0(%[dst_ptr]) \n\t"
97 "addu %[dst_ptr], %[dst_ptr], %[dst_stride] \n\t"
98
99 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
100 [p1] "=&r" (p1), [p2] "=&r" (p2),
101 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
102 [dst_ptr] "+r" (dst_ptr)
103 : [filter45] "r" (filter45),[vector4a] "r" (vector4a),
104 [cm] "r" (cm), [src] "r" (src), [dst_stride] "r" (dst_stride)
105 );
106
107 /* Next row... */
108 src += src_stride;
109 dst += 1;
110 }
111 }
112
convolve_bi_horiz_8_transposed_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)113 static void convolve_bi_horiz_8_transposed_dspr2(const uint8_t *src,
114 int32_t src_stride,
115 uint8_t *dst,
116 int32_t dst_stride,
117 const int16_t *filter_x0,
118 int32_t h) {
119 int32_t y;
120 uint8_t *cm = vp9_ff_cropTbl;
121 uint8_t *dst_ptr;
122 uint32_t vector4a = 64;
123 int32_t Temp1, Temp2, Temp3;
124 uint32_t tp1, tp2, tp3;
125 uint32_t p1, p2, p3, p4;
126 uint8_t *odd_dst;
127 uint32_t dst_pitch_2 = (dst_stride << 1);
128 const int16_t *filter = &filter_x0[3];
129 uint32_t filter45;
130
131 filter45 = ((const int32_t *)filter)[0];
132
133 for (y = h; y--;) {
134 /* prefetch data to cache memory */
135 vp9_prefetch_load(src + src_stride);
136 vp9_prefetch_load(src + src_stride + 32);
137
138 dst_ptr = dst;
139 odd_dst = (dst_ptr + dst_stride);
140
141 __asm__ __volatile__ (
142 "ulw %[tp1], 0(%[src]) \n\t"
143 "ulw %[tp2], 4(%[src]) \n\t"
144
145 /* even 1. pixel */
146 "mtlo %[vector4a], $ac3 \n\t"
147 "mthi $zero, $ac3 \n\t"
148 "mtlo %[vector4a], $ac2 \n\t"
149 "mthi $zero, $ac2 \n\t"
150 "preceu.ph.qbr %[p1], %[tp1] \n\t"
151 "preceu.ph.qbl %[p2], %[tp1] \n\t"
152 "preceu.ph.qbr %[p3], %[tp2] \n\t"
153 "preceu.ph.qbl %[p4], %[tp2] \n\t"
154 "ulw %[tp3], 8(%[src]) \n\t"
155 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
156 "extp %[Temp1], $ac3, 31 \n\t"
157
158 /* even 2. pixel */
159 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
160 "extp %[Temp3], $ac2, 31 \n\t"
161
162 /* even 3. pixel */
163 "lbux %[Temp2], %[Temp1](%[cm]) \n\t"
164 "mtlo %[vector4a], $ac1 \n\t"
165 "mthi $zero, $ac1 \n\t"
166 "balign %[tp3], %[tp2], 3 \n\t"
167 "balign %[tp2], %[tp1], 3 \n\t"
168 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
169 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
170 "extp %[p3], $ac1, 31 \n\t"
171
172 /* even 4. pixel */
173 "mtlo %[vector4a], $ac2 \n\t"
174 "mthi $zero, $ac2 \n\t"
175 "mtlo %[vector4a], $ac3 \n\t"
176 "mthi $zero, $ac3 \n\t"
177 "sb %[Temp2], 0(%[dst_ptr]) \n\t"
178 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
179 "sb %[tp1], 0(%[dst_ptr]) \n\t"
180 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
181
182 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
183 "extp %[Temp3], $ac2, 31 \n\t"
184
185 "lbux %[Temp1], %[p3](%[cm]) \n\t"
186
187 /* odd 1. pixel */
188 "mtlo %[vector4a], $ac1 \n\t"
189 "mthi $zero, $ac1 \n\t"
190 "preceu.ph.qbr %[p1], %[tp2] \n\t"
191 "preceu.ph.qbl %[p2], %[tp2] \n\t"
192 "preceu.ph.qbr %[p3], %[tp3] \n\t"
193 "preceu.ph.qbl %[p4], %[tp3] \n\t"
194 "sb %[Temp1], 0(%[dst_ptr]) \n\t"
195 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
196
197 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
198 "extp %[Temp2], $ac3, 31 \n\t"
199
200 /* odd 2. pixel */
201 "lbux %[tp1], %[Temp3](%[cm]) \n\t"
202 "mtlo %[vector4a], $ac3 \n\t"
203 "mthi $zero, $ac3 \n\t"
204 "mtlo %[vector4a], $ac2 \n\t"
205 "mthi $zero, $ac2 \n\t"
206 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
207 "sb %[tp1], 0(%[dst_ptr]) \n\t"
208 "addu %[dst_ptr], %[dst_ptr], %[dst_pitch_2] \n\t"
209 "extp %[Temp3], $ac1, 31 \n\t"
210
211 /* odd 3. pixel */
212 "lbux %[tp3], %[Temp2](%[cm]) \n\t"
213 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
214 "extp %[Temp2], $ac3, 31 \n\t"
215
216 /* odd 4. pixel */
217 "sb %[tp3], 0(%[odd_dst]) \n\t"
218 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
219 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
220 "extp %[Temp1], $ac2, 31 \n\t"
221
222 /* clamp */
223 "lbux %[p4], %[Temp3](%[cm]) \n\t"
224 "lbux %[p2], %[Temp2](%[cm]) \n\t"
225 "lbux %[p1], %[Temp1](%[cm]) \n\t"
226
227 /* store bytes */
228 "sb %[p4], 0(%[odd_dst]) \n\t"
229 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
230
231 "sb %[p2], 0(%[odd_dst]) \n\t"
232 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
233
234 "sb %[p1], 0(%[odd_dst]) \n\t"
235
236 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2), [tp3] "=&r" (tp3),
237 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
238 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
239 [dst_ptr] "+r" (dst_ptr), [odd_dst] "+r" (odd_dst)
240 : [filter45] "r" (filter45),[vector4a] "r" (vector4a), [cm] "r" (cm),
241 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
242 );
243
244 /* Next row... */
245 src += src_stride;
246 dst += 1;
247 }
248 }
249
convolve_bi_horiz_16_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)250 static void convolve_bi_horiz_16_transposed_dspr2(const uint8_t *src_ptr,
251 int32_t src_stride,
252 uint8_t *dst_ptr,
253 int32_t dst_stride,
254 const int16_t *filter_x0,
255 int32_t h,
256 int32_t count) {
257 int32_t c, y;
258 const uint8_t *src;
259 uint8_t *dst;
260 uint8_t *cm = vp9_ff_cropTbl;
261 uint32_t vector_64 = 64;
262 int32_t Temp1, Temp2, Temp3;
263 uint32_t qload1, qload2;
264 uint32_t p1, p2, p3, p4, p5;
265 uint32_t st1, st2, st3;
266 uint32_t dst_pitch_2 = (dst_stride << 1);
267 uint8_t *odd_dst;
268 const int16_t *filter = &filter_x0[3];
269 uint32_t filter45;
270
271 filter45 = ((const int32_t *)filter)[0];
272
273 for (y = h; y--;) {
274 /* prefetch data to cache memory */
275 vp9_prefetch_load(src_ptr + src_stride);
276 vp9_prefetch_load(src_ptr + src_stride + 32);
277
278 src = src_ptr;
279 dst = dst_ptr;
280
281 odd_dst = (dst + dst_stride);
282
283 for (c = 0; c < count; c++) {
284 __asm__ __volatile__ (
285 "ulw %[qload1], 0(%[src]) \n\t"
286 "ulw %[qload2], 4(%[src]) \n\t"
287
288 /* even 1. pixel */
289 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
290 "mthi $zero, $ac1 \n\t"
291 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
292 "mthi $zero, $ac2 \n\t"
293 "preceu.ph.qbr %[p1], %[qload1] \n\t"
294 "preceu.ph.qbl %[p2], %[qload1] \n\t"
295 "preceu.ph.qbr %[p3], %[qload2] \n\t"
296 "preceu.ph.qbl %[p4], %[qload2] \n\t"
297 "ulw %[qload1], 8(%[src]) \n\t"
298 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
299 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
300
301 /* even 2. pixel */
302 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
303 "mthi $zero, $ac3 \n\t"
304 "preceu.ph.qbr %[p1], %[qload1] \n\t"
305 "preceu.ph.qbl %[p5], %[qload1] \n\t"
306 "ulw %[qload2], 12(%[src]) \n\t"
307 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
308 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
309 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
310
311 /* even 3. pixel */
312 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
313 "mthi $zero, $ac1 \n\t"
314 "preceu.ph.qbr %[p2], %[qload2] \n\t"
315 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
316 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
317 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
318 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
319 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
320
321 /* even 4. pixel */
322 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
323 "mthi $zero, $ac2 \n\t"
324 "preceu.ph.qbl %[p3], %[qload2] \n\t"
325 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
326 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
327 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
328 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
329 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
330
331 /* even 5. pixel */
332 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
333 "mthi $zero, $ac3 \n\t"
334 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
335 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
336 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
337 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
338 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
339
340 /* even 6. pixel */
341 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
342 "mthi $zero, $ac1 \n\t"
343 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
344 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
345 "ulw %[qload1], 20(%[src]) \n\t"
346 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
347 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
348 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
349
350 /* even 7. pixel */
351 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
352 "mthi $zero, $ac2 \n\t"
353 "preceu.ph.qbr %[p5], %[qload1] \n\t"
354 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
355 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
356 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
357 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
358 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
359
360 /* even 8. pixel */
361 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
362 "mthi $zero, $ac3 \n\t"
363 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
364 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
365 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
366 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
367 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
368
369 /* ODD pixels */
370 "ulw %[qload1], 1(%[src]) \n\t"
371 "ulw %[qload2], 5(%[src]) \n\t"
372
373 /* odd 1. pixel */
374 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
375 "mthi $zero, $ac1 \n\t"
376 "preceu.ph.qbr %[p1], %[qload1] \n\t"
377 "preceu.ph.qbl %[p2], %[qload1] \n\t"
378 "preceu.ph.qbr %[p3], %[qload2] \n\t"
379 "preceu.ph.qbl %[p4], %[qload2] \n\t"
380 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
381 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
382 "ulw %[qload2], 9(%[src]) \n\t"
383 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
384 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
385 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
386
387 /* odd 2. pixel */
388 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
389 "mthi $zero, $ac2 \n\t"
390 "preceu.ph.qbr %[p1], %[qload2] \n\t"
391 "preceu.ph.qbl %[p5], %[qload2] \n\t"
392 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
393 "ulw %[qload1], 13(%[src]) \n\t"
394 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
395 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
396 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
397
398 /* odd 3. pixel */
399 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
400 "mthi $zero, $ac3 \n\t"
401 "preceu.ph.qbr %[p2], %[qload1] \n\t"
402 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
403 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
404 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
405 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
406 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
407
408 /* odd 4. pixel */
409 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
410 "mthi $zero, $ac1 \n\t"
411 "preceu.ph.qbl %[p3], %[qload1] \n\t"
412 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
413 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
414 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
415 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
416 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
417
418 /* odd 5. pixel */
419 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
420 "mthi $zero, $ac2 \n\t"
421 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
422 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
423 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
424 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
425 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
426
427 /* odd 6. pixel */
428 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
429 "mthi $zero, $ac3 \n\t"
430 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
431 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
432 "ulw %[qload1], 21(%[src]) \n\t"
433 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
434 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
435 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
436
437 /* odd 7. pixel */
438 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
439 "mthi $zero, $ac1 \n\t"
440 "preceu.ph.qbr %[p5], %[qload1] \n\t"
441 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
442 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
443 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
444 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
445
446 /* odd 8. pixel */
447 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
448 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
449
450 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
451 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
452 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
453
454 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
455 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
456
457 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
458 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
459
460 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
461
462 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
463 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
464 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
465 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
466 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
467 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
468 [cm] "r" (cm),
469 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
470 );
471
472 src += 16;
473 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
474 odd_dst = (dst + dst_stride);
475 }
476
477 /* Next row... */
478 src_ptr += src_stride;
479 dst_ptr += 1;
480 }
481 }
482
convolve_bi_horiz_64_transposed_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)483 static void convolve_bi_horiz_64_transposed_dspr2(const uint8_t *src_ptr,
484 int32_t src_stride,
485 uint8_t *dst_ptr,
486 int32_t dst_stride,
487 const int16_t *filter_x0,
488 int32_t h) {
489 int32_t c, y;
490 const uint8_t *src;
491 uint8_t *dst;
492 uint8_t *cm = vp9_ff_cropTbl;
493 uint32_t vector_64 = 64;
494 int32_t Temp1, Temp2, Temp3;
495 uint32_t qload1, qload2;
496 uint32_t p1, p2, p3, p4, p5;
497 uint32_t st1, st2, st3;
498 uint32_t dst_pitch_2 = (dst_stride << 1);
499 uint8_t *odd_dst;
500 const int16_t *filter = &filter_x0[3];
501 uint32_t filter45;
502
503 filter45 = ((const int32_t *)filter)[0];
504
505 for (y = h; y--;) {
506 /* prefetch data to cache memory */
507 vp9_prefetch_load(src_ptr + src_stride);
508 vp9_prefetch_load(src_ptr + src_stride + 32);
509 vp9_prefetch_load(src_ptr + src_stride + 64);
510
511 src = src_ptr;
512 dst = dst_ptr;
513
514 odd_dst = (dst + dst_stride);
515
516 for (c = 0; c < 4; c++) {
517 __asm__ __volatile__ (
518 "ulw %[qload1], 0(%[src]) \n\t"
519 "ulw %[qload2], 4(%[src]) \n\t"
520
521 /* even 1. pixel */
522 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
523 "mthi $zero, $ac1 \n\t"
524 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
525 "mthi $zero, $ac2 \n\t"
526 "preceu.ph.qbr %[p1], %[qload1] \n\t"
527 "preceu.ph.qbl %[p2], %[qload1] \n\t"
528 "preceu.ph.qbr %[p3], %[qload2] \n\t"
529 "preceu.ph.qbl %[p4], %[qload2] \n\t"
530 "ulw %[qload1], 8(%[src]) \n\t"
531 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
532 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
533
534 /* even 2. pixel */
535 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
536 "mthi $zero, $ac3 \n\t"
537 "preceu.ph.qbr %[p1], %[qload1] \n\t"
538 "preceu.ph.qbl %[p5], %[qload1] \n\t"
539 "ulw %[qload2], 12(%[src]) \n\t"
540 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
541 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
542 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
543
544 /* even 3. pixel */
545 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
546 "mthi $zero, $ac1 \n\t"
547 "preceu.ph.qbr %[p2], %[qload2] \n\t"
548 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
549 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
550 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
551 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
552 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
553
554 /* even 4. pixel */
555 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
556 "mthi $zero, $ac2 \n\t"
557 "preceu.ph.qbl %[p3], %[qload2] \n\t"
558 "sb %[st2], 0(%[dst]) \n\t" /* even 2 */
559 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
560 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
561 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
562 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
563
564 /* even 5. pixel */
565 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
566 "mthi $zero, $ac3 \n\t"
567 "sb %[st3], 0(%[dst]) \n\t" /* even 3 */
568 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
569 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
570 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
571 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
572
573 /* even 6. pixel */
574 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
575 "mthi $zero, $ac1 \n\t"
576 "sb %[st1], 0(%[dst]) \n\t" /* even 4 */
577 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
578 "ulw %[qload1], 20(%[src]) \n\t"
579 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
580 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
581 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
582
583 /* even 7. pixel */
584 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
585 "mthi $zero, $ac2 \n\t"
586 "preceu.ph.qbr %[p5], %[qload1] \n\t"
587 "sb %[st2], 0(%[dst]) \n\t" /* even 5 */
588 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
589 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
590 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
591 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
592
593 /* even 8. pixel */
594 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
595 "mthi $zero, $ac3 \n\t"
596 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
597 "sb %[st3], 0(%[dst]) \n\t" /* even 6 */
598 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
599 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
600 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
601
602 /* ODD pixels */
603 "ulw %[qload1], 1(%[src]) \n\t"
604 "ulw %[qload2], 5(%[src]) \n\t"
605
606 /* odd 1. pixel */
607 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
608 "mthi $zero, $ac1 \n\t"
609 "preceu.ph.qbr %[p1], %[qload1] \n\t"
610 "preceu.ph.qbl %[p2], %[qload1] \n\t"
611 "preceu.ph.qbr %[p3], %[qload2] \n\t"
612 "preceu.ph.qbl %[p4], %[qload2] \n\t"
613 "sb %[st1], 0(%[dst]) \n\t" /* even 7 */
614 "addu %[dst], %[dst], %[dst_pitch_2] \n\t"
615 "ulw %[qload2], 9(%[src]) \n\t"
616 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
617 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
618 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
619
620 /* odd 2. pixel */
621 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
622 "mthi $zero, $ac2 \n\t"
623 "preceu.ph.qbr %[p1], %[qload2] \n\t"
624 "preceu.ph.qbl %[p5], %[qload2] \n\t"
625 "sb %[st2], 0(%[dst]) \n\t" /* even 8 */
626 "ulw %[qload1], 13(%[src]) \n\t"
627 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
628 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
629 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
630
631 /* odd 3. pixel */
632 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
633 "mthi $zero, $ac3 \n\t"
634 "preceu.ph.qbr %[p2], %[qload1] \n\t"
635 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 1 */
636 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
637 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
638 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
639 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
640
641 /* odd 4. pixel */
642 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
643 "mthi $zero, $ac1 \n\t"
644 "preceu.ph.qbl %[p3], %[qload1] \n\t"
645 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 2 */
646 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
647 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
648 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
649 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
650
651 /* odd 5. pixel */
652 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
653 "mthi $zero, $ac2 \n\t"
654 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 3 */
655 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
656 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
657 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
658 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
659
660 /* odd 6. pixel */
661 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
662 "mthi $zero, $ac3 \n\t"
663 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 4 */
664 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
665 "ulw %[qload1], 21(%[src]) \n\t"
666 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
667 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
668 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
669
670 /* odd 7. pixel */
671 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
672 "mthi $zero, $ac1 \n\t"
673 "preceu.ph.qbr %[p5], %[qload1] \n\t"
674 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 5 */
675 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
676 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
677 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
678
679 /* odd 8. pixel */
680 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
681 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
682
683 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
684 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
685 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
686
687 "sb %[st2], 0(%[odd_dst]) \n\t" /* odd 6 */
688 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
689
690 "sb %[st3], 0(%[odd_dst]) \n\t" /* odd 7 */
691 "addu %[odd_dst], %[odd_dst], %[dst_pitch_2] \n\t"
692
693 "sb %[st1], 0(%[odd_dst]) \n\t" /* odd 8 */
694
695 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [p5] "=&r" (p5),
696 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
697 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
698 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3),
699 [dst] "+r" (dst), [odd_dst] "+r" (odd_dst)
700 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
701 [cm] "r" (cm),
702 [src] "r" (src), [dst_pitch_2] "r" (dst_pitch_2)
703 );
704
705 src += 16;
706 dst = (dst_ptr + ((c + 1) * 16 * dst_stride));
707 odd_dst = (dst + dst_stride);
708 }
709
710 /* Next row... */
711 src_ptr += src_stride;
712 dst_ptr += 1;
713 }
714 }
715
convolve_bi_horiz_transposed(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)716 void convolve_bi_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride,
717 uint8_t *dst, ptrdiff_t dst_stride,
718 const int16_t *filter, int w, int h) {
719 int x, y;
720
721 for (y = 0; y < h; ++y) {
722 for (x = 0; x < w; ++x) {
723 int sum = 0;
724
725 sum += src[x] * filter[3];
726 sum += src[x + 1] * filter[4];
727
728 dst[x * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
729 }
730
731 src += src_stride;
732 dst += 1;
733 }
734 }
735
vp9_convolve2_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter,int w,int h)736 void vp9_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride,
737 uint8_t *dst, ptrdiff_t dst_stride,
738 const int16_t *filter,
739 int w, int h) {
740 uint32_t pos = 38;
741
742 /* bit positon for extract from acc */
743 __asm__ __volatile__ (
744 "wrdsp %[pos], 1 \n\t"
745 :
746 : [pos] "r" (pos)
747 );
748
749 /* prefetch data to cache memory */
750 vp9_prefetch_load(src);
751 vp9_prefetch_load(src + 32);
752
753 switch (w) {
754 case 4:
755 convolve_bi_horiz_4_transposed_dspr2(src, src_stride,
756 dst, dst_stride,
757 filter, h);
758 break;
759 case 8:
760 convolve_bi_horiz_8_transposed_dspr2(src, src_stride,
761 dst, dst_stride,
762 filter, h);
763 break;
764 case 16:
765 case 32:
766 convolve_bi_horiz_16_transposed_dspr2(src, src_stride,
767 dst, dst_stride,
768 filter, h,
769 (w/16));
770 break;
771 case 64:
772 vp9_prefetch_load(src + 32);
773 convolve_bi_horiz_64_transposed_dspr2(src, src_stride,
774 dst, dst_stride,
775 filter, h);
776 break;
777 default:
778 convolve_bi_horiz_transposed(src, src_stride,
779 dst, dst_stride,
780 filter, w, h);
781 break;
782 }
783 }
784 #endif
785