1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
convolve_bi_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_bi_avg_horiz_4_dspr2(const uint8_t *src,
24 int32_t src_stride,
25 uint8_t *dst,
26 int32_t dst_stride,
27 const int16_t *filter_x0,
28 int32_t h) {
29 int32_t y;
30 uint8_t *cm = vp9_ff_cropTbl;
31 int32_t Temp1, Temp2, Temp3, Temp4;
32 uint32_t vector4a = 64;
33 uint32_t tp1, tp2;
34 uint32_t p1, p2, p3;
35 uint32_t tn1, tn2;
36 const int16_t *filter = &filter_x0[3];
37 uint32_t filter45;
38
39 filter45 = ((const int32_t *)filter)[0];
40
41 for (y = h; y--;) {
42 /* prefetch data to cache memory */
43 vp9_prefetch_load(src + src_stride);
44 vp9_prefetch_load(src + src_stride + 32);
45 vp9_prefetch_store(dst + dst_stride);
46
47 __asm__ __volatile__ (
48 "ulw %[tp1], 0(%[src]) \n\t"
49 "ulw %[tp2], 4(%[src]) \n\t"
50
51 /* even 1. pixel */
52 "mtlo %[vector4a], $ac3 \n\t"
53 "mthi $zero, $ac3 \n\t"
54 "preceu.ph.qbr %[p1], %[tp1] \n\t"
55 "preceu.ph.qbl %[p2], %[tp1] \n\t"
56 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
57 "extp %[Temp1], $ac3, 31 \n\t"
58
59 /* even 2. pixel */
60 "mtlo %[vector4a], $ac2 \n\t"
61 "mthi $zero, $ac2 \n\t"
62 "balign %[tp2], %[tp1], 3 \n\t"
63 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
64 "extp %[Temp3], $ac2, 31 \n\t"
65
66 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
67
68 /* odd 1. pixel */
69 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
70 "mtlo %[vector4a], $ac3 \n\t"
71 "mthi $zero, $ac3 \n\t"
72 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
73 "preceu.ph.qbr %[p1], %[tp2] \n\t"
74 "preceu.ph.qbl %[p3], %[tp2] \n\t"
75 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
76 "extp %[Temp2], $ac3, 31 \n\t"
77
78 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
79
80 /* odd 2. pixel */
81 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
82 "mtlo %[vector4a], $ac2 \n\t"
83 "mthi $zero, $ac2 \n\t"
84 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
85 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
86 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t"
87 "extp %[Temp4], $ac2, 31 \n\t"
88
89 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
90 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
91
92 /* clamp */
93 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
94 "lbux %[p3], %[Temp4](%[cm]) \n\t" /* odd 2 */
95 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
96
97 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
98 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
99
100 "addqh_r.w %[p2], %[p2], %[p3] \n\t" /* average odd 2 */
101 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
102
103 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
104 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
105 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3),
106 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
107 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
108 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
109 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
110 );
111
112 /* Next row... */
113 src += src_stride;
114 dst += dst_stride;
115 }
116 }
117
convolve_bi_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)118 static void convolve_bi_avg_horiz_8_dspr2(const uint8_t *src,
119 int32_t src_stride,
120 uint8_t *dst,
121 int32_t dst_stride,
122 const int16_t *filter_x0,
123 int32_t h) {
124 int32_t y;
125 uint8_t *cm = vp9_ff_cropTbl;
126 uint32_t vector4a = 64;
127 int32_t Temp1, Temp2, Temp3;
128 uint32_t tp1, tp2, tp3, tp4;
129 uint32_t p1, p2, p3, p4, n1;
130 uint32_t st0, st1;
131 const int16_t *filter = &filter_x0[3];
132 uint32_t filter45;;
133
134 filter45 = ((const int32_t *)filter)[0];
135
136 for (y = h; y--;) {
137 /* prefetch data to cache memory */
138 vp9_prefetch_load(src + src_stride);
139 vp9_prefetch_load(src + src_stride + 32);
140 vp9_prefetch_store(dst + dst_stride);
141
142 __asm__ __volatile__ (
143 "ulw %[tp1], 0(%[src]) \n\t"
144 "ulw %[tp2], 4(%[src]) \n\t"
145
146 /* even 1. pixel */
147 "mtlo %[vector4a], $ac3 \n\t"
148 "mthi $zero, $ac3 \n\t"
149 "mtlo %[vector4a], $ac2 \n\t"
150 "mthi $zero, $ac2 \n\t"
151 "preceu.ph.qbr %[p1], %[tp1] \n\t"
152 "preceu.ph.qbl %[p2], %[tp1] \n\t"
153 "preceu.ph.qbr %[p3], %[tp2] \n\t"
154 "preceu.ph.qbl %[p4], %[tp2] \n\t"
155 "ulw %[tp3], 8(%[src]) \n\t"
156 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
157 "extp %[Temp1], $ac3, 31 \n\t"
158 "lbu %[Temp2], 0(%[dst]) \n\t"
159 "lbu %[tp4], 2(%[dst]) \n\t"
160
161 /* even 2. pixel */
162 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t"
163 "extp %[Temp3], $ac2, 31 \n\t"
164
165 /* even 3. pixel */
166 "lbux %[st0], %[Temp1](%[cm]) \n\t"
167 "mtlo %[vector4a], $ac1 \n\t"
168 "mthi $zero, $ac1 \n\t"
169 "lbux %[st1], %[Temp3](%[cm]) \n\t"
170 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t"
171 "extp %[Temp1], $ac1, 31 \n\t"
172
173 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
174 "addqh_r.w %[tp4], %[tp4], %[st1] \n\t"
175 "sb %[Temp2], 0(%[dst]) \n\t"
176 "sb %[tp4], 2(%[dst]) \n\t"
177
178 /* even 4. pixel */
179 "mtlo %[vector4a], $ac2 \n\t"
180 "mthi $zero, $ac2 \n\t"
181 "mtlo %[vector4a], $ac3 \n\t"
182 "mthi $zero, $ac3 \n\t"
183
184 "balign %[tp3], %[tp2], 3 \n\t"
185 "balign %[tp2], %[tp1], 3 \n\t"
186
187 "lbux %[st0], %[Temp1](%[cm]) \n\t"
188 "lbu %[Temp2], 4(%[dst]) \n\t"
189 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
190
191 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
192 "extp %[Temp3], $ac2, 31 \n\t"
193
194 /* odd 1. pixel */
195 "mtlo %[vector4a], $ac1 \n\t"
196 "mthi $zero, $ac1 \n\t"
197 "sb %[Temp2], 4(%[dst]) \n\t"
198 "preceu.ph.qbr %[p1], %[tp2] \n\t"
199 "preceu.ph.qbl %[p2], %[tp2] \n\t"
200 "preceu.ph.qbr %[p3], %[tp3] \n\t"
201 "preceu.ph.qbl %[p4], %[tp3] \n\t"
202 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t"
203 "extp %[Temp2], $ac3, 31 \n\t"
204
205 "lbu %[tp1], 6(%[dst]) \n\t"
206
207 /* odd 2. pixel */
208 "mtlo %[vector4a], $ac3 \n\t"
209 "mthi $zero, $ac3 \n\t"
210 "mtlo %[vector4a], $ac2 \n\t"
211 "mthi $zero, $ac2 \n\t"
212 "lbux %[st0], %[Temp3](%[cm]) \n\t"
213 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t"
214 "extp %[Temp3], $ac1, 31 \n\t"
215
216 "lbu %[tp2], 1(%[dst]) \n\t"
217 "lbu %[tp3], 3(%[dst]) \n\t"
218 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
219
220 /* odd 3. pixel */
221 "lbux %[st1], %[Temp2](%[cm]) \n\t"
222 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t"
223 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
224 "extp %[Temp2], $ac3, 31 \n\t"
225
226 "lbu %[tp4], 5(%[dst]) \n\t"
227
228 /* odd 4. pixel */
229 "sb %[tp2], 1(%[dst]) \n\t"
230 "sb %[tp1], 6(%[dst]) \n\t"
231 "dpa.w.ph $ac2, %[p4], %[filter45] \n\t"
232 "extp %[Temp1], $ac2, 31 \n\t"
233
234 "lbu %[tp1], 7(%[dst]) \n\t"
235
236 /* clamp */
237 "lbux %[p4], %[Temp3](%[cm]) \n\t"
238 "addqh_r.w %[tp3], %[tp3], %[p4] \n\t"
239
240 "lbux %[p2], %[Temp2](%[cm]) \n\t"
241 "addqh_r.w %[tp4], %[tp4], %[p2] \n\t"
242
243 "lbux %[p1], %[Temp1](%[cm]) \n\t"
244 "addqh_r.w %[tp1], %[tp1], %[p1] \n\t"
245
246 /* store bytes */
247 "sb %[tp3], 3(%[dst]) \n\t"
248 "sb %[tp4], 5(%[dst]) \n\t"
249 "sb %[tp1], 7(%[dst]) \n\t"
250
251 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
252 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
253 [st0] "=&r" (st0), [st1] "=&r" (st1),
254 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
255 [n1] "=&r" (n1),
256 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
257 : [filter45] "r" (filter45), [vector4a] "r" (vector4a),
258 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
259 );
260
261 /* Next row... */
262 src += src_stride;
263 dst += dst_stride;
264 }
265 }
266
convolve_bi_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)267 static void convolve_bi_avg_horiz_16_dspr2(const uint8_t *src_ptr,
268 int32_t src_stride,
269 uint8_t *dst_ptr,
270 int32_t dst_stride,
271 const int16_t *filter_x0,
272 int32_t h,
273 int32_t count) {
274 int32_t y, c;
275 const uint8_t *src;
276 uint8_t *dst;
277 uint8_t *cm = vp9_ff_cropTbl;
278 uint32_t vector_64 = 64;
279 int32_t Temp1, Temp2, Temp3;
280 uint32_t qload1, qload2, qload3;
281 uint32_t p1, p2, p3, p4, p5;
282 uint32_t st1, st2, st3;
283 const int16_t *filter = &filter_x0[3];
284 uint32_t filter45;;
285
286 filter45 = ((const int32_t *)filter)[0];
287
288 for (y = h; y--;) {
289 src = src_ptr;
290 dst = dst_ptr;
291
292 /* prefetch data to cache memory */
293 vp9_prefetch_load(src_ptr + src_stride);
294 vp9_prefetch_load(src_ptr + src_stride + 32);
295 vp9_prefetch_store(dst_ptr + dst_stride);
296
297 for (c = 0; c < count; c++) {
298 __asm__ __volatile__ (
299 "ulw %[qload1], 0(%[src]) \n\t"
300 "ulw %[qload2], 4(%[src]) \n\t"
301
302 /* even 1. pixel */
303 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
304 "mthi $zero, $ac1 \n\t"
305 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
306 "mthi $zero, $ac2 \n\t"
307 "preceu.ph.qbr %[p1], %[qload1] \n\t"
308 "preceu.ph.qbl %[p2], %[qload1] \n\t"
309 "preceu.ph.qbr %[p3], %[qload2] \n\t"
310 "preceu.ph.qbl %[p4], %[qload2] \n\t"
311 "ulw %[qload3], 8(%[src]) \n\t"
312 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
313 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
314 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
315
316 /* even 2. pixel */
317 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
318 "mthi $zero, $ac3 \n\t"
319 "preceu.ph.qbr %[p1], %[qload3] \n\t"
320 "preceu.ph.qbl %[p5], %[qload3] \n\t"
321 "ulw %[qload1], 12(%[src]) \n\t"
322 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
323 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
324 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
325
326 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
327
328 /* even 3. pixel */
329 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
330 "mthi $zero, $ac1 \n\t"
331 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
332 "preceu.ph.qbr %[p2], %[qload1] \n\t"
333 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
334 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
335 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
336 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
337
338 /* even 4. pixel */
339 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
340 "mthi $zero, $ac2 \n\t"
341 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
342 "preceu.ph.qbl %[p3], %[qload1] \n\t"
343 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
344 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
345 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
346 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
347 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
348 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
349
350 /* even 5. pixel */
351 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
352 "mthi $zero, $ac3 \n\t"
353 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
354 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
355 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
356 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
357 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
358
359 /* even 6. pixel */
360 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
361 "mthi $zero, $ac1 \n\t"
362 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
363 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
364 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
365 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
366 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
367 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
368
369 /* even 7. pixel */
370 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
371 "mthi $zero, $ac2 \n\t"
372 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
373 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
374 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
375 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
376 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
377 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
378
379 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
380
381 /* even 8. pixel */
382 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
383 "mthi $zero, $ac3 \n\t"
384 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
385 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
386 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
387 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
388 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
389
390 /* ODD pixels */
391 "ulw %[qload1], 1(%[src]) \n\t"
392 "ulw %[qload2], 5(%[src]) \n\t"
393
394 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
395
396 /* odd 1. pixel */
397 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
398 "mthi $zero, $ac1 \n\t"
399 "preceu.ph.qbr %[p1], %[qload1] \n\t"
400 "preceu.ph.qbl %[p2], %[qload1] \n\t"
401 "preceu.ph.qbr %[p3], %[qload2] \n\t"
402 "preceu.ph.qbl %[p4], %[qload2] \n\t"
403 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
404 "ulw %[qload3], 9(%[src]) \n\t"
405 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
406 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
407 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
408 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
409
410 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
411
412 /* odd 2. pixel */
413 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
414 "mthi $zero, $ac2 \n\t"
415 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
416 "preceu.ph.qbr %[p1], %[qload3] \n\t"
417 "preceu.ph.qbl %[p5], %[qload3] \n\t"
418 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
419 "ulw %[qload1], 13(%[src]) \n\t"
420 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
421 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
422 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
423 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
424
425 /* odd 3. pixel */
426 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
427 "mthi $zero, $ac3 \n\t"
428 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
429 "preceu.ph.qbr %[p2], %[qload1] \n\t"
430 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
431 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
432 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
433 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
434
435 /* odd 4. pixel */
436 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
437 "mthi $zero, $ac1 \n\t"
438 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
439 "preceu.ph.qbl %[p3], %[qload1] \n\t"
440 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
441 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
442 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
443 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
444 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
445
446 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
447
448 /* odd 5. pixel */
449 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
450 "mthi $zero, $ac2 \n\t"
451 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
452 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
453 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
454 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
455 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
456
457 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
458
459 /* odd 6. pixel */
460 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
461 "mthi $zero, $ac3 \n\t"
462 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
463 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
464 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
465 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
466 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
467
468 /* odd 7. pixel */
469 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
470 "mthi $zero, $ac1 \n\t"
471 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
472 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
473 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
474 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
475 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
476
477 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
478
479 /* odd 8. pixel */
480 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
481 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
482
483 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
484
485 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
486 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
487
488 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
489 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
490
491 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
492 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
493
494 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
495 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
496 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
497
498 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
499 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
500 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
501 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
502 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
503 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
504 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
505 );
506
507 src += 16;
508 dst += 16;
509 }
510
511 /* Next row... */
512 src_ptr += src_stride;
513 dst_ptr += dst_stride;
514 }
515 }
516
convolve_bi_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)517 static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr,
518 int32_t src_stride,
519 uint8_t *dst_ptr,
520 int32_t dst_stride,
521 const int16_t *filter_x0,
522 int32_t h) {
523 int32_t y, c;
524 const uint8_t *src;
525 uint8_t *dst;
526 uint8_t *cm = vp9_ff_cropTbl;
527 uint32_t vector_64 = 64;
528 int32_t Temp1, Temp2, Temp3;
529 uint32_t qload1, qload2, qload3;
530 uint32_t p1, p2, p3, p4, p5;
531 uint32_t st1, st2, st3;
532 const int16_t *filter = &filter_x0[3];
533 uint32_t filter45;;
534
535 filter45 = ((const int32_t *)filter)[0];
536
537 for (y = h; y--;) {
538 src = src_ptr;
539 dst = dst_ptr;
540
541 /* prefetch data to cache memory */
542 vp9_prefetch_load(src_ptr + src_stride);
543 vp9_prefetch_load(src_ptr + src_stride + 32);
544 vp9_prefetch_load(src_ptr + src_stride + 64);
545 vp9_prefetch_store(dst_ptr + dst_stride);
546 vp9_prefetch_store(dst_ptr + dst_stride + 32);
547
548 for (c = 0; c < 4; c++) {
549 __asm__ __volatile__ (
550 "ulw %[qload1], 0(%[src]) \n\t"
551 "ulw %[qload2], 4(%[src]) \n\t"
552
553 /* even 1. pixel */
554 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
555 "mthi $zero, $ac1 \n\t"
556 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
557 "mthi $zero, $ac2 \n\t"
558 "preceu.ph.qbr %[p1], %[qload1] \n\t"
559 "preceu.ph.qbl %[p2], %[qload1] \n\t"
560 "preceu.ph.qbr %[p3], %[qload2] \n\t"
561 "preceu.ph.qbl %[p4], %[qload2] \n\t"
562 "ulw %[qload3], 8(%[src]) \n\t"
563 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* even 1 */
564 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
565 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
566
567 /* even 2. pixel */
568 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
569 "mthi $zero, $ac3 \n\t"
570 "preceu.ph.qbr %[p1], %[qload3] \n\t"
571 "preceu.ph.qbl %[p5], %[qload3] \n\t"
572 "ulw %[qload1], 12(%[src]) \n\t"
573 "dpa.w.ph $ac2, %[p2], %[filter45] \n\t" /* even 1 */
574 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
575 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
576
577 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
578
579 /* even 3. pixel */
580 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
581 "mthi $zero, $ac1 \n\t"
582 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
583 "preceu.ph.qbr %[p2], %[qload1] \n\t"
584 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
585 "dpa.w.ph $ac3, %[p3], %[filter45] \n\t" /* even 3 */
586 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
587 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
588
589 /* even 4. pixel */
590 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
591 "mthi $zero, $ac2 \n\t"
592 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
593 "preceu.ph.qbl %[p3], %[qload1] \n\t"
594 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
595 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
596 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
597 "dpa.w.ph $ac1, %[p4], %[filter45] \n\t" /* even 4 */
598 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
599 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
600
601 /* even 5. pixel */
602 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
603 "mthi $zero, $ac3 \n\t"
604 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
605 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
606 "dpa.w.ph $ac2, %[p1], %[filter45] \n\t" /* even 5 */
607 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
608 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
609
610 /* even 6. pixel */
611 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
612 "mthi $zero, $ac1 \n\t"
613 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
614 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
615 "dpa.w.ph $ac3, %[p5], %[filter45] \n\t" /* even 6 */
616 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
617 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
618 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
619
620 /* even 7. pixel */
621 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
622 "mthi $zero, $ac2 \n\t"
623 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
624 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
625 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* even 7 */
626 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
627 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
628 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
629
630 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
631
632 /* even 8. pixel */
633 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
634 "mthi $zero, $ac3 \n\t"
635 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
636 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* even 8 */
637 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
638 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
639 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
640
641 /* ODD pixels */
642 "ulw %[qload1], 1(%[src]) \n\t"
643 "ulw %[qload2], 5(%[src]) \n\t"
644
645 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
646
647 /* odd 1. pixel */
648 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
649 "mthi $zero, $ac1 \n\t"
650 "preceu.ph.qbr %[p1], %[qload1] \n\t"
651 "preceu.ph.qbl %[p2], %[qload1] \n\t"
652 "preceu.ph.qbr %[p3], %[qload2] \n\t"
653 "preceu.ph.qbl %[p4], %[qload2] \n\t"
654 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
655 "ulw %[qload3], 9(%[src]) \n\t"
656 "dpa.w.ph $ac3, %[p1], %[filter45] \n\t" /* odd 1 */
657 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
658 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
659 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
660
661 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
662
663 /* odd 2. pixel */
664 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
665 "mthi $zero, $ac2 \n\t"
666 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
667 "preceu.ph.qbr %[p1], %[qload3] \n\t"
668 "preceu.ph.qbl %[p5], %[qload3] \n\t"
669 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
670 "ulw %[qload1], 13(%[src]) \n\t"
671 "dpa.w.ph $ac1, %[p2], %[filter45] \n\t" /* odd 2 */
672 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
673 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
674 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
675
676 /* odd 3. pixel */
677 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
678 "mthi $zero, $ac3 \n\t"
679 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
680 "preceu.ph.qbr %[p2], %[qload1] \n\t"
681 "dpa.w.ph $ac2, %[p3], %[filter45] \n\t" /* odd 3 */
682 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
683 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
684 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
685
686 /* odd 4. pixel */
687 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
688 "mthi $zero, $ac1 \n\t"
689 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
690 "preceu.ph.qbl %[p3], %[qload1] \n\t"
691 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
692 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
693 "dpa.w.ph $ac3, %[p4], %[filter45] \n\t" /* odd 4 */
694 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
695 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
696
697 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
698
699 /* odd 5. pixel */
700 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
701 "mthi $zero, $ac2 \n\t"
702 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
703 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
704 "dpa.w.ph $ac1, %[p1], %[filter45] \n\t" /* odd 5 */
705 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
706 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
707
708 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
709
710 /* odd 6. pixel */
711 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
712 "mthi $zero, $ac3 \n\t"
713 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
714 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
715 "dpa.w.ph $ac2, %[p5], %[filter45] \n\t" /* odd 6 */
716 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
717 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
718
719 /* odd 7. pixel */
720 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
721 "mthi $zero, $ac1 \n\t"
722 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
723 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
724 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
725 "dpa.w.ph $ac3, %[p2], %[filter45] \n\t" /* odd 7 */
726 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
727
728 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
729
730 /* odd 8. pixel */
731 "dpa.w.ph $ac1, %[p3], %[filter45] \n\t" /* odd 8 */
732 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
733
734 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
735
736 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
737 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
738
739 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
740 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
741
742 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
743 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
744
745 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
746 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
747 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
748
749 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
750 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
751 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
752 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
753 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
754 : [filter45] "r" (filter45), [vector_64] "r" (vector_64),
755 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
756 );
757
758 src += 16;
759 dst += 16;
760 }
761
762 /* Next row... */
763 src_ptr += src_stride;
764 dst_ptr += dst_stride;
765 }
766 }
767
vp9_convolve2_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)768 void vp9_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
769 uint8_t *dst, ptrdiff_t dst_stride,
770 const int16_t *filter_x, int x_step_q4,
771 const int16_t *filter_y, int y_step_q4,
772 int w, int h) {
773 if (16 == x_step_q4) {
774 uint32_t pos = 38;
775
776 /* bit positon for extract from acc */
777 __asm__ __volatile__ (
778 "wrdsp %[pos], 1 \n\t"
779 :
780 : [pos] "r" (pos)
781 );
782
783 /* prefetch data to cache memory */
784 vp9_prefetch_load(src);
785 vp9_prefetch_load(src + 32);
786 vp9_prefetch_store(dst);
787
788 switch (w) {
789 case 4:
790 convolve_bi_avg_horiz_4_dspr2(src, src_stride,
791 dst, dst_stride,
792 filter_x, h);
793 break;
794 case 8:
795 convolve_bi_avg_horiz_8_dspr2(src, src_stride,
796 dst, dst_stride,
797 filter_x, h);
798 break;
799 case 16:
800 convolve_bi_avg_horiz_16_dspr2(src, src_stride,
801 dst, dst_stride,
802 filter_x, h, 1);
803 break;
804 case 32:
805 convolve_bi_avg_horiz_16_dspr2(src, src_stride,
806 dst, dst_stride,
807 filter_x, h, 2);
808 break;
809 case 64:
810 vp9_prefetch_load(src + 64);
811 vp9_prefetch_store(dst + 32);
812
813 convolve_bi_avg_horiz_64_dspr2(src, src_stride,
814 dst, dst_stride,
815 filter_x, h);
816 break;
817 default:
818 vp9_convolve8_avg_horiz_c(src, src_stride,
819 dst, dst_stride,
820 filter_x, x_step_q4,
821 filter_y, y_step_q4,
822 w, h);
823 break;
824 }
825 } else {
826 vp9_convolve8_avg_horiz_c(src, src_stride,
827 dst, dst_stride,
828 filter_x, x_step_q4,
829 filter_y, y_step_q4,
830 w, h);
831 }
832 }
833 #endif
834