1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
convolve_avg_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_avg_horiz_4_dspr2(const uint8_t *src,
24 int32_t src_stride,
25 uint8_t *dst,
26 int32_t dst_stride,
27 const int16_t *filter_x0,
28 int32_t h) {
29 int32_t y;
30 uint8_t *cm = vp9_ff_cropTbl;
31 int32_t vector1b, vector2b, vector3b, vector4b;
32 int32_t Temp1, Temp2, Temp3, Temp4;
33 uint32_t vector4a = 64;
34 uint32_t tp1, tp2;
35 uint32_t p1, p2, p3, p4;
36 uint32_t n1, n2, n3, n4;
37 uint32_t tn1, tn2;
38
39 vector1b = ((const int32_t *)filter_x0)[0];
40 vector2b = ((const int32_t *)filter_x0)[1];
41 vector3b = ((const int32_t *)filter_x0)[2];
42 vector4b = ((const int32_t *)filter_x0)[3];
43
44 for (y = h; y--;) {
45 /* prefetch data to cache memory */
46 vp9_prefetch_load(src + src_stride);
47 vp9_prefetch_load(src + src_stride + 32);
48 vp9_prefetch_store(dst + dst_stride);
49
50 __asm__ __volatile__ (
51 "ulw %[tp1], 0(%[src]) \n\t"
52 "ulw %[tp2], 4(%[src]) \n\t"
53
54 /* even 1. pixel */
55 "mtlo %[vector4a], $ac3 \n\t"
56 "mthi $zero, $ac3 \n\t"
57 "preceu.ph.qbr %[p1], %[tp1] \n\t"
58 "preceu.ph.qbl %[p2], %[tp1] \n\t"
59 "preceu.ph.qbr %[p3], %[tp2] \n\t"
60 "preceu.ph.qbl %[p4], %[tp2] \n\t"
61 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
62 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
63 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
64 "ulw %[tn2], 8(%[src]) \n\t"
65 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
66 "extp %[Temp1], $ac3, 31 \n\t"
67
68 /* even 2. pixel */
69 "mtlo %[vector4a], $ac2 \n\t"
70 "mthi $zero, $ac2 \n\t"
71 "preceu.ph.qbr %[p1], %[tn2] \n\t"
72 "balign %[tn1], %[tn2], 3 \n\t"
73 "balign %[tn2], %[tp2], 3 \n\t"
74 "balign %[tp2], %[tp1], 3 \n\t"
75 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
76 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
77 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
78 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
79 "extp %[Temp3], $ac2, 31 \n\t"
80
81 "lbu %[p2], 3(%[dst]) \n\t" /* load odd 2 */
82
83 /* odd 1. pixel */
84 "lbux %[tp1], %[Temp1](%[cm]) \n\t" /* even 1 */
85 "mtlo %[vector4a], $ac3 \n\t"
86 "mthi $zero, $ac3 \n\t"
87 "lbu %[Temp1], 1(%[dst]) \n\t" /* load odd 1 */
88 "preceu.ph.qbr %[n1], %[tp2] \n\t"
89 "preceu.ph.qbl %[n2], %[tp2] \n\t"
90 "preceu.ph.qbr %[n3], %[tn2] \n\t"
91 "preceu.ph.qbl %[n4], %[tn2] \n\t"
92 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
93 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
94 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
95 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
96 "extp %[Temp2], $ac3, 31 \n\t"
97
98 "lbu %[tn2], 0(%[dst]) \n\t" /* load even 1 */
99
100 /* odd 2. pixel */
101 "lbux %[tp2], %[Temp3](%[cm]) \n\t" /* even 2 */
102 "mtlo %[vector4a], $ac2 \n\t"
103 "mthi $zero, $ac2 \n\t"
104 "preceu.ph.qbr %[n1], %[tn1] \n\t"
105 "lbux %[tn1], %[Temp2](%[cm]) \n\t" /* odd 1 */
106 "addqh_r.w %[tn2], %[tn2], %[tp1] \n\t" /* average even 1 */
107 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
108 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
109 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
110 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
111 "extp %[Temp4], $ac2, 31 \n\t"
112
113 "lbu %[tp1], 2(%[dst]) \n\t" /* load even 2 */
114 "sb %[tn2], 0(%[dst]) \n\t" /* store even 1 */
115
116 /* clamp */
117 "addqh_r.w %[Temp1], %[Temp1], %[tn1] \n\t" /* average odd 1 */
118 "lbux %[n2], %[Temp4](%[cm]) \n\t" /* odd 2 */
119 "sb %[Temp1], 1(%[dst]) \n\t" /* store odd 1 */
120
121 "addqh_r.w %[tp1], %[tp1], %[tp2] \n\t" /* average even 2 */
122 "sb %[tp1], 2(%[dst]) \n\t" /* store even 2 */
123
124 "addqh_r.w %[p2], %[p2], %[n2] \n\t" /* average odd 2 */
125 "sb %[p2], 3(%[dst]) \n\t" /* store odd 2 */
126
127 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
128 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
129 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
130 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
131 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
132 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
133 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
134 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
135 [vector4a] "r" (vector4a),
136 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
137 );
138
139 /* Next row... */
140 src += src_stride;
141 dst += dst_stride;
142 }
143 }
144
convolve_avg_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)145 static void convolve_avg_horiz_8_dspr2(const uint8_t *src,
146 int32_t src_stride,
147 uint8_t *dst,
148 int32_t dst_stride,
149 const int16_t *filter_x0,
150 int32_t h) {
151 int32_t y;
152 uint8_t *cm = vp9_ff_cropTbl;
153 uint32_t vector4a = 64;
154 int32_t vector1b, vector2b, vector3b, vector4b;
155 int32_t Temp1, Temp2, Temp3;
156 uint32_t tp1, tp2;
157 uint32_t p1, p2, p3, p4, n1;
158 uint32_t tn1, tn2, tn3;
159 uint32_t st0, st1;
160
161 vector1b = ((const int32_t *)filter_x0)[0];
162 vector2b = ((const int32_t *)filter_x0)[1];
163 vector3b = ((const int32_t *)filter_x0)[2];
164 vector4b = ((const int32_t *)filter_x0)[3];
165
166 for (y = h; y--;) {
167 /* prefetch data to cache memory */
168 vp9_prefetch_load(src + src_stride);
169 vp9_prefetch_load(src + src_stride + 32);
170 vp9_prefetch_store(dst + dst_stride);
171
172 __asm__ __volatile__ (
173 "ulw %[tp1], 0(%[src]) \n\t"
174 "ulw %[tp2], 4(%[src]) \n\t"
175
176 /* even 1. pixel */
177 "mtlo %[vector4a], $ac3 \n\t"
178 "mthi $zero, $ac3 \n\t"
179 "mtlo %[vector4a], $ac2 \n\t"
180 "mthi $zero, $ac2 \n\t"
181 "preceu.ph.qbr %[p1], %[tp1] \n\t"
182 "preceu.ph.qbl %[p2], %[tp1] \n\t"
183 "preceu.ph.qbr %[p3], %[tp2] \n\t"
184 "preceu.ph.qbl %[p4], %[tp2] \n\t"
185 "ulw %[tn2], 8(%[src]) \n\t"
186 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
187 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
188 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
189 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
190 "extp %[Temp1], $ac3, 31 \n\t"
191 "lbu %[Temp2], 0(%[dst]) \n\t"
192 "lbu %[tn3], 2(%[dst]) \n\t"
193
194 /* even 2. pixel */
195 "preceu.ph.qbr %[p1], %[tn2] \n\t"
196 "preceu.ph.qbl %[n1], %[tn2] \n\t"
197 "ulw %[tn1], 12(%[src]) \n\t"
198 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
199 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
200 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
201 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
202 "extp %[Temp3], $ac2, 31 \n\t"
203
204 /* even 3. pixel */
205 "lbux %[st0], %[Temp1](%[cm]) \n\t"
206 "mtlo %[vector4a], $ac1 \n\t"
207 "mthi $zero, $ac1 \n\t"
208 "preceu.ph.qbr %[p2], %[tn1] \n\t"
209 "lbux %[st1], %[Temp3](%[cm]) \n\t"
210 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
211 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
212 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
213 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
214 "extp %[Temp1], $ac1, 31 \n\t"
215
216 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
217 "addqh_r.w %[tn3], %[tn3], %[st1] \n\t"
218 "sb %[Temp2], 0(%[dst]) \n\t"
219 "sb %[tn3], 2(%[dst]) \n\t"
220
221 /* even 4. pixel */
222 "mtlo %[vector4a], $ac2 \n\t"
223 "mthi $zero, $ac2 \n\t"
224 "mtlo %[vector4a], $ac3 \n\t"
225 "mthi $zero, $ac3 \n\t"
226
227 "balign %[tn3], %[tn1], 3 \n\t"
228 "balign %[tn1], %[tn2], 3 \n\t"
229 "balign %[tn2], %[tp2], 3 \n\t"
230 "balign %[tp2], %[tp1], 3 \n\t"
231
232 "lbux %[st0], %[Temp1](%[cm]) \n\t"
233 "lbu %[Temp2], 4(%[dst]) \n\t"
234 "addqh_r.w %[Temp2], %[Temp2], %[st0] \n\t"
235
236 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
237 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
238 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
239 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
240 "extp %[Temp3], $ac2, 31 \n\t"
241
242 /* odd 1. pixel */
243 "mtlo %[vector4a], $ac1 \n\t"
244 "mthi $zero, $ac1 \n\t"
245 "sb %[Temp2], 4(%[dst]) \n\t"
246 "preceu.ph.qbr %[p1], %[tp2] \n\t"
247 "preceu.ph.qbl %[p2], %[tp2] \n\t"
248 "preceu.ph.qbr %[p3], %[tn2] \n\t"
249 "preceu.ph.qbl %[p4], %[tn2] \n\t"
250 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
251 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
252 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
253 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
254 "extp %[Temp2], $ac3, 31 \n\t"
255
256 "lbu %[tp1], 6(%[dst]) \n\t"
257
258 /* odd 2. pixel */
259 "mtlo %[vector4a], $ac3 \n\t"
260 "mthi $zero, $ac3 \n\t"
261 "mtlo %[vector4a], $ac2 \n\t"
262 "mthi $zero, $ac2 \n\t"
263 "preceu.ph.qbr %[p1], %[tn1] \n\t"
264 "preceu.ph.qbl %[n1], %[tn1] \n\t"
265 "lbux %[st0], %[Temp3](%[cm]) \n\t"
266 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
267 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
268 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
269 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
270 "extp %[Temp3], $ac1, 31 \n\t"
271
272 "lbu %[tp2], 1(%[dst]) \n\t"
273 "lbu %[tn2], 3(%[dst]) \n\t"
274 "addqh_r.w %[tp1], %[tp1], %[st0] \n\t"
275
276 /* odd 3. pixel */
277 "lbux %[st1], %[Temp2](%[cm]) \n\t"
278 "preceu.ph.qbr %[p2], %[tn3] \n\t"
279 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
280 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
281 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
282 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
283 "addqh_r.w %[tp2], %[tp2], %[st1] \n\t"
284 "extp %[Temp2], $ac3, 31 \n\t"
285
286 "lbu %[tn3], 5(%[dst]) \n\t"
287
288 /* odd 4. pixel */
289 "sb %[tp2], 1(%[dst]) \n\t"
290 "sb %[tp1], 6(%[dst]) \n\t"
291 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
292 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
293 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
294 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
295 "extp %[Temp1], $ac2, 31 \n\t"
296
297 "lbu %[tn1], 7(%[dst]) \n\t"
298
299 /* clamp */
300 "lbux %[p4], %[Temp3](%[cm]) \n\t"
301 "addqh_r.w %[tn2], %[tn2], %[p4] \n\t"
302
303 "lbux %[p2], %[Temp2](%[cm]) \n\t"
304 "addqh_r.w %[tn3], %[tn3], %[p2] \n\t"
305
306 "lbux %[n1], %[Temp1](%[cm]) \n\t"
307 "addqh_r.w %[tn1], %[tn1], %[n1] \n\t"
308
309 /* store bytes */
310 "sb %[tn2], 3(%[dst]) \n\t"
311 "sb %[tn3], 5(%[dst]) \n\t"
312 "sb %[tn1], 7(%[dst]) \n\t"
313
314 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
315 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
316 [st0] "=&r" (st0), [st1] "=&r" (st1),
317 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
318 [n1] "=&r" (n1),
319 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
320 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
321 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
322 [vector4a] "r" (vector4a),
323 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
324 );
325
326 /* Next row... */
327 src += src_stride;
328 dst += dst_stride;
329 }
330 }
331
convolve_avg_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)332 static void convolve_avg_horiz_16_dspr2(const uint8_t *src_ptr,
333 int32_t src_stride,
334 uint8_t *dst_ptr,
335 int32_t dst_stride,
336 const int16_t *filter_x0,
337 int32_t h,
338 int32_t count) {
339 int32_t y, c;
340 const uint8_t *src;
341 uint8_t *dst;
342 uint8_t *cm = vp9_ff_cropTbl;
343 uint32_t vector_64 = 64;
344 int32_t filter12, filter34, filter56, filter78;
345 int32_t Temp1, Temp2, Temp3;
346 uint32_t qload1, qload2, qload3;
347 uint32_t p1, p2, p3, p4, p5;
348 uint32_t st1, st2, st3;
349
350 filter12 = ((const int32_t *)filter_x0)[0];
351 filter34 = ((const int32_t *)filter_x0)[1];
352 filter56 = ((const int32_t *)filter_x0)[2];
353 filter78 = ((const int32_t *)filter_x0)[3];
354
355 for (y = h; y--;) {
356 src = src_ptr;
357 dst = dst_ptr;
358
359 /* prefetch data to cache memory */
360 vp9_prefetch_load(src_ptr + src_stride);
361 vp9_prefetch_load(src_ptr + src_stride + 32);
362 vp9_prefetch_store(dst_ptr + dst_stride);
363
364 for (c = 0; c < count; c++) {
365 __asm__ __volatile__ (
366 "ulw %[qload1], 0(%[src]) \n\t"
367 "ulw %[qload2], 4(%[src]) \n\t"
368
369 /* even 1. pixel */
370 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
371 "mthi $zero, $ac1 \n\t"
372 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
373 "mthi $zero, $ac2 \n\t"
374 "preceu.ph.qbr %[p1], %[qload1] \n\t"
375 "preceu.ph.qbl %[p2], %[qload1] \n\t"
376 "preceu.ph.qbr %[p3], %[qload2] \n\t"
377 "preceu.ph.qbl %[p4], %[qload2] \n\t"
378 "ulw %[qload3], 8(%[src]) \n\t"
379 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
380 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
381 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
382 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
383 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
384 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
385
386 /* even 2. pixel */
387 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
388 "mthi $zero, $ac3 \n\t"
389 "preceu.ph.qbr %[p1], %[qload3] \n\t"
390 "preceu.ph.qbl %[p5], %[qload3] \n\t"
391 "ulw %[qload1], 12(%[src]) \n\t"
392 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
393 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
394 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
395 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
396 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
397 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
398
399 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
400
401 /* even 3. pixel */
402 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
403 "mthi $zero, $ac1 \n\t"
404 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
405 "preceu.ph.qbr %[p2], %[qload1] \n\t"
406 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
407 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
408 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
409 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
410 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
411 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
412 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
413
414 /* even 4. pixel */
415 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
416 "mthi $zero, $ac2 \n\t"
417 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
418 "preceu.ph.qbl %[p3], %[qload1] \n\t"
419 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
420 "ulw %[qload2], 16(%[src]) \n\t"
421 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
422 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
423 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
424 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
425 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
426 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
427 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
428 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
429
430 /* even 5. pixel */
431 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
432 "mthi $zero, $ac3 \n\t"
433 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
434 "preceu.ph.qbr %[p4], %[qload2] \n\t"
435 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
436 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
437 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
438 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
439 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
440 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
441 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
442
443 /* even 6. pixel */
444 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
445 "mthi $zero, $ac1 \n\t"
446 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
447 "preceu.ph.qbl %[p1], %[qload2] \n\t"
448 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
449 "ulw %[qload3], 20(%[src]) \n\t"
450 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
451 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
452 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
453 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
454 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
455 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
456 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
457
458 /* even 7. pixel */
459 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
460 "mthi $zero, $ac2 \n\t"
461 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
462 "preceu.ph.qbr %[p5], %[qload3] \n\t"
463 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
464 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
465 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
466 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
467 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
468 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
469 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
470 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
471
472 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
473
474 /* even 8. pixel */
475 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
476 "mthi $zero, $ac3 \n\t"
477 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
478 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
479 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
480 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
481 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
482 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
483 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
484 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
485
486 /* ODD pixels */
487 "ulw %[qload1], 1(%[src]) \n\t"
488 "ulw %[qload2], 5(%[src]) \n\t"
489
490 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
491
492 /* odd 1. pixel */
493 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
494 "mthi $zero, $ac1 \n\t"
495 "preceu.ph.qbr %[p1], %[qload1] \n\t"
496 "preceu.ph.qbl %[p2], %[qload1] \n\t"
497 "preceu.ph.qbr %[p3], %[qload2] \n\t"
498 "preceu.ph.qbl %[p4], %[qload2] \n\t"
499 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
500 "ulw %[qload3], 9(%[src]) \n\t"
501 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
502 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
503 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
504 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
505 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
506 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
507 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
508
509 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
510
511 /* odd 2. pixel */
512 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
513 "mthi $zero, $ac2 \n\t"
514 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
515 "preceu.ph.qbr %[p1], %[qload3] \n\t"
516 "preceu.ph.qbl %[p5], %[qload3] \n\t"
517 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
518 "ulw %[qload1], 13(%[src]) \n\t"
519 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
520 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
521 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
522 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
523 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
524 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
525 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
526
527 /* odd 3. pixel */
528 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
529 "mthi $zero, $ac3 \n\t"
530 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
531 "preceu.ph.qbr %[p2], %[qload1] \n\t"
532 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
533 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
534 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
535 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
536 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
537 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
538 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
539
540 /* odd 4. pixel */
541 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
542 "mthi $zero, $ac1 \n\t"
543 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
544 "preceu.ph.qbl %[p3], %[qload1] \n\t"
545 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
546 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
547 "ulw %[qload2], 17(%[src]) \n\t"
548 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
549 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
550 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
551 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
552 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
553 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
554
555 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
556
557 /* odd 5. pixel */
558 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
559 "mthi $zero, $ac2 \n\t"
560 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
561 "preceu.ph.qbr %[p4], %[qload2] \n\t"
562 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
563 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
564 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
565 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
566 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
567 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
568 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
569
570 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
571
572 /* odd 6. pixel */
573 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
574 "mthi $zero, $ac3 \n\t"
575 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
576 "preceu.ph.qbl %[p1], %[qload2] \n\t"
577 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
578 "ulw %[qload3], 21(%[src]) \n\t"
579 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
580 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
581 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
582 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
583 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
584 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
585
586 /* odd 7. pixel */
587 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
588 "mthi $zero, $ac1 \n\t"
589 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
590 "preceu.ph.qbr %[p5], %[qload3] \n\t"
591 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
592 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
593 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
594 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
595 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
596 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
597 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
598
599 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
600
601 /* odd 8. pixel */
602 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
603 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
604 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
605 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
606 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
607
608 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
609
610 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
611 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
612
613 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
614 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
615
616 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
617 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
618
619 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
620 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
621 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
622
623 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
624 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
625 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
626 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
627 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
628 : [filter12] "r" (filter12), [filter34] "r" (filter34),
629 [filter56] "r" (filter56), [filter78] "r" (filter78),
630 [vector_64] "r" (vector_64),
631 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
632 );
633
634 src += 16;
635 dst += 16;
636 }
637
638 /* Next row... */
639 src_ptr += src_stride;
640 dst_ptr += dst_stride;
641 }
642 }
643
convolve_avg_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)644 static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr,
645 int32_t src_stride,
646 uint8_t *dst_ptr,
647 int32_t dst_stride,
648 const int16_t *filter_x0,
649 int32_t h) {
650 int32_t y, c;
651 const uint8_t *src;
652 uint8_t *dst;
653 uint8_t *cm = vp9_ff_cropTbl;
654 uint32_t vector_64 = 64;
655 int32_t filter12, filter34, filter56, filter78;
656 int32_t Temp1, Temp2, Temp3;
657 uint32_t qload1, qload2, qload3;
658 uint32_t p1, p2, p3, p4, p5;
659 uint32_t st1, st2, st3;
660
661 filter12 = ((const int32_t *)filter_x0)[0];
662 filter34 = ((const int32_t *)filter_x0)[1];
663 filter56 = ((const int32_t *)filter_x0)[2];
664 filter78 = ((const int32_t *)filter_x0)[3];
665
666 for (y = h; y--;) {
667 src = src_ptr;
668 dst = dst_ptr;
669
670 /* prefetch data to cache memory */
671 vp9_prefetch_load(src_ptr + src_stride);
672 vp9_prefetch_load(src_ptr + src_stride + 32);
673 vp9_prefetch_load(src_ptr + src_stride + 64);
674 vp9_prefetch_store(dst_ptr + dst_stride);
675 vp9_prefetch_store(dst_ptr + dst_stride + 32);
676
677 for (c = 0; c < 4; c++) {
678 __asm__ __volatile__ (
679 "ulw %[qload1], 0(%[src]) \n\t"
680 "ulw %[qload2], 4(%[src]) \n\t"
681
682 /* even 1. pixel */
683 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
684 "mthi $zero, $ac1 \n\t"
685 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
686 "mthi $zero, $ac2 \n\t"
687 "preceu.ph.qbr %[p1], %[qload1] \n\t"
688 "preceu.ph.qbl %[p2], %[qload1] \n\t"
689 "preceu.ph.qbr %[p3], %[qload2] \n\t"
690 "preceu.ph.qbl %[p4], %[qload2] \n\t"
691 "ulw %[qload3], 8(%[src]) \n\t"
692 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
693 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
694 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
695 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
696 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
697 "lbu %[st2], 0(%[dst]) \n\t" /* load even 1 from dst */
698
699 /* even 2. pixel */
700 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
701 "mthi $zero, $ac3 \n\t"
702 "preceu.ph.qbr %[p1], %[qload3] \n\t"
703 "preceu.ph.qbl %[p5], %[qload3] \n\t"
704 "ulw %[qload1], 12(%[src]) \n\t"
705 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
706 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
707 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
708 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
709 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
710 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
711
712 "lbu %[qload3], 2(%[dst]) \n\t" /* load even 2 from dst */
713
714 /* even 3. pixel */
715 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
716 "mthi $zero, $ac1 \n\t"
717 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 1 */
718 "preceu.ph.qbr %[p2], %[qload1] \n\t"
719 "sb %[st2], 0(%[dst]) \n\t" /* store even 1 to dst */
720 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
721 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
722 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
723 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
724 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
725 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
726
727 /* even 4. pixel */
728 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
729 "mthi $zero, $ac2 \n\t"
730 "addqh_r.w %[qload3], %[qload3], %[st2] \n\t" /* average even 2 */
731 "preceu.ph.qbl %[p3], %[qload1] \n\t"
732 "sb %[qload3], 2(%[dst]) \n\t" /* store even 2 to dst */
733 "ulw %[qload2], 16(%[src]) \n\t"
734 "lbu %[qload3], 4(%[dst]) \n\t" /* load even 3 from dst */
735 "lbu %[qload1], 6(%[dst]) \n\t" /* load even 4 from dst */
736 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
737 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
738 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
739 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
740 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
741 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
742
743 /* even 5. pixel */
744 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
745 "mthi $zero, $ac3 \n\t"
746 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 3 */
747 "preceu.ph.qbr %[p4], %[qload2] \n\t"
748 "sb %[qload3], 4(%[dst]) \n\t" /* store even 3 to dst */
749 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
750 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
751 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
752 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
753 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
754 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
755
756 /* even 6. pixel */
757 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
758 "mthi $zero, $ac1 \n\t"
759 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average even 4 */
760 "preceu.ph.qbl %[p1], %[qload2] \n\t"
761 "sb %[qload1], 6(%[dst]) \n\t" /* store even 4 to dst */
762 "ulw %[qload3], 20(%[src]) \n\t"
763 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
764 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
765 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
766 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
767 "lbu %[qload2], 8(%[dst]) \n\t" /* load even 5 from dst */
768 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
769 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
770
771 /* even 7. pixel */
772 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
773 "mthi $zero, $ac2 \n\t"
774 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 5 */
775 "preceu.ph.qbr %[p5], %[qload3] \n\t"
776 "sb %[qload2], 8(%[dst]) \n\t" /* store even 5 to dst */
777 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
778 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
779 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
780 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
781 "lbu %[qload3], 10(%[dst]) \n\t" /* load even 6 from dst */
782 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
783 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
784
785 "lbu %[st2], 12(%[dst]) \n\t" /* load even 7 from dst */
786
787 /* even 8. pixel */
788 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
789 "mthi $zero, $ac3 \n\t"
790 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average even 6 */
791 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
792 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
793 "sb %[qload3], 10(%[dst]) \n\t" /* store even 6 to dst */
794 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
795 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
796 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
797 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
798
799 /* ODD pixels */
800 "ulw %[qload1], 1(%[src]) \n\t"
801 "ulw %[qload2], 5(%[src]) \n\t"
802
803 "addqh_r.w %[st2], %[st2], %[st1] \n\t" /* average even 7 */
804
805 /* odd 1. pixel */
806 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
807 "mthi $zero, $ac1 \n\t"
808 "preceu.ph.qbr %[p1], %[qload1] \n\t"
809 "preceu.ph.qbl %[p2], %[qload1] \n\t"
810 "preceu.ph.qbr %[p3], %[qload2] \n\t"
811 "preceu.ph.qbl %[p4], %[qload2] \n\t"
812 "sb %[st2], 12(%[dst]) \n\t" /* store even 7 to dst */
813 "ulw %[qload3], 9(%[src]) \n\t"
814 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
815 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
816 "lbu %[qload2], 14(%[dst]) \n\t" /* load even 8 from dst */
817 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
818 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
819 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
820 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
821
822 "lbu %[st1], 1(%[dst]) \n\t" /* load odd 1 from dst */
823
824 /* odd 2. pixel */
825 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
826 "mthi $zero, $ac2 \n\t"
827 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average even 8 */
828 "preceu.ph.qbr %[p1], %[qload3] \n\t"
829 "preceu.ph.qbl %[p5], %[qload3] \n\t"
830 "sb %[qload2], 14(%[dst]) \n\t" /* store even 8 to dst */
831 "ulw %[qload1], 13(%[src]) \n\t"
832 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
833 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
834 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
835 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
836 "lbu %[qload3], 3(%[dst]) \n\t" /* load odd 2 from dst */
837 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
838 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
839
840 /* odd 3. pixel */
841 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
842 "mthi $zero, $ac3 \n\t"
843 "addqh_r.w %[st3], %[st3], %[st1] \n\t" /* average odd 1 */
844 "preceu.ph.qbr %[p2], %[qload1] \n\t"
845 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
846 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
847 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
848 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
849 "sb %[st3], 1(%[dst]) \n\t" /* store odd 1 to dst */
850 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
851 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
852
853 /* odd 4. pixel */
854 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
855 "mthi $zero, $ac1 \n\t"
856 "addqh_r.w %[qload3], %[qload3], %[st1] \n\t" /* average odd 2 */
857 "preceu.ph.qbl %[p3], %[qload1] \n\t"
858 "sb %[qload3], 3(%[dst]) \n\t" /* store odd 2 to dst */
859 "lbu %[qload1], 5(%[dst]) \n\t" /* load odd 3 from dst */
860 "ulw %[qload2], 17(%[src]) \n\t"
861 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
862 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
863 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
864 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
865 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
866 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
867
868 "lbu %[st1], 7(%[dst]) \n\t" /* load odd 4 from dst */
869
870 /* odd 5. pixel */
871 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
872 "mthi $zero, $ac2 \n\t"
873 "addqh_r.w %[qload1], %[qload1], %[st2] \n\t" /* average odd 3 */
874 "preceu.ph.qbr %[p4], %[qload2] \n\t"
875 "sb %[qload1], 5(%[dst]) \n\t" /* store odd 3 to dst */
876 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
877 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
878 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
879 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
880 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
881 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
882
883 "lbu %[qload1], 9(%[dst]) \n\t" /* load odd 5 from dst */
884
885 /* odd 6. pixel */
886 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
887 "mthi $zero, $ac3 \n\t"
888 "addqh_r.w %[st1], %[st1], %[st3] \n\t" /* average odd 4 */
889 "preceu.ph.qbl %[p1], %[qload2] \n\t"
890 "sb %[st1], 7(%[dst]) \n\t" /* store odd 4 to dst */
891 "ulw %[qload3], 21(%[src]) \n\t"
892 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
893 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
894 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
895 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
896 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
897 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
898
899 /* odd 7. pixel */
900 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
901 "mthi $zero, $ac1 \n\t"
902 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 5 */
903 "preceu.ph.qbr %[p5], %[qload3] \n\t"
904 "sb %[qload1], 9(%[dst]) \n\t" /* store odd 5 to dst */
905 "lbu %[qload2], 11(%[dst]) \n\t" /* load odd 6 from dst */
906 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
907 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
908 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
909 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
910 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
911
912 "lbu %[qload3], 13(%[dst]) \n\t" /* load odd 7 from dst */
913
914 /* odd 8. pixel */
915 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
916 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
917 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
918 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
919 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
920
921 "lbu %[qload1], 15(%[dst]) \n\t" /* load odd 8 from dst */
922
923 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
924 "addqh_r.w %[qload2], %[qload2], %[st2] \n\t" /* average odd 6 */
925
926 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
927 "addqh_r.w %[qload3], %[qload3], %[st3] \n\t" /* average odd 7 */
928
929 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
930 "addqh_r.w %[qload1], %[qload1], %[st1] \n\t" /* average odd 8 */
931
932 "sb %[qload2], 11(%[dst]) \n\t" /* store odd 6 to dst */
933 "sb %[qload3], 13(%[dst]) \n\t" /* store odd 7 to dst */
934 "sb %[qload1], 15(%[dst]) \n\t" /* store odd 8 to dst */
935
936 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2),
937 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
938 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
939 [qload3] "=&r" (qload3), [p5] "=&r" (p5),
940 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
941 : [filter12] "r" (filter12), [filter34] "r" (filter34),
942 [filter56] "r" (filter56), [filter78] "r" (filter78),
943 [vector_64] "r" (vector_64),
944 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
945 );
946
947 src += 16;
948 dst += 16;
949 }
950
951 /* Next row... */
952 src_ptr += src_stride;
953 dst_ptr += dst_stride;
954 }
955 }
956
vp9_convolve8_avg_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)957 void vp9_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
958 uint8_t *dst, ptrdiff_t dst_stride,
959 const int16_t *filter_x, int x_step_q4,
960 const int16_t *filter_y, int y_step_q4,
961 int w, int h) {
962 if (((const int32_t *)filter_x)[1] == 0x800000) {
963 vp9_convolve_avg(src, src_stride,
964 dst, dst_stride,
965 filter_x, x_step_q4,
966 filter_y, y_step_q4,
967 w, h);
968 } else if (((const int32_t *)filter_x)[0] == 0) {
969 vp9_convolve2_avg_horiz_dspr2(src, src_stride,
970 dst, dst_stride,
971 filter_x, x_step_q4,
972 filter_y, y_step_q4,
973 w, h);
974 } else {
975 if (16 == x_step_q4) {
976 uint32_t pos = 38;
977
978 src -= 3;
979
980 /* bit positon for extract from acc */
981 __asm__ __volatile__ (
982 "wrdsp %[pos], 1 \n\t"
983 :
984 : [pos] "r" (pos)
985 );
986
987 /* prefetch data to cache memory */
988 vp9_prefetch_load(src);
989 vp9_prefetch_load(src + 32);
990 vp9_prefetch_store(dst);
991
992 switch (w) {
993 case 4:
994 convolve_avg_horiz_4_dspr2(src, src_stride,
995 dst, dst_stride,
996 filter_x, h);
997 break;
998 case 8:
999 convolve_avg_horiz_8_dspr2(src, src_stride,
1000 dst, dst_stride,
1001 filter_x, h);
1002 break;
1003 case 16:
1004 convolve_avg_horiz_16_dspr2(src, src_stride,
1005 dst, dst_stride,
1006 filter_x, h, 1);
1007 break;
1008 case 32:
1009 convolve_avg_horiz_16_dspr2(src, src_stride,
1010 dst, dst_stride,
1011 filter_x, h, 2);
1012 break;
1013 case 64:
1014 vp9_prefetch_load(src + 64);
1015 vp9_prefetch_store(dst + 32);
1016
1017 convolve_avg_horiz_64_dspr2(src, src_stride,
1018 dst, dst_stride,
1019 filter_x, h);
1020 break;
1021 default:
1022 vp9_convolve8_avg_horiz_c(src + 3, src_stride,
1023 dst, dst_stride,
1024 filter_x, x_step_q4,
1025 filter_y, y_step_q4,
1026 w, h);
1027 break;
1028 }
1029 } else {
1030 vp9_convolve8_avg_horiz_c(src, src_stride,
1031 dst, dst_stride,
1032 filter_x, x_step_q4,
1033 filter_y, y_step_q4,
1034 w, h);
1035 }
1036 }
1037 }
1038 #endif
1039