1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
convolve_horiz_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)23 static void convolve_horiz_4_dspr2(const uint8_t *src,
24 int32_t src_stride,
25 uint8_t *dst,
26 int32_t dst_stride,
27 const int16_t *filter_x0,
28 int32_t h) {
29 int32_t y;
30 uint8_t *cm = vp9_ff_cropTbl;
31 int32_t vector1b, vector2b, vector3b, vector4b;
32 int32_t Temp1, Temp2, Temp3, Temp4;
33 uint32_t vector4a = 64;
34 uint32_t tp1, tp2;
35 uint32_t p1, p2, p3, p4;
36 uint32_t n1, n2, n3, n4;
37 uint32_t tn1, tn2;
38
39 vector1b = ((const int32_t *)filter_x0)[0];
40 vector2b = ((const int32_t *)filter_x0)[1];
41 vector3b = ((const int32_t *)filter_x0)[2];
42 vector4b = ((const int32_t *)filter_x0)[3];
43
44 for (y = h; y--;) {
45 /* prefetch data to cache memory */
46 vp9_prefetch_load(src + src_stride);
47 vp9_prefetch_load(src + src_stride + 32);
48 vp9_prefetch_store(dst + dst_stride);
49
50 __asm__ __volatile__ (
51 "ulw %[tp1], 0(%[src]) \n\t"
52 "ulw %[tp2], 4(%[src]) \n\t"
53
54 /* even 1. pixel */
55 "mtlo %[vector4a], $ac3 \n\t"
56 "mthi $zero, $ac3 \n\t"
57 "preceu.ph.qbr %[p1], %[tp1] \n\t"
58 "preceu.ph.qbl %[p2], %[tp1] \n\t"
59 "preceu.ph.qbr %[p3], %[tp2] \n\t"
60 "preceu.ph.qbl %[p4], %[tp2] \n\t"
61 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
62 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
63 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
64 "ulw %[tn2], 8(%[src]) \n\t"
65 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
66 "extp %[Temp1], $ac3, 31 \n\t"
67
68 /* even 2. pixel */
69 "mtlo %[vector4a], $ac2 \n\t"
70 "mthi $zero, $ac2 \n\t"
71 "preceu.ph.qbr %[p1], %[tn2] \n\t"
72 "balign %[tn1], %[tn2], 3 \n\t"
73 "balign %[tn2], %[tp2], 3 \n\t"
74 "balign %[tp2], %[tp1], 3 \n\t"
75 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
76 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
77 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
78 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
79 "extp %[Temp3], $ac2, 31 \n\t"
80
81 /* odd 1. pixel */
82 "lbux %[tp1], %[Temp1](%[cm]) \n\t"
83 "mtlo %[vector4a], $ac3 \n\t"
84 "mthi $zero, $ac3 \n\t"
85 "preceu.ph.qbr %[n1], %[tp2] \n\t"
86 "preceu.ph.qbl %[n2], %[tp2] \n\t"
87 "preceu.ph.qbr %[n3], %[tn2] \n\t"
88 "preceu.ph.qbl %[n4], %[tn2] \n\t"
89 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
90 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
91 "dpa.w.ph $ac3, %[n3], %[vector3b] \n\t"
92 "dpa.w.ph $ac3, %[n4], %[vector4b] \n\t"
93 "extp %[Temp2], $ac3, 31 \n\t"
94
95 /* odd 2. pixel */
96 "lbux %[tp2], %[Temp3](%[cm]) \n\t"
97 "mtlo %[vector4a], $ac2 \n\t"
98 "mthi $zero, $ac2 \n\t"
99 "preceu.ph.qbr %[n1], %[tn1] \n\t"
100 "dpa.w.ph $ac2, %[n2], %[vector1b] \n\t"
101 "dpa.w.ph $ac2, %[n3], %[vector2b] \n\t"
102 "dpa.w.ph $ac2, %[n4], %[vector3b] \n\t"
103 "dpa.w.ph $ac2, %[n1], %[vector4b] \n\t"
104 "extp %[Temp4], $ac2, 31 \n\t"
105
106 /* clamp */
107 "lbux %[tn1], %[Temp2](%[cm]) \n\t"
108 "lbux %[n2], %[Temp4](%[cm]) \n\t"
109
110 /* store bytes */
111 "sb %[tp1], 0(%[dst]) \n\t"
112 "sb %[tn1], 1(%[dst]) \n\t"
113 "sb %[tp2], 2(%[dst]) \n\t"
114 "sb %[n2], 3(%[dst]) \n\t"
115
116 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
117 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2),
118 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
119 [n1] "=&r" (n1), [n2] "=&r" (n2), [n3] "=&r" (n3), [n4] "=&r" (n4),
120 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
121 [Temp3] "=&r" (Temp3), [Temp4] "=&r" (Temp4)
122 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
123 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
124 [vector4a] "r" (vector4a),
125 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
126 );
127
128 /* Next row... */
129 src += src_stride;
130 dst += dst_stride;
131 }
132 }
133
convolve_horiz_8_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_x0,int32_t h)134 static void convolve_horiz_8_dspr2(const uint8_t *src,
135 int32_t src_stride,
136 uint8_t *dst,
137 int32_t dst_stride,
138 const int16_t *filter_x0,
139 int32_t h) {
140 int32_t y;
141 uint8_t *cm = vp9_ff_cropTbl;
142 uint32_t vector4a = 64;
143 int32_t vector1b, vector2b, vector3b, vector4b;
144 int32_t Temp1, Temp2, Temp3;
145 uint32_t tp1, tp2;
146 uint32_t p1, p2, p3, p4, n1;
147 uint32_t tn1, tn2, tn3;
148 uint32_t st0, st1;
149
150 vector1b = ((const int32_t *)filter_x0)[0];
151 vector2b = ((const int32_t *)filter_x0)[1];
152 vector3b = ((const int32_t *)filter_x0)[2];
153 vector4b = ((const int32_t *)filter_x0)[3];
154
155 for (y = h; y--;) {
156 /* prefetch data to cache memory */
157 vp9_prefetch_load(src + src_stride);
158 vp9_prefetch_load(src + src_stride + 32);
159 vp9_prefetch_store(dst + dst_stride);
160
161 __asm__ __volatile__ (
162 "ulw %[tp1], 0(%[src]) \n\t"
163 "ulw %[tp2], 4(%[src]) \n\t"
164
165 /* even 1. pixel */
166 "mtlo %[vector4a], $ac3 \n\t"
167 "mthi $zero, $ac3 \n\t"
168 "mtlo %[vector4a], $ac2 \n\t"
169 "mthi $zero, $ac2 \n\t"
170 "preceu.ph.qbr %[p1], %[tp1] \n\t"
171 "preceu.ph.qbl %[p2], %[tp1] \n\t"
172 "preceu.ph.qbr %[p3], %[tp2] \n\t"
173 "preceu.ph.qbl %[p4], %[tp2] \n\t"
174 "ulw %[tn2], 8(%[src]) \n\t"
175 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
176 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
177 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
178 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
179 "extp %[Temp1], $ac3, 31 \n\t"
180
181 /* even 2. pixel */
182 "preceu.ph.qbr %[p1], %[tn2] \n\t"
183 "preceu.ph.qbl %[n1], %[tn2] \n\t"
184 "ulw %[tn1], 12(%[src]) \n\t"
185 "dpa.w.ph $ac2, %[p2], %[vector1b] \n\t"
186 "dpa.w.ph $ac2, %[p3], %[vector2b] \n\t"
187 "dpa.w.ph $ac2, %[p4], %[vector3b] \n\t"
188 "dpa.w.ph $ac2, %[p1], %[vector4b] \n\t"
189 "extp %[Temp3], $ac2, 31 \n\t"
190
191 /* even 3. pixel */
192 "lbux %[st0], %[Temp1](%[cm]) \n\t"
193 "mtlo %[vector4a], $ac1 \n\t"
194 "mthi $zero, $ac1 \n\t"
195 "preceu.ph.qbr %[p2], %[tn1] \n\t"
196 "dpa.w.ph $ac1, %[p3], %[vector1b] \n\t"
197 "dpa.w.ph $ac1, %[p4], %[vector2b] \n\t"
198 "dpa.w.ph $ac1, %[p1], %[vector3b] \n\t"
199 "dpa.w.ph $ac1, %[n1], %[vector4b] \n\t"
200 "extp %[Temp1], $ac1, 31 \n\t"
201
202 /* even 4. pixel */
203 "mtlo %[vector4a], $ac2 \n\t"
204 "mthi $zero, $ac2 \n\t"
205 "mtlo %[vector4a], $ac3 \n\t"
206 "mthi $zero, $ac3 \n\t"
207 "sb %[st0], 0(%[dst]) \n\t"
208 "lbux %[st1], %[Temp3](%[cm]) \n\t"
209
210 "balign %[tn3], %[tn1], 3 \n\t"
211 "balign %[tn1], %[tn2], 3 \n\t"
212 "balign %[tn2], %[tp2], 3 \n\t"
213 "balign %[tp2], %[tp1], 3 \n\t"
214
215 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
216 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
217 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
218 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
219 "extp %[Temp3], $ac2, 31 \n\t"
220
221 "lbux %[st0], %[Temp1](%[cm]) \n\t"
222
223 /* odd 1. pixel */
224 "mtlo %[vector4a], $ac1 \n\t"
225 "mthi $zero, $ac1 \n\t"
226 "sb %[st1], 2(%[dst]) \n\t"
227 "preceu.ph.qbr %[p1], %[tp2] \n\t"
228 "preceu.ph.qbl %[p2], %[tp2] \n\t"
229 "preceu.ph.qbr %[p3], %[tn2] \n\t"
230 "preceu.ph.qbl %[p4], %[tn2] \n\t"
231 "sb %[st0], 4(%[dst]) \n\t"
232 "dpa.w.ph $ac3, %[p1], %[vector1b] \n\t"
233 "dpa.w.ph $ac3, %[p2], %[vector2b] \n\t"
234 "dpa.w.ph $ac3, %[p3], %[vector3b] \n\t"
235 "dpa.w.ph $ac3, %[p4], %[vector4b] \n\t"
236 "extp %[Temp2], $ac3, 31 \n\t"
237
238 /* odd 2. pixel */
239 "mtlo %[vector4a], $ac3 \n\t"
240 "mthi $zero, $ac3 \n\t"
241 "mtlo %[vector4a], $ac2 \n\t"
242 "mthi $zero, $ac2 \n\t"
243 "preceu.ph.qbr %[p1], %[tn1] \n\t"
244 "preceu.ph.qbl %[n1], %[tn1] \n\t"
245 "lbux %[st0], %[Temp3](%[cm]) \n\t"
246 "dpa.w.ph $ac1, %[p2], %[vector1b] \n\t"
247 "dpa.w.ph $ac1, %[p3], %[vector2b] \n\t"
248 "dpa.w.ph $ac1, %[p4], %[vector3b] \n\t"
249 "dpa.w.ph $ac1, %[p1], %[vector4b] \n\t"
250 "extp %[Temp3], $ac1, 31 \n\t"
251
252 /* odd 3. pixel */
253 "lbux %[st1], %[Temp2](%[cm]) \n\t"
254 "preceu.ph.qbr %[p2], %[tn3] \n\t"
255 "dpa.w.ph $ac3, %[p3], %[vector1b] \n\t"
256 "dpa.w.ph $ac3, %[p4], %[vector2b] \n\t"
257 "dpa.w.ph $ac3, %[p1], %[vector3b] \n\t"
258 "dpa.w.ph $ac3, %[n1], %[vector4b] \n\t"
259 "extp %[Temp2], $ac3, 31 \n\t"
260
261 /* odd 4. pixel */
262 "sb %[st1], 1(%[dst]) \n\t"
263 "sb %[st0], 6(%[dst]) \n\t"
264 "dpa.w.ph $ac2, %[p4], %[vector1b] \n\t"
265 "dpa.w.ph $ac2, %[p1], %[vector2b] \n\t"
266 "dpa.w.ph $ac2, %[n1], %[vector3b] \n\t"
267 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
268 "extp %[Temp1], $ac2, 31 \n\t"
269
270 /* clamp */
271 "lbux %[p4], %[Temp3](%[cm]) \n\t"
272 "lbux %[p2], %[Temp2](%[cm]) \n\t"
273 "lbux %[n1], %[Temp1](%[cm]) \n\t"
274
275 /* store bytes */
276 "sb %[p4], 3(%[dst]) \n\t"
277 "sb %[p2], 5(%[dst]) \n\t"
278 "sb %[n1], 7(%[dst]) \n\t"
279
280 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
281 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2), [tn3] "=&r" (tn3),
282 [st0] "=&r" (st0), [st1] "=&r" (st1),
283 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
284 [n1] "=&r" (n1),
285 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
286 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
287 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
288 [vector4a] "r" (vector4a),
289 [cm] "r" (cm), [dst] "r" (dst), [src] "r" (src)
290 );
291
292 /* Next row... */
293 src += src_stride;
294 dst += dst_stride;
295 }
296 }
297
convolve_horiz_16_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h,int32_t count)298 static void convolve_horiz_16_dspr2(const uint8_t *src_ptr,
299 int32_t src_stride,
300 uint8_t *dst_ptr,
301 int32_t dst_stride,
302 const int16_t *filter_x0,
303 int32_t h,
304 int32_t count) {
305 int32_t y, c;
306 const uint8_t *src;
307 uint8_t *dst;
308 uint8_t *cm = vp9_ff_cropTbl;
309 uint32_t vector_64 = 64;
310 int32_t filter12, filter34, filter56, filter78;
311 int32_t Temp1, Temp2, Temp3;
312 uint32_t qload1, qload2, qload3;
313 uint32_t p1, p2, p3, p4, p5;
314 uint32_t st1, st2, st3;
315
316 filter12 = ((const int32_t *)filter_x0)[0];
317 filter34 = ((const int32_t *)filter_x0)[1];
318 filter56 = ((const int32_t *)filter_x0)[2];
319 filter78 = ((const int32_t *)filter_x0)[3];
320
321 for (y = h; y--;) {
322 src = src_ptr;
323 dst = dst_ptr;
324
325 /* prefetch data to cache memory */
326 vp9_prefetch_load(src_ptr + src_stride);
327 vp9_prefetch_load(src_ptr + src_stride + 32);
328 vp9_prefetch_store(dst_ptr + dst_stride);
329
330 for (c = 0; c < count; c++) {
331 __asm__ __volatile__ (
332 "ulw %[qload1], 0(%[src]) \n\t"
333 "ulw %[qload2], 4(%[src]) \n\t"
334
335 /* even 1. pixel */
336 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
337 "mthi $zero, $ac1 \n\t"
338 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
339 "mthi $zero, $ac2 \n\t"
340 "preceu.ph.qbr %[p1], %[qload1] \n\t"
341 "preceu.ph.qbl %[p2], %[qload1] \n\t"
342 "preceu.ph.qbr %[p3], %[qload2] \n\t"
343 "preceu.ph.qbl %[p4], %[qload2] \n\t"
344 "ulw %[qload3], 8(%[src]) \n\t"
345 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
346 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
347 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
348 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
349 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
350
351 /* even 2. pixel */
352 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
353 "mthi $zero, $ac3 \n\t"
354 "preceu.ph.qbr %[p1], %[qload3] \n\t"
355 "preceu.ph.qbl %[p5], %[qload3] \n\t"
356 "ulw %[qload1], 12(%[src]) \n\t"
357 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
358 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
359 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
360 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
361 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
362 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
363
364 /* even 3. pixel */
365 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
366 "mthi $zero, $ac1 \n\t"
367 "preceu.ph.qbr %[p2], %[qload1] \n\t"
368 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
369 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
370 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
371 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
372 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
373 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
374 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
375
376 /* even 4. pixel */
377 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
378 "mthi $zero, $ac2 \n\t"
379 "preceu.ph.qbl %[p3], %[qload1] \n\t"
380 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
381 "ulw %[qload2], 16(%[src]) \n\t"
382 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
383 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
384 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
385 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
386 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
387 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
388
389 /* even 5. pixel */
390 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
391 "mthi $zero, $ac3 \n\t"
392 "preceu.ph.qbr %[p4], %[qload2] \n\t"
393 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
394 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
395 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
396 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
397 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
398 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
399 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
400
401 /* even 6. pixel */
402 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
403 "mthi $zero, $ac1 \n\t"
404 "preceu.ph.qbl %[p1], %[qload2] \n\t"
405 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
406 "ulw %[qload3], 20(%[src]) \n\t"
407 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
408 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
409 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
410 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
411 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
412 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
413
414 /* even 7. pixel */
415 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
416 "mthi $zero, $ac2 \n\t"
417 "preceu.ph.qbr %[p5], %[qload3] \n\t"
418 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
419 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
420 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
421 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
422 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
423 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
424 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
425
426 /* even 8. pixel */
427 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
428 "mthi $zero, $ac3 \n\t"
429 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
430 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
431 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
432 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
433 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
434 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
435 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
436
437 /* ODD pixels */
438 "ulw %[qload1], 1(%[src]) \n\t"
439 "ulw %[qload2], 5(%[src]) \n\t"
440
441 /* odd 1. pixel */
442 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
443 "mthi $zero, $ac1 \n\t"
444 "preceu.ph.qbr %[p1], %[qload1] \n\t"
445 "preceu.ph.qbl %[p2], %[qload1] \n\t"
446 "preceu.ph.qbr %[p3], %[qload2] \n\t"
447 "preceu.ph.qbl %[p4], %[qload2] \n\t"
448 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
449 "ulw %[qload3], 9(%[src]) \n\t"
450 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
451 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
452 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
453 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
454 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
455 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
456
457 /* odd 2. pixel */
458 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
459 "mthi $zero, $ac2 \n\t"
460 "preceu.ph.qbr %[p1], %[qload3] \n\t"
461 "preceu.ph.qbl %[p5], %[qload3] \n\t"
462 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
463 "ulw %[qload1], 13(%[src]) \n\t"
464 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
465 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
466 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
467 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
468 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
469 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
470
471 /* odd 3. pixel */
472 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
473 "mthi $zero, $ac3 \n\t"
474 "preceu.ph.qbr %[p2], %[qload1] \n\t"
475 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
476 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
477 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
478 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
479 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
480 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
481 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
482
483 /* odd 4. pixel */
484 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
485 "mthi $zero, $ac1 \n\t"
486 "preceu.ph.qbl %[p3], %[qload1] \n\t"
487 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
488 "ulw %[qload2], 17(%[src]) \n\t"
489 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
490 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
491 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
492 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
493 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
494 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
495
496 /* odd 5. pixel */
497 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
498 "mthi $zero, $ac2 \n\t"
499 "preceu.ph.qbr %[p4], %[qload2] \n\t"
500 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
501 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
502 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
503 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
504 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
505 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
506 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
507
508 /* odd 6. pixel */
509 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
510 "mthi $zero, $ac3 \n\t"
511 "preceu.ph.qbl %[p1], %[qload2] \n\t"
512 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
513 "ulw %[qload3], 21(%[src]) \n\t"
514 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
515 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
516 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
517 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
518 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
519 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
520
521 /* odd 7. pixel */
522 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
523 "mthi $zero, $ac1 \n\t"
524 "preceu.ph.qbr %[p5], %[qload3] \n\t"
525 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
526 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
527 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
528 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
529 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
530 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
531
532 /* odd 8. pixel */
533 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
534 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
535 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
536 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
537 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
538
539 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
540 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
541 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
542
543 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
544 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
545 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
546
547 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
548 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
549 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
550 [p5] "=&r" (p5),
551 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
552 : [filter12] "r" (filter12), [filter34] "r" (filter34),
553 [filter56] "r" (filter56), [filter78] "r" (filter78),
554 [vector_64] "r" (vector_64),
555 [cm] "r" (cm), [dst] "r" (dst),
556 [src] "r" (src)
557 );
558
559 src += 16;
560 dst += 16;
561 }
562
563 /* Next row... */
564 src_ptr += src_stride;
565 dst_ptr += dst_stride;
566 }
567 }
568
convolve_horiz_64_dspr2(const uint8_t * src_ptr,int32_t src_stride,uint8_t * dst_ptr,int32_t dst_stride,const int16_t * filter_x0,int32_t h)569 static void convolve_horiz_64_dspr2(const uint8_t *src_ptr,
570 int32_t src_stride,
571 uint8_t *dst_ptr,
572 int32_t dst_stride,
573 const int16_t *filter_x0,
574 int32_t h) {
575 int32_t y, c;
576 const uint8_t *src;
577 uint8_t *dst;
578 uint8_t *cm = vp9_ff_cropTbl;
579 uint32_t vector_64 = 64;
580 int32_t filter12, filter34, filter56, filter78;
581 int32_t Temp1, Temp2, Temp3;
582 uint32_t qload1, qload2, qload3;
583 uint32_t p1, p2, p3, p4, p5;
584 uint32_t st1, st2, st3;
585
586 filter12 = ((const int32_t *)filter_x0)[0];
587 filter34 = ((const int32_t *)filter_x0)[1];
588 filter56 = ((const int32_t *)filter_x0)[2];
589 filter78 = ((const int32_t *)filter_x0)[3];
590
591 for (y = h; y--;) {
592 src = src_ptr;
593 dst = dst_ptr;
594
595 /* prefetch data to cache memory */
596 vp9_prefetch_load(src_ptr + src_stride);
597 vp9_prefetch_load(src_ptr + src_stride + 32);
598 vp9_prefetch_load(src_ptr + src_stride + 64);
599 vp9_prefetch_store(dst_ptr + dst_stride);
600 vp9_prefetch_store(dst_ptr + dst_stride + 32);
601
602 for (c = 0; c < 4; c++) {
603 __asm__ __volatile__ (
604 "ulw %[qload1], 0(%[src]) \n\t"
605 "ulw %[qload2], 4(%[src]) \n\t"
606
607 /* even 1. pixel */
608 "mtlo %[vector_64], $ac1 \n\t" /* even 1 */
609 "mthi $zero, $ac1 \n\t"
610 "mtlo %[vector_64], $ac2 \n\t" /* even 2 */
611 "mthi $zero, $ac2 \n\t"
612 "preceu.ph.qbr %[p1], %[qload1] \n\t"
613 "preceu.ph.qbl %[p2], %[qload1] \n\t"
614 "preceu.ph.qbr %[p3], %[qload2] \n\t"
615 "preceu.ph.qbl %[p4], %[qload2] \n\t"
616 "ulw %[qload3], 8(%[src]) \n\t"
617 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* even 1 */
618 "dpa.w.ph $ac1, %[p2], %[filter34] \n\t" /* even 1 */
619 "dpa.w.ph $ac1, %[p3], %[filter56] \n\t" /* even 1 */
620 "dpa.w.ph $ac1, %[p4], %[filter78] \n\t" /* even 1 */
621 "extp %[Temp1], $ac1, 31 \n\t" /* even 1 */
622
623 /* even 2. pixel */
624 "mtlo %[vector_64], $ac3 \n\t" /* even 3 */
625 "mthi $zero, $ac3 \n\t"
626 "preceu.ph.qbr %[p1], %[qload3] \n\t"
627 "preceu.ph.qbl %[p5], %[qload3] \n\t"
628 "ulw %[qload1], 12(%[src]) \n\t"
629 "dpa.w.ph $ac2, %[p2], %[filter12] \n\t" /* even 1 */
630 "dpa.w.ph $ac2, %[p3], %[filter34] \n\t" /* even 1 */
631 "dpa.w.ph $ac2, %[p4], %[filter56] \n\t" /* even 1 */
632 "dpa.w.ph $ac2, %[p1], %[filter78] \n\t" /* even 1 */
633 "extp %[Temp2], $ac2, 31 \n\t" /* even 1 */
634 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 1 */
635
636 /* even 3. pixel */
637 "mtlo %[vector_64], $ac1 \n\t" /* even 4 */
638 "mthi $zero, $ac1 \n\t"
639 "preceu.ph.qbr %[p2], %[qload1] \n\t"
640 "sb %[st1], 0(%[dst]) \n\t" /* even 1 */
641 "dpa.w.ph $ac3, %[p3], %[filter12] \n\t" /* even 3 */
642 "dpa.w.ph $ac3, %[p4], %[filter34] \n\t" /* even 3 */
643 "dpa.w.ph $ac3, %[p1], %[filter56] \n\t" /* even 3 */
644 "dpa.w.ph $ac3, %[p5], %[filter78] \n\t" /* even 3 */
645 "extp %[Temp3], $ac3, 31 \n\t" /* even 3 */
646 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 1 */
647
648 /* even 4. pixel */
649 "mtlo %[vector_64], $ac2 \n\t" /* even 5 */
650 "mthi $zero, $ac2 \n\t"
651 "preceu.ph.qbl %[p3], %[qload1] \n\t"
652 "sb %[st2], 2(%[dst]) \n\t" /* even 1 */
653 "ulw %[qload2], 16(%[src]) \n\t"
654 "dpa.w.ph $ac1, %[p4], %[filter12] \n\t" /* even 4 */
655 "dpa.w.ph $ac1, %[p1], %[filter34] \n\t" /* even 4 */
656 "dpa.w.ph $ac1, %[p5], %[filter56] \n\t" /* even 4 */
657 "dpa.w.ph $ac1, %[p2], %[filter78] \n\t" /* even 4 */
658 "extp %[Temp1], $ac1, 31 \n\t" /* even 4 */
659 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 3 */
660
661 /* even 5. pixel */
662 "mtlo %[vector_64], $ac3 \n\t" /* even 6 */
663 "mthi $zero, $ac3 \n\t"
664 "preceu.ph.qbr %[p4], %[qload2] \n\t"
665 "sb %[st3], 4(%[dst]) \n\t" /* even 3 */
666 "dpa.w.ph $ac2, %[p1], %[filter12] \n\t" /* even 5 */
667 "dpa.w.ph $ac2, %[p5], %[filter34] \n\t" /* even 5 */
668 "dpa.w.ph $ac2, %[p2], %[filter56] \n\t" /* even 5 */
669 "dpa.w.ph $ac2, %[p3], %[filter78] \n\t" /* even 5 */
670 "extp %[Temp2], $ac2, 31 \n\t" /* even 5 */
671 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 4 */
672
673 /* even 6. pixel */
674 "mtlo %[vector_64], $ac1 \n\t" /* even 7 */
675 "mthi $zero, $ac1 \n\t"
676 "preceu.ph.qbl %[p1], %[qload2] \n\t"
677 "sb %[st1], 6(%[dst]) \n\t" /* even 4 */
678 "ulw %[qload3], 20(%[src]) \n\t"
679 "dpa.w.ph $ac3, %[p5], %[filter12] \n\t" /* even 6 */
680 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* even 6 */
681 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* even 6 */
682 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* even 6 */
683 "extp %[Temp3], $ac3, 31 \n\t" /* even 6 */
684 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 5 */
685
686 /* even 7. pixel */
687 "mtlo %[vector_64], $ac2 \n\t" /* even 8 */
688 "mthi $zero, $ac2 \n\t"
689 "preceu.ph.qbr %[p5], %[qload3] \n\t"
690 "sb %[st2], 8(%[dst]) \n\t" /* even 5 */
691 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* even 7 */
692 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* even 7 */
693 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* even 7 */
694 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* even 7 */
695 "extp %[Temp1], $ac1, 31 \n\t" /* even 7 */
696 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* even 6 */
697
698 /* even 8. pixel */
699 "mtlo %[vector_64], $ac3 \n\t" /* odd 1 */
700 "mthi $zero, $ac3 \n\t"
701 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* even 8 */
702 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* even 8 */
703 "sb %[st3], 10(%[dst]) \n\t" /* even 6 */
704 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* even 8 */
705 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* even 8 */
706 "extp %[Temp2], $ac2, 31 \n\t" /* even 8 */
707 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* even 7 */
708
709 /* ODD pixels */
710 "ulw %[qload1], 1(%[src]) \n\t"
711 "ulw %[qload2], 5(%[src]) \n\t"
712
713 /* odd 1. pixel */
714 "mtlo %[vector_64], $ac1 \n\t" /* odd 2 */
715 "mthi $zero, $ac1 \n\t"
716 "preceu.ph.qbr %[p1], %[qload1] \n\t"
717 "preceu.ph.qbl %[p2], %[qload1] \n\t"
718 "preceu.ph.qbr %[p3], %[qload2] \n\t"
719 "preceu.ph.qbl %[p4], %[qload2] \n\t"
720 "sb %[st1], 12(%[dst]) \n\t" /* even 7 */
721 "ulw %[qload3], 9(%[src]) \n\t"
722 "dpa.w.ph $ac3, %[p1], %[filter12] \n\t" /* odd 1 */
723 "dpa.w.ph $ac3, %[p2], %[filter34] \n\t" /* odd 1 */
724 "dpa.w.ph $ac3, %[p3], %[filter56] \n\t" /* odd 1 */
725 "dpa.w.ph $ac3, %[p4], %[filter78] \n\t" /* odd 1 */
726 "extp %[Temp3], $ac3, 31 \n\t" /* odd 1 */
727 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* even 8 */
728
729 /* odd 2. pixel */
730 "mtlo %[vector_64], $ac2 \n\t" /* odd 3 */
731 "mthi $zero, $ac2 \n\t"
732 "preceu.ph.qbr %[p1], %[qload3] \n\t"
733 "preceu.ph.qbl %[p5], %[qload3] \n\t"
734 "sb %[st2], 14(%[dst]) \n\t" /* even 8 */
735 "ulw %[qload1], 13(%[src]) \n\t"
736 "dpa.w.ph $ac1, %[p2], %[filter12] \n\t" /* odd 2 */
737 "dpa.w.ph $ac1, %[p3], %[filter34] \n\t" /* odd 2 */
738 "dpa.w.ph $ac1, %[p4], %[filter56] \n\t" /* odd 2 */
739 "dpa.w.ph $ac1, %[p1], %[filter78] \n\t" /* odd 2 */
740 "extp %[Temp1], $ac1, 31 \n\t" /* odd 2 */
741 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 1 */
742
743 /* odd 3. pixel */
744 "mtlo %[vector_64], $ac3 \n\t" /* odd 4 */
745 "mthi $zero, $ac3 \n\t"
746 "preceu.ph.qbr %[p2], %[qload1] \n\t"
747 "sb %[st3], 1(%[dst]) \n\t" /* odd 1 */
748 "dpa.w.ph $ac2, %[p3], %[filter12] \n\t" /* odd 3 */
749 "dpa.w.ph $ac2, %[p4], %[filter34] \n\t" /* odd 3 */
750 "dpa.w.ph $ac2, %[p1], %[filter56] \n\t" /* odd 3 */
751 "dpa.w.ph $ac2, %[p5], %[filter78] \n\t" /* odd 3 */
752 "extp %[Temp2], $ac2, 31 \n\t" /* odd 3 */
753 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 2 */
754
755 /* odd 4. pixel */
756 "mtlo %[vector_64], $ac1 \n\t" /* odd 5 */
757 "mthi $zero, $ac1 \n\t"
758 "preceu.ph.qbl %[p3], %[qload1] \n\t"
759 "sb %[st1], 3(%[dst]) \n\t" /* odd 2 */
760 "ulw %[qload2], 17(%[src]) \n\t"
761 "dpa.w.ph $ac3, %[p4], %[filter12] \n\t" /* odd 4 */
762 "dpa.w.ph $ac3, %[p1], %[filter34] \n\t" /* odd 4 */
763 "dpa.w.ph $ac3, %[p5], %[filter56] \n\t" /* odd 4 */
764 "dpa.w.ph $ac3, %[p2], %[filter78] \n\t" /* odd 4 */
765 "extp %[Temp3], $ac3, 31 \n\t" /* odd 4 */
766 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 3 */
767
768 /* odd 5. pixel */
769 "mtlo %[vector_64], $ac2 \n\t" /* odd 6 */
770 "mthi $zero, $ac2 \n\t"
771 "preceu.ph.qbr %[p4], %[qload2] \n\t"
772 "sb %[st2], 5(%[dst]) \n\t" /* odd 3 */
773 "dpa.w.ph $ac1, %[p1], %[filter12] \n\t" /* odd 5 */
774 "dpa.w.ph $ac1, %[p5], %[filter34] \n\t" /* odd 5 */
775 "dpa.w.ph $ac1, %[p2], %[filter56] \n\t" /* odd 5 */
776 "dpa.w.ph $ac1, %[p3], %[filter78] \n\t" /* odd 5 */
777 "extp %[Temp1], $ac1, 31 \n\t" /* odd 5 */
778 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 4 */
779
780 /* odd 6. pixel */
781 "mtlo %[vector_64], $ac3 \n\t" /* odd 7 */
782 "mthi $zero, $ac3 \n\t"
783 "preceu.ph.qbl %[p1], %[qload2] \n\t"
784 "sb %[st3], 7(%[dst]) \n\t" /* odd 4 */
785 "ulw %[qload3], 21(%[src]) \n\t"
786 "dpa.w.ph $ac2, %[p5], %[filter12] \n\t" /* odd 6 */
787 "dpa.w.ph $ac2, %[p2], %[filter34] \n\t" /* odd 6 */
788 "dpa.w.ph $ac2, %[p3], %[filter56] \n\t" /* odd 6 */
789 "dpa.w.ph $ac2, %[p4], %[filter78] \n\t" /* odd 6 */
790 "extp %[Temp2], $ac2, 31 \n\t" /* odd 6 */
791 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 5 */
792
793 /* odd 7. pixel */
794 "mtlo %[vector_64], $ac1 \n\t" /* odd 8 */
795 "mthi $zero, $ac1 \n\t"
796 "preceu.ph.qbr %[p5], %[qload3] \n\t"
797 "sb %[st1], 9(%[dst]) \n\t" /* odd 5 */
798 "dpa.w.ph $ac3, %[p2], %[filter12] \n\t" /* odd 7 */
799 "dpa.w.ph $ac3, %[p3], %[filter34] \n\t" /* odd 7 */
800 "dpa.w.ph $ac3, %[p4], %[filter56] \n\t" /* odd 7 */
801 "dpa.w.ph $ac3, %[p1], %[filter78] \n\t" /* odd 7 */
802 "extp %[Temp3], $ac3, 31 \n\t" /* odd 7 */
803
804 /* odd 8. pixel */
805 "dpa.w.ph $ac1, %[p3], %[filter12] \n\t" /* odd 8 */
806 "dpa.w.ph $ac1, %[p4], %[filter34] \n\t" /* odd 8 */
807 "dpa.w.ph $ac1, %[p1], %[filter56] \n\t" /* odd 8 */
808 "dpa.w.ph $ac1, %[p5], %[filter78] \n\t" /* odd 8 */
809 "extp %[Temp1], $ac1, 31 \n\t" /* odd 8 */
810
811 "lbux %[st2], %[Temp2](%[cm]) \n\t" /* odd 6 */
812 "lbux %[st3], %[Temp3](%[cm]) \n\t" /* odd 7 */
813 "lbux %[st1], %[Temp1](%[cm]) \n\t" /* odd 8 */
814
815 "sb %[st2], 11(%[dst]) \n\t" /* odd 6 */
816 "sb %[st3], 13(%[dst]) \n\t" /* odd 7 */
817 "sb %[st1], 15(%[dst]) \n\t" /* odd 8 */
818
819 : [qload1] "=&r" (qload1), [qload2] "=&r" (qload2), [qload3] "=&r" (qload3),
820 [st1] "=&r" (st1), [st2] "=&r" (st2), [st3] "=&r" (st3),
821 [p1] "=&r" (p1), [p2] "=&r" (p2), [p3] "=&r" (p3), [p4] "=&r" (p4),
822 [p5] "=&r" (p5),
823 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2), [Temp3] "=&r" (Temp3)
824 : [filter12] "r" (filter12), [filter34] "r" (filter34),
825 [filter56] "r" (filter56), [filter78] "r" (filter78),
826 [vector_64] "r" (vector_64),
827 [cm] "r" (cm), [dst] "r" (dst),
828 [src] "r" (src)
829 );
830
831 src += 16;
832 dst += 16;
833 }
834
835 /* Next row... */
836 src_ptr += src_stride;
837 dst_ptr += dst_stride;
838 }
839 }
840
vp9_convolve8_horiz_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)841 void vp9_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride,
842 uint8_t *dst, ptrdiff_t dst_stride,
843 const int16_t *filter_x, int x_step_q4,
844 const int16_t *filter_y, int y_step_q4,
845 int w, int h) {
846 if (((const int32_t *)filter_x)[1] == 0x800000) {
847 vp9_convolve_copy(src, src_stride,
848 dst, dst_stride,
849 filter_x, x_step_q4,
850 filter_y, y_step_q4,
851 w, h);
852 } else if (((const int32_t *)filter_x)[0] == 0) {
853 vp9_convolve2_horiz_dspr2(src, src_stride,
854 dst, dst_stride,
855 filter_x, x_step_q4,
856 filter_y, y_step_q4,
857 w, h);
858 } else {
859 if (16 == x_step_q4) {
860 uint32_t pos = 38;
861
862 vp9_prefetch_load((const uint8_t *)filter_x);
863 src -= 3;
864
865 /* bit positon for extract from acc */
866 __asm__ __volatile__ (
867 "wrdsp %[pos], 1 \n\t"
868 :
869 : [pos] "r" (pos)
870 );
871
872 /* prefetch data to cache memory */
873 vp9_prefetch_load(src);
874 vp9_prefetch_load(src + 32);
875 vp9_prefetch_store(dst);
876
877 switch (w) {
878 case 4:
879 convolve_horiz_4_dspr2(src, (int32_t)src_stride,
880 dst, (int32_t)dst_stride,
881 filter_x, (int32_t)h);
882 break;
883 case 8:
884 convolve_horiz_8_dspr2(src, (int32_t)src_stride,
885 dst, (int32_t)dst_stride,
886 filter_x, (int32_t)h);
887 break;
888 case 16:
889 convolve_horiz_16_dspr2(src, (int32_t)src_stride,
890 dst, (int32_t)dst_stride,
891 filter_x, (int32_t)h, 1);
892 break;
893 case 32:
894 convolve_horiz_16_dspr2(src, (int32_t)src_stride,
895 dst, (int32_t)dst_stride,
896 filter_x, (int32_t)h, 2);
897 break;
898 case 64:
899 vp9_prefetch_load(src + 64);
900 vp9_prefetch_store(dst + 32);
901
902 convolve_horiz_64_dspr2(src, (int32_t)src_stride,
903 dst, (int32_t)dst_stride,
904 filter_x, (int32_t)h);
905 break;
906 default:
907 vp9_convolve8_horiz_c(src + 3, src_stride,
908 dst, dst_stride,
909 filter_x, x_step_q4,
910 filter_y, y_step_q4,
911 w, h);
912 break;
913 }
914 } else {
915 vp9_convolve8_horiz_c(src, src_stride,
916 dst, dst_stride,
917 filter_x, x_step_q4,
918 filter_y, y_step_q4,
919 w, h);
920 }
921 }
922 }
923 #endif
924