1 /*
2 * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include <assert.h>
12 #include <stdio.h>
13
14 #include "./vpx_config.h"
15 #include "./vp9_rtcd.h"
16 #include "vp9/common/vp9_common.h"
17 #include "vpx/vpx_integer.h"
18 #include "vpx_ports/mem.h"
19 #include "vp9/common/vp9_convolve.h"
20 #include "vp9/common/mips/dspr2/vp9_common_dspr2.h"
21
22 #if HAVE_DSPR2
convolve_avg_vert_4_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t w,int32_t h)23 static void convolve_avg_vert_4_dspr2(const uint8_t *src,
24 int32_t src_stride,
25 uint8_t *dst,
26 int32_t dst_stride,
27 const int16_t *filter_y,
28 int32_t w,
29 int32_t h) {
30 int32_t x, y;
31 const uint8_t *src_ptr;
32 uint8_t *dst_ptr;
33 uint8_t *cm = vp9_ff_cropTbl;
34 uint32_t vector4a = 64;
35 uint32_t load1, load2, load3, load4;
36 uint32_t p1, p2;
37 uint32_t n1, n2;
38 uint32_t scratch1, scratch2;
39 uint32_t store1, store2;
40 int32_t vector1b, vector2b, vector3b, vector4b;
41 int32_t Temp1, Temp2;
42
43 vector1b = ((const int32_t *)filter_y)[0];
44 vector2b = ((const int32_t *)filter_y)[1];
45 vector3b = ((const int32_t *)filter_y)[2];
46 vector4b = ((const int32_t *)filter_y)[3];
47
48 src -= 3 * src_stride;
49
50 for (y = h; y--;) {
51 /* prefetch data to cache memory */
52 vp9_prefetch_store(dst + dst_stride);
53
54 for (x = 0; x < w; x += 4) {
55 src_ptr = src + x;
56 dst_ptr = dst + x;
57
58 __asm__ __volatile__ (
59 "ulw %[load1], 0(%[src_ptr]) \n\t"
60 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
61 "ulw %[load2], 0(%[src_ptr]) \n\t"
62 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
63 "ulw %[load3], 0(%[src_ptr]) \n\t"
64 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
65 "ulw %[load4], 0(%[src_ptr]) \n\t"
66
67 "mtlo %[vector4a], $ac0 \n\t"
68 "mtlo %[vector4a], $ac1 \n\t"
69 "mtlo %[vector4a], $ac2 \n\t"
70 "mtlo %[vector4a], $ac3 \n\t"
71 "mthi $zero, $ac0 \n\t"
72 "mthi $zero, $ac1 \n\t"
73 "mthi $zero, $ac2 \n\t"
74 "mthi $zero, $ac3 \n\t"
75
76 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
77 "preceu.ph.qbr %[p1], %[load2] \n\t"
78 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
79 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
80 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
81 "preceu.ph.qbr %[p2], %[load4] \n\t"
82 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
83 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
84
85 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
86 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
87 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
88 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
89
90 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
91 "preceu.ph.qbl %[p1], %[load2] \n\t"
92 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
93 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
94 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
95 "preceu.ph.qbl %[p2], %[load4] \n\t"
96 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
97 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
98
99 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
100 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
101 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
102 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
103
104 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
105 "ulw %[load1], 0(%[src_ptr]) \n\t"
106 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
107 "ulw %[load2], 0(%[src_ptr]) \n\t"
108 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
109 "ulw %[load3], 0(%[src_ptr]) \n\t"
110 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
111 "ulw %[load4], 0(%[src_ptr]) \n\t"
112
113 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
114 "preceu.ph.qbr %[p1], %[load2] \n\t"
115 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
116 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
117 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
118 "preceu.ph.qbr %[p2], %[load4] \n\t"
119 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
120 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
121
122 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
123 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
124 "extp %[Temp1], $ac0, 31 \n\t"
125 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
126 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
127 "extp %[Temp2], $ac1, 31 \n\t"
128
129 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
130 "preceu.ph.qbl %[p1], %[load2] \n\t"
131 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
132 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
133 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
134 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
135 "preceu.ph.qbl %[p2], %[load4] \n\t"
136 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
137 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
138 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
139
140 "lbux %[store1], %[Temp1](%[cm]) \n\t"
141 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
142 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
143 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
144 "extp %[Temp1], $ac2, 31 \n\t"
145
146 "lbux %[store2], %[Temp2](%[cm]) \n\t"
147 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
148 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
149 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
150 "extp %[Temp2], $ac3, 31 \n\t"
151 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
152
153 "sb %[store1], 0(%[dst_ptr]) \n\t"
154 "sb %[store2], 1(%[dst_ptr]) \n\t"
155 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
156
157 "lbux %[store1], %[Temp1](%[cm]) \n\t"
158 "lbux %[store2], %[Temp2](%[cm]) \n\t"
159 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
160 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
161
162 "sb %[store1], 2(%[dst_ptr]) \n\t"
163 "sb %[store2], 3(%[dst_ptr]) \n\t"
164
165 : [load1] "=&r" (load1), [load2] "=&r" (load2),
166 [load3] "=&r" (load3), [load4] "=&r" (load4),
167 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
168 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
169 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
170 [store1] "=&r" (store1), [store2] "=&r" (store2),
171 [src_ptr] "+r" (src_ptr)
172 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
173 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
174 [vector4a] "r" (vector4a),
175 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
176 );
177 }
178
179 /* Next row... */
180 src += src_stride;
181 dst += dst_stride;
182 }
183 }
184
convolve_avg_vert_64_dspr2(const uint8_t * src,int32_t src_stride,uint8_t * dst,int32_t dst_stride,const int16_t * filter_y,int32_t h)185 static void convolve_avg_vert_64_dspr2(const uint8_t *src,
186 int32_t src_stride,
187 uint8_t *dst,
188 int32_t dst_stride,
189 const int16_t *filter_y,
190 int32_t h) {
191 int32_t x, y;
192 const uint8_t *src_ptr;
193 uint8_t *dst_ptr;
194 uint8_t *cm = vp9_ff_cropTbl;
195 uint32_t vector4a = 64;
196 uint32_t load1, load2, load3, load4;
197 uint32_t p1, p2;
198 uint32_t n1, n2;
199 uint32_t scratch1, scratch2;
200 uint32_t store1, store2;
201 int32_t vector1b, vector2b, vector3b, vector4b;
202 int32_t Temp1, Temp2;
203
204 vector1b = ((const int32_t *)filter_y)[0];
205 vector2b = ((const int32_t *)filter_y)[1];
206 vector3b = ((const int32_t *)filter_y)[2];
207 vector4b = ((const int32_t *)filter_y)[3];
208
209 src -= 3 * src_stride;
210
211 for (y = h; y--;) {
212 /* prefetch data to cache memory */
213 vp9_prefetch_store(dst + dst_stride);
214 vp9_prefetch_store(dst + dst_stride + 32);
215
216 for (x = 0; x < 64; x += 4) {
217 src_ptr = src + x;
218 dst_ptr = dst + x;
219
220 __asm__ __volatile__ (
221 "ulw %[load1], 0(%[src_ptr]) \n\t"
222 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
223 "ulw %[load2], 0(%[src_ptr]) \n\t"
224 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
225 "ulw %[load3], 0(%[src_ptr]) \n\t"
226 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
227 "ulw %[load4], 0(%[src_ptr]) \n\t"
228
229 "mtlo %[vector4a], $ac0 \n\t"
230 "mtlo %[vector4a], $ac1 \n\t"
231 "mtlo %[vector4a], $ac2 \n\t"
232 "mtlo %[vector4a], $ac3 \n\t"
233 "mthi $zero, $ac0 \n\t"
234 "mthi $zero, $ac1 \n\t"
235 "mthi $zero, $ac2 \n\t"
236 "mthi $zero, $ac3 \n\t"
237
238 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
239 "preceu.ph.qbr %[p1], %[load2] \n\t"
240 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
241 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
242 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
243 "preceu.ph.qbr %[p2], %[load4] \n\t"
244 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
245 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
246
247 "dpa.w.ph $ac0, %[p1], %[vector1b] \n\t"
248 "dpa.w.ph $ac0, %[p2], %[vector2b] \n\t"
249 "dpa.w.ph $ac1, %[n1], %[vector1b] \n\t"
250 "dpa.w.ph $ac1, %[n2], %[vector2b] \n\t"
251
252 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
253 "preceu.ph.qbl %[p1], %[load2] \n\t"
254 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
255 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
256 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
257 "preceu.ph.qbl %[p2], %[load4] \n\t"
258 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
259 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
260
261 "dpa.w.ph $ac2, %[p1], %[vector1b] \n\t"
262 "dpa.w.ph $ac2, %[p2], %[vector2b] \n\t"
263 "dpa.w.ph $ac3, %[n1], %[vector1b] \n\t"
264 "dpa.w.ph $ac3, %[n2], %[vector2b] \n\t"
265
266 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
267 "ulw %[load1], 0(%[src_ptr]) \n\t"
268 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
269 "ulw %[load2], 0(%[src_ptr]) \n\t"
270 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
271 "ulw %[load3], 0(%[src_ptr]) \n\t"
272 "add %[src_ptr], %[src_ptr], %[src_stride] \n\t"
273 "ulw %[load4], 0(%[src_ptr]) \n\t"
274
275 "preceu.ph.qbr %[scratch1], %[load1] \n\t"
276 "preceu.ph.qbr %[p1], %[load2] \n\t"
277 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
278 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
279 "preceu.ph.qbr %[scratch2], %[load3] \n\t"
280 "preceu.ph.qbr %[p2], %[load4] \n\t"
281 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
282 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
283
284 "dpa.w.ph $ac0, %[p1], %[vector3b] \n\t"
285 "dpa.w.ph $ac0, %[p2], %[vector4b] \n\t"
286 "extp %[Temp1], $ac0, 31 \n\t"
287 "dpa.w.ph $ac1, %[n1], %[vector3b] \n\t"
288 "dpa.w.ph $ac1, %[n2], %[vector4b] \n\t"
289 "extp %[Temp2], $ac1, 31 \n\t"
290
291 "preceu.ph.qbl %[scratch1], %[load1] \n\t"
292 "preceu.ph.qbl %[p1], %[load2] \n\t"
293 "precrq.ph.w %[n1], %[p1], %[scratch1] \n\t" /* pixel 2 */
294 "append %[p1], %[scratch1], 16 \n\t" /* pixel 1 */
295 "lbu %[scratch1], 0(%[dst_ptr]) \n\t"
296 "preceu.ph.qbl %[scratch2], %[load3] \n\t"
297 "preceu.ph.qbl %[p2], %[load4] \n\t"
298 "precrq.ph.w %[n2], %[p2], %[scratch2] \n\t" /* pixel 2 */
299 "append %[p2], %[scratch2], 16 \n\t" /* pixel 1 */
300 "lbu %[scratch2], 1(%[dst_ptr]) \n\t"
301
302 "lbux %[store1], %[Temp1](%[cm]) \n\t"
303 "dpa.w.ph $ac2, %[p1], %[vector3b] \n\t"
304 "dpa.w.ph $ac2, %[p2], %[vector4b] \n\t"
305 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 1 */
306 "extp %[Temp1], $ac2, 31 \n\t"
307
308 "lbux %[store2], %[Temp2](%[cm]) \n\t"
309 "dpa.w.ph $ac3, %[n1], %[vector3b] \n\t"
310 "dpa.w.ph $ac3, %[n2], %[vector4b] \n\t"
311 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 2 */
312 "extp %[Temp2], $ac3, 31 \n\t"
313 "lbu %[scratch1], 2(%[dst_ptr]) \n\t"
314
315 "sb %[store1], 0(%[dst_ptr]) \n\t"
316 "sb %[store2], 1(%[dst_ptr]) \n\t"
317 "lbu %[scratch2], 3(%[dst_ptr]) \n\t"
318
319 "lbux %[store1], %[Temp1](%[cm]) \n\t"
320 "lbux %[store2], %[Temp2](%[cm]) \n\t"
321 "addqh_r.w %[store1], %[store1], %[scratch1] \n\t" /* pixel 3 */
322 "addqh_r.w %[store2], %[store2], %[scratch2] \n\t" /* pixel 4 */
323
324 "sb %[store1], 2(%[dst_ptr]) \n\t"
325 "sb %[store2], 3(%[dst_ptr]) \n\t"
326
327 : [load1] "=&r" (load1), [load2] "=&r" (load2),
328 [load3] "=&r" (load3), [load4] "=&r" (load4),
329 [p1] "=&r" (p1), [p2] "=&r" (p2), [n1] "=&r" (n1), [n2] "=&r" (n2),
330 [scratch1] "=&r" (scratch1), [scratch2] "=&r" (scratch2),
331 [Temp1] "=&r" (Temp1), [Temp2] "=&r" (Temp2),
332 [store1] "=&r" (store1), [store2] "=&r" (store2),
333 [src_ptr] "+r" (src_ptr)
334 : [vector1b] "r" (vector1b), [vector2b] "r" (vector2b),
335 [vector3b] "r" (vector3b), [vector4b] "r" (vector4b),
336 [vector4a] "r" (vector4a),
337 [src_stride] "r" (src_stride), [cm] "r" (cm), [dst_ptr] "r" (dst_ptr)
338 );
339 }
340
341 /* Next row... */
342 src += src_stride;
343 dst += dst_stride;
344 }
345 }
346
vp9_convolve8_avg_vert_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)347 void vp9_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride,
348 uint8_t *dst, ptrdiff_t dst_stride,
349 const int16_t *filter_x, int x_step_q4,
350 const int16_t *filter_y, int y_step_q4,
351 int w, int h) {
352 if (((const int32_t *)filter_y)[1] == 0x800000) {
353 vp9_convolve_avg(src, src_stride,
354 dst, dst_stride,
355 filter_x, x_step_q4,
356 filter_y, y_step_q4,
357 w, h);
358 } else if (((const int32_t *)filter_y)[0] == 0) {
359 vp9_convolve2_avg_vert_dspr2(src, src_stride,
360 dst, dst_stride,
361 filter_x, x_step_q4,
362 filter_y, y_step_q4,
363 w, h);
364 } else {
365 if (16 == y_step_q4) {
366 uint32_t pos = 38;
367
368 /* bit positon for extract from acc */
369 __asm__ __volatile__ (
370 "wrdsp %[pos], 1 \n\t"
371 :
372 : [pos] "r" (pos)
373 );
374
375 vp9_prefetch_store(dst);
376
377 switch (w) {
378 case 4:
379 case 8:
380 case 16:
381 case 32:
382 convolve_avg_vert_4_dspr2(src, src_stride,
383 dst, dst_stride,
384 filter_y, w, h);
385 break;
386 case 64:
387 vp9_prefetch_store(dst + 32);
388 convolve_avg_vert_64_dspr2(src, src_stride,
389 dst, dst_stride,
390 filter_y, h);
391 break;
392 default:
393 vp9_convolve8_avg_vert_c(src, src_stride,
394 dst, dst_stride,
395 filter_x, x_step_q4,
396 filter_y, y_step_q4,
397 w, h);
398 break;
399 }
400 } else {
401 vp9_convolve8_avg_vert_c(src, src_stride,
402 dst, dst_stride,
403 filter_x, x_step_q4,
404 filter_y, y_step_q4,
405 w, h);
406 }
407 }
408 }
409
vp9_convolve8_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int x_step_q4,const int16_t * filter_y,int y_step_q4,int w,int h)410 void vp9_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
411 uint8_t *dst, ptrdiff_t dst_stride,
412 const int16_t *filter_x, int x_step_q4,
413 const int16_t *filter_y, int y_step_q4,
414 int w, int h) {
415 /* Fixed size intermediate buffer places limits on parameters. */
416 DECLARE_ALIGNED_ARRAY(32, uint8_t, temp, 64 * 135);
417 int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7;
418
419 assert(w <= 64);
420 assert(h <= 64);
421
422 if (intermediate_height < h)
423 intermediate_height = h;
424
425 if (x_step_q4 != 16 || y_step_q4 != 16)
426 return vp9_convolve8_avg_c(src, src_stride,
427 dst, dst_stride,
428 filter_x, x_step_q4,
429 filter_y, y_step_q4,
430 w, h);
431
432 vp9_convolve8_horiz(src - (src_stride * 3), src_stride,
433 temp, 64,
434 filter_x, x_step_q4,
435 filter_y, y_step_q4,
436 w, intermediate_height);
437
438 vp9_convolve8_avg_vert(temp + 64 * 3, 64,
439 dst, dst_stride,
440 filter_x, x_step_q4,
441 filter_y, y_step_q4,
442 w, h);
443 }
444
vp9_convolve_avg_dspr2(const uint8_t * src,ptrdiff_t src_stride,uint8_t * dst,ptrdiff_t dst_stride,const int16_t * filter_x,int filter_x_stride,const int16_t * filter_y,int filter_y_stride,int w,int h)445 void vp9_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride,
446 uint8_t *dst, ptrdiff_t dst_stride,
447 const int16_t *filter_x, int filter_x_stride,
448 const int16_t *filter_y, int filter_y_stride,
449 int w, int h) {
450 int x, y;
451 uint32_t tp1, tp2, tn1;
452 uint32_t tp3, tp4, tn2;
453
454 /* prefetch data to cache memory */
455 vp9_prefetch_load(src);
456 vp9_prefetch_load(src + 32);
457 vp9_prefetch_store(dst);
458
459 switch (w) {
460 case 4:
461 /* 1 word storage */
462 for (y = h; y--; ) {
463 vp9_prefetch_load(src + src_stride);
464 vp9_prefetch_load(src + src_stride + 32);
465 vp9_prefetch_store(dst + dst_stride);
466
467 __asm__ __volatile__ (
468 "ulw %[tp1], 0(%[src]) \n\t"
469 "ulw %[tp2], 0(%[dst]) \n\t"
470 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
471 "sw %[tn1], 0(%[dst]) \n\t" /* store */
472
473 : [tn1] "=&r" (tn1), [tp1] "=&r" (tp1),
474 [tp2] "=&r" (tp2)
475 : [src] "r" (src), [dst] "r" (dst)
476 );
477
478 src += src_stride;
479 dst += dst_stride;
480 }
481 break;
482 case 8:
483 /* 2 word storage */
484 for (y = h; y--; ) {
485 vp9_prefetch_load(src + src_stride);
486 vp9_prefetch_load(src + src_stride + 32);
487 vp9_prefetch_store(dst + dst_stride);
488
489 __asm__ __volatile__ (
490 "ulw %[tp1], 0(%[src]) \n\t"
491 "ulw %[tp2], 0(%[dst]) \n\t"
492 "ulw %[tp3], 4(%[src]) \n\t"
493 "ulw %[tp4], 4(%[dst]) \n\t"
494 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
495 "sw %[tn1], 0(%[dst]) \n\t" /* store */
496 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
497 "sw %[tn2], 4(%[dst]) \n\t" /* store */
498
499 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
500 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
501 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
502 : [src] "r" (src), [dst] "r" (dst)
503 );
504
505 src += src_stride;
506 dst += dst_stride;
507 }
508 break;
509 case 16:
510 /* 4 word storage */
511 for (y = h; y--; ) {
512 vp9_prefetch_load(src + src_stride);
513 vp9_prefetch_load(src + src_stride + 32);
514 vp9_prefetch_store(dst + dst_stride);
515
516 __asm__ __volatile__ (
517 "ulw %[tp1], 0(%[src]) \n\t"
518 "ulw %[tp2], 0(%[dst]) \n\t"
519 "ulw %[tp3], 4(%[src]) \n\t"
520 "ulw %[tp4], 4(%[dst]) \n\t"
521 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
522 "ulw %[tp1], 8(%[src]) \n\t"
523 "ulw %[tp2], 8(%[dst]) \n\t"
524 "sw %[tn1], 0(%[dst]) \n\t" /* store */
525 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
526 "sw %[tn2], 4(%[dst]) \n\t" /* store */
527 "ulw %[tp3], 12(%[src]) \n\t"
528 "ulw %[tp4], 12(%[dst]) \n\t"
529 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
530 "sw %[tn1], 8(%[dst]) \n\t" /* store */
531 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
532 "sw %[tn2], 12(%[dst]) \n\t" /* store */
533
534 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
535 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
536 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
537 : [src] "r" (src), [dst] "r" (dst)
538 );
539
540 src += src_stride;
541 dst += dst_stride;
542 }
543 break;
544 case 32:
545 /* 8 word storage */
546 for (y = h; y--; ) {
547 vp9_prefetch_load(src + src_stride);
548 vp9_prefetch_load(src + src_stride + 32);
549 vp9_prefetch_store(dst + dst_stride);
550
551 __asm__ __volatile__ (
552 "ulw %[tp1], 0(%[src]) \n\t"
553 "ulw %[tp2], 0(%[dst]) \n\t"
554 "ulw %[tp3], 4(%[src]) \n\t"
555 "ulw %[tp4], 4(%[dst]) \n\t"
556 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
557 "ulw %[tp1], 8(%[src]) \n\t"
558 "ulw %[tp2], 8(%[dst]) \n\t"
559 "sw %[tn1], 0(%[dst]) \n\t" /* store */
560 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
561 "sw %[tn2], 4(%[dst]) \n\t" /* store */
562 "ulw %[tp3], 12(%[src]) \n\t"
563 "ulw %[tp4], 12(%[dst]) \n\t"
564 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
565 "ulw %[tp1], 16(%[src]) \n\t"
566 "ulw %[tp2], 16(%[dst]) \n\t"
567 "sw %[tn1], 8(%[dst]) \n\t" /* store */
568 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
569 "sw %[tn2], 12(%[dst]) \n\t" /* store */
570 "ulw %[tp3], 20(%[src]) \n\t"
571 "ulw %[tp4], 20(%[dst]) \n\t"
572 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
573 "ulw %[tp1], 24(%[src]) \n\t"
574 "ulw %[tp2], 24(%[dst]) \n\t"
575 "sw %[tn1], 16(%[dst]) \n\t" /* store */
576 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
577 "sw %[tn2], 20(%[dst]) \n\t" /* store */
578 "ulw %[tp3], 28(%[src]) \n\t"
579 "ulw %[tp4], 28(%[dst]) \n\t"
580 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
581 "sw %[tn1], 24(%[dst]) \n\t" /* store */
582 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
583 "sw %[tn2], 28(%[dst]) \n\t" /* store */
584
585 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
586 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
587 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
588 : [src] "r" (src), [dst] "r" (dst)
589 );
590
591 src += src_stride;
592 dst += dst_stride;
593 }
594 break;
595 case 64:
596 vp9_prefetch_load(src + 64);
597 vp9_prefetch_store(dst + 32);
598
599 /* 16 word storage */
600 for (y = h; y--; ) {
601 vp9_prefetch_load(src + src_stride);
602 vp9_prefetch_load(src + src_stride + 32);
603 vp9_prefetch_load(src + src_stride + 64);
604 vp9_prefetch_store(dst + dst_stride);
605 vp9_prefetch_store(dst + dst_stride + 32);
606
607 __asm__ __volatile__ (
608 "ulw %[tp1], 0(%[src]) \n\t"
609 "ulw %[tp2], 0(%[dst]) \n\t"
610 "ulw %[tp3], 4(%[src]) \n\t"
611 "ulw %[tp4], 4(%[dst]) \n\t"
612 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
613 "ulw %[tp1], 8(%[src]) \n\t"
614 "ulw %[tp2], 8(%[dst]) \n\t"
615 "sw %[tn1], 0(%[dst]) \n\t" /* store */
616 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
617 "sw %[tn2], 4(%[dst]) \n\t" /* store */
618 "ulw %[tp3], 12(%[src]) \n\t"
619 "ulw %[tp4], 12(%[dst]) \n\t"
620 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
621 "ulw %[tp1], 16(%[src]) \n\t"
622 "ulw %[tp2], 16(%[dst]) \n\t"
623 "sw %[tn1], 8(%[dst]) \n\t" /* store */
624 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
625 "sw %[tn2], 12(%[dst]) \n\t" /* store */
626 "ulw %[tp3], 20(%[src]) \n\t"
627 "ulw %[tp4], 20(%[dst]) \n\t"
628 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
629 "ulw %[tp1], 24(%[src]) \n\t"
630 "ulw %[tp2], 24(%[dst]) \n\t"
631 "sw %[tn1], 16(%[dst]) \n\t" /* store */
632 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
633 "sw %[tn2], 20(%[dst]) \n\t" /* store */
634 "ulw %[tp3], 28(%[src]) \n\t"
635 "ulw %[tp4], 28(%[dst]) \n\t"
636 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
637 "ulw %[tp1], 32(%[src]) \n\t"
638 "ulw %[tp2], 32(%[dst]) \n\t"
639 "sw %[tn1], 24(%[dst]) \n\t" /* store */
640 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
641 "sw %[tn2], 28(%[dst]) \n\t" /* store */
642 "ulw %[tp3], 36(%[src]) \n\t"
643 "ulw %[tp4], 36(%[dst]) \n\t"
644 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
645 "ulw %[tp1], 40(%[src]) \n\t"
646 "ulw %[tp2], 40(%[dst]) \n\t"
647 "sw %[tn1], 32(%[dst]) \n\t" /* store */
648 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
649 "sw %[tn2], 36(%[dst]) \n\t" /* store */
650 "ulw %[tp3], 44(%[src]) \n\t"
651 "ulw %[tp4], 44(%[dst]) \n\t"
652 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
653 "ulw %[tp1], 48(%[src]) \n\t"
654 "ulw %[tp2], 48(%[dst]) \n\t"
655 "sw %[tn1], 40(%[dst]) \n\t" /* store */
656 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
657 "sw %[tn2], 44(%[dst]) \n\t" /* store */
658 "ulw %[tp3], 52(%[src]) \n\t"
659 "ulw %[tp4], 52(%[dst]) \n\t"
660 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
661 "ulw %[tp1], 56(%[src]) \n\t"
662 "ulw %[tp2], 56(%[dst]) \n\t"
663 "sw %[tn1], 48(%[dst]) \n\t" /* store */
664 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
665 "sw %[tn2], 52(%[dst]) \n\t" /* store */
666 "ulw %[tp3], 60(%[src]) \n\t"
667 "ulw %[tp4], 60(%[dst]) \n\t"
668 "adduh_r.qb %[tn1], %[tp2], %[tp1] \n\t" /* average */
669 "sw %[tn1], 56(%[dst]) \n\t" /* store */
670 "adduh_r.qb %[tn2], %[tp3], %[tp4] \n\t" /* average */
671 "sw %[tn2], 60(%[dst]) \n\t" /* store */
672
673 : [tp1] "=&r" (tp1), [tp2] "=&r" (tp2),
674 [tp3] "=&r" (tp3), [tp4] "=&r" (tp4),
675 [tn1] "=&r" (tn1), [tn2] "=&r" (tn2)
676 : [src] "r" (src), [dst] "r" (dst)
677 );
678
679 src += src_stride;
680 dst += dst_stride;
681 }
682 break;
683 default:
684 for (y = h; y > 0; --y) {
685 for (x = 0; x < w; ++x) {
686 dst[x] = (dst[x] + src[x] + 1) >> 1;
687 }
688
689 src += src_stride;
690 dst += dst_stride;
691 }
692 break;
693 }
694 }
695 #endif
696