1 // Copyright 2014 Google Inc. All Rights Reserved.
2 //
3 // Use of this source code is governed by a BSD-style license
4 // that can be found in the COPYING file in the root of the source
5 // tree. An additional intellectual property rights grant can be found
6 // in the file PATENTS. All contributing project authors may
7 // be found in the AUTHORS file in the root of the source tree.
8 // -----------------------------------------------------------------------------
9 //
10 // MIPS version of dsp functions
11 //
12 // Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
13 // Jovan Zelincevic (jovan.zelincevic@imgtec.com)
14
15 #include "./dsp.h"
16
17 #if defined(WEBP_USE_MIPS_DSP_R2)
18
19 #include "./mips_macro.h"
20
21 static const int kC1 = 20091 + (1 << 16);
22 static const int kC2 = 35468;
23
24 #define MUL(a, b) (((a) * (b)) >> 16)
25
TransformDC(const int16_t * in,uint8_t * dst)26 static void TransformDC(const int16_t* in, uint8_t* dst) {
27 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
28
29 __asm__ volatile (
30 LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
31 0, 0, 0, 0,
32 0, 1, 2, 3,
33 BPS)
34 "lh %[temp5], 0(%[in]) \n\t"
35 "addiu %[temp5], %[temp5], 4 \n\t"
36 "ins %[temp5], %[temp5], 16, 16 \n\t"
37 "shra.ph %[temp5], %[temp5], 3 \n\t"
38 CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
39 temp3, temp1, temp2, temp3, temp4)
40 STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
41 temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
42 dst, 0, 1, 2, 3, BPS)
43
44 OUTPUT_EARLY_CLOBBER_REGS_10()
45 : [in]"r"(in), [dst]"r"(dst)
46 : "memory"
47 );
48 }
49
TransformAC3(const int16_t * in,uint8_t * dst)50 static void TransformAC3(const int16_t* in, uint8_t* dst) {
51 const int a = in[0] + 4;
52 int c4 = MUL(in[4], kC2);
53 const int d4 = MUL(in[4], kC1);
54 const int c1 = MUL(in[1], kC2);
55 const int d1 = MUL(in[1], kC1);
56 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
57 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
58
59 __asm__ volatile (
60 "ins %[c4], %[d4], 16, 16 \n\t"
61 "replv.ph %[temp1], %[a] \n\t"
62 "replv.ph %[temp4], %[d1] \n\t"
63 ADD_SUB_HALVES(temp2, temp3, temp1, c4)
64 "replv.ph %[temp5], %[c1] \n\t"
65 SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
66 temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
67 LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
68 0, 0, 0, 0,
69 0, 1, 2, 3,
70 BPS)
71 CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
72 temp11, temp17, temp3, temp5, temp11, temp12)
73 PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
74 temp4, temp7, temp6, temp10, temp9)
75 STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
76 temp17, temp12, temp18, temp1, temp8, temp2, temp4,
77 temp7, temp6, dst, 0, 1, 2, 3, BPS)
78
79 OUTPUT_EARLY_CLOBBER_REGS_18(),
80 [c4]"+&r"(c4)
81 : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
82 : "memory"
83 );
84 }
85
TransformOne(const int16_t * in,uint8_t * dst)86 static void TransformOne(const int16_t* in, uint8_t* dst) {
87 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
88 int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
89
90 __asm__ volatile (
91 "ulw %[temp1], 0(%[in]) \n\t"
92 "ulw %[temp2], 16(%[in]) \n\t"
93 LOAD_IN_X2(temp5, temp6, 24, 26)
94 ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
95 LOAD_IN_X2(temp1, temp2, 8, 10)
96 MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
97 temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
98 temp13, temp11, temp14, temp12)
99 INSERT_HALF_X2(temp8, temp7, temp10, temp9)
100 "ulw %[temp17], 4(%[in]) \n\t"
101 "ulw %[temp18], 20(%[in]) \n\t"
102 ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
103 ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
104 ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
105 LOAD_IN_X2(temp17, temp18, 12, 14)
106 LOAD_IN_X2(temp9, temp10, 28, 30)
107 MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
108 temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
109 temp15, temp4, temp16, temp17)
110 INSERT_HALF_X2(temp11, temp12, temp13, temp14)
111 ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
112 ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
113
114 // horizontal
115 SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
116 INSERT_HALF_X2(temp1, temp6, temp5, temp2)
117 SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
118 "repl.ph %[temp2], 0x4 \n\t"
119 INSERT_HALF_X2(temp3, temp8, temp17, temp4)
120 "addq.ph %[temp1], %[temp1], %[temp2] \n\t"
121 "addq.ph %[temp6], %[temp6], %[temp2] \n\t"
122 ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
123 ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
124 MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
125 temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
126 temp6, temp17, temp8, temp18)
127 MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
128 temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
129 temp18, temp12, temp17, temp16)
130 INSERT_HALF_X2(temp1, temp3, temp9, temp13)
131 INSERT_HALF_X2(temp6, temp8, temp11, temp15)
132 SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
133 temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
134 temp6)
135 PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
136 temp16, temp11, temp10, temp15, temp14)
137 LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
138 0, 0, 0, 0,
139 0, 1, 2, 3,
140 BPS)
141 CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
142 temp11, temp10, temp11, temp14, temp15)
143 STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
144 temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
145 dst, 0, 1, 2, 3, BPS)
146
147 OUTPUT_EARLY_CLOBBER_REGS_18()
148 : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
149 : "memory", "hi", "lo"
150 );
151 }
152
TransformTwo(const int16_t * in,uint8_t * dst,int do_two)153 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
154 TransformOne(in, dst);
155 if (do_two) {
156 TransformOne(in + 16, dst + 4);
157 }
158 }
159
FilterLoop26(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)160 static WEBP_INLINE void FilterLoop26(uint8_t* p,
161 int hstride, int vstride, int size,
162 int thresh, int ithresh, int hev_thresh) {
163 const int thresh2 = 2 * thresh + 1;
164 int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
165 int temp10, temp11, temp12, temp13, temp14, temp15;
166
167 __asm__ volatile (
168 ".set push \n\t"
169 ".set noreorder \n\t"
170 "1: \n\t"
171 "negu %[temp1], %[hstride] \n\t"
172 "addiu %[size], %[size], -1 \n\t"
173 "sll %[temp2], %[hstride], 1 \n\t"
174 "sll %[temp3], %[temp1], 1 \n\t"
175 "addu %[temp4], %[temp2], %[hstride] \n\t"
176 "addu %[temp5], %[temp3], %[temp1] \n\t"
177 "lbu %[temp7], 0(%[p]) \n\t"
178 "sll %[temp6], %[temp3], 1 \n\t"
179 "lbux %[temp8], %[temp5](%[p]) \n\t"
180 "lbux %[temp9], %[temp3](%[p]) \n\t"
181 "lbux %[temp10], %[temp1](%[p]) \n\t"
182 "lbux %[temp11], %[temp6](%[p]) \n\t"
183 "lbux %[temp12], %[hstride](%[p]) \n\t"
184 "lbux %[temp13], %[temp2](%[p]) \n\t"
185 "lbux %[temp14], %[temp4](%[p]) \n\t"
186 "subu %[temp1], %[temp10], %[temp7] \n\t"
187 "subu %[temp2], %[temp9], %[temp12] \n\t"
188 "absq_s.w %[temp3], %[temp1] \n\t"
189 "absq_s.w %[temp4], %[temp2] \n\t"
190 "negu %[temp1], %[temp1] \n\t"
191 "sll %[temp3], %[temp3], 2 \n\t"
192 "addu %[temp15], %[temp3], %[temp4] \n\t"
193 "subu %[temp3], %[temp15], %[thresh2] \n\t"
194 "sll %[temp6], %[temp1], 1 \n\t"
195 "bgtz %[temp3], 3f \n\t"
196 " subu %[temp4], %[temp11], %[temp8] \n\t"
197 "absq_s.w %[temp4], %[temp4] \n\t"
198 "shll_s.w %[temp2], %[temp2], 24 \n\t"
199 "subu %[temp4], %[temp4], %[ithresh] \n\t"
200 "bgtz %[temp4], 3f \n\t"
201 " subu %[temp3], %[temp8], %[temp9] \n\t"
202 "absq_s.w %[temp3], %[temp3] \n\t"
203 "subu %[temp3], %[temp3], %[ithresh] \n\t"
204 "bgtz %[temp3], 3f \n\t"
205 " subu %[temp5], %[temp9], %[temp10] \n\t"
206 "absq_s.w %[temp3], %[temp5] \n\t"
207 "absq_s.w %[temp5], %[temp5] \n\t"
208 "subu %[temp3], %[temp3], %[ithresh] \n\t"
209 "bgtz %[temp3], 3f \n\t"
210 " subu %[temp3], %[temp14], %[temp13] \n\t"
211 "absq_s.w %[temp3], %[temp3] \n\t"
212 "slt %[temp5], %[hev_thresh], %[temp5] \n\t"
213 "subu %[temp3], %[temp3], %[ithresh] \n\t"
214 "bgtz %[temp3], 3f \n\t"
215 " subu %[temp3], %[temp13], %[temp12] \n\t"
216 "absq_s.w %[temp3], %[temp3] \n\t"
217 "sra %[temp4], %[temp2], 24 \n\t"
218 "subu %[temp3], %[temp3], %[ithresh] \n\t"
219 "bgtz %[temp3], 3f \n\t"
220 " subu %[temp15], %[temp12], %[temp7] \n\t"
221 "absq_s.w %[temp3], %[temp15] \n\t"
222 "absq_s.w %[temp15], %[temp15] \n\t"
223 "subu %[temp3], %[temp3], %[ithresh] \n\t"
224 "bgtz %[temp3], 3f \n\t"
225 " slt %[temp15], %[hev_thresh], %[temp15] \n\t"
226 "addu %[temp3], %[temp6], %[temp1] \n\t"
227 "or %[temp2], %[temp5], %[temp15] \n\t"
228 "addu %[temp5], %[temp4], %[temp3] \n\t"
229 "beqz %[temp2], 4f \n\t"
230 " shra_r.w %[temp1], %[temp5], 3 \n\t"
231 "addiu %[temp2], %[temp5], 3 \n\t"
232 "sra %[temp2], %[temp2], 3 \n\t"
233 "shll_s.w %[temp1], %[temp1], 27 \n\t"
234 "shll_s.w %[temp2], %[temp2], 27 \n\t"
235 "subu %[temp3], %[p], %[hstride] \n\t"
236 "sra %[temp1], %[temp1], 27 \n\t"
237 "sra %[temp2], %[temp2], 27 \n\t"
238 "subu %[temp1], %[temp7], %[temp1] \n\t"
239 "addu %[temp2], %[temp10], %[temp2] \n\t"
240 "lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
241 "lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
242 "sb %[temp2], 0(%[temp3]) \n\t"
243 "j 3f \n\t"
244 " sb %[temp1], 0(%[p]) \n\t"
245 "4: \n\t"
246 "shll_s.w %[temp5], %[temp5], 24 \n\t"
247 "subu %[temp14], %[p], %[hstride] \n\t"
248 "subu %[temp11], %[temp14], %[hstride] \n\t"
249 "sra %[temp6], %[temp5], 24 \n\t"
250 "sll %[temp1], %[temp6], 3 \n\t"
251 "subu %[temp15], %[temp11], %[hstride] \n\t"
252 "addu %[temp2], %[temp6], %[temp1] \n\t"
253 "sll %[temp3], %[temp2], 1 \n\t"
254 "addu %[temp4], %[temp3], %[temp2] \n\t"
255 "addiu %[temp2], %[temp2], 63 \n\t"
256 "addiu %[temp3], %[temp3], 63 \n\t"
257 "addiu %[temp4], %[temp4], 63 \n\t"
258 "sra %[temp2], %[temp2], 7 \n\t"
259 "sra %[temp3], %[temp3], 7 \n\t"
260 "sra %[temp4], %[temp4], 7 \n\t"
261 "addu %[temp1], %[temp8], %[temp2] \n\t"
262 "addu %[temp5], %[temp9], %[temp3] \n\t"
263 "addu %[temp6], %[temp10], %[temp4] \n\t"
264 "subu %[temp8], %[temp7], %[temp4] \n\t"
265 "subu %[temp7], %[temp12], %[temp3] \n\t"
266 "addu %[temp10], %[p], %[hstride] \n\t"
267 "subu %[temp9], %[temp13], %[temp2] \n\t"
268 "addu %[temp12], %[temp10], %[hstride] \n\t"
269 "lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
270 "lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
271 "lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
272 "lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
273 "lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
274 "lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
275 "sb %[temp2], 0(%[temp15]) \n\t"
276 "sb %[temp3], 0(%[temp11]) \n\t"
277 "sb %[temp4], 0(%[temp14]) \n\t"
278 "sb %[temp5], 0(%[p]) \n\t"
279 "sb %[temp6], 0(%[temp10]) \n\t"
280 "sb %[temp8], 0(%[temp12]) \n\t"
281 "3: \n\t"
282 "bgtz %[size], 1b \n\t"
283 " addu %[p], %[p], %[vstride] \n\t"
284 ".set pop \n\t"
285 : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
286 [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
287 [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
288 [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
289 [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
290 [size]"+&r"(size), [p]"+&r"(p)
291 : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
292 [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
293 [VP8kclip1]"r"(VP8kclip1)
294 : "memory"
295 );
296 }
297
FilterLoop24(uint8_t * p,int hstride,int vstride,int size,int thresh,int ithresh,int hev_thresh)298 static WEBP_INLINE void FilterLoop24(uint8_t* p,
299 int hstride, int vstride, int size,
300 int thresh, int ithresh, int hev_thresh) {
301 int p0, q0, p1, q1, p2, q2, p3, q3;
302 int step1, step2, temp1, temp2, temp3, temp4;
303 uint8_t* pTemp0;
304 uint8_t* pTemp1;
305 const int thresh2 = 2 * thresh + 1;
306
307 __asm__ volatile (
308 ".set push \n\t"
309 ".set noreorder \n\t"
310 "bltz %[size], 3f \n\t"
311 " nop \n\t"
312 "2: \n\t"
313 "negu %[step1], %[hstride] \n\t"
314 "lbu %[q0], 0(%[p]) \n\t"
315 "lbux %[p0], %[step1](%[p]) \n\t"
316 "subu %[step1], %[step1], %[hstride] \n\t"
317 "lbux %[q1], %[hstride](%[p]) \n\t"
318 "subu %[temp1], %[p0], %[q0] \n\t"
319 "lbux %[p1], %[step1](%[p]) \n\t"
320 "addu %[step2], %[hstride], %[hstride] \n\t"
321 "absq_s.w %[temp2], %[temp1] \n\t"
322 "subu %[temp3], %[p1], %[q1] \n\t"
323 "absq_s.w %[temp4], %[temp3] \n\t"
324 "sll %[temp2], %[temp2], 2 \n\t"
325 "addu %[temp2], %[temp2], %[temp4] \n\t"
326 "subu %[temp4], %[temp2], %[thresh2] \n\t"
327 "subu %[step1], %[step1], %[hstride] \n\t"
328 "bgtz %[temp4], 0f \n\t"
329 " lbux %[p2], %[step1](%[p]) \n\t"
330 "subu %[step1], %[step1], %[hstride] \n\t"
331 "lbux %[q2], %[step2](%[p]) \n\t"
332 "lbux %[p3], %[step1](%[p]) \n\t"
333 "subu %[temp4], %[p2], %[p1] \n\t"
334 "addu %[step2], %[step2], %[hstride] \n\t"
335 "subu %[temp2], %[p3], %[p2] \n\t"
336 "absq_s.w %[temp4], %[temp4] \n\t"
337 "absq_s.w %[temp2], %[temp2] \n\t"
338 "lbux %[q3], %[step2](%[p]) \n\t"
339 "subu %[temp4], %[temp4], %[ithresh] \n\t"
340 "negu %[temp1], %[temp1] \n\t"
341 "bgtz %[temp4], 0f \n\t"
342 " subu %[temp2], %[temp2], %[ithresh] \n\t"
343 "subu %[p3], %[p1], %[p0] \n\t"
344 "bgtz %[temp2], 0f \n\t"
345 " absq_s.w %[p3], %[p3] \n\t"
346 "subu %[temp4], %[q3], %[q2] \n\t"
347 "subu %[pTemp0], %[p], %[hstride] \n\t"
348 "absq_s.w %[temp4], %[temp4] \n\t"
349 "subu %[temp2], %[p3], %[ithresh] \n\t"
350 "sll %[step1], %[temp1], 1 \n\t"
351 "bgtz %[temp2], 0f \n\t"
352 " subu %[temp4], %[temp4], %[ithresh] \n\t"
353 "subu %[temp2], %[q2], %[q1] \n\t"
354 "bgtz %[temp4], 0f \n\t"
355 " absq_s.w %[temp2], %[temp2] \n\t"
356 "subu %[q3], %[q1], %[q0] \n\t"
357 "absq_s.w %[q3], %[q3] \n\t"
358 "subu %[temp2], %[temp2], %[ithresh] \n\t"
359 "addu %[temp1], %[temp1], %[step1] \n\t"
360 "bgtz %[temp2], 0f \n\t"
361 " subu %[temp4], %[q3], %[ithresh] \n\t"
362 "slt %[p3], %[hev_thresh], %[p3] \n\t"
363 "bgtz %[temp4], 0f \n\t"
364 " slt %[q3], %[hev_thresh], %[q3] \n\t"
365 "or %[q3], %[q3], %[p3] \n\t"
366 "bgtz %[q3], 1f \n\t"
367 " shra_r.w %[temp2], %[temp1], 3 \n\t"
368 "addiu %[temp1], %[temp1], 3 \n\t"
369 "sra %[temp1], %[temp1], 3 \n\t"
370 "shll_s.w %[temp2], %[temp2], 27 \n\t"
371 "shll_s.w %[temp1], %[temp1], 27 \n\t"
372 "addu %[pTemp1], %[p], %[hstride] \n\t"
373 "sra %[temp2], %[temp2], 27 \n\t"
374 "sra %[temp1], %[temp1], 27 \n\t"
375 "addiu %[step1], %[temp2], 1 \n\t"
376 "sra %[step1], %[step1], 1 \n\t"
377 "addu %[p0], %[p0], %[temp1] \n\t"
378 "addu %[p1], %[p1], %[step1] \n\t"
379 "subu %[q0], %[q0], %[temp2] \n\t"
380 "subu %[q1], %[q1], %[step1] \n\t"
381 "lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
382 "lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
383 "lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
384 "sb %[temp2], 0(%[pTemp0]) \n\t"
385 "lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
386 "subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
387 "sb %[temp3], 0(%[p]) \n\t"
388 "sb %[temp4], 0(%[pTemp1]) \n\t"
389 "j 0f \n\t"
390 " sb %[temp1], 0(%[pTemp0]) \n\t"
391 "1: \n\t"
392 "shll_s.w %[temp3], %[temp3], 24 \n\t"
393 "sra %[temp3], %[temp3], 24 \n\t"
394 "addu %[temp1], %[temp1], %[temp3] \n\t"
395 "shra_r.w %[temp2], %[temp1], 3 \n\t"
396 "addiu %[temp1], %[temp1], 3 \n\t"
397 "shll_s.w %[temp2], %[temp2], 27 \n\t"
398 "sra %[temp1], %[temp1], 3 \n\t"
399 "shll_s.w %[temp1], %[temp1], 27 \n\t"
400 "sra %[temp2], %[temp2], 27 \n\t"
401 "sra %[temp1], %[temp1], 27 \n\t"
402 "addu %[p0], %[p0], %[temp1] \n\t"
403 "subu %[q0], %[q0], %[temp2] \n\t"
404 "lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
405 "lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
406 "sb %[temp2], 0(%[p]) \n\t"
407 "sb %[temp1], 0(%[pTemp0]) \n\t"
408 "0: \n\t"
409 "subu %[size], %[size], 1 \n\t"
410 "bgtz %[size], 2b \n\t"
411 " addu %[p], %[p], %[vstride] \n\t"
412 "3: \n\t"
413 ".set pop \n\t"
414 : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
415 [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
416 [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
417 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
418 [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
419 [size]"+&r"(size)
420 : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
421 [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
422 [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
423 : "memory"
424 );
425 }
426
427 // on macroblock edges
VFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)428 static void VFilter16(uint8_t* p, int stride,
429 int thresh, int ithresh, int hev_thresh) {
430 FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
431 }
432
HFilter16(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)433 static void HFilter16(uint8_t* p, int stride,
434 int thresh, int ithresh, int hev_thresh) {
435 FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
436 }
437
438 // 8-pixels wide variant, for chroma filtering
VFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)439 static void VFilter8(uint8_t* u, uint8_t* v, int stride,
440 int thresh, int ithresh, int hev_thresh) {
441 FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
442 FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
443 }
444
HFilter8(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)445 static void HFilter8(uint8_t* u, uint8_t* v, int stride,
446 int thresh, int ithresh, int hev_thresh) {
447 FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
448 FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
449 }
450
451 // on three inner edges
VFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)452 static void VFilter16i(uint8_t* p, int stride,
453 int thresh, int ithresh, int hev_thresh) {
454 int k;
455 for (k = 3; k > 0; --k) {
456 p += 4 * stride;
457 FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
458 }
459 }
460
HFilter16i(uint8_t * p,int stride,int thresh,int ithresh,int hev_thresh)461 static void HFilter16i(uint8_t* p, int stride,
462 int thresh, int ithresh, int hev_thresh) {
463 int k;
464 for (k = 3; k > 0; --k) {
465 p += 4;
466 FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
467 }
468 }
469
VFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)470 static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
471 int thresh, int ithresh, int hev_thresh) {
472 FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
473 FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
474 }
475
HFilter8i(uint8_t * u,uint8_t * v,int stride,int thresh,int ithresh,int hev_thresh)476 static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
477 int thresh, int ithresh, int hev_thresh) {
478 FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
479 FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
480 }
481
482 #undef MUL
483
484 //------------------------------------------------------------------------------
485 // Simple In-loop filtering (Paragraph 15.2)
486
SimpleVFilter16(uint8_t * p,int stride,int thresh)487 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
488 int i;
489 const int thresh2 = 2 * thresh + 1;
490 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
491 uint8_t* p1 = p - stride;
492 __asm__ volatile (
493 ".set push \n\t"
494 ".set noreorder \n\t"
495 "li %[i], 16 \n\t"
496 "0: \n\t"
497 "negu %[temp4], %[stride] \n\t"
498 "sll %[temp5], %[temp4], 1 \n\t"
499 "lbu %[temp2], 0(%[p]) \n\t"
500 "lbux %[temp3], %[stride](%[p]) \n\t"
501 "lbux %[temp1], %[temp4](%[p]) \n\t"
502 "lbux %[temp0], %[temp5](%[p]) \n\t"
503 "subu %[temp7], %[temp1], %[temp2] \n\t"
504 "subu %[temp6], %[temp0], %[temp3] \n\t"
505 "absq_s.w %[temp4], %[temp7] \n\t"
506 "absq_s.w %[temp5], %[temp6] \n\t"
507 "sll %[temp4], %[temp4], 2 \n\t"
508 "subu %[temp5], %[temp5], %[thresh2] \n\t"
509 "addu %[temp5], %[temp4], %[temp5] \n\t"
510 "negu %[temp8], %[temp7] \n\t"
511 "bgtz %[temp5], 1f \n\t"
512 " addiu %[i], %[i], -1 \n\t"
513 "sll %[temp4], %[temp8], 1 \n\t"
514 "shll_s.w %[temp5], %[temp6], 24 \n\t"
515 "addu %[temp3], %[temp4], %[temp8] \n\t"
516 "sra %[temp5], %[temp5], 24 \n\t"
517 "addu %[temp3], %[temp3], %[temp5] \n\t"
518 "addiu %[temp7], %[temp3], 3 \n\t"
519 "sra %[temp7], %[temp7], 3 \n\t"
520 "shra_r.w %[temp8], %[temp3], 3 \n\t"
521 "shll_s.w %[temp0], %[temp7], 27 \n\t"
522 "shll_s.w %[temp4], %[temp8], 27 \n\t"
523 "sra %[temp0], %[temp0], 27 \n\t"
524 "sra %[temp4], %[temp4], 27 \n\t"
525 "addu %[temp7], %[temp1], %[temp0] \n\t"
526 "subu %[temp2], %[temp2], %[temp4] \n\t"
527 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
528 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
529 "sb %[temp3], 0(%[p1]) \n\t"
530 "sb %[temp4], 0(%[p]) \n\t"
531 "1: \n\t"
532 "addiu %[p1], %[p1], 1 \n\t"
533 "bgtz %[i], 0b \n\t"
534 " addiu %[p], %[p], 1 \n\t"
535 " .set pop \n\t"
536 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
537 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
538 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
539 [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
540 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
541 : "memory"
542 );
543 }
544
545 // TEMP0 = SRC[A + A1 * BPS]
546 // TEMP1 = SRC[B + B1 * BPS]
547 // TEMP2 = SRC[C + C1 * BPS]
548 // TEMP3 = SRC[D + D1 * BPS]
549 #define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
550 A, A1, B, B1, C, C1, D, D1, SRC) \
551 "lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
552 "lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
553 "lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
554 "lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
555
SimpleHFilter16(uint8_t * p,int stride,int thresh)556 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
557 int i;
558 const int thresh2 = 2 * thresh + 1;
559 int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
560 __asm__ volatile (
561 ".set push \n\t"
562 ".set noreorder \n\t"
563 "li %[i], 16 \n\t"
564 "0: \n\t"
565 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
566 "subu %[temp7], %[temp1], %[temp2] \n\t"
567 "subu %[temp6], %[temp0], %[temp3] \n\t"
568 "absq_s.w %[temp4], %[temp7] \n\t"
569 "absq_s.w %[temp5], %[temp6] \n\t"
570 "sll %[temp4], %[temp4], 2 \n\t"
571 "addu %[temp5], %[temp4], %[temp5] \n\t"
572 "subu %[temp5], %[temp5], %[thresh2] \n\t"
573 "negu %[temp8], %[temp7] \n\t"
574 "bgtz %[temp5], 1f \n\t"
575 " addiu %[i], %[i], -1 \n\t"
576 "sll %[temp4], %[temp8], 1 \n\t"
577 "shll_s.w %[temp5], %[temp6], 24 \n\t"
578 "addu %[temp3], %[temp4], %[temp8] \n\t"
579 "sra %[temp5], %[temp5], 24 \n\t"
580 "addu %[temp3], %[temp3], %[temp5] \n\t"
581 "addiu %[temp7], %[temp3], 3 \n\t"
582 "sra %[temp7], %[temp7], 3 \n\t"
583 "shra_r.w %[temp8], %[temp3], 3 \n\t"
584 "shll_s.w %[temp0], %[temp7], 27 \n\t"
585 "shll_s.w %[temp4], %[temp8], 27 \n\t"
586 "sra %[temp0], %[temp0], 27 \n\t"
587 "sra %[temp4], %[temp4], 27 \n\t"
588 "addu %[temp7], %[temp1], %[temp0] \n\t"
589 "subu %[temp2], %[temp2], %[temp4] \n\t"
590 "lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
591 "lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
592 "sb %[temp3], -1(%[p]) \n\t"
593 "sb %[temp4], 0(%[p]) \n\t"
594 "1: \n\t"
595 "bgtz %[i], 0b \n\t"
596 " addu %[p], %[p], %[stride] \n\t"
597 ".set pop \n\t"
598 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
599 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
600 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
601 [p]"+&r"(p), [i]"=&r"(i)
602 : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
603 : "memory"
604 );
605 }
606
SimpleVFilter16i(uint8_t * p,int stride,int thresh)607 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
608 int k;
609 for (k = 3; k > 0; --k) {
610 p += 4 * stride;
611 SimpleVFilter16(p, stride, thresh);
612 }
613 }
614
SimpleHFilter16i(uint8_t * p,int stride,int thresh)615 static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
616 int k;
617 for (k = 3; k > 0; --k) {
618 p += 4;
619 SimpleHFilter16(p, stride, thresh);
620 }
621 }
622
623 // DST[A * BPS] = TEMP0
624 // DST[B + C * BPS] = TEMP1
625 #define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
626 "usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
627 "usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
628
VE4(uint8_t * dst)629 static void VE4(uint8_t* dst) { // vertical
630 const uint8_t* top = dst - BPS;
631 int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
632 __asm__ volatile (
633 "ulw %[temp0], -1(%[top]) \n\t"
634 "ulh %[temp1], 3(%[top]) \n\t"
635 "preceu.ph.qbr %[temp2], %[temp0] \n\t"
636 "preceu.ph.qbl %[temp3], %[temp0] \n\t"
637 "preceu.ph.qbr %[temp4], %[temp1] \n\t"
638 "packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
639 "packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
640 "shll.ph %[temp5], %[temp5], 1 \n\t"
641 "shll.ph %[temp6], %[temp6], 1 \n\t"
642 "addq.ph %[temp2], %[temp5], %[temp2] \n\t"
643 "addq.ph %[temp6], %[temp6], %[temp4] \n\t"
644 "addq.ph %[temp2], %[temp2], %[temp3] \n\t"
645 "addq.ph %[temp6], %[temp6], %[temp3] \n\t"
646 "shra_r.ph %[temp2], %[temp2], 2 \n\t"
647 "shra_r.ph %[temp6], %[temp6], 2 \n\t"
648 "precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
649 STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
650 STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
651 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
652 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
653 [temp6]"=&r"(temp6)
654 : [top]"r"(top), [dst]"r"(dst)
655 : "memory"
656 );
657 }
658
DC4(uint8_t * dst)659 static void DC4(uint8_t* dst) { // DC
660 int temp0, temp1, temp2, temp3, temp4;
661 __asm__ volatile (
662 "ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
663 LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
664 "ins %[temp1], %[temp2], 8, 8 \n\t"
665 "ins %[temp1], %[temp3], 16, 8 \n\t"
666 "ins %[temp1], %[temp4], 24, 8 \n\t"
667 "raddu.w.qb %[temp0], %[temp0] \n\t"
668 "raddu.w.qb %[temp1], %[temp1] \n\t"
669 "addu %[temp0], %[temp0], %[temp1] \n\t"
670 "shra_r.w %[temp0], %[temp0], 3 \n\t"
671 "replv.qb %[temp0], %[temp0] \n\t"
672 STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
673 STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
674 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
675 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
676 : [dst]"r"(dst)
677 : "memory"
678 );
679 }
680
RD4(uint8_t * dst)681 static void RD4(uint8_t* dst) { // Down-right
682 int temp0, temp1, temp2, temp3, temp4;
683 int temp5, temp6, temp7, temp8;
684 __asm__ volatile (
685 LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
686 "ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
687 "ins %[temp1], %[temp0], 16, 16 \n\t"
688 "preceu.ph.qbr %[temp5], %[temp7] \n\t"
689 "ins %[temp2], %[temp1], 16, 16 \n\t"
690 "preceu.ph.qbl %[temp4], %[temp7] \n\t"
691 "ins %[temp3], %[temp2], 16, 16 \n\t"
692 "shll.ph %[temp2], %[temp2], 1 \n\t"
693 "addq.ph %[temp3], %[temp3], %[temp1] \n\t"
694 "packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
695 "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
696 "addq.ph %[temp1], %[temp1], %[temp5] \n\t"
697 "shll.ph %[temp6], %[temp6], 1 \n\t"
698 "addq.ph %[temp1], %[temp1], %[temp6] \n\t"
699 "packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
700 "addq.ph %[temp8], %[temp5], %[temp4] \n\t"
701 "shra_r.ph %[temp3], %[temp3], 2 \n\t"
702 "shll.ph %[temp0], %[temp0], 1 \n\t"
703 "shra_r.ph %[temp1], %[temp1], 2 \n\t"
704 "addq.ph %[temp8], %[temp0], %[temp8] \n\t"
705 "lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
706 "precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
707 "shra_r.ph %[temp8], %[temp8], 2 \n\t"
708 "ins %[temp7], %[temp5], 0, 8 \n\t"
709 "precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
710 "raddu.w.qb %[temp4], %[temp7] \n\t"
711 "precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
712 "shra_r.w %[temp4], %[temp4], 2 \n\t"
713 STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
714 "prepend %[temp2], %[temp8], 8 \n\t"
715 "prepend %[temp6], %[temp4], 8 \n\t"
716 STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
717 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
718 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
719 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
720 : [dst]"r"(dst)
721 : "memory"
722 );
723 }
724
725 // TEMP0 = SRC[A * BPS]
726 // TEMP1 = SRC[B + C * BPS]
727 #define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
728 "ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
729 "ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
730
LD4(uint8_t * dst)731 static void LD4(uint8_t* dst) { // Down-Left
732 int temp0, temp1, temp2, temp3, temp4;
733 int temp5, temp6, temp7, temp8, temp9;
734 __asm__ volatile (
735 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
736 "preceu.ph.qbl %[temp2], %[temp0] \n\t"
737 "preceu.ph.qbr %[temp3], %[temp0] \n\t"
738 "preceu.ph.qbr %[temp4], %[temp1] \n\t"
739 "preceu.ph.qbl %[temp5], %[temp1] \n\t"
740 "packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
741 "packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
742 "packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
743 "shll.ph %[temp6], %[temp6], 1 \n\t"
744 "addq.ph %[temp9], %[temp2], %[temp6] \n\t"
745 "shll.ph %[temp7], %[temp7], 1 \n\t"
746 "addq.ph %[temp9], %[temp9], %[temp3] \n\t"
747 "shll.ph %[temp8], %[temp8], 1 \n\t"
748 "shra_r.ph %[temp9], %[temp9], 2 \n\t"
749 "addq.ph %[temp3], %[temp4], %[temp7] \n\t"
750 "addq.ph %[temp0], %[temp5], %[temp8] \n\t"
751 "addq.ph %[temp3], %[temp3], %[temp2] \n\t"
752 "addq.ph %[temp0], %[temp0], %[temp4] \n\t"
753 "shra_r.ph %[temp3], %[temp3], 2 \n\t"
754 "shra_r.ph %[temp0], %[temp0], 2 \n\t"
755 "srl %[temp1], %[temp1], 24 \n\t"
756 "sll %[temp1], %[temp1], 1 \n\t"
757 "raddu.w.qb %[temp5], %[temp5] \n\t"
758 "precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
759 "precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
760 "addu %[temp1], %[temp1], %[temp5] \n\t"
761 "shra_r.w %[temp1], %[temp1], 2 \n\t"
762 STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
763 "prepend %[temp9], %[temp0], 8 \n\t"
764 "prepend %[temp3], %[temp1], 8 \n\t"
765 STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
766 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
767 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
768 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
769 [temp9]"=&r"(temp9)
770 : [dst]"r"(dst)
771 : "memory"
772 );
773 }
774
775 //------------------------------------------------------------------------------
776 // Chroma
777
DC8uv(uint8_t * dst)778 static void DC8uv(uint8_t* dst) { // DC
779 int temp0, temp1, temp2, temp3, temp4;
780 int temp5, temp6, temp7, temp8, temp9;
781 __asm__ volatile (
782 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
783 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
784 LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
785 "raddu.w.qb %[temp0], %[temp0] \n\t"
786 "raddu.w.qb %[temp1], %[temp1] \n\t"
787 "addu %[temp2], %[temp2], %[temp3] \n\t"
788 "addu %[temp4], %[temp4], %[temp5] \n\t"
789 "addu %[temp6], %[temp6], %[temp7] \n\t"
790 "addu %[temp8], %[temp8], %[temp9] \n\t"
791 "addu %[temp0], %[temp0], %[temp1] \n\t"
792 "addu %[temp2], %[temp2], %[temp4] \n\t"
793 "addu %[temp6], %[temp6], %[temp8] \n\t"
794 "addu %[temp0], %[temp0], %[temp2] \n\t"
795 "addu %[temp0], %[temp0], %[temp6] \n\t"
796 "shra_r.w %[temp0], %[temp0], 4 \n\t"
797 "replv.qb %[temp0], %[temp0] \n\t"
798 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
799 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
800 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
801 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
802 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
803 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
804 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
805 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
806 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
807 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
808 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
809 [temp9]"=&r"(temp9)
810 : [dst]"r"(dst)
811 : "memory"
812 );
813 }
814
DC8uvNoLeft(uint8_t * dst)815 static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
816 int temp0, temp1;
817 __asm__ volatile (
818 LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
819 "raddu.w.qb %[temp0], %[temp0] \n\t"
820 "raddu.w.qb %[temp1], %[temp1] \n\t"
821 "addu %[temp0], %[temp0], %[temp1] \n\t"
822 "shra_r.w %[temp0], %[temp0], 3 \n\t"
823 "replv.qb %[temp0], %[temp0] \n\t"
824 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
825 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
826 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
827 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
828 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
829 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
830 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
831 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
832 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
833 : [dst]"r"(dst)
834 : "memory"
835 );
836 }
837
DC8uvNoTop(uint8_t * dst)838 static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
839 int temp0, temp1, temp2, temp3, temp4;
840 int temp5, temp6, temp7, temp8;
841 __asm__ volatile (
842 LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
843 LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
844 "addu %[temp2], %[temp2], %[temp3] \n\t"
845 "addu %[temp4], %[temp4], %[temp5] \n\t"
846 "addu %[temp6], %[temp6], %[temp7] \n\t"
847 "addu %[temp8], %[temp8], %[temp1] \n\t"
848 "addu %[temp2], %[temp2], %[temp4] \n\t"
849 "addu %[temp6], %[temp6], %[temp8] \n\t"
850 "addu %[temp0], %[temp6], %[temp2] \n\t"
851 "shra_r.w %[temp0], %[temp0], 3 \n\t"
852 "replv.qb %[temp0], %[temp0] \n\t"
853 STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
854 STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
855 STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
856 STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
857 STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
858 STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
859 STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
860 STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
861 : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
862 [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
863 [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
864 : [dst]"r"(dst)
865 : "memory"
866 );
867 }
868
869 #undef LOAD_8_BYTES
870 #undef STORE_8_BYTES
871 #undef LOAD_4_BYTES
872
873 #define CLIPPING(SIZE) \
874 "preceu.ph.qbl %[temp2], %[temp0] \n\t" \
875 "preceu.ph.qbr %[temp0], %[temp0] \n\t" \
876 ".if " #SIZE " == 8 \n\t" \
877 "preceu.ph.qbl %[temp3], %[temp1] \n\t" \
878 "preceu.ph.qbr %[temp1], %[temp1] \n\t" \
879 ".endif \n\t" \
880 "addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
881 "addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
882 ".if " #SIZE " == 8 \n\t" \
883 "addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
884 "addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
885 ".endif \n\t" \
886 "shll_s.ph %[temp2], %[temp2], 7 \n\t" \
887 "shll_s.ph %[temp0], %[temp0], 7 \n\t" \
888 ".if " #SIZE " == 8 \n\t" \
889 "shll_s.ph %[temp3], %[temp3], 7 \n\t" \
890 "shll_s.ph %[temp1], %[temp1], 7 \n\t" \
891 ".endif \n\t" \
892 "precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
893 ".if " #SIZE " == 8 \n\t" \
894 "precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
895 ".endif \n\t"
896
897
898 #define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
899 int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
900 int temp0, temp1, temp2, temp3; \
901 __asm__ volatile ( \
902 ".if " #SIZE " < 8 \n\t" \
903 "ulw %[temp0], 0(%[top]) \n\t" \
904 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
905 CLIPPING(4) \
906 "usw %[temp0], 0(%[dst]) \n\t" \
907 ".else \n\t" \
908 "ulw %[temp0], 0(%[top]) \n\t" \
909 "ulw %[temp1], 4(%[top]) \n\t" \
910 "subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
911 CLIPPING(8) \
912 "usw %[temp0], 0(%[dst]) \n\t" \
913 "usw %[temp1], 4(%[dst]) \n\t" \
914 ".if " #SIZE " == 16 \n\t" \
915 "ulw %[temp0], 8(%[top]) \n\t" \
916 "ulw %[temp1], 12(%[top]) \n\t" \
917 CLIPPING(8) \
918 "usw %[temp0], 8(%[dst]) \n\t" \
919 "usw %[temp1], 12(%[dst]) \n\t" \
920 ".endif \n\t" \
921 ".endif \n\t" \
922 : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
923 [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
924 : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
925 : "memory" \
926 ); \
927 } while (0)
928
929 #define CLIP_TO_DST(DST, SIZE) do { \
930 int y; \
931 const uint8_t* top = (DST) - BPS; \
932 const int top_1 = ((int)top[-1] << 16) + top[-1]; \
933 for (y = 0; y < (SIZE); ++y) { \
934 CLIP_8B_TO_DST((DST), top, (SIZE)); \
935 (DST) += BPS; \
936 } \
937 } while (0)
938
939 #define TRUE_MOTION(DST, SIZE) \
940 static void TrueMotion##SIZE(uint8_t* (DST)) { \
941 CLIP_TO_DST((DST), (SIZE)); \
942 }
943
944 TRUE_MOTION(dst, 4)
945 TRUE_MOTION(dst, 8)
946 TRUE_MOTION(dst, 16)
947
948 #undef TRUE_MOTION
949 #undef CLIP_TO_DST
950 #undef CLIP_8B_TO_DST
951 #undef CLIPPING
952
953 //------------------------------------------------------------------------------
954 // Entry point
955
956 extern void VP8DspInitMIPSdspR2(void);
957
VP8DspInitMIPSdspR2(void)958 WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
959 VP8TransformDC = TransformDC;
960 VP8TransformAC3 = TransformAC3;
961 VP8Transform = TransformTwo;
962
963 VP8VFilter16 = VFilter16;
964 VP8HFilter16 = HFilter16;
965 VP8VFilter8 = VFilter8;
966 VP8HFilter8 = HFilter8;
967 VP8VFilter16i = VFilter16i;
968 VP8HFilter16i = HFilter16i;
969 VP8VFilter8i = VFilter8i;
970 VP8HFilter8i = HFilter8i;
971 VP8SimpleVFilter16 = SimpleVFilter16;
972 VP8SimpleHFilter16 = SimpleHFilter16;
973 VP8SimpleVFilter16i = SimpleVFilter16i;
974 VP8SimpleHFilter16i = SimpleHFilter16i;
975
976 VP8PredLuma4[0] = DC4;
977 VP8PredLuma4[1] = TrueMotion4;
978 VP8PredLuma4[2] = VE4;
979 VP8PredLuma4[4] = RD4;
980 VP8PredLuma4[6] = LD4;
981
982 VP8PredChroma8[0] = DC8uv;
983 VP8PredChroma8[1] = TrueMotion8;
984 VP8PredChroma8[4] = DC8uvNoTop;
985 VP8PredChroma8[5] = DC8uvNoLeft;
986
987 VP8PredLuma16[1] = TrueMotion16;
988 }
989
990 #else // !WEBP_USE_MIPS_DSP_R2
991
992 WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
993
994 #endif // WEBP_USE_MIPS_DSP_R2
995