1 /*
2  *  Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // The following are available on Mips platforms:
19 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips__) && \
20     (_MIPS_SIM == _MIPS_SIM_ABI32)
21 
22 #ifdef HAS_COPYROW_MIPS
CopyRow_MIPS(const uint8 * src,uint8 * dst,int count)23 void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
24   __asm__ __volatile__(
25       ".set      noreorder                         \n"
26       ".set      noat                              \n"
27       "slti      $at, %[count], 8                  \n"
28       "bne       $at ,$zero, $last8                \n"
29       "xor       $t8, %[src], %[dst]               \n"
30       "andi      $t8, $t8, 0x3                     \n"
31 
32       "bne       $t8, $zero, unaligned             \n"
33       "negu      $a3, %[dst]                       \n"
34       // make dst/src aligned
35       "andi      $a3, $a3, 0x3                     \n"
36       "beq       $a3, $zero, $chk16w               \n"
37       // word-aligned now count is the remining bytes count
38       "subu     %[count], %[count], $a3            \n"
39 
40       "lwr       $t8, 0(%[src])                    \n"
41       "addu      %[src], %[src], $a3               \n"
42       "swr       $t8, 0(%[dst])                    \n"
43       "addu      %[dst], %[dst], $a3               \n"
44 
45       // Now the dst/src are mutually word-aligned with word-aligned addresses
46       "$chk16w:                                    \n"
47       "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
48       // t8 is the byte count after 64-byte chunks
49       "beq       %[count], $t8, chk8w              \n"
50       // There will be at most 1 32-byte chunk after it
51       "subu      $a3, %[count], $t8                \n"  // the reminder
52       // Here a3 counts bytes in 16w chunks
53       "addu      $a3, %[dst], $a3                  \n"
54       // Now a3 is the final dst after 64-byte chunks
55       "addu      $t0, %[dst], %[count]             \n"
56       // t0 is the "past the end" address
57 
58       // When in the loop we exercise "pref 30,x(a1)", the a1+x should not be
59       // past
60       // the "t0-32" address
61       // This means: for x=128 the last "safe" a1 address is "t0-160"
62       // Alternatively, for x=64 the last "safe" a1 address is "t0-96"
63       // we will use "pref 30,128(a1)", so "t0-160" is the limit
64       "subu      $t9, $t0, 160                     \n"
65       // t9 is the "last safe pref 30,128(a1)" address
66       "pref      0, 0(%[src])                      \n"  // first line of src
67       "pref      0, 32(%[src])                     \n"  // second line of src
68       "pref      0, 64(%[src])                     \n"
69       "pref      30, 32(%[dst])                    \n"
70       // In case the a1 > t9 don't use "pref 30" at all
71       "sltu      $v1, $t9, %[dst]                  \n"
72       "bgtz      $v1, $loop16w                     \n"
73       "nop                                         \n"
74       // otherwise, start with using pref30
75       "pref      30, 64(%[dst])                    \n"
76       "$loop16w:                                    \n"
77       "pref      0, 96(%[src])                     \n"
78       "lw        $t0, 0(%[src])                    \n"
79       "bgtz      $v1, $skip_pref30_96              \n"  // skip
80       "lw        $t1, 4(%[src])                    \n"
81       "pref      30, 96(%[dst])                    \n"  // continue
82       "$skip_pref30_96:                            \n"
83       "lw        $t2, 8(%[src])                    \n"
84       "lw        $t3, 12(%[src])                   \n"
85       "lw        $t4, 16(%[src])                   \n"
86       "lw        $t5, 20(%[src])                   \n"
87       "lw        $t6, 24(%[src])                   \n"
88       "lw        $t7, 28(%[src])                   \n"
89       "pref      0, 128(%[src])                    \n"
90       //  bring the next lines of src, addr 128
91       "sw        $t0, 0(%[dst])                    \n"
92       "sw        $t1, 4(%[dst])                    \n"
93       "sw        $t2, 8(%[dst])                    \n"
94       "sw        $t3, 12(%[dst])                   \n"
95       "sw        $t4, 16(%[dst])                   \n"
96       "sw        $t5, 20(%[dst])                   \n"
97       "sw        $t6, 24(%[dst])                   \n"
98       "sw        $t7, 28(%[dst])                   \n"
99       "lw        $t0, 32(%[src])                   \n"
100       "bgtz      $v1, $skip_pref30_128             \n"  // skip pref 30,128(a1)
101       "lw        $t1, 36(%[src])                   \n"
102       "pref      30, 128(%[dst])                   \n"  // set dest, addr 128
103       "$skip_pref30_128:                           \n"
104       "lw        $t2, 40(%[src])                   \n"
105       "lw        $t3, 44(%[src])                   \n"
106       "lw        $t4, 48(%[src])                   \n"
107       "lw        $t5, 52(%[src])                   \n"
108       "lw        $t6, 56(%[src])                   \n"
109       "lw        $t7, 60(%[src])                   \n"
110       "pref      0, 160(%[src])                    \n"
111       // bring the next lines of src, addr 160
112       "sw        $t0, 32(%[dst])                   \n"
113       "sw        $t1, 36(%[dst])                   \n"
114       "sw        $t2, 40(%[dst])                   \n"
115       "sw        $t3, 44(%[dst])                   \n"
116       "sw        $t4, 48(%[dst])                   \n"
117       "sw        $t5, 52(%[dst])                   \n"
118       "sw        $t6, 56(%[dst])                   \n"
119       "sw        $t7, 60(%[dst])                   \n"
120 
121       "addiu     %[dst], %[dst], 64                \n"  // adding 64 to dest
122       "sltu      $v1, $t9, %[dst]                  \n"
123       "bne       %[dst], $a3, $loop16w             \n"
124       " addiu    %[src], %[src], 64                \n"  // adding 64 to src
125       "move      %[count], $t8                     \n"
126 
127       // Here we have src and dest word-aligned but less than 64-bytes to go
128 
129       "chk8w:                                      \n"
130       "pref      0, 0x0(%[src])                    \n"
131       "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
132       // the t8 is the reminder count past 32-bytes
133       "beq       %[count], $t8, chk1w              \n"
134       // count=t8,no 32-byte chunk
135       " nop                                        \n"
136 
137       "lw        $t0, 0(%[src])                    \n"
138       "lw        $t1, 4(%[src])                    \n"
139       "lw        $t2, 8(%[src])                    \n"
140       "lw        $t3, 12(%[src])                   \n"
141       "lw        $t4, 16(%[src])                   \n"
142       "lw        $t5, 20(%[src])                   \n"
143       "lw        $t6, 24(%[src])                   \n"
144       "lw        $t7, 28(%[src])                   \n"
145       "addiu     %[src], %[src], 32                \n"
146 
147       "sw        $t0, 0(%[dst])                    \n"
148       "sw        $t1, 4(%[dst])                    \n"
149       "sw        $t2, 8(%[dst])                    \n"
150       "sw        $t3, 12(%[dst])                   \n"
151       "sw        $t4, 16(%[dst])                   \n"
152       "sw        $t5, 20(%[dst])                   \n"
153       "sw        $t6, 24(%[dst])                   \n"
154       "sw        $t7, 28(%[dst])                   \n"
155       "addiu     %[dst], %[dst], 32                \n"
156 
157       "chk1w:                                      \n"
158       "andi      %[count], $t8, 0x3                \n"
159       // now count is the reminder past 1w chunks
160       "beq       %[count], $t8, $last8             \n"
161       " subu     $a3, $t8, %[count]                \n"
162       // a3 is count of bytes in 1w chunks
163       "addu      $a3, %[dst], $a3                  \n"
164       // now a3 is the dst address past the 1w chunks
165       // copying in words (4-byte chunks)
166       "$wordCopy_loop:                             \n"
167       "lw        $t3, 0(%[src])                    \n"
168       // the first t3 may be equal t0 ... optimize?
169       "addiu     %[src], %[src],4                  \n"
170       "addiu     %[dst], %[dst],4                  \n"
171       "bne       %[dst], $a3,$wordCopy_loop        \n"
172       " sw       $t3, -4(%[dst])                   \n"
173 
174       // For the last (<8) bytes
175       "$last8:                                     \n"
176       "blez      %[count], leave                   \n"
177       " addu     $a3, %[dst], %[count]             \n"  // a3 -last dst address
178       "$last8loop:                                 \n"
179       "lb        $v1, 0(%[src])                    \n"
180       "addiu     %[src], %[src], 1                 \n"
181       "addiu     %[dst], %[dst], 1                 \n"
182       "bne       %[dst], $a3, $last8loop           \n"
183       " sb       $v1, -1(%[dst])                   \n"
184 
185       "leave:                                      \n"
186       "  j       $ra                               \n"
187       "  nop                                       \n"
188 
189       //
190       // UNALIGNED case
191       //
192 
193       "unaligned:                                  \n"
194       // got here with a3="negu a1"
195       "andi      $a3, $a3, 0x3                     \n"  // a1 is word aligned?
196       "beqz      $a3, $ua_chk16w                   \n"
197       " subu     %[count], %[count], $a3           \n"
198       // bytes left after initial a3 bytes
199       "lwr       $v1, 0(%[src])                    \n"
200       "lwl       $v1, 3(%[src])                    \n"
201       "addu      %[src], %[src], $a3               \n"  // a3 may be 1, 2 or 3
202       "swr       $v1, 0(%[dst])                    \n"
203       "addu      %[dst], %[dst], $a3               \n"
204       // below the dst will be word aligned (NOTE1)
205       "$ua_chk16w:                                 \n"
206       "andi      $t8, %[count], 0x3f               \n"  // whole 64-B chunks?
207       // t8 is the byte count after 64-byte chunks
208       "beq       %[count], $t8, ua_chk8w           \n"
209       // if a2==t8, no 64-byte chunks
210       // There will be at most 1 32-byte chunk after it
211       "subu      $a3, %[count], $t8                \n"  // the reminder
212       // Here a3 counts bytes in 16w chunks
213       "addu      $a3, %[dst], $a3                  \n"
214       // Now a3 is the final dst after 64-byte chunks
215       "addu      $t0, %[dst], %[count]             \n"  // t0 "past the end"
216       "subu      $t9, $t0, 160                     \n"
217       // t9 is the "last safe pref 30,128(a1)" address
218       "pref      0, 0(%[src])                      \n"  // first line of src
219       "pref      0, 32(%[src])                     \n"  // second line  addr 32
220       "pref      0, 64(%[src])                     \n"
221       "pref      30, 32(%[dst])                    \n"
222       // safe, as we have at least 64 bytes ahead
223       // In case the a1 > t9 don't use "pref 30" at all
224       "sltu      $v1, $t9, %[dst]                  \n"
225       "bgtz      $v1, $ua_loop16w                  \n"
226       // skip "pref 30,64(a1)" for too short arrays
227       " nop                                        \n"
228       // otherwise, start with using pref30
229       "pref      30, 64(%[dst])                    \n"
230       "$ua_loop16w:                                \n"
231       "pref      0, 96(%[src])                     \n"
232       "lwr       $t0, 0(%[src])                    \n"
233       "lwl       $t0, 3(%[src])                    \n"
234       "lwr       $t1, 4(%[src])                    \n"
235       "bgtz      $v1, $ua_skip_pref30_96           \n"
236       " lwl      $t1, 7(%[src])                    \n"
237       "pref      30, 96(%[dst])                    \n"
238       // continue setting up the dest, addr 96
239       "$ua_skip_pref30_96:                         \n"
240       "lwr       $t2, 8(%[src])                    \n"
241       "lwl       $t2, 11(%[src])                   \n"
242       "lwr       $t3, 12(%[src])                   \n"
243       "lwl       $t3, 15(%[src])                   \n"
244       "lwr       $t4, 16(%[src])                   \n"
245       "lwl       $t4, 19(%[src])                   \n"
246       "lwr       $t5, 20(%[src])                   \n"
247       "lwl       $t5, 23(%[src])                   \n"
248       "lwr       $t6, 24(%[src])                   \n"
249       "lwl       $t6, 27(%[src])                   \n"
250       "lwr       $t7, 28(%[src])                   \n"
251       "lwl       $t7, 31(%[src])                   \n"
252       "pref      0, 128(%[src])                    \n"
253       // bring the next lines of src, addr 128
254       "sw        $t0, 0(%[dst])                    \n"
255       "sw        $t1, 4(%[dst])                    \n"
256       "sw        $t2, 8(%[dst])                    \n"
257       "sw        $t3, 12(%[dst])                   \n"
258       "sw        $t4, 16(%[dst])                   \n"
259       "sw        $t5, 20(%[dst])                   \n"
260       "sw        $t6, 24(%[dst])                   \n"
261       "sw        $t7, 28(%[dst])                   \n"
262       "lwr       $t0, 32(%[src])                   \n"
263       "lwl       $t0, 35(%[src])                   \n"
264       "lwr       $t1, 36(%[src])                   \n"
265       "bgtz      $v1, ua_skip_pref30_128           \n"
266       " lwl      $t1, 39(%[src])                   \n"
267       "pref      30, 128(%[dst])                   \n"
268       // continue setting up the dest, addr 128
269       "ua_skip_pref30_128:                         \n"
270 
271       "lwr       $t2, 40(%[src])                   \n"
272       "lwl       $t2, 43(%[src])                   \n"
273       "lwr       $t3, 44(%[src])                   \n"
274       "lwl       $t3, 47(%[src])                   \n"
275       "lwr       $t4, 48(%[src])                   \n"
276       "lwl       $t4, 51(%[src])                   \n"
277       "lwr       $t5, 52(%[src])                   \n"
278       "lwl       $t5, 55(%[src])                   \n"
279       "lwr       $t6, 56(%[src])                   \n"
280       "lwl       $t6, 59(%[src])                   \n"
281       "lwr       $t7, 60(%[src])                   \n"
282       "lwl       $t7, 63(%[src])                   \n"
283       "pref      0, 160(%[src])                    \n"
284       // bring the next lines of src, addr 160
285       "sw        $t0, 32(%[dst])                   \n"
286       "sw        $t1, 36(%[dst])                   \n"
287       "sw        $t2, 40(%[dst])                   \n"
288       "sw        $t3, 44(%[dst])                   \n"
289       "sw        $t4, 48(%[dst])                   \n"
290       "sw        $t5, 52(%[dst])                   \n"
291       "sw        $t6, 56(%[dst])                   \n"
292       "sw        $t7, 60(%[dst])                   \n"
293 
294       "addiu     %[dst],%[dst],64                  \n"  // adding 64 to dest
295       "sltu      $v1,$t9,%[dst]                    \n"
296       "bne       %[dst],$a3,$ua_loop16w            \n"
297       " addiu    %[src],%[src],64                  \n"  // adding 64 to src
298       "move      %[count],$t8                      \n"
299 
300       // Here we have src and dest word-aligned but less than 64-bytes to go
301 
302       "ua_chk8w:                                   \n"
303       "pref      0, 0x0(%[src])                    \n"
304       "andi      $t8, %[count], 0x1f               \n"  // 32-byte chunk?
305       // the t8 is the reminder count
306       "beq       %[count], $t8, $ua_chk1w          \n"
307       // when count==t8, no 32-byte chunk
308 
309       "lwr       $t0, 0(%[src])                    \n"
310       "lwl       $t0, 3(%[src])                    \n"
311       "lwr       $t1, 4(%[src])                    \n"
312       "lwl       $t1, 7(%[src])                    \n"
313       "lwr       $t2, 8(%[src])                    \n"
314       "lwl       $t2, 11(%[src])                   \n"
315       "lwr       $t3, 12(%[src])                   \n"
316       "lwl       $t3, 15(%[src])                   \n"
317       "lwr       $t4, 16(%[src])                   \n"
318       "lwl       $t4, 19(%[src])                   \n"
319       "lwr       $t5, 20(%[src])                   \n"
320       "lwl       $t5, 23(%[src])                   \n"
321       "lwr       $t6, 24(%[src])                   \n"
322       "lwl       $t6, 27(%[src])                   \n"
323       "lwr       $t7, 28(%[src])                   \n"
324       "lwl       $t7, 31(%[src])                   \n"
325       "addiu     %[src], %[src], 32                \n"
326 
327       "sw        $t0, 0(%[dst])                    \n"
328       "sw        $t1, 4(%[dst])                    \n"
329       "sw        $t2, 8(%[dst])                    \n"
330       "sw        $t3, 12(%[dst])                   \n"
331       "sw        $t4, 16(%[dst])                   \n"
332       "sw        $t5, 20(%[dst])                   \n"
333       "sw        $t6, 24(%[dst])                   \n"
334       "sw        $t7, 28(%[dst])                   \n"
335       "addiu     %[dst], %[dst], 32                \n"
336 
337       "$ua_chk1w:                                  \n"
338       "andi      %[count], $t8, 0x3                \n"
339       // now count is the reminder past 1w chunks
340       "beq       %[count], $t8, ua_smallCopy       \n"
341       "subu      $a3, $t8, %[count]                \n"
342       // a3 is count of bytes in 1w chunks
343       "addu      $a3, %[dst], $a3                  \n"
344       // now a3 is the dst address past the 1w chunks
345 
346       // copying in words (4-byte chunks)
347       "$ua_wordCopy_loop:                          \n"
348       "lwr       $v1, 0(%[src])                    \n"
349       "lwl       $v1, 3(%[src])                    \n"
350       "addiu     %[src], %[src], 4                 \n"
351       "addiu     %[dst], %[dst], 4                 \n"
352       // note: dst=a1 is word aligned here, see NOTE1
353       "bne       %[dst], $a3, $ua_wordCopy_loop    \n"
354       " sw       $v1,-4(%[dst])                    \n"
355 
356       // Now less than 4 bytes (value in count) left to copy
357       "ua_smallCopy:                               \n"
358       "beqz      %[count], leave                   \n"
359       " addu     $a3, %[dst], %[count]             \n"  // a3 = last dst address
360       "$ua_smallCopy_loop:                         \n"
361       "lb        $v1, 0(%[src])                    \n"
362       "addiu     %[src], %[src], 1                 \n"
363       "addiu     %[dst], %[dst], 1                 \n"
364       "bne       %[dst],$a3,$ua_smallCopy_loop     \n"
365       " sb       $v1, -1(%[dst])                   \n"
366 
367       "j         $ra                               \n"
368       " nop                                        \n"
369       ".set      at                                \n"
370       ".set      reorder                           \n"
371       : [dst] "+r"(dst), [src] "+r"(src)
372       : [count] "r"(count)
373       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9", "a3", "v1",
374         "at");
375 }
376 #endif  // HAS_COPYROW_MIPS
377 
378 // DSPR2 functions
379 #if !defined(LIBYUV_DISABLE_DSPR2) && defined(__mips_dsp) &&   \
380     (__mips_dsp_rev >= 2) && (_MIPS_SIM == _MIPS_SIM_ABI32) && \
381     (__mips_isa_rev < 6)
382 
SplitUVRow_DSPR2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)383 void SplitUVRow_DSPR2(const uint8* src_uv,
384                       uint8* dst_u,
385                       uint8* dst_v,
386                       int width) {
387   __asm__ __volatile__(
388       ".set push                                     \n"
389       ".set noreorder                                \n"
390       "srl             $t4, %[width], 4              \n"  // multiplies of 16
391       "blez            $t4, 2f                       \n"
392       " andi           %[width], %[width], 0xf       \n"  // residual
393 
394       "1:                                            \n"
395       "addiu           $t4, $t4, -1                  \n"
396       "lw              $t0, 0(%[src_uv])             \n"  // V1 | U1 | V0 | U0
397       "lw              $t1, 4(%[src_uv])             \n"  // V3 | U3 | V2 | U2
398       "lw              $t2, 8(%[src_uv])             \n"  // V5 | U5 | V4 | U4
399       "lw              $t3, 12(%[src_uv])            \n"  // V7 | U7 | V6 | U6
400       "lw              $t5, 16(%[src_uv])            \n"  // V9 | U9 | V8 | U8
401       "lw              $t6, 20(%[src_uv])            \n"  // V11 | U11 | V10 |
402                                                           // U10
403       "lw              $t7, 24(%[src_uv])            \n"  // V13 | U13 | V12 |
404                                                           // U12
405       "lw              $t8, 28(%[src_uv])            \n"  // V15 | U15 | V14 |
406                                                           // U14
407       "addiu           %[src_uv], %[src_uv], 32      \n"
408       "precrq.qb.ph    $t9, $t1, $t0                 \n"  // V3 | V2 | V1 | V0
409       "precr.qb.ph     $t0, $t1, $t0                 \n"  // U3 | U2 | U1 | U0
410       "precrq.qb.ph    $t1, $t3, $t2                 \n"  // V7 | V6 | V5 | V4
411       "precr.qb.ph     $t2, $t3, $t2                 \n"  // U7 | U6 | U5 | U4
412       "precrq.qb.ph    $t3, $t6, $t5                 \n"  // V11 | V10 | V9 | V8
413       "precr.qb.ph     $t5, $t6, $t5                 \n"  // U11 | U10 | U9 | U8
414       "precrq.qb.ph    $t6, $t8, $t7                 \n"  // V15 | V14 | V13 |
415                                                           // V12
416       "precr.qb.ph     $t7, $t8, $t7                 \n"  // U15 | U14 | U13 |
417                                                           // U12
418       "sw              $t9, 0(%[dst_v])              \n"
419       "sw              $t0, 0(%[dst_u])              \n"
420       "sw              $t1, 4(%[dst_v])              \n"
421       "sw              $t2, 4(%[dst_u])              \n"
422       "sw              $t3, 8(%[dst_v])              \n"
423       "sw              $t5, 8(%[dst_u])              \n"
424       "sw              $t6, 12(%[dst_v])             \n"
425       "sw              $t7, 12(%[dst_u])             \n"
426       "addiu           %[dst_v], %[dst_v], 16        \n"
427       "bgtz            $t4, 1b                       \n"
428       " addiu          %[dst_u], %[dst_u], 16        \n"
429 
430       "beqz            %[width], 3f                  \n"
431       " nop                                          \n"
432 
433       "2:                                              \n"
434       "lbu             $t0, 0(%[src_uv])             \n"
435       "lbu             $t1, 1(%[src_uv])             \n"
436       "addiu           %[src_uv], %[src_uv], 2       \n"
437       "addiu           %[width], %[width], -1        \n"
438       "sb              $t0, 0(%[dst_u])              \n"
439       "sb              $t1, 0(%[dst_v])              \n"
440       "addiu           %[dst_u], %[dst_u], 1         \n"
441       "bgtz            %[width], 2b                  \n"
442       " addiu          %[dst_v], %[dst_v], 1         \n"
443 
444       "3:                                              \n"
445       ".set pop                                      \n"
446       : [src_uv] "+r"(src_uv), [width] "+r"(width), [dst_u] "+r"(dst_u),
447         [dst_v] "+r"(dst_v)
448       :
449       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
450 }
451 
MirrorRow_DSPR2(const uint8 * src,uint8 * dst,int width)452 void MirrorRow_DSPR2(const uint8* src, uint8* dst, int width) {
453   __asm__ __volatile__(
454       ".set push                             \n"
455       ".set noreorder                        \n"
456 
457       "srl       $t4, %[width], 4            \n"  // multiplies of 16
458       "andi      $t5, %[width], 0xf          \n"
459       "blez      $t4, 2f                     \n"
460       " addu     %[src], %[src], %[width]    \n"  // src += width
461 
462       "1:                                     \n"
463       "lw        $t0, -16(%[src])            \n"  // |3|2|1|0|
464       "lw        $t1, -12(%[src])            \n"  // |7|6|5|4|
465       "lw        $t2, -8(%[src])             \n"  // |11|10|9|8|
466       "lw        $t3, -4(%[src])             \n"  // |15|14|13|12|
467       "wsbh      $t0, $t0                    \n"  // |2|3|0|1|
468       "wsbh      $t1, $t1                    \n"  // |6|7|4|5|
469       "wsbh      $t2, $t2                    \n"  // |10|11|8|9|
470       "wsbh      $t3, $t3                    \n"  // |14|15|12|13|
471       "rotr      $t0, $t0, 16                \n"  // |0|1|2|3|
472       "rotr      $t1, $t1, 16                \n"  // |4|5|6|7|
473       "rotr      $t2, $t2, 16                \n"  // |8|9|10|11|
474       "rotr      $t3, $t3, 16                \n"  // |12|13|14|15|
475       "addiu     %[src], %[src], -16         \n"
476       "addiu     $t4, $t4, -1                \n"
477       "sw        $t3, 0(%[dst])              \n"  // |15|14|13|12|
478       "sw        $t2, 4(%[dst])              \n"  // |11|10|9|8|
479       "sw        $t1, 8(%[dst])              \n"  // |7|6|5|4|
480       "sw        $t0, 12(%[dst])             \n"  // |3|2|1|0|
481       "bgtz      $t4, 1b                     \n"
482       " addiu    %[dst], %[dst], 16          \n"
483       "beqz      $t5, 3f                     \n"
484       " nop                                  \n"
485 
486       "2:                                     \n"
487       "lbu       $t0, -1(%[src])             \n"
488       "addiu     $t5, $t5, -1                \n"
489       "addiu     %[src], %[src], -1          \n"
490       "sb        $t0, 0(%[dst])              \n"
491       "bgez      $t5, 2b                     \n"
492       " addiu    %[dst], %[dst], 1           \n"
493 
494       "3:                                     \n"
495       ".set pop                              \n"
496       : [src] "+r"(src), [dst] "+r"(dst)
497       : [width] "r"(width)
498       : "t0", "t1", "t2", "t3", "t4", "t5");
499 }
500 
MirrorUVRow_DSPR2(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)501 void MirrorUVRow_DSPR2(const uint8* src_uv,
502                        uint8* dst_u,
503                        uint8* dst_v,
504                        int width) {
505   int x;
506   int y;
507   __asm__ __volatile__(
508       ".set push                                    \n"
509       ".set noreorder                               \n"
510 
511       "addu            $t4, %[width], %[width]      \n"
512       "srl             %[x], %[width], 4            \n"
513       "andi            %[y], %[width], 0xf          \n"
514       "blez            %[x], 2f                     \n"
515       " addu           %[src_uv], %[src_uv], $t4    \n"
516 
517       "1:                                          \n"
518       "lw              $t0, -32(%[src_uv])          \n"  // |3|2|1|0|
519       "lw              $t1, -28(%[src_uv])          \n"  // |7|6|5|4|
520       "lw              $t2, -24(%[src_uv])          \n"  // |11|10|9|8|
521       "lw              $t3, -20(%[src_uv])          \n"  // |15|14|13|12|
522       "lw              $t4, -16(%[src_uv])          \n"  // |19|18|17|16|
523       "lw              $t6, -12(%[src_uv])          \n"  // |23|22|21|20|
524       "lw              $t7, -8(%[src_uv])           \n"  // |27|26|25|24|
525       "lw              $t8, -4(%[src_uv])           \n"  // |31|30|29|28|
526 
527       "rotr            $t0, $t0, 16                 \n"  // |1|0|3|2|
528       "rotr            $t1, $t1, 16                 \n"  // |5|4|7|6|
529       "rotr            $t2, $t2, 16                 \n"  // |9|8|11|10|
530       "rotr            $t3, $t3, 16                 \n"  // |13|12|15|14|
531       "rotr            $t4, $t4, 16                 \n"  // |17|16|19|18|
532       "rotr            $t6, $t6, 16                 \n"  // |21|20|23|22|
533       "rotr            $t7, $t7, 16                 \n"  // |25|24|27|26|
534       "rotr            $t8, $t8, 16                 \n"  // |29|28|31|30|
535       "precr.qb.ph     $t9, $t0, $t1                \n"  // |0|2|4|6|
536       "precrq.qb.ph    $t5, $t0, $t1                \n"  // |1|3|5|7|
537       "precr.qb.ph     $t0, $t2, $t3                \n"  // |8|10|12|14|
538       "precrq.qb.ph    $t1, $t2, $t3                \n"  // |9|11|13|15|
539       "precr.qb.ph     $t2, $t4, $t6                \n"  // |16|18|20|22|
540       "precrq.qb.ph    $t3, $t4, $t6                \n"  // |17|19|21|23|
541       "precr.qb.ph     $t4, $t7, $t8                \n"  // |24|26|28|30|
542       "precrq.qb.ph    $t6, $t7, $t8                \n"  // |25|27|29|31|
543       "addiu           %[src_uv], %[src_uv], -32    \n"
544       "addiu           %[x], %[x], -1               \n"
545       "swr             $t4, 0(%[dst_u])             \n"
546       "swl             $t4, 3(%[dst_u])             \n"  // |30|28|26|24|
547       "swr             $t6, 0(%[dst_v])             \n"
548       "swl             $t6, 3(%[dst_v])             \n"  // |31|29|27|25|
549       "swr             $t2, 4(%[dst_u])             \n"
550       "swl             $t2, 7(%[dst_u])             \n"  // |22|20|18|16|
551       "swr             $t3, 4(%[dst_v])             \n"
552       "swl             $t3, 7(%[dst_v])             \n"  // |23|21|19|17|
553       "swr             $t0, 8(%[dst_u])             \n"
554       "swl             $t0, 11(%[dst_u])            \n"  // |14|12|10|8|
555       "swr             $t1, 8(%[dst_v])             \n"
556       "swl             $t1, 11(%[dst_v])            \n"  // |15|13|11|9|
557       "swr             $t9, 12(%[dst_u])            \n"
558       "swl             $t9, 15(%[dst_u])            \n"  // |6|4|2|0|
559       "swr             $t5, 12(%[dst_v])            \n"
560       "swl             $t5, 15(%[dst_v])            \n"  // |7|5|3|1|
561       "addiu           %[dst_v], %[dst_v], 16       \n"
562       "bgtz            %[x], 1b                     \n"
563       " addiu          %[dst_u], %[dst_u], 16       \n"
564       "beqz            %[y], 3f                     \n"
565       " nop                                         \n"
566       "b               2f                           \n"
567       " nop                                         \n"
568 
569       "2:                                            \n"
570       "lbu             $t0, -2(%[src_uv])           \n"
571       "lbu             $t1, -1(%[src_uv])           \n"
572       "addiu           %[src_uv], %[src_uv], -2     \n"
573       "addiu           %[y], %[y], -1               \n"
574       "sb              $t0, 0(%[dst_u])             \n"
575       "sb              $t1, 0(%[dst_v])             \n"
576       "addiu           %[dst_u], %[dst_u], 1        \n"
577       "bgtz            %[y], 2b                     \n"
578       " addiu          %[dst_v], %[dst_v], 1        \n"
579 
580       "3:                                            \n"
581       ".set pop                                     \n"
582       : [src_uv] "+r"(src_uv), [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v),
583         [x] "=&r"(x), [y] "=&r"(y)
584       : [width] "r"(width)
585       : "t0", "t1", "t2", "t3", "t4", "t5", "t7", "t8", "t9");
586 }
587 
I422ToARGBRow_DSPR2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)588 void I422ToARGBRow_DSPR2(const uint8* src_y,
589                          const uint8* src_u,
590                          const uint8* src_v,
591                          uint8* rgb_buf,
592                          const struct YuvConstants* yuvconstants,
593                          int width) {
594   int x;
595   uint32 tmp_ub = yuvconstants->kUVToB[0];
596   uint32 tmp_ug = yuvconstants->kUVToG[0];
597   uint32 tmp_vg = yuvconstants->kUVToG[1];
598   uint32 tmp_vr = yuvconstants->kUVToR[1];
599   uint32 tmp_bb = yuvconstants->kUVBiasB[0];
600   uint32 tmp_bg = yuvconstants->kUVBiasG[0];
601   uint32 tmp_br = yuvconstants->kUVBiasR[0];
602   uint32 yg = yuvconstants->kYToRgb[0];
603   uint32 tmp_yg;
604   uint32 tmp_mask = 0x7fff7fff;
605   tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
606   tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
607   tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
608   tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
609   tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
610   tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
611   tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
612   tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
613   yg = yg * 0x0101;
614 
615   for (x = 0; x < width - 1; x += 2) {
616     uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
617     uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
618     __asm__ __volatile__(
619         ".set push                                             \n"
620         ".set noreorder                                        \n"
621         "lbu              %[tmp_t7], 0(%[src_y])               \n"
622         "lbu              %[tmp_t1], 1(%[src_y])               \n"
623         "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
624         "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
625         "lbu              %[tmp_t2], 0(%[src_u])               \n"
626         "lbu              %[tmp_t3], 0(%[src_v])               \n"
627         "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
628         "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
629         "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
630         "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
631         "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
632         "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
633         "srl              %[tmp_t7], %[tmp_t7],     16         \n"
634         "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
635         "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
636         "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
637         "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
638         "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
639         "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
640         "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
641         "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
642         "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
643         "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
644         "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
645         "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
646         "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
647         "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
648         "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
649         "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
650         "precrq.ph.w      %[tmp_t9], %[tmp_t8],     %[tmp_t7]  \n"
651         "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
652         "precr.qb.ph      %[tmp_t8], %[tmp_t9],     %[tmp_t7]  \n"
653         "precrq.qb.ph     %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
654         "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
655         "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
656         ".set pop                                              \n"
657         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
658           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
659           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
660           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
661         : [src_y] "r"(src_y), [src_u] "r"(src_u), [src_v] "r"(src_v),
662           [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [yg] "r"(yg),
663           [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
664           [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
665           [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
666     src_y += 2;
667     src_u += 1;
668     src_v += 1;
669     rgb_buf += 8;  // Advance 4 pixels.
670   }
671 }
672 
673 // Bilinear filter 8x2 -> 8x1
InterpolateRow_DSPR2(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)674 void InterpolateRow_DSPR2(uint8* dst_ptr,
675                           const uint8* src_ptr,
676                           ptrdiff_t src_stride,
677                           int dst_width,
678                           int source_y_fraction) {
679   int y0_fraction = 256 - source_y_fraction;
680   const uint8* src_ptr1 = src_ptr + src_stride;
681 
682   __asm__ __volatile__(
683       ".set push                                           \n"
684       ".set noreorder                                      \n"
685 
686       "replv.ph          $t0, %[y0_fraction]               \n"
687       "replv.ph          $t1, %[source_y_fraction]         \n"
688 
689       "1:                                                    \n"
690       "lw                $t2, 0(%[src_ptr])                \n"
691       "lw                $t3, 0(%[src_ptr1])               \n"
692       "lw                $t4, 4(%[src_ptr])                \n"
693       "lw                $t5, 4(%[src_ptr1])               \n"
694       "muleu_s.ph.qbl    $t6, $t2, $t0                     \n"
695       "muleu_s.ph.qbr    $t7, $t2, $t0                     \n"
696       "muleu_s.ph.qbl    $t8, $t3, $t1                     \n"
697       "muleu_s.ph.qbr    $t9, $t3, $t1                     \n"
698       "muleu_s.ph.qbl    $t2, $t4, $t0                     \n"
699       "muleu_s.ph.qbr    $t3, $t4, $t0                     \n"
700       "muleu_s.ph.qbl    $t4, $t5, $t1                     \n"
701       "muleu_s.ph.qbr    $t5, $t5, $t1                     \n"
702       "addq.ph           $t6, $t6, $t8                     \n"
703       "addq.ph           $t7, $t7, $t9                     \n"
704       "addq.ph           $t2, $t2, $t4                     \n"
705       "addq.ph           $t3, $t3, $t5                     \n"
706       "shra_r.ph         $t6, $t6, 8                       \n"
707       "shra_r.ph         $t7, $t7, 8                       \n"
708       "shra_r.ph         $t2, $t2, 8                       \n"
709       "shra_r.ph         $t3, $t3, 8                       \n"
710       "precr.qb.ph       $t6, $t6, $t7                     \n"
711       "precr.qb.ph       $t2, $t2, $t3                     \n"
712       "addiu             %[src_ptr], %[src_ptr], 8         \n"
713       "addiu             %[src_ptr1], %[src_ptr1], 8       \n"
714       "addiu             %[dst_width], %[dst_width], -8    \n"
715       "sw                $t6, 0(%[dst_ptr])                \n"
716       "sw                $t2, 4(%[dst_ptr])                \n"
717       "bgtz              %[dst_width], 1b                  \n"
718       " addiu            %[dst_ptr], %[dst_ptr], 8         \n"
719 
720       ".set pop                                            \n"
721       : [dst_ptr] "+r"(dst_ptr), [src_ptr1] "+r"(src_ptr1),
722         [src_ptr] "+r"(src_ptr), [dst_width] "+r"(dst_width)
723       : [source_y_fraction] "r"(source_y_fraction),
724         [y0_fraction] "r"(y0_fraction), [src_stride] "r"(src_stride)
725       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9");
726 }
727 #include <stdio.h>
RGB24ToARGBRow_DSPR2(const uint8 * src_rgb24,uint8 * dst_argb,int width)728 void RGB24ToARGBRow_DSPR2(const uint8* src_rgb24, uint8* dst_argb, int width) {
729   int x;
730   uint32 tmp_mask = 0xff;
731   uint32 tmp_t1;
732   for (x = 0; x < (width - 1); ++x) {
733     __asm__ __volatile__(
734         ".set push                                                  \n"
735         ".set noreorder                                             \n"
736         "ulw             %[tmp_t1],    0(%[src_rgb24])              \n"
737         "addiu           %[dst_argb],  %[dst_argb],     4           \n"
738         "addiu           %[src_rgb24], %[src_rgb24],    3           \n"
739         "ins             %[tmp_t1],    %[tmp_mask],     24,    8    \n"
740         "sw              %[tmp_t1],    -4(%[dst_argb])              \n"
741         ".set pop                                                   \n"
742         : [src_rgb24] "+r"(src_rgb24), [dst_argb] "+r"(dst_argb),
743           [tmp_t1] "=&r"(tmp_t1)
744         : [tmp_mask] "r"(tmp_mask)
745         : "memory");
746   }
747   uint8 b = src_rgb24[0];
748   uint8 g = src_rgb24[1];
749   uint8 r = src_rgb24[2];
750   dst_argb[0] = b;
751   dst_argb[1] = g;
752   dst_argb[2] = r;
753   dst_argb[3] = 255u;
754 }
755 
RAWToARGBRow_DSPR2(const uint8 * src_raw,uint8 * dst_argb,int width)756 void RAWToARGBRow_DSPR2(const uint8* src_raw, uint8* dst_argb, int width) {
757   int x;
758   uint32 tmp_mask = 0xff;
759   uint32 tmp_t1, tmp_t2;
760   for (x = 0; x < (width - 1); ++x) {
761     __asm__ __volatile__(
762         ".set push                                               \n"
763         ".set noreorder                                          \n"
764         "ulw               %[tmp_t1],   0(%[src_raw])            \n"
765         "addiu             %[dst_argb], %[dst_argb],      4      \n"
766         "addiu             %[src_raw],  %[src_raw],       3      \n"
767         "srl               %[tmp_t2],   %[tmp_t1],        16     \n"
768         "ins               %[tmp_t1],   %[tmp_mask],      24, 8  \n"
769         "ins               %[tmp_t1],   %[tmp_t1],        16, 8  \n"
770         "ins               %[tmp_t1],   %[tmp_t2],        0,  8  \n"
771         "sw                %[tmp_t1],   -4(%[dst_argb])          \n"
772         ".set pop                                                \n"
773         : [src_raw] "+r"(src_raw), [dst_argb] "+r"(dst_argb),
774           [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2)
775         : [tmp_mask] "r"(tmp_mask)
776         : "memory");
777   }
778   uint8 r = src_raw[0];
779   uint8 g = src_raw[1];
780   uint8 b = src_raw[2];
781   dst_argb[0] = b;
782   dst_argb[1] = g;
783   dst_argb[2] = r;
784   dst_argb[3] = 255u;
785 }
786 
RGB565ToARGBRow_DSPR2(const uint8 * src_rgb565,uint8 * dst_argb,int width)787 void RGB565ToARGBRow_DSPR2(const uint8* src_rgb565,
788                            uint8* dst_argb,
789                            int width) {
790   int x;
791   uint32 tmp_mask = 0xff;
792   uint32 tmp_t1, tmp_t2, tmp_t3;
793   for (x = 0; x < width; ++x) {
794     __asm__ __volatile__(
795         ".set push                                                   \n"
796         ".set noreorder                                              \n"
797         "lhu               %[tmp_t1],     0(%[src_rgb565])           \n"
798         "addiu             %[dst_argb],   %[dst_argb],      4        \n"
799         "addiu             %[src_rgb565], %[src_rgb565],    2        \n"
800         "sll               %[tmp_t2],     %[tmp_t1],        8        \n"
801         "ins               %[tmp_t2],     %[tmp_mask],      24,8     \n"
802         "ins               %[tmp_t2],     %[tmp_t1],        3, 16    \n"
803         "ins               %[tmp_t2],     %[tmp_t1],        5, 11    \n"
804         "srl               %[tmp_t3],     %[tmp_t1],        9        \n"
805         "ins               %[tmp_t2],     %[tmp_t3],        8, 2     \n"
806         "ins               %[tmp_t2],     %[tmp_t1],        3, 5     \n"
807         "srl               %[tmp_t3],     %[tmp_t1],        2        \n"
808         "ins               %[tmp_t2],     %[tmp_t3],        0, 3     \n"
809         "sw                %[tmp_t2],     -4(%[dst_argb])            \n"
810         ".set pop                                                    \n"
811         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
812           [tmp_t3] "=&r"(tmp_t3), [src_rgb565] "+r"(src_rgb565),
813           [dst_argb] "+r"(dst_argb)
814         : [tmp_mask] "r"(tmp_mask));
815   }
816 }
817 
ARGB1555ToARGBRow_DSPR2(const uint8 * src_argb1555,uint8 * dst_argb,int width)818 void ARGB1555ToARGBRow_DSPR2(const uint8* src_argb1555,
819                              uint8* dst_argb,
820                              int width) {
821   int x;
822   uint32 tmp_t1, tmp_t2, tmp_t3;
823   for (x = 0; x < width; ++x) {
824     __asm__ __volatile__(
825         ".set push                                                   \n"
826         ".set noreorder                                              \n"
827         "lh                %[tmp_t1],       0(%[src_argb1555])       \n"
828         "addiu             %[dst_argb],     %[dst_argb],      4      \n"
829         "addiu             %[src_argb1555], %[src_argb1555],  2      \n"
830         "sll               %[tmp_t2],       %[tmp_t1],        9      \n"
831         "ins               %[tmp_t2],       %[tmp_t1],        4, 15  \n"
832         "ins               %[tmp_t2],       %[tmp_t1],        6, 10  \n"
833         "srl               %[tmp_t3],       %[tmp_t1],        7      \n"
834         "ins               %[tmp_t2],       %[tmp_t3],        8, 3   \n"
835         "ins               %[tmp_t2],       %[tmp_t1],        3, 5   \n"
836         "srl               %[tmp_t3],       %[tmp_t1],        2      \n"
837         "ins               %[tmp_t2],       %[tmp_t3],        0, 3   \n"
838         "sw                %[tmp_t2],       -4(%[dst_argb])          \n"
839         ".set pop                                                    \n"
840         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
841           [tmp_t3] "=&r"(tmp_t3), [src_argb1555] "+r"(src_argb1555),
842           [dst_argb] "+r"(dst_argb)
843         :);
844   }
845 }
846 
ARGB4444ToARGBRow_DSPR2(const uint8 * src_argb4444,uint8 * dst_argb,int width)847 void ARGB4444ToARGBRow_DSPR2(const uint8* src_argb4444,
848                              uint8* dst_argb,
849                              int width) {
850   int x;
851   uint32 tmp_t1;
852   for (x = 0; x < width; ++x) {
853     __asm__ __volatile__(
854         ".set push                                                    \n"
855         ".set noreorder                                               \n"
856         "lh                %[tmp_t1],       0(%[src_argb4444])        \n"
857         "addiu             %[dst_argb],     %[dst_argb],       4      \n"
858         "addiu             %[src_argb4444], %[src_argb4444],   2      \n"
859         "ins               %[tmp_t1],       %[tmp_t1],         16, 16 \n"
860         "ins               %[tmp_t1],       %[tmp_t1],         12, 16 \n"
861         "ins               %[tmp_t1],       %[tmp_t1],         8,  12 \n"
862         "ins               %[tmp_t1],       %[tmp_t1],         4,  8  \n"
863         "sw                %[tmp_t1],       -4(%[dst_argb])           \n"
864         ".set pop                                                     \n"
865         : [src_argb4444] "+r"(src_argb4444), [dst_argb] "+r"(dst_argb),
866           [tmp_t1] "=&r"(tmp_t1));
867   }
868 }
869 
I444ToARGBRow_DSPR2(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)870 void I444ToARGBRow_DSPR2(const uint8* y_buf,
871                          const uint8* u_buf,
872                          const uint8* v_buf,
873                          uint8* rgb_buf,
874                          const struct YuvConstants* yuvconstants,
875                          int width) {
876   int x;
877   uint32 tmp_ub = yuvconstants->kUVToB[0];
878   uint32 tmp_ug = yuvconstants->kUVToG[0];
879   uint32 tmp_vg = yuvconstants->kUVToG[1];
880   uint32 tmp_vr = yuvconstants->kUVToR[1];
881   uint32 tmp_bb = yuvconstants->kUVBiasB[0];
882   uint32 tmp_bg = yuvconstants->kUVBiasG[0];
883   uint32 tmp_br = yuvconstants->kUVBiasR[0];
884   uint32 yg = yuvconstants->kYToRgb[0];
885   uint32 tmp_mask = 0x7fff7fff;
886   uint32 tmp_yg;
887 
888   tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
889   tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
890   tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
891   tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
892   tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
893   tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
894   tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
895   tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
896   yg = yg * 0x0101;
897 
898   for (x = 0; x < width - 1; x += 2) {
899     uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
900     uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
901     __asm__ __volatile__(
902         ".set push                                              \n"
903         ".set noreorder                                         \n"
904         "lbu              %[tmp_t7], 0(%[y_buf])               \n"
905         "lbu              %[tmp_t1], 1(%[y_buf])               \n"
906         "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
907         "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
908         "lh               %[tmp_t2], 0(%[u_buf])               \n"
909         "lh               %[tmp_t3], 0(%[v_buf])               \n"
910         "preceu.ph.qbr    %[tmp_t2], %[tmp_t2]                 \n"
911         "preceu.ph.qbr    %[tmp_t3], %[tmp_t3]                 \n"
912         "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
913         "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
914         "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
915         "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
916         "srl              %[tmp_t7], %[tmp_t7],     16         \n"
917         "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
918         "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
919         "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
920         "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
921         "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
922         "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
923         "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
924         "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
925         "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
926         "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
927         "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
928         "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
929         "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
930         "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
931         "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
932         "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
933         "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
934         "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
935         "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
936         "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
937         "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
938         "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
939         ".set pop                                              \n"
940         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
941           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
942           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
943           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
944         : [y_buf] "r"(y_buf), [yg] "r"(yg), [u_buf] "r"(u_buf),
945           [v_buf] "r"(v_buf), [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug),
946           [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb),
947           [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg),
948           [rgb_buf] "r"(rgb_buf), [tmp_mask] "r"(tmp_mask));
949     y_buf += 2;
950     u_buf += 2;
951     v_buf += 2;
952     rgb_buf += 8;  // Advance 1 pixel.
953   }
954 }
955 
I422ToARGB4444Row_DSPR2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,const struct YuvConstants * yuvconstants,int width)956 void I422ToARGB4444Row_DSPR2(const uint8* src_y,
957                              const uint8* src_u,
958                              const uint8* src_v,
959                              uint8* dst_argb4444,
960                              const struct YuvConstants* yuvconstants,
961                              int width) {
962   int x;
963   uint32 tmp_ub = yuvconstants->kUVToB[0];
964   uint32 tmp_ug = yuvconstants->kUVToG[0];
965   uint32 tmp_vg = yuvconstants->kUVToG[1];
966   uint32 tmp_vr = yuvconstants->kUVToR[1];
967   uint32 tmp_bb = yuvconstants->kUVBiasB[0];
968   uint32 tmp_bg = yuvconstants->kUVBiasG[0];
969   uint32 tmp_br = yuvconstants->kUVBiasR[0];
970   uint32 yg = yuvconstants->kYToRgb[0];
971   uint32 tmp_yg;
972   uint32 tmp_mask = 0x7fff7fff;
973   tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
974   tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
975   tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
976   tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
977   tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
978   tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
979   tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
980   tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
981   yg = yg * 0x0101;
982 
983   for (x = 0; x < width - 1; x += 2) {
984     uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
985     uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
986     __asm__ __volatile__(
987         ".set push                                             \n"
988         ".set noreorder                                        \n"
989         "lbu              %[tmp_t7], 0(%[src_y])               \n"
990         "lbu              %[tmp_t1], 1(%[src_y])               \n"
991         "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
992         "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
993         "lbu              %[tmp_t2], 0(%[src_u])               \n"
994         "lbu              %[tmp_t3], 0(%[src_v])               \n"
995         "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
996         "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
997         "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
998         "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
999         "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
1000         "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
1001         "srl              %[tmp_t7], %[tmp_t7],     16         \n"
1002         "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
1003         "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
1004         "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
1005         "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
1006         "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
1007         "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
1008         "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
1009         "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
1010         "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
1011         "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
1012         "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
1013         "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
1014         "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
1015         "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
1016         "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
1017         "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
1018         "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
1019         "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
1020         "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
1021         "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
1022         "shrl.qb          %[tmp_t1], %[tmp_t8],     4          \n"
1023         "shrl.qb          %[tmp_t2], %[tmp_t7],     4          \n"
1024         "shrl.ph          %[tmp_t8], %[tmp_t1],     4          \n"
1025         "shrl.ph          %[tmp_t7], %[tmp_t2],     4          \n"
1026         "or               %[tmp_t8], %[tmp_t8],     %[tmp_t1]  \n"
1027         "or               %[tmp_t7], %[tmp_t7],     %[tmp_t2]  \n"
1028         "precr.qb.ph      %[tmp_t8], %[tmp_t7],     %[tmp_t8]  \n"
1029         "sw               %[tmp_t8], 0(%[dst_argb4444])        \n"
1030         ".set pop                                              \n"
1031         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1032           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1033           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1034           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
1035         : [dst_argb4444] "r"(dst_argb4444), [yg] "r"(yg), [src_u] "r"(src_u),
1036           [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
1037           [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
1038           [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
1039           [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
1040     src_y += 2;
1041     src_u += 1;
1042     src_v += 1;
1043     dst_argb4444 += 4;  // Advance 2 pixels.
1044   }
1045 }
1046 
I422ToARGB1555Row_DSPR2(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,const struct YuvConstants * yuvconstants,int width)1047 void I422ToARGB1555Row_DSPR2(const uint8* src_y,
1048                              const uint8* src_u,
1049                              const uint8* src_v,
1050                              uint8* dst_argb1555,
1051                              const struct YuvConstants* yuvconstants,
1052                              int width) {
1053   int x;
1054   uint32 tmp_ub = yuvconstants->kUVToB[0];
1055   uint32 tmp_ug = yuvconstants->kUVToG[0];
1056   uint32 tmp_vg = yuvconstants->kUVToG[1];
1057   uint32 tmp_vr = yuvconstants->kUVToR[1];
1058   uint32 tmp_bb = yuvconstants->kUVBiasB[0];
1059   uint32 tmp_bg = yuvconstants->kUVBiasG[0];
1060   uint32 tmp_br = yuvconstants->kUVBiasR[0];
1061   uint32 yg = yuvconstants->kYToRgb[0];
1062   uint32 tmp_yg;
1063   uint32 tmp_mask = 0x80008000;
1064   tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
1065   tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
1066   tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
1067   tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
1068   tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
1069   tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
1070   tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
1071   tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
1072   yg = yg * 0x0101;
1073 
1074   for (x = 0; x < width - 1; x += 2) {
1075     uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1076     uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
1077     __asm__ __volatile__(
1078         ".set push                                             \n"
1079         ".set noreorder                                        \n"
1080         "lbu              %[tmp_t7], 0(%[src_y])               \n"
1081         "lbu              %[tmp_t1], 1(%[src_y])               \n"
1082         "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
1083         "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
1084         "lbu              %[tmp_t2], 0(%[src_u])               \n"
1085         "lbu              %[tmp_t3], 0(%[src_v])               \n"
1086         "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
1087         "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
1088         "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
1089         "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
1090         "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
1091         "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
1092         "srl              %[tmp_t7], %[tmp_t7],     16         \n"
1093         "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
1094         "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
1095         "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
1096         "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
1097         "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
1098         "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
1099         "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
1100         "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
1101         "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
1102         "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
1103         "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
1104         "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
1105         "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
1106         "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
1107         "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
1108         "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
1109         "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
1110         "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
1111         "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
1112         "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
1113         "ins              %[tmp_t3], %[tmp_t8],     7,      24 \n"
1114         "ins              %[tmp_t3], %[tmp_t8],     10,     16 \n"
1115         "ins              %[tmp_t3], %[tmp_t8],     13,     8  \n"
1116         "ins              %[tmp_t4], %[tmp_t7],     7,      24 \n"
1117         "ins              %[tmp_t4], %[tmp_t7],     10,     16 \n"
1118         "ins              %[tmp_t4], %[tmp_t7],     13,     8  \n"
1119         "precrq.ph.w      %[tmp_t8], %[tmp_t4],     %[tmp_t3]  \n"
1120         "or               %[tmp_t8], %[tmp_t8],     %[tmp_mask]\n"
1121         "sw               %[tmp_t8], 0(%[dst_argb1555])        \n"
1122         ".set pop                                              \n"
1123         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1124           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1125           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1126           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
1127         : [dst_argb1555] "r"(dst_argb1555), [yg] "r"(yg), [src_u] "r"(src_u),
1128           [src_v] "r"(src_v), [src_y] "r"(src_y), [tmp_ub] "r"(tmp_ub),
1129           [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg), [tmp_vr] "r"(tmp_vr),
1130           [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg), [tmp_br] "r"(tmp_br),
1131           [tmp_yg] "r"(tmp_yg), [tmp_mask] "r"(tmp_mask));
1132     src_y += 2;
1133     src_u += 1;
1134     src_v += 1;
1135     dst_argb1555 += 4;  // Advance 2 pixels.
1136   }
1137 }
1138 
NV12ToARGBRow_DSPR2(const uint8 * src_y,const uint8 * src_uv,uint8 * rgb_buf,const struct YuvConstants * yuvconstants,int width)1139 void NV12ToARGBRow_DSPR2(const uint8* src_y,
1140                          const uint8* src_uv,
1141                          uint8* rgb_buf,
1142                          const struct YuvConstants* yuvconstants,
1143                          int width) {
1144   int x;
1145   uint32 tmp_ub = yuvconstants->kUVToB[0];
1146   uint32 tmp_ug = yuvconstants->kUVToG[0];
1147   uint32 tmp_vg = yuvconstants->kUVToG[1];
1148   uint32 tmp_vr = yuvconstants->kUVToR[1];
1149   uint32 tmp_bb = yuvconstants->kUVBiasB[0];
1150   uint32 tmp_bg = yuvconstants->kUVBiasG[0];
1151   uint32 tmp_br = yuvconstants->kUVBiasR[0];
1152   uint32 yg = yuvconstants->kYToRgb[0];
1153   uint32 tmp_mask = 0x7fff7fff;
1154   uint32 tmp_yg;
1155   tmp_bb = ((uint)(tmp_bb & 0xffff) << 16) | (tmp_bb & 0xffff);
1156   tmp_bg = ((uint)(tmp_bg & 0xffff) << 16) | (tmp_bg & 0xffff);
1157   tmp_br = ((uint)(tmp_br & 0xffff) << 16) | (tmp_br & 0xffff);
1158   tmp_yg = ((uint)(yg & 0xffff) << 16) | (yg & 0xffff);
1159   tmp_ub = ~(((uint)(tmp_ub & 0xffff) << 16) | (tmp_ub & 0xffff)) + 0x00010001;
1160   tmp_ug = ((uint)(tmp_ug & 0xffff) << 16) | (tmp_ug & 0xffff);
1161   tmp_vg = ((uint)(tmp_vg & 0xffff) << 16) | (tmp_vg & 0xffff);
1162   tmp_vr = ~(((uint)(tmp_vr & 0xffff) << 16) | (tmp_vr & 0xffff)) + 0x00010001;
1163   yg = yg * 0x0101;
1164 
1165   for (x = 0; x < width - 1; x += 2) {
1166     uint32 tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1167     uint32 tmp_t6, tmp_t7, tmp_t8, tmp_t9;
1168     __asm__ __volatile__(
1169         ".set push                                             \n"
1170         ".set noreorder                                        \n"
1171         "lbu              %[tmp_t7], 0(%[src_y])               \n"
1172         "lbu              %[tmp_t1], 1(%[src_y])               \n"
1173         "mul              %[tmp_t7], %[tmp_t7],     %[yg]      \n"
1174         "mul              %[tmp_t1], %[tmp_t1],     %[yg]      \n"
1175         "lbu              %[tmp_t2], 0(%[src_uv])              \n"
1176         "lbu              %[tmp_t3], 1(%[src_uv])              \n"
1177         "replv.ph         %[tmp_t2], %[tmp_t2]                 \n"
1178         "replv.ph         %[tmp_t3], %[tmp_t3]                 \n"
1179         "mul.ph           %[tmp_t4], %[tmp_t2],     %[tmp_ub]  \n"
1180         "mul.ph           %[tmp_t5], %[tmp_t2],     %[tmp_ug]  \n"
1181         "mul.ph           %[tmp_t6], %[tmp_t3],     %[tmp_vr]  \n"
1182         "mul.ph           %[tmp_t3], %[tmp_t3],     %[tmp_vg]  \n"
1183         "srl              %[tmp_t7], %[tmp_t7],     16         \n"
1184         "ins              %[tmp_t1], %[tmp_t7],     0,      16 \n"
1185         "addq_s.ph        %[tmp_t7], %[tmp_t1],     %[tmp_bb]  \n"
1186         "addq_s.ph        %[tmp_t8], %[tmp_t1],     %[tmp_bg]  \n"
1187         "addq_s.ph        %[tmp_t9], %[tmp_t1],     %[tmp_br]  \n"
1188         "addq_s.ph        %[tmp_t5], %[tmp_t5],     %[tmp_t3]  \n"
1189         "addq_s.ph        %[tmp_t7], %[tmp_t7],     %[tmp_t4]  \n"
1190         "subq_s.ph        %[tmp_t8], %[tmp_t8],     %[tmp_t5]  \n"
1191         "addq_s.ph        %[tmp_t9], %[tmp_t9],     %[tmp_t6]  \n"
1192         "shra.ph          %[tmp_t7], %[tmp_t7],     6          \n"
1193         "shra.ph          %[tmp_t8], %[tmp_t8],     6          \n"
1194         "shra.ph          %[tmp_t9], %[tmp_t9],     6          \n"
1195         "shll_s.ph        %[tmp_t7], %[tmp_t7],     7          \n"
1196         "shll_s.ph        %[tmp_t8], %[tmp_t8],     7          \n"
1197         "shll_s.ph        %[tmp_t9], %[tmp_t9],     7          \n"
1198         "precrqu_s.qb.ph  %[tmp_t8], %[tmp_mask],   %[tmp_t8]  \n"
1199         "precrqu_s.qb.ph  %[tmp_t7], %[tmp_t9],     %[tmp_t7]  \n"
1200         "precrq.ph.w      %[tmp_t2], %[tmp_t8],     %[tmp_t7]  \n"
1201         "ins              %[tmp_t7], %[tmp_t8],     16,     16 \n"
1202         "precr.qb.ph      %[tmp_t8], %[tmp_t2],     %[tmp_t7]  \n"
1203         "precrq.qb.ph     %[tmp_t7], %[tmp_t2],     %[tmp_t7]  \n"
1204         "sw               %[tmp_t8], 0(%[rgb_buf])             \n"
1205         "sw               %[tmp_t7], 4(%[rgb_buf])             \n"
1206         ".set pop                                              \n"
1207         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1208           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1209           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1210           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8), [tmp_t9] "=&r"(tmp_t9)
1211         : [src_y] "r"(src_y), [src_uv] "r"(src_uv), [yg] "r"(yg),
1212           [tmp_ub] "r"(tmp_ub), [tmp_ug] "r"(tmp_ug), [tmp_vg] "r"(tmp_vg),
1213           [tmp_vr] "r"(tmp_vr), [tmp_bb] "r"(tmp_bb), [tmp_bg] "r"(tmp_bg),
1214           [tmp_br] "r"(tmp_br), [tmp_yg] "r"(tmp_yg), [rgb_buf] "r"(rgb_buf),
1215           [tmp_mask] "r"(tmp_mask));
1216 
1217     src_y += 2;
1218     src_uv += 2;
1219     rgb_buf += 8;  // Advance 2 pixels.
1220   }
1221 }
1222 
BGRAToUVRow_DSPR2(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)1223 void BGRAToUVRow_DSPR2(const uint8* src_rgb0,
1224                        int src_stride_rgb,
1225                        uint8* dst_u,
1226                        uint8* dst_v,
1227                        int width) {
1228   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
1229   int x;
1230   int const1 = 0xffda0000;
1231   int const2 = 0x0070ffb6;
1232   int const3 = 0x00700000;
1233   int const4 = 0xffeeffa2;
1234   int const5 = 0x100;
1235   for (x = 0; x < width - 1; x += 2) {
1236     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1237     int tmp_t6, tmp_t7, tmp_t8;
1238     __asm__ __volatile__(
1239         ".set push                                                 \n"
1240         ".set noreorder                                            \n"
1241         "lw                %[tmp_t1],   0(%[src_rgb0])             \n"
1242         "lw                %[tmp_t2],   4(%[src_rgb0])             \n"
1243         "lw                %[tmp_t3],   0(%[src_rgb1])             \n"
1244         "lw                %[tmp_t4],   4(%[src_rgb1])             \n"
1245         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                  \n"
1246         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                  \n"
1247         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                  \n"
1248         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                  \n"
1249         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                  \n"
1250         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                  \n"
1251         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                  \n"
1252         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                  \n"
1253         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]   \n"
1254         "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]   \n"
1255         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]   \n"
1256         "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]   \n"
1257         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]   \n"
1258         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]   \n"
1259         "shrl.ph           %[tmp_t5],   %[tmp_t5],     2           \n"
1260         "shrl.ph           %[tmp_t1],   %[tmp_t1],     2           \n"
1261         "mult              $ac0,        %[const5],     %[const5]   \n"
1262         "mult              $ac1,        %[const5],     %[const5]   \n"
1263         "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]   \n"
1264         "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]   \n"
1265         "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]   \n"
1266         "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]   \n"
1267         "extr_r.w          %[tmp_t7],   $ac0,          9           \n"
1268         "extr_r.w          %[tmp_t8],   $ac1,          9           \n"
1269         "addiu             %[dst_u],    %[dst_u],    1             \n"
1270         "addiu             %[dst_v],    %[dst_v],    1             \n"
1271         "addiu             %[src_rgb0], %[src_rgb0], 8             \n"
1272         "addiu             %[src_rgb1], %[src_rgb1], 8             \n"
1273         "sb                %[tmp_t7],   -1(%[dst_u])               \n"
1274         "sb                %[tmp_t8],   -1(%[dst_v])               \n"
1275         ".set pop                                                  \n"
1276         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1277           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1278           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1279           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1280           [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
1281           [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
1282         : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
1283           [const4] "r"(const4), [const5] "r"(const5)
1284         : "hi", "lo", "$ac1lo", "$ac1hi");
1285   }
1286 }
1287 
BGRAToYRow_DSPR2(const uint8 * src_argb0,uint8 * dst_y,int width)1288 void BGRAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
1289   int x;
1290   int const1 = 0x00420000;
1291   int const2 = 0x00190081;
1292   int const5 = 0x40;
1293   for (x = 0; x < width; x += 4) {
1294     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1295     int tmp_t6, tmp_t7, tmp_t8;
1296     __asm__ __volatile__(
1297         ".set push                                                \n"
1298         ".set noreorder                                           \n"
1299         "lw                %[tmp_t1],   0(%[src_argb0])           \n"
1300         "lw                %[tmp_t2],   4(%[src_argb0])           \n"
1301         "lw                %[tmp_t3],   8(%[src_argb0])           \n"
1302         "lw                %[tmp_t4],   12(%[src_argb0])          \n"
1303         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
1304         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
1305         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
1306         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
1307         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
1308         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
1309         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
1310         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
1311         "mult              $ac0,        %[const5],     %[const5]  \n"
1312         "mult              $ac1,        %[const5],     %[const5]  \n"
1313         "mult              $ac2,        %[const5],     %[const5]  \n"
1314         "mult              $ac3,        %[const5],     %[const5]  \n"
1315         "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
1316         "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
1317         "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
1318         "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
1319         "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
1320         "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
1321         "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
1322         "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
1323         "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
1324         "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
1325         "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
1326         "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
1327         "addiu             %[src_argb0],%[src_argb0],  16         \n"
1328         "addiu             %[dst_y],    %[dst_y],      4          \n"
1329         "sb                %[tmp_t1],   -4(%[dst_y])              \n"
1330         "sb                %[tmp_t2],   -3(%[dst_y])              \n"
1331         "sb                %[tmp_t3],   -2(%[dst_y])              \n"
1332         "sb                %[tmp_t4],   -1(%[dst_y])              \n"
1333         ".set pop                                                 \n"
1334         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1335           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1336           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1337           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1338           [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
1339         : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
1340         : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
1341           "$ac3hi");
1342   }
1343 }
1344 
ABGRToUVRow_DSPR2(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)1345 void ABGRToUVRow_DSPR2(const uint8* src_rgb0,
1346                        int src_stride_rgb,
1347                        uint8* dst_u,
1348                        uint8* dst_v,
1349                        int width) {
1350   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
1351   int x;
1352   int const1 = 0xffb6ffda;
1353   int const2 = 0x00000070;
1354   int const3 = 0xffa20070;
1355   int const4 = 0x0000ffee;
1356   int const5 = 0x100;
1357 
1358   for (x = 0; x < width - 1; x += 2) {
1359     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1360     int tmp_t6, tmp_t7, tmp_t8;
1361     __asm__ __volatile__(
1362         ".set push                                                \n"
1363         ".set noreorder                                           \n"
1364         "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
1365         "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
1366         "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
1367         "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
1368         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
1369         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
1370         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
1371         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
1372         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
1373         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
1374         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
1375         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
1376         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
1377         "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
1378         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
1379         "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
1380         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
1381         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
1382         "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
1383         "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
1384         "mult              $ac0,        %[const5],     %[const5]  \n"
1385         "mult              $ac1,        %[const5],     %[const5]  \n"
1386         "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
1387         "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
1388         "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
1389         "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
1390         "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
1391         "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
1392         "addiu             %[dst_u],    %[dst_u],    1            \n"
1393         "addiu             %[dst_v],    %[dst_v],    1            \n"
1394         "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
1395         "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
1396         "sb                %[tmp_t7],   -1(%[dst_u])              \n"
1397         "sb                %[tmp_t8],   -1(%[dst_v])              \n"
1398         ".set pop                                                 \n"
1399         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1400           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1401           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1402           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1403           [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
1404           [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
1405         : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
1406           [const4] "r"(const4), [const5] "r"(const5)
1407         : "hi", "lo", "$ac1lo", "$ac1hi");
1408   }
1409 }
1410 
ARGBToYRow_DSPR2(const uint8 * src_argb0,uint8 * dst_y,int width)1411 void ARGBToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
1412   int x;
1413   int const1 = 0x00810019;
1414   int const2 = 0x00000042;
1415   int const5 = 0x40;
1416   for (x = 0; x < width; x += 4) {
1417     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1418     int tmp_t6, tmp_t7, tmp_t8;
1419     __asm__ __volatile__(
1420         ".set push                                                \n"
1421         ".set noreorder                                           \n"
1422         "lw                %[tmp_t1],   0(%[src_argb0])           \n"
1423         "lw                %[tmp_t2],   4(%[src_argb0])           \n"
1424         "lw                %[tmp_t3],   8(%[src_argb0])           \n"
1425         "lw                %[tmp_t4],   12(%[src_argb0])          \n"
1426         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
1427         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
1428         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
1429         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
1430         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
1431         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
1432         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
1433         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
1434         "mult              $ac0,        %[const5],     %[const5]  \n"
1435         "mult              $ac1,        %[const5],     %[const5]  \n"
1436         "mult              $ac2,        %[const5],     %[const5]  \n"
1437         "mult              $ac3,        %[const5],     %[const5]  \n"
1438         "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
1439         "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
1440         "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
1441         "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
1442         "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
1443         "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
1444         "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
1445         "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
1446         "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
1447         "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
1448         "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
1449         "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
1450         "addiu             %[dst_y],    %[dst_y],      4          \n"
1451         "addiu             %[src_argb0],%[src_argb0],  16         \n"
1452         "sb                %[tmp_t1],   -4(%[dst_y])              \n"
1453         "sb                %[tmp_t2],   -3(%[dst_y])              \n"
1454         "sb                %[tmp_t3],   -2(%[dst_y])              \n"
1455         "sb                %[tmp_t4],   -1(%[dst_y])              \n"
1456         ".set pop                                                 \n"
1457         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1458           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1459           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1460           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1461           [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
1462         : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
1463         : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
1464           "$ac3hi");
1465   }
1466 }
1467 
ABGRToYRow_DSPR2(const uint8 * src_argb0,uint8 * dst_y,int width)1468 void ABGRToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
1469   int x;
1470   int const1 = 0x00810042;
1471   int const2 = 0x00000019;
1472   int const5 = 0x40;
1473   for (x = 0; x < width; x += 4) {
1474     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1475     int tmp_t6, tmp_t7, tmp_t8;
1476     __asm__ __volatile__(
1477         ".set push                                                \n"
1478         ".set noreorder                                           \n"
1479         "lw                %[tmp_t1],   0(%[src_argb0])           \n"
1480         "lw                %[tmp_t2],   4(%[src_argb0])           \n"
1481         "lw                %[tmp_t3],   8(%[src_argb0])           \n"
1482         "lw                %[tmp_t4],   12(%[src_argb0])          \n"
1483         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
1484         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
1485         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
1486         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
1487         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
1488         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
1489         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
1490         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
1491         "mult              $ac0,        %[const5],     %[const5]  \n"
1492         "mult              $ac1,        %[const5],     %[const5]  \n"
1493         "mult              $ac2,        %[const5],     %[const5]  \n"
1494         "mult              $ac3,        %[const5],     %[const5]  \n"
1495         "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
1496         "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
1497         "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
1498         "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
1499         "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
1500         "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
1501         "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
1502         "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
1503         "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
1504         "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
1505         "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
1506         "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
1507         "addiu             %[src_argb0],%[src_argb0],  16         \n"
1508         "addiu             %[dst_y],    %[dst_y],      4          \n"
1509         "sb                %[tmp_t1],   -4(%[dst_y])              \n"
1510         "sb                %[tmp_t2],   -3(%[dst_y])              \n"
1511         "sb                %[tmp_t3],   -2(%[dst_y])              \n"
1512         "sb                %[tmp_t4],   -1(%[dst_y])              \n"
1513         ".set pop                                                 \n"
1514         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1515           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1516           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1517           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1518           [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
1519         : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
1520         : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
1521           "$ac3hi");
1522   }
1523 }
1524 
RGBAToUVRow_DSPR2(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)1525 void RGBAToUVRow_DSPR2(const uint8* src_rgb0,
1526                        int src_stride_rgb,
1527                        uint8* dst_u,
1528                        uint8* dst_v,
1529                        int width) {
1530   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
1531   int x;
1532   int const1 = 0xffb60070;
1533   int const2 = 0x0000ffda;
1534   int const3 = 0xffa2ffee;
1535   int const4 = 0x00000070;
1536   int const5 = 0x100;
1537 
1538   for (x = 0; x < width - 1; x += 2) {
1539     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1540     int tmp_t6, tmp_t7, tmp_t8;
1541     __asm__ __volatile__(
1542         ".set push                                                \n"
1543         ".set noreorder                                           \n"
1544         "ulw               %[tmp_t1],   0+1(%[src_rgb0])          \n"
1545         "ulw               %[tmp_t2],   4+1(%[src_rgb0])          \n"
1546         "ulw               %[tmp_t3],   0+1(%[src_rgb1])          \n"
1547         "ulw               %[tmp_t4],   4+1(%[src_rgb1])          \n"
1548         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
1549         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
1550         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
1551         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
1552         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
1553         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
1554         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
1555         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
1556         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
1557         "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
1558         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
1559         "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
1560         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
1561         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
1562         "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
1563         "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
1564         "mult              $ac0,        %[const5],     %[const5]  \n"
1565         "mult              $ac1,        %[const5],     %[const5]  \n"
1566         "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
1567         "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
1568         "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
1569         "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
1570         "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
1571         "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
1572         "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
1573         "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
1574         "addiu             %[dst_u],    %[dst_u],    1            \n"
1575         "addiu             %[dst_v],    %[dst_v],    1            \n"
1576         "sb                %[tmp_t7],   -1(%[dst_u])              \n"
1577         "sb                %[tmp_t8],   -1(%[dst_v])              \n"
1578         ".set pop                                                 \n"
1579         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1580           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1581           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1582           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1583           [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
1584           [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
1585         : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
1586           [const4] "r"(const4), [const5] "r"(const5)
1587         : "hi", "lo", "$ac1lo", "$ac1hi");
1588   }
1589 }
1590 
RGBAToYRow_DSPR2(const uint8 * src_argb0,uint8 * dst_y,int width)1591 void RGBAToYRow_DSPR2(const uint8* src_argb0, uint8* dst_y, int width) {
1592   int x;
1593   int const1 = 0x00420081;
1594   int const2 = 0x00190000;
1595   int const5 = 0x40;
1596   for (x = 0; x < width; x += 4) {
1597     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1598     int tmp_t6, tmp_t7, tmp_t8;
1599     __asm__ __volatile__(
1600         ".set push                                                \n"
1601         ".set noreorder                                           \n"
1602         "lw                %[tmp_t1],   0(%[src_argb0])           \n"
1603         "lw                %[tmp_t2],   4(%[src_argb0])           \n"
1604         "lw                %[tmp_t3],   8(%[src_argb0])           \n"
1605         "lw                %[tmp_t4],   12(%[src_argb0])          \n"
1606         "preceu.ph.qbl     %[tmp_t5],   %[tmp_t1]                 \n"
1607         "preceu.ph.qbr     %[tmp_t1],   %[tmp_t1]                 \n"
1608         "preceu.ph.qbl     %[tmp_t6],   %[tmp_t2]                 \n"
1609         "preceu.ph.qbr     %[tmp_t2],   %[tmp_t2]                 \n"
1610         "preceu.ph.qbl     %[tmp_t7],   %[tmp_t3]                 \n"
1611         "preceu.ph.qbr     %[tmp_t3],   %[tmp_t3]                 \n"
1612         "preceu.ph.qbl     %[tmp_t8],   %[tmp_t4]                 \n"
1613         "preceu.ph.qbr     %[tmp_t4],   %[tmp_t4]                 \n"
1614         "mult              $ac0,        %[const5],     %[const5]  \n"
1615         "mult              $ac1,        %[const5],     %[const5]  \n"
1616         "mult              $ac2,        %[const5],     %[const5]  \n"
1617         "mult              $ac3,        %[const5],     %[const5]  \n"
1618         "dpa.w.ph          $ac0,        %[tmp_t5],     %[const1]  \n"
1619         "dpa.w.ph          $ac1,        %[tmp_t6],     %[const1]  \n"
1620         "dpa.w.ph          $ac2,        %[tmp_t7],     %[const1]  \n"
1621         "dpa.w.ph          $ac3,        %[tmp_t8],     %[const1]  \n"
1622         "dpa.w.ph          $ac0,        %[tmp_t1],     %[const2]  \n"
1623         "dpa.w.ph          $ac1,        %[tmp_t2],     %[const2]  \n"
1624         "dpa.w.ph          $ac2,        %[tmp_t3],     %[const2]  \n"
1625         "dpa.w.ph          $ac3,        %[tmp_t4],     %[const2]  \n"
1626         "extr_r.w          %[tmp_t1],   $ac0,          8          \n"
1627         "extr_r.w          %[tmp_t2],   $ac1,          8          \n"
1628         "extr_r.w          %[tmp_t3],   $ac2,          8          \n"
1629         "extr_r.w          %[tmp_t4],   $ac3,          8          \n"
1630         "addiu             %[dst_y],    %[dst_y],      4          \n"
1631         "addiu             %[src_argb0],%[src_argb0],  16         \n"
1632         "sb                %[tmp_t1],   -4(%[dst_y])              \n"
1633         "sb                %[tmp_t2],   -3(%[dst_y])              \n"
1634         "sb                %[tmp_t3],   -2(%[dst_y])              \n"
1635         "sb                %[tmp_t4],   -1(%[dst_y])              \n"
1636         ".set pop                                                 \n"
1637         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1638           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1639           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1640           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1641           [src_argb0] "+r"(src_argb0), [dst_y] "+r"(dst_y)
1642         : [const1] "r"(const1), [const2] "r"(const2), [const5] "r"(const5)
1643         : "hi", "lo", "$ac1lo", "$ac1hi", "$ac2lo", "$ac2hi", "$ac3lo",
1644           "$ac3hi");
1645   }
1646 }
1647 
ARGBToUVRow_DSPR2(const uint8 * src_rgb0,int src_stride_rgb,uint8 * dst_u,uint8 * dst_v,int width)1648 void ARGBToUVRow_DSPR2(const uint8* src_rgb0,
1649                        int src_stride_rgb,
1650                        uint8* dst_u,
1651                        uint8* dst_v,
1652                        int width) {
1653   const uint8* src_rgb1 = src_rgb0 + src_stride_rgb;
1654   int x;
1655   int const1 = 0xffb60070;
1656   int const2 = 0x0000ffda;
1657   int const3 = 0xffa2ffee;
1658   int const4 = 0x00000070;
1659   int const5 = 0x100;
1660 
1661   for (x = 0; x < width - 1; x += 2) {
1662     int tmp_t1, tmp_t2, tmp_t3, tmp_t4, tmp_t5;
1663     int tmp_t6, tmp_t7, tmp_t8;
1664     __asm__ __volatile__(
1665         ".set push                                                \n"
1666         ".set noreorder                                           \n"
1667         "lw                %[tmp_t1],   0(%[src_rgb0])            \n"
1668         "lw                %[tmp_t2],   4(%[src_rgb0])            \n"
1669         "lw                %[tmp_t3],   0(%[src_rgb1])            \n"
1670         "lw                %[tmp_t4],   4(%[src_rgb1])            \n"
1671         "preceu.ph.qbr     %[tmp_t5],   %[tmp_t1]                 \n"
1672         "preceu.ph.qbl     %[tmp_t1],   %[tmp_t1]                 \n"
1673         "preceu.ph.qbr     %[tmp_t6],   %[tmp_t2]                 \n"
1674         "preceu.ph.qbl     %[tmp_t2],   %[tmp_t2]                 \n"
1675         "preceu.ph.qbr     %[tmp_t7],   %[tmp_t3]                 \n"
1676         "preceu.ph.qbl     %[tmp_t3],   %[tmp_t3]                 \n"
1677         "preceu.ph.qbr     %[tmp_t8],   %[tmp_t4]                 \n"
1678         "preceu.ph.qbl     %[tmp_t4],   %[tmp_t4]                 \n"
1679         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t6]  \n"
1680         "addu.ph           %[tmp_t7],   %[tmp_t7],     %[tmp_t8]  \n"
1681         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t2]  \n"
1682         "addu.ph           %[tmp_t3],   %[tmp_t3],     %[tmp_t4]  \n"
1683         "addu.ph           %[tmp_t5],   %[tmp_t5],     %[tmp_t7]  \n"
1684         "addu.ph           %[tmp_t1],   %[tmp_t1],     %[tmp_t3]  \n"
1685         "shrl.ph           %[tmp_t5],   %[tmp_t5],     2          \n"
1686         "shrl.ph           %[tmp_t1],   %[tmp_t1],     2          \n"
1687         "mult              $ac0,        %[const5],     %[const5]  \n"
1688         "mult              $ac1,        %[const5],     %[const5]  \n"
1689         "dpaq_s.w.ph       $ac0,        %[tmp_t5],     %[const1]  \n"
1690         "dpaq_s.w.ph       $ac1,        %[tmp_t5],     %[const3]  \n"
1691         "dpaq_s.w.ph       $ac0,        %[tmp_t1],     %[const2]  \n"
1692         "dpaq_s.w.ph       $ac1,        %[tmp_t1],     %[const4]  \n"
1693         "extr_r.w          %[tmp_t7],   $ac0,          9          \n"
1694         "extr_r.w          %[tmp_t8],   $ac1,          9          \n"
1695         "addiu             %[src_rgb0], %[src_rgb0], 8            \n"
1696         "addiu             %[src_rgb1], %[src_rgb1], 8            \n"
1697         "addiu             %[dst_u],    %[dst_u],    1            \n"
1698         "addiu             %[dst_v],    %[dst_v],    1            \n"
1699         "sb                %[tmp_t7],   -1(%[dst_u])              \n"
1700         "sb                %[tmp_t8],   -1(%[dst_v])              \n"
1701         ".set pop                                                 \n"
1702         : [tmp_t1] "=&r"(tmp_t1), [tmp_t2] "=&r"(tmp_t2),
1703           [tmp_t3] "=&r"(tmp_t3), [tmp_t4] "=&r"(tmp_t4),
1704           [tmp_t5] "=&r"(tmp_t5), [tmp_t6] "=&r"(tmp_t6),
1705           [tmp_t7] "=&r"(tmp_t7), [tmp_t8] "=&r"(tmp_t8),
1706           [src_rgb0] "+r"(src_rgb0), [src_rgb1] "+r"(src_rgb1),
1707           [dst_u] "+r"(dst_u), [dst_v] "+r"(dst_v)
1708         : [const1] "r"(const1), [const2] "r"(const2), [const3] "r"(const3),
1709           [const4] "r"(const4), [const5] "r"(const5)
1710         : "hi", "lo", "$ac1lo", "$ac1hi");
1711   }
1712 }
1713 
1714 #endif  // __mips_dsp_rev >= 2
1715 
1716 #endif  // defined(__mips__)
1717 
1718 #ifdef __cplusplus
1719 }  // extern "C"
1720 }  // namespace libyuv
1721 #endif
1722