1 /*
2  *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 #include "libyuv/rotate_row.h"
13 
14 #include "libyuv/basic_types.h"
15 
16 #ifdef __cplusplus
17 namespace libyuv {
18 extern "C" {
19 #endif
20 
21 #if !defined(LIBYUV_DISABLE_MIPS) && \
22     defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
23     (_MIPS_SIM == _MIPS_SIM_ABI32)
24 
TransposeWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)25 void TransposeWx8_DSPR2(const uint8* src, int src_stride,
26                         uint8* dst, int dst_stride, int width) {
27    __asm__ __volatile__ (
28       ".set push                                         \n"
29       ".set noreorder                                    \n"
30       "sll              $t2, %[src_stride], 0x1          \n" // src_stride x 2
31       "sll              $t4, %[src_stride], 0x2          \n" // src_stride x 4
32       "sll              $t9, %[src_stride], 0x3          \n" // src_stride x 8
33       "addu             $t3, $t2, %[src_stride]          \n"
34       "addu             $t5, $t4, %[src_stride]          \n"
35       "addu             $t6, $t2, $t4                    \n"
36       "andi             $t0, %[dst], 0x3                 \n"
37       "andi             $t1, %[dst_stride], 0x3          \n"
38       "or               $t0, $t0, $t1                    \n"
39       "bnez             $t0, 11f                         \n"
40       " subu            $t7, $t9, %[src_stride]          \n"
41 //dst + dst_stride word aligned
42     "1:                                                  \n"
43       "lbu              $t0, 0(%[src])                   \n"
44       "lbux             $t1, %[src_stride](%[src])       \n"
45       "lbux             $t8, $t2(%[src])                 \n"
46       "lbux             $t9, $t3(%[src])                 \n"
47       "sll              $t1, $t1, 16                     \n"
48       "sll              $t9, $t9, 16                     \n"
49       "or               $t0, $t0, $t1                    \n"
50       "or               $t8, $t8, $t9                    \n"
51       "precr.qb.ph      $s0, $t8, $t0                    \n"
52       "lbux             $t0, $t4(%[src])                 \n"
53       "lbux             $t1, $t5(%[src])                 \n"
54       "lbux             $t8, $t6(%[src])                 \n"
55       "lbux             $t9, $t7(%[src])                 \n"
56       "sll              $t1, $t1, 16                     \n"
57       "sll              $t9, $t9, 16                     \n"
58       "or               $t0, $t0, $t1                    \n"
59       "or               $t8, $t8, $t9                    \n"
60       "precr.qb.ph      $s1, $t8, $t0                    \n"
61       "sw               $s0, 0(%[dst])                   \n"
62       "addiu            %[width], -1                     \n"
63       "addiu            %[src], 1                        \n"
64       "sw               $s1, 4(%[dst])                   \n"
65       "bnez             %[width], 1b                     \n"
66       " addu            %[dst], %[dst], %[dst_stride]    \n"
67       "b                2f                               \n"
68 //dst + dst_stride unaligned
69    "11:                                                  \n"
70       "lbu              $t0, 0(%[src])                   \n"
71       "lbux             $t1, %[src_stride](%[src])       \n"
72       "lbux             $t8, $t2(%[src])                 \n"
73       "lbux             $t9, $t3(%[src])                 \n"
74       "sll              $t1, $t1, 16                     \n"
75       "sll              $t9, $t9, 16                     \n"
76       "or               $t0, $t0, $t1                    \n"
77       "or               $t8, $t8, $t9                    \n"
78       "precr.qb.ph      $s0, $t8, $t0                    \n"
79       "lbux             $t0, $t4(%[src])                 \n"
80       "lbux             $t1, $t5(%[src])                 \n"
81       "lbux             $t8, $t6(%[src])                 \n"
82       "lbux             $t9, $t7(%[src])                 \n"
83       "sll              $t1, $t1, 16                     \n"
84       "sll              $t9, $t9, 16                     \n"
85       "or               $t0, $t0, $t1                    \n"
86       "or               $t8, $t8, $t9                    \n"
87       "precr.qb.ph      $s1, $t8, $t0                    \n"
88       "swr              $s0, 0(%[dst])                   \n"
89       "swl              $s0, 3(%[dst])                   \n"
90       "addiu            %[width], -1                     \n"
91       "addiu            %[src], 1                        \n"
92       "swr              $s1, 4(%[dst])                   \n"
93       "swl              $s1, 7(%[dst])                   \n"
94       "bnez             %[width], 11b                    \n"
95        "addu             %[dst], %[dst], %[dst_stride]   \n"
96     "2:                                                  \n"
97       ".set pop                                          \n"
98       :[src] "+r" (src),
99        [dst] "+r" (dst),
100        [width] "+r" (width)
101       :[src_stride] "r" (src_stride),
102        [dst_stride] "r" (dst_stride)
103       : "t0", "t1",  "t2", "t3", "t4", "t5",
104         "t6", "t7", "t8", "t9",
105         "s0", "s1"
106   );
107 }
108 
TransposeWx8_Fast_DSPR2(const uint8 * src,int src_stride,uint8 * dst,int dst_stride,int width)109 void TransposeWx8_Fast_DSPR2(const uint8* src, int src_stride,
110                              uint8* dst, int dst_stride, int width) {
111   __asm__ __volatile__ (
112       ".set noat                                         \n"
113       ".set push                                         \n"
114       ".set noreorder                                    \n"
115       "beqz             %[width], 2f                     \n"
116       " sll             $t2, %[src_stride], 0x1          \n"  // src_stride x 2
117       "sll              $t4, %[src_stride], 0x2          \n"  // src_stride x 4
118       "sll              $t9, %[src_stride], 0x3          \n"  // src_stride x 8
119       "addu             $t3, $t2, %[src_stride]          \n"
120       "addu             $t5, $t4, %[src_stride]          \n"
121       "addu             $t6, $t2, $t4                    \n"
122 
123       "srl              $AT, %[width], 0x2               \n"
124       "andi             $t0, %[dst], 0x3                 \n"
125       "andi             $t1, %[dst_stride], 0x3          \n"
126       "or               $t0, $t0, $t1                    \n"
127       "bnez             $t0, 11f                         \n"
128       " subu            $t7, $t9, %[src_stride]          \n"
129 //dst + dst_stride word aligned
130       "1:                                                \n"
131       "lw               $t0, 0(%[src])                   \n"
132       "lwx              $t1, %[src_stride](%[src])       \n"
133       "lwx              $t8, $t2(%[src])                 \n"
134       "lwx              $t9, $t3(%[src])                 \n"
135 
136 // t0 = | 30 | 20 | 10 | 00 |
137 // t1 = | 31 | 21 | 11 | 01 |
138 // t8 = | 32 | 22 | 12 | 02 |
139 // t9 = | 33 | 23 | 13 | 03 |
140 
141       "precr.qb.ph     $s0, $t1, $t0                     \n"
142       "precr.qb.ph     $s1, $t9, $t8                     \n"
143       "precrq.qb.ph    $s2, $t1, $t0                     \n"
144       "precrq.qb.ph    $s3, $t9, $t8                     \n"
145 
146   // s0 = | 21 | 01 | 20 | 00 |
147   // s1 = | 23 | 03 | 22 | 02 |
148   // s2 = | 31 | 11 | 30 | 10 |
149   // s3 = | 33 | 13 | 32 | 12 |
150 
151       "precr.qb.ph     $s4, $s1, $s0                     \n"
152       "precrq.qb.ph    $s5, $s1, $s0                     \n"
153       "precr.qb.ph     $s6, $s3, $s2                     \n"
154       "precrq.qb.ph    $s7, $s3, $s2                     \n"
155 
156   // s4 = | 03 | 02 | 01 | 00 |
157   // s5 = | 23 | 22 | 21 | 20 |
158   // s6 = | 13 | 12 | 11 | 10 |
159   // s7 = | 33 | 32 | 31 | 30 |
160 
161       "lwx              $t0, $t4(%[src])                 \n"
162       "lwx              $t1, $t5(%[src])                 \n"
163       "lwx              $t8, $t6(%[src])                 \n"
164       "lwx              $t9, $t7(%[src])                 \n"
165 
166 // t0 = | 34 | 24 | 14 | 04 |
167 // t1 = | 35 | 25 | 15 | 05 |
168 // t8 = | 36 | 26 | 16 | 06 |
169 // t9 = | 37 | 27 | 17 | 07 |
170 
171       "precr.qb.ph     $s0, $t1, $t0                     \n"
172       "precr.qb.ph     $s1, $t9, $t8                     \n"
173       "precrq.qb.ph    $s2, $t1, $t0                     \n"
174       "precrq.qb.ph    $s3, $t9, $t8                     \n"
175 
176   // s0 = | 25 | 05 | 24 | 04 |
177   // s1 = | 27 | 07 | 26 | 06 |
178   // s2 = | 35 | 15 | 34 | 14 |
179   // s3 = | 37 | 17 | 36 | 16 |
180 
181       "precr.qb.ph     $t0, $s1, $s0                     \n"
182       "precrq.qb.ph    $t1, $s1, $s0                     \n"
183       "precr.qb.ph     $t8, $s3, $s2                     \n"
184       "precrq.qb.ph    $t9, $s3, $s2                     \n"
185 
186   // t0 = | 07 | 06 | 05 | 04 |
187   // t1 = | 27 | 26 | 25 | 24 |
188   // t8 = | 17 | 16 | 15 | 14 |
189   // t9 = | 37 | 36 | 35 | 34 |
190 
191       "addu            $s0, %[dst], %[dst_stride]        \n"
192       "addu            $s1, $s0, %[dst_stride]           \n"
193       "addu            $s2, $s1, %[dst_stride]           \n"
194 
195       "sw              $s4, 0(%[dst])                    \n"
196       "sw              $t0, 4(%[dst])                    \n"
197       "sw              $s6, 0($s0)                       \n"
198       "sw              $t8, 4($s0)                       \n"
199       "sw              $s5, 0($s1)                       \n"
200       "sw              $t1, 4($s1)                       \n"
201       "sw              $s7, 0($s2)                       \n"
202       "sw              $t9, 4($s2)                       \n"
203 
204       "addiu            $AT, -1                          \n"
205       "addiu            %[src], 4                        \n"
206 
207       "bnez             $AT, 1b                          \n"
208       " addu            %[dst], $s2, %[dst_stride]       \n"
209       "b                2f                               \n"
210 //dst + dst_stride unaligned
211       "11:                                               \n"
212       "lw               $t0, 0(%[src])                   \n"
213       "lwx              $t1, %[src_stride](%[src])       \n"
214       "lwx              $t8, $t2(%[src])                 \n"
215       "lwx              $t9, $t3(%[src])                 \n"
216 
217 // t0 = | 30 | 20 | 10 | 00 |
218 // t1 = | 31 | 21 | 11 | 01 |
219 // t8 = | 32 | 22 | 12 | 02 |
220 // t9 = | 33 | 23 | 13 | 03 |
221 
222       "precr.qb.ph     $s0, $t1, $t0                     \n"
223       "precr.qb.ph     $s1, $t9, $t8                     \n"
224       "precrq.qb.ph    $s2, $t1, $t0                     \n"
225       "precrq.qb.ph    $s3, $t9, $t8                     \n"
226 
227   // s0 = | 21 | 01 | 20 | 00 |
228   // s1 = | 23 | 03 | 22 | 02 |
229   // s2 = | 31 | 11 | 30 | 10 |
230   // s3 = | 33 | 13 | 32 | 12 |
231 
232       "precr.qb.ph     $s4, $s1, $s0                     \n"
233       "precrq.qb.ph    $s5, $s1, $s0                     \n"
234       "precr.qb.ph     $s6, $s3, $s2                     \n"
235       "precrq.qb.ph    $s7, $s3, $s2                     \n"
236 
237   // s4 = | 03 | 02 | 01 | 00 |
238   // s5 = | 23 | 22 | 21 | 20 |
239   // s6 = | 13 | 12 | 11 | 10 |
240   // s7 = | 33 | 32 | 31 | 30 |
241 
242       "lwx              $t0, $t4(%[src])                 \n"
243       "lwx              $t1, $t5(%[src])                 \n"
244       "lwx              $t8, $t6(%[src])                 \n"
245       "lwx              $t9, $t7(%[src])                 \n"
246 
247 // t0 = | 34 | 24 | 14 | 04 |
248 // t1 = | 35 | 25 | 15 | 05 |
249 // t8 = | 36 | 26 | 16 | 06 |
250 // t9 = | 37 | 27 | 17 | 07 |
251 
252       "precr.qb.ph     $s0, $t1, $t0                     \n"
253       "precr.qb.ph     $s1, $t9, $t8                     \n"
254       "precrq.qb.ph    $s2, $t1, $t0                     \n"
255       "precrq.qb.ph    $s3, $t9, $t8                     \n"
256 
257   // s0 = | 25 | 05 | 24 | 04 |
258   // s1 = | 27 | 07 | 26 | 06 |
259   // s2 = | 35 | 15 | 34 | 14 |
260   // s3 = | 37 | 17 | 36 | 16 |
261 
262       "precr.qb.ph     $t0, $s1, $s0                     \n"
263       "precrq.qb.ph    $t1, $s1, $s0                     \n"
264       "precr.qb.ph     $t8, $s3, $s2                     \n"
265       "precrq.qb.ph    $t9, $s3, $s2                     \n"
266 
267   // t0 = | 07 | 06 | 05 | 04 |
268   // t1 = | 27 | 26 | 25 | 24 |
269   // t8 = | 17 | 16 | 15 | 14 |
270   // t9 = | 37 | 36 | 35 | 34 |
271 
272       "addu            $s0, %[dst], %[dst_stride]        \n"
273       "addu            $s1, $s0, %[dst_stride]           \n"
274       "addu            $s2, $s1, %[dst_stride]           \n"
275 
276       "swr              $s4, 0(%[dst])                   \n"
277       "swl              $s4, 3(%[dst])                   \n"
278       "swr              $t0, 4(%[dst])                   \n"
279       "swl              $t0, 7(%[dst])                   \n"
280       "swr              $s6, 0($s0)                      \n"
281       "swl              $s6, 3($s0)                      \n"
282       "swr              $t8, 4($s0)                      \n"
283       "swl              $t8, 7($s0)                      \n"
284       "swr              $s5, 0($s1)                      \n"
285       "swl              $s5, 3($s1)                      \n"
286       "swr              $t1, 4($s1)                      \n"
287       "swl              $t1, 7($s1)                      \n"
288       "swr              $s7, 0($s2)                      \n"
289       "swl              $s7, 3($s2)                      \n"
290       "swr              $t9, 4($s2)                      \n"
291       "swl              $t9, 7($s2)                      \n"
292 
293       "addiu            $AT, -1                          \n"
294       "addiu            %[src], 4                        \n"
295 
296       "bnez             $AT, 11b                         \n"
297       " addu            %[dst], $s2, %[dst_stride]       \n"
298       "2:                                                \n"
299       ".set pop                                          \n"
300       ".set at                                           \n"
301       :[src] "+r" (src),
302        [dst] "+r" (dst),
303        [width] "+r" (width)
304       :[src_stride] "r" (src_stride),
305        [dst_stride] "r" (dst_stride)
306       : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
307         "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
308   );
309 }
310 
TransposeUVWx8_DSPR2(const uint8 * src,int src_stride,uint8 * dst_a,int dst_stride_a,uint8 * dst_b,int dst_stride_b,int width)311 void TransposeUVWx8_DSPR2(const uint8* src, int src_stride,
312                           uint8* dst_a, int dst_stride_a,
313                           uint8* dst_b, int dst_stride_b,
314                           int width) {
315   __asm__ __volatile__ (
316       ".set push                                         \n"
317       ".set noreorder                                    \n"
318       "beqz            %[width], 2f                      \n"
319       " sll            $t2, %[src_stride], 0x1           \n" // src_stride x 2
320       "sll             $t4, %[src_stride], 0x2           \n" // src_stride x 4
321       "sll             $t9, %[src_stride], 0x3           \n" // src_stride x 8
322       "addu            $t3, $t2, %[src_stride]           \n"
323       "addu            $t5, $t4, %[src_stride]           \n"
324       "addu            $t6, $t2, $t4                     \n"
325       "subu            $t7, $t9, %[src_stride]           \n"
326       "srl             $t1, %[width], 1                  \n"
327 
328 // check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
329       "andi            $t0, %[dst_a], 0x3                \n"
330       "andi            $t8, %[dst_b], 0x3                \n"
331       "or              $t0, $t0, $t8                     \n"
332       "andi            $t8, %[dst_stride_a], 0x3         \n"
333       "andi            $s5, %[dst_stride_b], 0x3         \n"
334       "or              $t8, $t8, $s5                     \n"
335       "or              $t0, $t0, $t8                     \n"
336       "bnez            $t0, 11f                          \n"
337       " nop                                              \n"
338 // dst + dst_stride word aligned (both, a & b dst addresses)
339     "1:                                                  \n"
340       "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
341       "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
342       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
343       "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
344       "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
345       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
346 
347       "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
348       "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
349       "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
350       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
351 
352       "sll             $t0, $t0, 16                      \n"
353       "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
354       "sll             $t9, $t9, 16                      \n"
355       "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
356 
357       "sw              $s3, 0($s5)                       \n"
358       "sw              $s4, 0($s6)                       \n"
359 
360       "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
361       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
362 
363       "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
364       "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
365       "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
366       "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
367       "sw              $s3, 0(%[dst_a])                  \n"
368       "sw              $s4, 0(%[dst_b])                  \n"
369 
370       "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
371       "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
372       "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
373       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
374 
375       "sll             $t0, $t0, 16                      \n"
376       "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
377       "sll             $t9, $t9, 16                      \n"
378       "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
379       "sw              $s3, 4($s5)                       \n"
380       "sw              $s4, 4($s6)                       \n"
381 
382       "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
383       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
384 
385       "addiu           %[src], 4                         \n"
386       "addiu           $t1, -1                           \n"
387       "sll             $t0, %[dst_stride_a], 1           \n"
388       "sll             $t8, %[dst_stride_b], 1           \n"
389       "sw              $s3, 4(%[dst_a])                  \n"
390       "sw              $s4, 4(%[dst_b])                  \n"
391       "addu            %[dst_a], %[dst_a], $t0           \n"
392       "bnez            $t1, 1b                           \n"
393       " addu           %[dst_b], %[dst_b], $t8           \n"
394       "b               2f                                \n"
395       " nop                                              \n"
396 
397 // dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
398    "11:                                                  \n"
399       "lw              $t0, 0(%[src])                    \n" // |B0|A0|b0|a0|
400       "lwx             $t8, %[src_stride](%[src])        \n" // |B1|A1|b1|a1|
401       "addu            $s5, %[dst_a], %[dst_stride_a]    \n"
402       "lwx             $t9, $t2(%[src])                  \n" // |B2|A2|b2|a2|
403       "lwx             $s0, $t3(%[src])                  \n" // |B3|A3|b3|a3|
404       "addu            $s6, %[dst_b], %[dst_stride_b]    \n"
405 
406       "precrq.ph.w     $s1, $t8, $t0                     \n" // |B1|A1|B0|A0|
407       "precrq.ph.w     $s2, $s0, $t9                     \n" // |B3|A3|B2|A2|
408       "precr.qb.ph     $s3, $s2, $s1                     \n" // |A3|A2|A1|A0|
409       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B3|B2|B1|B0|
410 
411       "sll             $t0, $t0, 16                      \n"
412       "packrl.ph       $s1, $t8, $t0                     \n" // |b1|a1|b0|a0|
413       "sll             $t9, $t9, 16                      \n"
414       "packrl.ph       $s2, $s0, $t9                     \n" // |b3|a3|b2|a2|
415 
416       "swr             $s3, 0($s5)                       \n"
417       "swl             $s3, 3($s5)                       \n"
418       "swr             $s4, 0($s6)                       \n"
419       "swl             $s4, 3($s6)                       \n"
420 
421       "precr.qb.ph     $s3, $s2, $s1                     \n" // |a3|a2|a1|a0|
422       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b3|b2|b1|b0|
423 
424       "lwx             $t0, $t4(%[src])                  \n" // |B4|A4|b4|a4|
425       "lwx             $t8, $t5(%[src])                  \n" // |B5|A5|b5|a5|
426       "lwx             $t9, $t6(%[src])                  \n" // |B6|A6|b6|a6|
427       "lwx             $s0, $t7(%[src])                  \n" // |B7|A7|b7|a7|
428       "swr             $s3, 0(%[dst_a])                  \n"
429       "swl             $s3, 3(%[dst_a])                  \n"
430       "swr             $s4, 0(%[dst_b])                  \n"
431       "swl             $s4, 3(%[dst_b])                  \n"
432 
433       "precrq.ph.w     $s1, $t8, $t0                     \n" // |B5|A5|B4|A4|
434       "precrq.ph.w     $s2, $s0, $t9                     \n" // |B6|A6|B7|A7|
435       "precr.qb.ph     $s3, $s2, $s1                     \n" // |A7|A6|A5|A4|
436       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |B7|B6|B5|B4|
437 
438       "sll             $t0, $t0, 16                      \n"
439       "packrl.ph       $s1, $t8, $t0                     \n" // |b5|a5|b4|a4|
440       "sll             $t9, $t9, 16                      \n"
441       "packrl.ph       $s2, $s0, $t9                     \n" // |b7|a7|b6|a6|
442 
443       "swr             $s3, 4($s5)                       \n"
444       "swl             $s3, 7($s5)                       \n"
445       "swr             $s4, 4($s6)                       \n"
446       "swl             $s4, 7($s6)                       \n"
447 
448       "precr.qb.ph     $s3, $s2, $s1                     \n" // |a7|a6|a5|a4|
449       "precrq.qb.ph    $s4, $s2, $s1                     \n" // |b7|b6|b5|b4|
450 
451       "addiu           %[src], 4                         \n"
452       "addiu           $t1, -1                           \n"
453       "sll             $t0, %[dst_stride_a], 1           \n"
454       "sll             $t8, %[dst_stride_b], 1           \n"
455       "swr             $s3, 4(%[dst_a])                  \n"
456       "swl             $s3, 7(%[dst_a])                  \n"
457       "swr             $s4, 4(%[dst_b])                  \n"
458       "swl             $s4, 7(%[dst_b])                  \n"
459       "addu            %[dst_a], %[dst_a], $t0           \n"
460       "bnez            $t1, 11b                          \n"
461       " addu           %[dst_b], %[dst_b], $t8           \n"
462 
463       "2:                                                \n"
464       ".set pop                                          \n"
465       : [src] "+r" (src),
466         [dst_a] "+r" (dst_a),
467         [dst_b] "+r" (dst_b),
468         [width] "+r" (width),
469         [src_stride] "+r" (src_stride)
470       : [dst_stride_a] "r" (dst_stride_a),
471         [dst_stride_b] "r" (dst_stride_b)
472       : "t0", "t1",  "t2", "t3",  "t4", "t5",
473         "t6", "t7", "t8", "t9",
474         "s0", "s1", "s2", "s3",
475         "s4", "s5", "s6"
476   );
477 }
478 
479 #endif  // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
480 
481 #ifdef __cplusplus
482 }  // extern "C"
483 }  // namespace libyuv
484 #endif
485