1 /*
2 * Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon
19 #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
20
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23 "vld1.u8 {d0}, [%0]! \n" \
24 "vld1.u32 {d2[0]}, [%1]! \n" \
25 "vld1.u32 {d2[1]}, [%2]! \n"
26
27 // Read 8 Y and 4 UV from NV12
28 #define READNV12 \
29 "vld1.u8 {d0}, [%0]! \n" \
30 "vld1.u8 {d2}, [%1]! \n" \
31 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
32 "vuzp.u8 d2, d3 \n" \
33 "vtrn.u32 d2, d3 \n" \
34
35 // Read 8 Y and 4 VU from NV21
36 #define READNV21 \
37 "vld1.u8 {d0}, [%0]! \n" \
38 "vld1.u8 {d2}, [%1]! \n" \
39 "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
40 "vuzp.u8 d3, d2 \n" \
41 "vtrn.u32 d2, d3 \n" \
42
43 #define YUV422TORGB \
44 "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
45 "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
46 "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
47 "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
48 "vtrn.u8 d0, d1 \n" \
49 "vsub.s16 q0, q0, q15 \n"/* offset y */\
50 "vmul.s16 q0, q0, q14 \n" \
51 "vadd.s16 d18, d19 \n" \
52 "vqadd.s16 d20, d0, d16 \n" \
53 "vqadd.s16 d21, d1, d16 \n" \
54 "vqadd.s16 d22, d0, d17 \n" \
55 "vqadd.s16 d23, d1, d17 \n" \
56 "vqadd.s16 d16, d0, d18 \n" \
57 "vqadd.s16 d17, d1, d18 \n" \
58 "vqrshrun.s16 d0, q10, #6 \n" \
59 "vqrshrun.s16 d1, q11, #6 \n" \
60 "vqrshrun.s16 d2, q8, #6 \n" \
61 "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
62 "vmovl.u8 q11, d1 \n" \
63 "vmovl.u8 q8, d2 \n" \
64 "vtrn.u8 d20, d21 \n" \
65 "vtrn.u8 d22, d23 \n" \
66 "vtrn.u8 d16, d17 \n" \
67 "vmov.u8 d21, d16 \n"
68
69 #if defined(HAS_I422TOARGBROW_NEON) || defined(HAS_I422TOBGRAROW_NEON) || \
70 defined(HAS_I422TOABGRROW_NEON) || defined(HAS_I422TORGBAROW_NEON)
71 static const vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
72 0, 0, 0, 0, 0, 0, 0, 0 };
73 static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
74 0, 0, 0, 0, 0, 0, 0, 0 };
75 #endif
76
77 #ifdef HAS_I422TOARGBROW_NEON
I422ToARGBRow_NEON(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)78 void I422ToARGBRow_NEON(const uint8* y_buf,
79 const uint8* u_buf,
80 const uint8* v_buf,
81 uint8* rgb_buf,
82 int width) {
83 asm volatile (
84 "vld1.u8 {d24}, [%5] \n"
85 "vld1.u8 {d25}, [%6] \n"
86 "vmov.u8 d26, #128 \n"
87 "vmov.u16 q14, #74 \n"
88 "vmov.u16 q15, #16 \n"
89 ".p2align 2 \n"
90 "1: \n"
91 READYUV422
92 YUV422TORGB
93 "subs %4, %4, #8 \n"
94 "vmov.u8 d23, #255 \n"
95 "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
96 "bgt 1b \n"
97 : "+r"(y_buf), // %0
98 "+r"(u_buf), // %1
99 "+r"(v_buf), // %2
100 "+r"(rgb_buf), // %3
101 "+r"(width) // %4
102 : "r"(&kUVToRB), // %5
103 "r"(&kUVToG) // %6
104 : "cc", "memory", "q0", "q1", "q2", "q3",
105 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
106 );
107 }
108 #endif // HAS_I422TOARGBROW_NEON
109
110 #ifdef HAS_I422TOBGRAROW_NEON
I422ToBGRARow_NEON(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)111 void I422ToBGRARow_NEON(const uint8* y_buf,
112 const uint8* u_buf,
113 const uint8* v_buf,
114 uint8* rgb_buf,
115 int width) {
116 asm volatile (
117 "vld1.u8 {d24}, [%5] \n"
118 "vld1.u8 {d25}, [%6] \n"
119 "vmov.u8 d26, #128 \n"
120 "vmov.u16 q14, #74 \n"
121 "vmov.u16 q15, #16 \n"
122 ".p2align 2 \n"
123 "1: \n"
124 READYUV422
125 YUV422TORGB
126 "subs %4, %4, #8 \n"
127 "vswp.u8 d20, d22 \n"
128 "vmov.u8 d19, #255 \n"
129 "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
130 "bgt 1b \n"
131 : "+r"(y_buf), // %0
132 "+r"(u_buf), // %1
133 "+r"(v_buf), // %2
134 "+r"(rgb_buf), // %3
135 "+r"(width) // %4
136 : "r"(&kUVToRB), // %5
137 "r"(&kUVToG) // %6
138 : "cc", "memory", "q0", "q1", "q2", "q3",
139 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
140 );
141 }
142 #endif // HAS_I422TOBGRAROW_NEON
143
144 #ifdef HAS_I422TOABGRROW_NEON
I422ToABGRRow_NEON(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)145 void I422ToABGRRow_NEON(const uint8* y_buf,
146 const uint8* u_buf,
147 const uint8* v_buf,
148 uint8* rgb_buf,
149 int width) {
150 asm volatile (
151 "vld1.u8 {d24}, [%5] \n"
152 "vld1.u8 {d25}, [%6] \n"
153 "vmov.u8 d26, #128 \n"
154 "vmov.u16 q14, #74 \n"
155 "vmov.u16 q15, #16 \n"
156 ".p2align 2 \n"
157 "1: \n"
158 READYUV422
159 YUV422TORGB
160 "subs %4, %4, #8 \n"
161 "vswp.u8 d20, d22 \n"
162 "vmov.u8 d23, #255 \n"
163 "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
164 "bgt 1b \n"
165 : "+r"(y_buf), // %0
166 "+r"(u_buf), // %1
167 "+r"(v_buf), // %2
168 "+r"(rgb_buf), // %3
169 "+r"(width) // %4
170 : "r"(&kUVToRB), // %5
171 "r"(&kUVToG) // %6
172 : "cc", "memory", "q0", "q1", "q2", "q3",
173 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
174 );
175 }
176 #endif // HAS_I422TOABGRROW_NEON
177
178 #ifdef HAS_I422TORGBAROW_NEON
I422ToRGBARow_NEON(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)179 void I422ToRGBARow_NEON(const uint8* y_buf,
180 const uint8* u_buf,
181 const uint8* v_buf,
182 uint8* rgb_buf,
183 int width) {
184 asm volatile (
185 "vld1.u8 {d24}, [%5] \n"
186 "vld1.u8 {d25}, [%6] \n"
187 "vmov.u8 d26, #128 \n"
188 "vmov.u16 q14, #74 \n"
189 "vmov.u16 q15, #16 \n"
190 ".p2align 2 \n"
191 "1: \n"
192 READYUV422
193 YUV422TORGB
194 "subs %4, %4, #8 \n"
195 "vmov.u8 d19, #255 \n"
196 "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
197 "bgt 1b \n"
198 : "+r"(y_buf), // %0
199 "+r"(u_buf), // %1
200 "+r"(v_buf), // %2
201 "+r"(rgb_buf), // %3
202 "+r"(width) // %4
203 : "r"(&kUVToRB), // %5
204 "r"(&kUVToG) // %6
205 : "cc", "memory", "q0", "q1", "q2", "q3",
206 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
207 );
208 }
209 #endif // HAS_I422TORGBAROW_NEON
210
211 #ifdef HAS_I422TORGB24ROW_NEON
I422ToRGB24Row_NEON(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)212 void I422ToRGB24Row_NEON(const uint8* y_buf,
213 const uint8* u_buf,
214 const uint8* v_buf,
215 uint8* rgb_buf,
216 int width) {
217 asm volatile (
218 "vld1.u8 {d24}, [%5] \n"
219 "vld1.u8 {d25}, [%6] \n"
220 "vmov.u8 d26, #128 \n"
221 "vmov.u16 q14, #74 \n"
222 "vmov.u16 q15, #16 \n"
223 ".p2align 2 \n"
224 "1: \n"
225 READYUV422
226 YUV422TORGB
227 "subs %4, %4, #8 \n"
228 "vst3.8 {d20, d21, d22}, [%3]! \n"
229 "bgt 1b \n"
230 : "+r"(y_buf), // %0
231 "+r"(u_buf), // %1
232 "+r"(v_buf), // %2
233 "+r"(rgb_buf), // %3
234 "+r"(width) // %4
235 : "r"(&kUVToRB), // %5
236 "r"(&kUVToG) // %6
237 : "cc", "memory", "q0", "q1", "q2", "q3",
238 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
239 );
240 }
241 #endif // HAS_I422TORGB24ROW_NEON
242
243 #ifdef HAS_I422TORAWROW_NEON
I422ToRAWRow_NEON(const uint8 * y_buf,const uint8 * u_buf,const uint8 * v_buf,uint8 * rgb_buf,int width)244 void I422ToRAWRow_NEON(const uint8* y_buf,
245 const uint8* u_buf,
246 const uint8* v_buf,
247 uint8* rgb_buf,
248 int width) {
249 asm volatile (
250 "vld1.u8 {d24}, [%5] \n"
251 "vld1.u8 {d25}, [%6] \n"
252 "vmov.u8 d26, #128 \n"
253 "vmov.u16 q14, #74 \n"
254 "vmov.u16 q15, #16 \n"
255 ".p2align 2 \n"
256 "1: \n"
257 READYUV422
258 YUV422TORGB
259 "subs %4, %4, #8 \n"
260 "vswp.u8 d20, d22 \n"
261 "vst3.8 {d20, d21, d22}, [%3]! \n"
262 "bgt 1b \n"
263 : "+r"(y_buf), // %0
264 "+r"(u_buf), // %1
265 "+r"(v_buf), // %2
266 "+r"(rgb_buf), // %3
267 "+r"(width) // %4
268 : "r"(&kUVToRB), // %5
269 "r"(&kUVToG) // %6
270 : "cc", "memory", "q0", "q1", "q2", "q3",
271 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
272 );
273 }
274 #endif // HAS_I422TORAWROW_NEON
275
276 #ifdef HAS_NV12TOARGBROW_NEON
NV12ToARGBRow_NEON(const uint8 * y_buf,const uint8 * uv_buf,uint8 * rgb_buf,int width)277 void NV12ToARGBRow_NEON(const uint8* y_buf,
278 const uint8* uv_buf,
279 uint8* rgb_buf,
280 int width) {
281 asm volatile (
282 "vld1.u8 {d24}, [%4] \n"
283 "vld1.u8 {d25}, [%5] \n"
284 "vmov.u8 d26, #128 \n"
285 "vmov.u16 q14, #74 \n"
286 "vmov.u16 q15, #16 \n"
287 ".p2align 2 \n"
288 "1: \n"
289 READNV12
290 YUV422TORGB
291 "subs %3, %3, #8 \n"
292 "vmov.u8 d23, #255 \n"
293 "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
294 "bgt 1b \n"
295 : "+r"(y_buf), // %0
296 "+r"(uv_buf), // %1
297 "+r"(rgb_buf), // %2
298 "+r"(width) // %3
299 : "r"(&kUVToRB), // %4
300 "r"(&kUVToG) // %5
301 : "cc", "memory", "q0", "q1", "q2", "q3",
302 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
303 );
304 }
305 #endif // HAS_NV12TOARGBROW_NEON
306
307 #ifdef HAS_NV21TOARGBROW_NEON
NV21ToARGBRow_NEON(const uint8 * y_buf,const uint8 * uv_buf,uint8 * rgb_buf,int width)308 void NV21ToARGBRow_NEON(const uint8* y_buf,
309 const uint8* uv_buf,
310 uint8* rgb_buf,
311 int width) {
312 asm volatile (
313 "vld1.u8 {d24}, [%4] \n"
314 "vld1.u8 {d25}, [%5] \n"
315 "vmov.u8 d26, #128 \n"
316 "vmov.u16 q14, #74 \n"
317 "vmov.u16 q15, #16 \n"
318 ".p2align 2 \n"
319 "1: \n"
320 READNV21
321 YUV422TORGB
322 "subs %3, %3, #8 \n"
323 "vmov.u8 d23, #255 \n"
324 "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
325 "bgt 1b \n"
326 : "+r"(y_buf), // %0
327 "+r"(uv_buf), // %1
328 "+r"(rgb_buf), // %2
329 "+r"(width) // %3
330 : "r"(&kUVToRB), // %4
331 "r"(&kUVToG) // %5
332 : "cc", "memory", "q0", "q1", "q2", "q3",
333 "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
334 );
335 }
336 #endif // HAS_NV21TOARGBROW_NEON
337
338 #ifdef HAS_SPLITUV_NEON
339 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
340 // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
SplitUV_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)341 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
342 asm volatile (
343 ".p2align 2 \n"
344 "1: \n"
345 "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
346 "subs %3, %3, #16 \n" // 16 processed per loop
347 "vst1.u8 {q0}, [%1]! \n" // store U
348 "vst1.u8 {q1}, [%2]! \n" // Store V
349 "bgt 1b \n"
350 : "+r"(src_uv), // %0
351 "+r"(dst_u), // %1
352 "+r"(dst_v), // %2
353 "+r"(width) // %3 // Output registers
354 : // Input registers
355 : "memory", "cc", "q0", "q1" // Clobber List
356 );
357 }
358 #endif // HAS_SPLITUV_NEON
359
360 #ifdef HAS_COPYROW_NEON
361 // Copy multiple of 64
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)362 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
363 asm volatile (
364 ".p2align 2 \n"
365 "1: \n"
366 "vldm %0!, {q0, q1, q2, q3} \n" // load 64
367 "subs %2, %2, #64 \n" // 64 processed per loop
368 "vstm %1!, {q0, q1, q2, q3} \n" // store 64
369 "bgt 1b \n"
370 : "+r"(src), // %0
371 "+r"(dst), // %1
372 "+r"(count) // %2 // Output registers
373 : // Input registers
374 : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
375 );
376 }
377 #endif // HAS_COPYROW_NEON
378
379 #ifdef HAS_SETROW_NEON
380 // SetRow8 writes 'count' bytes using a 32 bit value repeated.
SetRow8_NEON(uint8 * dst,uint32 v32,int count)381 void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
382 asm volatile ( // NOLINT
383 "vdup.u32 q0, %2 \n" // duplicate 4 ints
384 "1: \n"
385 "subs %1, %1, #16 \n" // 16 bytes per loop
386 "vst1.u32 {q0}, [%0]! \n" // store
387 "bgt 1b \n"
388 : "+r"(dst), // %0
389 "+r"(count) // %1
390 : "r"(v32) // %2
391 : "q0", "memory", "cc");
392 }
393
394 // TODO(fbarchard): Make fully assembler
395 // SetRow32 writes 'count' words using a 32 bit value repeated.
SetRows32_NEON(uint8 * dst,uint32 v32,int width,int dst_stride,int height)396 void SetRows32_NEON(uint8* dst, uint32 v32, int width,
397 int dst_stride, int height) {
398 for (int y = 0; y < height; ++y) {
399 SetRow8_NEON(dst, v32, width << 2);
400 dst += dst_stride;
401 }
402 }
403 #endif // HAS_SETROW_NEON
404
405 #ifdef HAS_MIRRORROW_NEON
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)406 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
407 asm volatile (
408 // compute where to start writing destination
409 "add %1, %2 \n"
410 // work on segments that are multiples of 16
411 "lsrs r3, %2, #4 \n"
412 // the output is written in two block. 8 bytes followed
413 // by another 8. reading is done sequentially, from left to
414 // right. writing is done from right to left in block sizes
415 // %1, the destination pointer is incremented after writing
416 // the first of the two blocks. need to subtract that 8 off
417 // along with 16 to get the next location.
418 "mov r3, #-24 \n"
419 "beq 2f \n"
420
421 // back of destination by the size of the register that is
422 // going to be mirrored
423 "sub %1, #16 \n"
424 // the loop needs to run on blocks of 16. what will be left
425 // over is either a negative number, the residuals that need
426 // to be done, or 0. If this isn't subtracted off here the
427 // loop will run one extra time.
428 "sub %2, #16 \n"
429
430 // mirror the bytes in the 64 bit segments. unable to mirror
431 // the bytes in the entire 128 bits in one go.
432 // because of the inability to mirror the entire 128 bits
433 // mirror the writing out of the two 64 bit segments.
434 ".p2align 2 \n"
435 "1: \n"
436 "vld1.8 {q0}, [%0]! \n" // src += 16
437 "subs %2, #16 \n"
438 "vrev64.8 q0, q0 \n"
439 "vst1.8 {d1}, [%1]! \n"
440 "vst1.8 {d0}, [%1], r3 \n" // dst -= 16
441 "bge 1b \n"
442
443 // add 16 back to the counter. if the result is 0 there is no
444 // residuals so jump past
445 "adds %2, #16 \n"
446 "beq 5f \n"
447 "add %1, #16 \n"
448 "2: \n"
449 "mov r3, #-3 \n"
450 "sub %1, #2 \n"
451 "subs %2, #2 \n"
452 // check for 16*n+1 scenarios where segments_of_2 should not
453 // be run, but there is something left over.
454 "blt 4f \n"
455
456 // do this in neon registers as per
457 // http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
458 "3: \n"
459 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
460 "subs %2, #2 \n"
461 "vst1.8 {d1[0]}, [%1]! \n"
462 "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
463 "bge 3b \n"
464
465 "adds %2, #2 \n"
466 "beq 5f \n"
467 "4: \n"
468 "add %1, #1 \n"
469 "vld1.8 {d0[0]}, [%0] \n"
470 "vst1.8 {d0[0]}, [%1] \n"
471 "5: \n"
472 : "+r"(src), // %0
473 "+r"(dst), // %1
474 "+r"(width) // %2
475 :
476 : "memory", "cc", "r3", "q0"
477 );
478 }
479 #endif // HAS_MIRRORROW_NEON
480
481 #ifdef HAS_MIRRORROWUV_NEON
MirrorRowUV_NEON(const uint8 * src,uint8 * dst_a,uint8 * dst_b,int width)482 void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
483 asm volatile (
484 // compute where to start writing destination
485 "add %1, %3 \n" // dst_a + width
486 "add %2, %3 \n" // dst_b + width
487 // work on input segments that are multiples of 16, but
488 // width that has been passed is output segments, half
489 // the size of input.
490 "lsrs r12, %3, #3 \n"
491 "beq 2f \n"
492 // the output is written in to two blocks.
493 "mov r12, #-8 \n"
494 // back of destination by the size of the register that is
495 // going to be mirrord
496 "sub %1, #8 \n"
497 "sub %2, #8 \n"
498 // the loop needs to run on blocks of 8. what will be left
499 // over is either a negative number, the residuals that need
500 // to be done, or 0. if this isn't subtracted off here the
501 // loop will run one extra time.
502 "sub %3, #8 \n"
503
504 // mirror the bytes in the 64 bit segments
505 ".p2align 2 \n"
506 "1: \n"
507 "vld2.8 {d0, d1}, [%0]! \n" // src += 16
508 "subs %3, #8 \n"
509 "vrev64.8 q0, q0 \n"
510 "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
511 "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
512 "bge 1b \n"
513
514 // add 8 back to the counter. if the result is 0 there is no
515 // residuals so return
516 "adds %3, #8 \n"
517 "beq 4f \n"
518 "add %1, #8 \n"
519 "add %2, #8 \n"
520 "2: \n"
521 "mov r12, #-1 \n"
522 "sub %1, #1 \n"
523 "sub %2, #1 \n"
524 "3: \n"
525 "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
526 "subs %3, %3, #1 \n"
527 "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
528 "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
529 "bgt 3b \n"
530 "4: \n"
531 : "+r"(src), // %0
532 "+r"(dst_a), // %1
533 "+r"(dst_b), // %2
534 "+r"(width) // %3
535 :
536 : "memory", "cc", "r12", "q0"
537 );
538 }
539 #endif // HAS_MIRRORROWUV_NEON
540
541 #ifdef HAS_BGRATOARGBROW_NEON
BGRAToARGBRow_NEON(const uint8 * src_bgra,uint8 * dst_argb,int pix)542 void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) {
543 asm volatile (
544 ".p2align 2 \n"
545 "1: \n"
546 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
547 "subs %2, %2, #8 \n" // 8 processed per loop.
548 "vswp.u8 d1, d2 \n" // swap G, R
549 "vswp.u8 d0, d3 \n" // swap B, A
550 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
551 "bgt 1b \n"
552 : "+r"(src_bgra), // %0
553 "+r"(dst_argb), // %1
554 "+r"(pix) // %2
555 :
556 : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
557 );
558 }
559 #endif // HAS_BGRATOARGBROW_NEON
560
561 #ifdef HAS_ABGRTOARGBROW_NEON
ABGRToARGBRow_NEON(const uint8 * src_abgr,uint8 * dst_argb,int pix)562 void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) {
563 asm volatile (
564 ".p2align 2 \n"
565 "1: \n"
566 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
567 "subs %2, %2, #8 \n" // 8 processed per loop.
568 "vswp.u8 d0, d2 \n" // swap R, B
569 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
570 "bgt 1b \n"
571 : "+r"(src_abgr), // %0
572 "+r"(dst_argb), // %1
573 "+r"(pix) // %2
574 :
575 : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
576 );
577 }
578 #endif // HAS_ABGRTOARGBROW_NEON
579
580 #ifdef HAS_RGBATOARGBROW_NEON
RGBAToARGBRow_NEON(const uint8 * src_rgba,uint8 * dst_argb,int pix)581 void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) {
582 asm volatile (
583 ".p2align 2 \n"
584 "1: \n"
585 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
586 "subs %2, %2, #8 \n" // 8 processed per loop.
587 "vmov.u8 d4, d0 \n" // move A after RGB
588 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
589 "bgt 1b \n"
590 : "+r"(src_rgba), // %0
591 "+r"(dst_argb), // %1
592 "+r"(pix) // %2
593 :
594 : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List
595 );
596 }
597 #endif // HAS_RGBATOARGBROW_NEON
598
599 #ifdef HAS_RGB24TOARGBROW_NEON
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int pix)600 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
601 asm volatile (
602 "vmov.u8 d4, #255 \n" // Alpha
603 ".p2align 2 \n"
604 "1: \n"
605 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
606 "subs %2, %2, #8 \n" // 8 processed per loop.
607 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
608 "bgt 1b \n"
609 : "+r"(src_rgb24), // %0
610 "+r"(dst_argb), // %1
611 "+r"(pix) // %2
612 :
613 : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
614 );
615 }
616 #endif // HAS_RGB24TOARGBROW_NEON
617
618 #ifdef HAS_RAWTOARGBROW_NEON
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int pix)619 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
620 asm volatile (
621 "vmov.u8 d4, #255 \n" // Alpha
622 ".p2align 2 \n"
623 "1: \n"
624 "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
625 "subs %2, %2, #8 \n" // 8 processed per loop.
626 "vswp.u8 d1, d3 \n" // swap R, B
627 "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
628 "bgt 1b \n"
629 : "+r"(src_raw), // %0
630 "+r"(dst_argb), // %1
631 "+r"(pix) // %2
632 :
633 : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
634 );
635 }
636 #endif // HAS_RAWTOARGBROW_NEON
637
638 #ifdef HAS_ARGBTORGBAROW_NEON
ARGBToRGBARow_NEON(const uint8 * src_argb,uint8 * dst_rgba,int pix)639 void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) {
640 asm volatile (
641 ".p2align 2 \n"
642 "1: \n"
643 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
644 "subs %2, %2, #8 \n" // 8 processed per loop.
645 "vmov.u8 d0, d4 \n" // move A before RGB.
646 "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of RGBA.
647 "bgt 1b \n"
648 : "+r"(src_argb), // %0
649 "+r"(dst_rgba), // %1
650 "+r"(pix) // %2
651 :
652 : "memory", "cc", "d0", "d1", "d2", "d3", "d4" // Clobber List
653 );
654 }
655 #endif // HAS_ARGBTORGBAROW_NEON
656
657 #ifdef HAS_ARGBTORGB24ROW_NEON
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int pix)658 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
659 asm volatile (
660 ".p2align 2 \n"
661 "1: \n"
662 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
663 "subs %2, %2, #8 \n" // 8 processed per loop.
664 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
665 "bgt 1b \n"
666 : "+r"(src_argb), // %0
667 "+r"(dst_rgb24), // %1
668 "+r"(pix) // %2
669 :
670 : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
671 );
672 }
673 #endif // HAS_ARGBTORGB24ROW_NEON
674
675 #ifdef HAS_ARGBTORAWROW_NEON
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int pix)676 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
677 asm volatile (
678 ".p2align 2 \n"
679 "1: \n"
680 "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
681 "subs %2, %2, #8 \n" // 8 processed per loop.
682 "vswp.u8 d1, d3 \n" // swap R, B
683 "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
684 "bgt 1b \n"
685 : "+r"(src_argb), // %0
686 "+r"(dst_raw), // %1
687 "+r"(pix) // %2
688 :
689 : "memory", "cc", "d1", "d2", "d3", "d4" // Clobber List
690 );
691 }
692 #endif // HAS_ARGBTORAWROW_NEON
693
694 #ifdef HAS_YUY2TOYROW_NEON
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int pix)695 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
696 asm volatile (
697 ".p2align 2 \n"
698 "1: \n"
699 "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
700 "subs %2, %2, #16 \n" // 16 processed per loop.
701 "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y.
702 "bgt 1b \n"
703 : "+r"(src_yuy2), // %0
704 "+r"(dst_y), // %1
705 "+r"(pix) // %2
706 :
707 : "memory", "cc", "q0", "q1" // Clobber List
708 );
709 }
710 #endif // HAS_YUY2TOYROW_NEON
711
712 #ifdef HAS_UYVYTOYROW_NEON
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int pix)713 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
714 asm volatile (
715 ".p2align 2 \n"
716 "1: \n"
717 "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
718 "subs %2, %2, #16 \n" // 16 processed per loop.
719 "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y.
720 "bgt 1b \n"
721 : "+r"(src_uyvy), // %0
722 "+r"(dst_y), // %1
723 "+r"(pix) // %2
724 :
725 : "memory", "cc", "q0", "q1" // Clobber List
726 );
727 }
728 #endif // HAS_UYVYTOYROW_NEON
729
730 #ifdef HAS_YUY2TOYROW_NEON
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)731 void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
732 int pix) {
733 asm volatile (
734 ".p2align 2 \n"
735 "1: \n"
736 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
737 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
738 "vst1.u8 {d1}, [%1]! \n" // store 8 U.
739 "vst1.u8 {d3}, [%2]! \n" // store 8 V.
740 "bgt 1b \n"
741 : "+r"(src_yuy2), // %0
742 "+r"(dst_u), // %1
743 "+r"(dst_v), // %2
744 "+r"(pix) // %3
745 :
746 : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
747 );
748 }
749 #endif // HAS_YUY2TOYROW_NEON
750
751 #ifdef HAS_UYVYTOYROW_NEON
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)752 void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
753 int pix) {
754 asm volatile (
755 ".p2align 2 \n"
756 "1: \n"
757 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
758 "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
759 "vst1.u8 {d0}, [%1]! \n" // store 8 U.
760 "vst1.u8 {d2}, [%2]! \n" // store 8 V.
761 "bgt 1b \n"
762 : "+r"(src_uyvy), // %0
763 "+r"(dst_u), // %1
764 "+r"(dst_v), // %2
765 "+r"(pix) // %3
766 :
767 : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List
768 );
769 }
770 #endif // HAS_UYVYTOYROW_NEON
771
772 #ifdef HAS_YUY2TOYROW_NEON
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int pix)773 void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
774 uint8* dst_u, uint8* dst_v, int pix) {
775 asm volatile (
776 "adds %1, %0, %1 \n" // stride + src_yuy2
777 ".p2align 2 \n"
778 "1: \n"
779 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
780 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
781 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
782 "vrhadd.u8 d1, d1, d5 \n" // average rows of U
783 "vrhadd.u8 d3, d3, d7 \n" // average rows of V
784 "vst1.u8 {d1}, [%2]! \n" // store 8 U.
785 "vst1.u8 {d3}, [%3]! \n" // store 8 V.
786 "bgt 1b \n"
787 : "+r"(src_yuy2), // %0
788 "+r"(stride_yuy2), // %1
789 "+r"(dst_u), // %2
790 "+r"(dst_v), // %3
791 "+r"(pix) // %4
792 :
793 : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
794 );
795 }
796 #endif // HAS_YUY2TOYROW_NEON
797
798 #ifdef HAS_UYVYTOYROW_NEON
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int pix)799 void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
800 uint8* dst_u, uint8* dst_v, int pix) {
801 asm volatile (
802 "adds %1, %0, %1 \n" // stride + src_uyvy
803 ".p2align 2 \n"
804 "1: \n"
805 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
806 "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
807 "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
808 "vrhadd.u8 d0, d0, d4 \n" // average rows of U
809 "vrhadd.u8 d2, d2, d6 \n" // average rows of V
810 "vst1.u8 {d0}, [%2]! \n" // store 8 U.
811 "vst1.u8 {d2}, [%3]! \n" // store 8 V.
812 "bgt 1b \n"
813 : "+r"(src_uyvy), // %0
814 "+r"(stride_uyvy), // %1
815 "+r"(dst_u), // %2
816 "+r"(dst_v), // %3
817 "+r"(pix) // %4
818 :
819 : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
820 );
821 }
822 #endif // HAS_UYVYTOYROW_NEON
823
824 #endif // __ARM_NEON__
825
826 #ifdef __cplusplus
827 } // extern "C"
828 } // namespace libyuv
829 #endif
830