1 /*
2 * Copyright 2014 The LibYuv Project Authors. All rights reserved.
3 *
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
9 */
10
11 #include "libyuv/row.h"
12
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23 MEMACCESS(0) \
24 "ld1 {v0.8b}, [%0], #8 \n" \
25 MEMACCESS(1) \
26 "ld1 {v1.s}[0], [%1], #4 \n" \
27 MEMACCESS(2) \
28 "ld1 {v1.s}[1], [%2], #4 \n"
29
30 // Read 8 Y, 8 U and 8 V from 444
31 #define READYUV444 \
32 MEMACCESS(0) \
33 "ld1 {v0.8b}, [%0], #8 \n" \
34 MEMACCESS(1) \
35 "ld1 {v1.d}[0], [%1], #8 \n" \
36 MEMACCESS(2) \
37 "ld1 {v1.d}[1], [%2], #8 \n" \
38 "uaddlp v1.8h, v1.16b \n" \
39 "rshrn v1.8b, v1.8h, #1 \n"
40
41 // Read 8 Y, and set 4 U and 4 V to 128
42 #define READYUV400 \
43 MEMACCESS(0) \
44 "ld1 {v0.8b}, [%0], #8 \n" \
45 "movi v1.8b , #128 \n"
46
47 // Read 8 Y and 4 UV from NV12
48 #define READNV12 \
49 MEMACCESS(0) \
50 "ld1 {v0.8b}, [%0], #8 \n" \
51 MEMACCESS(1) \
52 "ld1 {v2.8b}, [%1], #8 \n" \
53 "uzp1 v1.8b, v2.8b, v2.8b \n" \
54 "uzp2 v3.8b, v2.8b, v2.8b \n" \
55 "ins v1.s[1], v3.s[0] \n"
56
57 // Read 8 Y and 4 VU from NV21
58 #define READNV21 \
59 MEMACCESS(0) \
60 "ld1 {v0.8b}, [%0], #8 \n" \
61 MEMACCESS(1) \
62 "ld1 {v2.8b}, [%1], #8 \n" \
63 "uzp1 v3.8b, v2.8b, v2.8b \n" \
64 "uzp2 v1.8b, v2.8b, v2.8b \n" \
65 "ins v1.s[1], v3.s[0] \n"
66
67 // Read 8 YUY2
68 #define READYUY2 \
69 MEMACCESS(0) \
70 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" \
71 "uzp2 v3.8b, v1.8b, v1.8b \n" \
72 "uzp1 v1.8b, v1.8b, v1.8b \n" \
73 "ins v1.s[1], v3.s[0] \n"
74
75 // Read 8 UYVY
76 #define READUYVY \
77 MEMACCESS(0) \
78 "ld2 {v2.8b, v3.8b}, [%0], #16 \n" \
79 "orr v0.8b, v3.8b, v3.8b \n" \
80 "uzp1 v1.8b, v2.8b, v2.8b \n" \
81 "uzp2 v3.8b, v2.8b, v2.8b \n" \
82 "ins v1.s[1], v3.s[0] \n"
83
84 #define YUVTORGB_SETUP \
85 "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
86 "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
87 "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
88 "ld1r {v31.4s}, [%[kYToRgb]] \n" \
89 "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
90 "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
91
92 #define YUVTORGB(vR, vG, vB) \
93 "uxtl v0.8h, v0.8b \n" /* Extract Y */ \
94 "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
95 "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
96 "ushll v0.4s, v0.4h, #0 \n" \
97 "mul v3.4s, v3.4s, v31.4s \n" \
98 "mul v0.4s, v0.4s, v31.4s \n" \
99 "sqshrun v0.4h, v0.4s, #16 \n" \
100 "sqshrun2 v0.8h, v3.4s, #16 \n" /* Y */ \
101 "uaddw v1.8h, v2.8h, v1.8b \n" /* Replicate UV */ \
102 "mov v2.d[0], v1.d[1] \n" /* Extract V */ \
103 "uxtl v2.8h, v2.8b \n" \
104 "uxtl v1.8h, v1.8b \n" /* Extract U */ \
105 "mul v3.8h, v1.8h, v27.8h \n" \
106 "mul v5.8h, v1.8h, v29.8h \n" \
107 "mul v6.8h, v2.8h, v30.8h \n" \
108 "mul v7.8h, v2.8h, v28.8h \n" \
109 "sqadd v6.8h, v6.8h, v5.8h \n" \
110 "sqadd " #vB \
111 ".8h, v24.8h, v0.8h \n" /* B */ \
112 "sqadd " #vG \
113 ".8h, v25.8h, v0.8h \n" /* G */ \
114 "sqadd " #vR \
115 ".8h, v26.8h, v0.8h \n" /* R */ \
116 "sqadd " #vB ".8h, " #vB \
117 ".8h, v3.8h \n" /* B */ \
118 "sqsub " #vG ".8h, " #vG \
119 ".8h, v6.8h \n" /* G */ \
120 "sqadd " #vR ".8h, " #vR \
121 ".8h, v7.8h \n" /* R */ \
122 "sqshrun " #vB ".8b, " #vB \
123 ".8h, #6 \n" /* B */ \
124 "sqshrun " #vG ".8b, " #vG \
125 ".8h, #6 \n" /* G */ \
126 "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */
127
I444ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)128 void I444ToARGBRow_NEON(const uint8* src_y,
129 const uint8* src_u,
130 const uint8* src_v,
131 uint8* dst_argb,
132 const struct YuvConstants* yuvconstants,
133 int width) {
134 asm volatile (
135 YUVTORGB_SETUP
136 "movi v23.8b, #255 \n" /* A */
137 "1: \n"
138 READYUV444
139 YUVTORGB(v22, v21, v20)
140 "subs %w4, %w4, #8 \n"
141 MEMACCESS(3)
142 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
143 "b.gt 1b \n"
144 : "+r"(src_y), // %0
145 "+r"(src_u), // %1
146 "+r"(src_v), // %2
147 "+r"(dst_argb), // %3
148 "+r"(width) // %4
149 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
150 [kUVToG]"r"(&yuvconstants->kUVToG),
151 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
152 [kYToRgb]"r"(&yuvconstants->kYToRgb)
153 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
154 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
155 );
156 }
157
I422ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)158 void I422ToARGBRow_NEON(const uint8* src_y,
159 const uint8* src_u,
160 const uint8* src_v,
161 uint8* dst_argb,
162 const struct YuvConstants* yuvconstants,
163 int width) {
164 asm volatile (
165 YUVTORGB_SETUP
166 "movi v23.8b, #255 \n" /* A */
167 "1: \n"
168 READYUV422
169 YUVTORGB(v22, v21, v20)
170 "subs %w4, %w4, #8 \n"
171 MEMACCESS(3)
172 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
173 "b.gt 1b \n"
174 : "+r"(src_y), // %0
175 "+r"(src_u), // %1
176 "+r"(src_v), // %2
177 "+r"(dst_argb), // %3
178 "+r"(width) // %4
179 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
180 [kUVToG]"r"(&yuvconstants->kUVToG),
181 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
182 [kYToRgb]"r"(&yuvconstants->kYToRgb)
183 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
184 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
185 );
186 }
187
I422AlphaToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,const uint8 * src_a,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)188 void I422AlphaToARGBRow_NEON(const uint8* src_y,
189 const uint8* src_u,
190 const uint8* src_v,
191 const uint8* src_a,
192 uint8* dst_argb,
193 const struct YuvConstants* yuvconstants,
194 int width) {
195 asm volatile (
196 YUVTORGB_SETUP
197 "1: \n"
198 READYUV422
199 YUVTORGB(v22, v21, v20)
200 MEMACCESS(3)
201 "ld1 {v23.8b}, [%3], #8 \n"
202 "subs %w5, %w5, #8 \n"
203 MEMACCESS(4)
204 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
205 "b.gt 1b \n"
206 : "+r"(src_y), // %0
207 "+r"(src_u), // %1
208 "+r"(src_v), // %2
209 "+r"(src_a), // %3
210 "+r"(dst_argb), // %4
211 "+r"(width) // %5
212 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
213 [kUVToG]"r"(&yuvconstants->kUVToG),
214 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
215 [kYToRgb]"r"(&yuvconstants->kYToRgb)
216 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
217 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
218 );
219 }
220
I422ToRGBARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)221 void I422ToRGBARow_NEON(const uint8* src_y,
222 const uint8* src_u,
223 const uint8* src_v,
224 uint8* dst_rgba,
225 const struct YuvConstants* yuvconstants,
226 int width) {
227 asm volatile (
228 YUVTORGB_SETUP
229 "movi v20.8b, #255 \n" /* A */
230 "1: \n"
231 READYUV422
232 YUVTORGB(v23, v22, v21)
233 "subs %w4, %w4, #8 \n"
234 MEMACCESS(3)
235 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
236 "b.gt 1b \n"
237 : "+r"(src_y), // %0
238 "+r"(src_u), // %1
239 "+r"(src_v), // %2
240 "+r"(dst_rgba), // %3
241 "+r"(width) // %4
242 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
243 [kUVToG]"r"(&yuvconstants->kUVToG),
244 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
245 [kYToRgb]"r"(&yuvconstants->kYToRgb)
246 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
247 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
248 );
249 }
250
I422ToRGB24Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)251 void I422ToRGB24Row_NEON(const uint8* src_y,
252 const uint8* src_u,
253 const uint8* src_v,
254 uint8* dst_rgb24,
255 const struct YuvConstants* yuvconstants,
256 int width) {
257 asm volatile (
258 YUVTORGB_SETUP
259 "1: \n"
260 READYUV422
261 YUVTORGB(v22, v21, v20)
262 "subs %w4, %w4, #8 \n"
263 MEMACCESS(3)
264 "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
265 "b.gt 1b \n"
266 : "+r"(src_y), // %0
267 "+r"(src_u), // %1
268 "+r"(src_v), // %2
269 "+r"(dst_rgb24), // %3
270 "+r"(width) // %4
271 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
272 [kUVToG]"r"(&yuvconstants->kUVToG),
273 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
274 [kYToRgb]"r"(&yuvconstants->kYToRgb)
275 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
276 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
277 );
278 }
279
280 #define ARGBTORGB565 \
281 "shll v0.8h, v22.8b, #8 \n" /* R */ \
282 "shll v21.8h, v21.8b, #8 \n" /* G */ \
283 "shll v20.8h, v20.8b, #8 \n" /* B */ \
284 "sri v0.8h, v21.8h, #5 \n" /* RG */ \
285 "sri v0.8h, v20.8h, #11 \n" /* RGB */
286
I422ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)287 void I422ToRGB565Row_NEON(const uint8* src_y,
288 const uint8* src_u,
289 const uint8* src_v,
290 uint8* dst_rgb565,
291 const struct YuvConstants* yuvconstants,
292 int width) {
293 asm volatile (
294 YUVTORGB_SETUP
295 "1: \n"
296 READYUV422
297 YUVTORGB(v22, v21, v20)
298 "subs %w4, %w4, #8 \n"
299 ARGBTORGB565
300 MEMACCESS(3)
301 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
302 "b.gt 1b \n"
303 : "+r"(src_y), // %0
304 "+r"(src_u), // %1
305 "+r"(src_v), // %2
306 "+r"(dst_rgb565), // %3
307 "+r"(width) // %4
308 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
309 [kUVToG]"r"(&yuvconstants->kUVToG),
310 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
311 [kYToRgb]"r"(&yuvconstants->kYToRgb)
312 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
313 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
314 );
315 }
316
317 #define ARGBTOARGB1555 \
318 "shll v0.8h, v23.8b, #8 \n" /* A */ \
319 "shll v22.8h, v22.8b, #8 \n" /* R */ \
320 "shll v21.8h, v21.8b, #8 \n" /* G */ \
321 "shll v20.8h, v20.8b, #8 \n" /* B */ \
322 "sri v0.8h, v22.8h, #1 \n" /* AR */ \
323 "sri v0.8h, v21.8h, #6 \n" /* ARG */ \
324 "sri v0.8h, v20.8h, #11 \n" /* ARGB */
325
I422ToARGB1555Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,const struct YuvConstants * yuvconstants,int width)326 void I422ToARGB1555Row_NEON(const uint8* src_y,
327 const uint8* src_u,
328 const uint8* src_v,
329 uint8* dst_argb1555,
330 const struct YuvConstants* yuvconstants,
331 int width) {
332 asm volatile (
333 YUVTORGB_SETUP
334 "movi v23.8b, #255 \n"
335 "1: \n"
336 READYUV422
337 YUVTORGB(v22, v21, v20)
338 "subs %w4, %w4, #8 \n"
339 ARGBTOARGB1555
340 MEMACCESS(3)
341 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
342 "b.gt 1b \n"
343 : "+r"(src_y), // %0
344 "+r"(src_u), // %1
345 "+r"(src_v), // %2
346 "+r"(dst_argb1555), // %3
347 "+r"(width) // %4
348 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
349 [kUVToG]"r"(&yuvconstants->kUVToG),
350 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
351 [kYToRgb]"r"(&yuvconstants->kYToRgb)
352 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
353 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
354 );
355 }
356
357 #define ARGBTOARGB4444 \
358 /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
359 "ushr v20.8b, v20.8b, #4 \n" /* B */ \
360 "bic v21.8b, v21.8b, v4.8b \n" /* G */ \
361 "ushr v22.8b, v22.8b, #4 \n" /* R */ \
362 "bic v23.8b, v23.8b, v4.8b \n" /* A */ \
363 "orr v0.8b, v20.8b, v21.8b \n" /* BG */ \
364 "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
365 "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
366
I422ToARGB4444Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,const struct YuvConstants * yuvconstants,int width)367 void I422ToARGB4444Row_NEON(const uint8* src_y,
368 const uint8* src_u,
369 const uint8* src_v,
370 uint8* dst_argb4444,
371 const struct YuvConstants* yuvconstants,
372 int width) {
373 asm volatile (
374 YUVTORGB_SETUP
375 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
376 "1: \n"
377 READYUV422
378 YUVTORGB(v22, v21, v20)
379 "subs %w4, %w4, #8 \n"
380 "movi v23.8b, #255 \n"
381 ARGBTOARGB4444
382 MEMACCESS(3)
383 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
384 "b.gt 1b \n"
385 : "+r"(src_y), // %0
386 "+r"(src_u), // %1
387 "+r"(src_v), // %2
388 "+r"(dst_argb4444), // %3
389 "+r"(width) // %4
390 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
391 [kUVToG]"r"(&yuvconstants->kUVToG),
392 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
393 [kYToRgb]"r"(&yuvconstants->kYToRgb)
394 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
395 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
396 );
397 }
398
I400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)399 void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
400 asm volatile (
401 YUVTORGB_SETUP
402 "movi v23.8b, #255 \n"
403 "1: \n"
404 READYUV400
405 YUVTORGB(v22, v21, v20)
406 "subs %w2, %w2, #8 \n"
407 MEMACCESS(1)
408 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
409 "b.gt 1b \n"
410 : "+r"(src_y), // %0
411 "+r"(dst_argb), // %1
412 "+r"(width) // %2
413 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
414 [kUVToG]"r"(&kYuvI601Constants.kUVToG),
415 [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
416 [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
417 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
418 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
419 );
420 }
421
J400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)422 void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
423 asm volatile (
424 "movi v23.8b, #255 \n"
425 "1: \n"
426 MEMACCESS(0)
427 "ld1 {v20.8b}, [%0], #8 \n"
428 "orr v21.8b, v20.8b, v20.8b \n"
429 "orr v22.8b, v20.8b, v20.8b \n"
430 "subs %w2, %w2, #8 \n"
431 MEMACCESS(1)
432 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
433 "b.gt 1b \n"
434 : "+r"(src_y), // %0
435 "+r"(dst_argb), // %1
436 "+r"(width) // %2
437 :
438 : "cc", "memory", "v20", "v21", "v22", "v23"
439 );
440 }
441
NV12ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)442 void NV12ToARGBRow_NEON(const uint8* src_y,
443 const uint8* src_uv,
444 uint8* dst_argb,
445 const struct YuvConstants* yuvconstants,
446 int width) {
447 asm volatile (
448 YUVTORGB_SETUP
449 "movi v23.8b, #255 \n"
450 "1: \n"
451 READNV12
452 YUVTORGB(v22, v21, v20)
453 "subs %w3, %w3, #8 \n"
454 MEMACCESS(2)
455 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
456 "b.gt 1b \n"
457 : "+r"(src_y), // %0
458 "+r"(src_uv), // %1
459 "+r"(dst_argb), // %2
460 "+r"(width) // %3
461 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
462 [kUVToG]"r"(&yuvconstants->kUVToG),
463 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
464 [kYToRgb]"r"(&yuvconstants->kYToRgb)
465 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
466 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
467 );
468 }
469
NV21ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_vu,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)470 void NV21ToARGBRow_NEON(const uint8* src_y,
471 const uint8* src_vu,
472 uint8* dst_argb,
473 const struct YuvConstants* yuvconstants,
474 int width) {
475 asm volatile (
476 YUVTORGB_SETUP
477 "movi v23.8b, #255 \n"
478 "1: \n"
479 READNV21
480 YUVTORGB(v22, v21, v20)
481 "subs %w3, %w3, #8 \n"
482 MEMACCESS(2)
483 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
484 "b.gt 1b \n"
485 : "+r"(src_y), // %0
486 "+r"(src_vu), // %1
487 "+r"(dst_argb), // %2
488 "+r"(width) // %3
489 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
490 [kUVToG]"r"(&yuvconstants->kUVToG),
491 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
492 [kYToRgb]"r"(&yuvconstants->kYToRgb)
493 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
494 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
495 );
496 }
497
NV12ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)498 void NV12ToRGB565Row_NEON(const uint8* src_y,
499 const uint8* src_uv,
500 uint8* dst_rgb565,
501 const struct YuvConstants* yuvconstants,
502 int width) {
503 asm volatile (
504 YUVTORGB_SETUP
505 "1: \n"
506 READNV12
507 YUVTORGB(v22, v21, v20)
508 "subs %w3, %w3, #8 \n"
509 ARGBTORGB565
510 MEMACCESS(2)
511 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
512 "b.gt 1b \n"
513 : "+r"(src_y), // %0
514 "+r"(src_uv), // %1
515 "+r"(dst_rgb565), // %2
516 "+r"(width) // %3
517 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
518 [kUVToG]"r"(&yuvconstants->kUVToG),
519 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
520 [kYToRgb]"r"(&yuvconstants->kYToRgb)
521 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
522 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
523 );
524 }
525
YUY2ToARGBRow_NEON(const uint8 * src_yuy2,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)526 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
527 uint8* dst_argb,
528 const struct YuvConstants* yuvconstants,
529 int width) {
530 asm volatile (
531 YUVTORGB_SETUP
532 "movi v23.8b, #255 \n"
533 "1: \n"
534 READYUY2
535 YUVTORGB(v22, v21, v20)
536 "subs %w2, %w2, #8 \n"
537 MEMACCESS(1)
538 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
539 "b.gt 1b \n"
540 : "+r"(src_yuy2), // %0
541 "+r"(dst_argb), // %1
542 "+r"(width) // %2
543 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
544 [kUVToG]"r"(&yuvconstants->kUVToG),
545 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
546 [kYToRgb]"r"(&yuvconstants->kYToRgb)
547 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
548 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
549 );
550 }
551
UYVYToARGBRow_NEON(const uint8 * src_uyvy,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)552 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
553 uint8* dst_argb,
554 const struct YuvConstants* yuvconstants,
555 int width) {
556 asm volatile (
557 YUVTORGB_SETUP
558 "movi v23.8b, #255 \n"
559 "1: \n"
560 READUYVY
561 YUVTORGB(v22, v21, v20)
562 "subs %w2, %w2, #8 \n"
563 MEMACCESS(1)
564 "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
565 "b.gt 1b \n"
566 : "+r"(src_uyvy), // %0
567 "+r"(dst_argb), // %1
568 "+r"(width) // %2
569 : [kUVToRB]"r"(&yuvconstants->kUVToRB),
570 [kUVToG]"r"(&yuvconstants->kUVToG),
571 [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
572 [kYToRgb]"r"(&yuvconstants->kYToRgb)
573 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
574 "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
575 );
576 }
577
578 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)579 void SplitUVRow_NEON(const uint8* src_uv,
580 uint8* dst_u,
581 uint8* dst_v,
582 int width) {
583 asm volatile (
584 "1: \n"
585 MEMACCESS(0)
586 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV
587 "subs %w3, %w3, #16 \n" // 16 processed per loop
588 MEMACCESS(1)
589 "st1 {v0.16b}, [%1], #16 \n" // store U
590 MEMACCESS(2)
591 "st1 {v1.16b}, [%2], #16 \n" // store V
592 "b.gt 1b \n"
593 : "+r"(src_uv), // %0
594 "+r"(dst_u), // %1
595 "+r"(dst_v), // %2
596 "+r"(width) // %3 // Output registers
597 : // Input registers
598 : "cc", "memory", "v0", "v1" // Clobber List
599 );
600 }
601
602 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)603 void MergeUVRow_NEON(const uint8* src_u,
604 const uint8* src_v,
605 uint8* dst_uv,
606 int width) {
607 asm volatile (
608 "1: \n"
609 MEMACCESS(0)
610 "ld1 {v0.16b}, [%0], #16 \n" // load U
611 MEMACCESS(1)
612 "ld1 {v1.16b}, [%1], #16 \n" // load V
613 "subs %w3, %w3, #16 \n" // 16 processed per loop
614 MEMACCESS(2)
615 "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV
616 "b.gt 1b \n"
617 :
618 "+r"(src_u), // %0
619 "+r"(src_v), // %1
620 "+r"(dst_uv), // %2
621 "+r"(width) // %3 // Output registers
622 : // Input registers
623 : "cc", "memory", "v0", "v1" // Clobber List
624 );
625 }
626
627 // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)628 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
629 asm volatile (
630 "1: \n"
631 MEMACCESS(0)
632 "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32
633 "subs %w2, %w2, #32 \n" // 32 processed per loop
634 MEMACCESS(1)
635 "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32
636 "b.gt 1b \n"
637 : "+r"(src), // %0
638 "+r"(dst), // %1
639 "+r"(count) // %2 // Output registers
640 : // Input registers
641 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
642 );
643 }
644
645 // SetRow writes 'count' bytes using an 8 bit value repeated.
SetRow_NEON(uint8 * dst,uint8 v8,int count)646 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
647 asm volatile (
648 "dup v0.16b, %w2 \n" // duplicate 16 bytes
649 "1: \n"
650 "subs %w1, %w1, #16 \n" // 16 bytes per loop
651 MEMACCESS(0)
652 "st1 {v0.16b}, [%0], #16 \n" // store
653 "b.gt 1b \n"
654 : "+r"(dst), // %0
655 "+r"(count) // %1
656 : "r"(v8) // %2
657 : "cc", "memory", "v0"
658 );
659 }
660
ARGBSetRow_NEON(uint8 * dst,uint32 v32,int count)661 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
662 asm volatile (
663 "dup v0.4s, %w2 \n" // duplicate 4 ints
664 "1: \n"
665 "subs %w1, %w1, #4 \n" // 4 ints per loop
666 MEMACCESS(0)
667 "st1 {v0.16b}, [%0], #16 \n" // store
668 "b.gt 1b \n"
669 : "+r"(dst), // %0
670 "+r"(count) // %1
671 : "r"(v32) // %2
672 : "cc", "memory", "v0"
673 );
674 }
675
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)676 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
677 asm volatile (
678 // Start at end of source row.
679 "add %0, %0, %w2, sxtw \n"
680 "sub %0, %0, #16 \n"
681 "1: \n"
682 MEMACCESS(0)
683 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
684 "subs %w2, %w2, #16 \n" // 16 pixels per loop.
685 "rev64 v0.16b, v0.16b \n"
686 MEMACCESS(1)
687 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
688 MEMACCESS(1)
689 "st1 {v0.D}[0], [%1], #8 \n"
690 "b.gt 1b \n"
691 : "+r"(src), // %0
692 "+r"(dst), // %1
693 "+r"(width) // %2
694 : "r"((ptrdiff_t)-16) // %3
695 : "cc", "memory", "v0"
696 );
697 }
698
MirrorUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)699 void MirrorUVRow_NEON(const uint8* src_uv,
700 uint8* dst_u,
701 uint8* dst_v,
702 int width) {
703 asm volatile (
704 // Start at end of source row.
705 "add %0, %0, %w3, sxtw #1 \n"
706 "sub %0, %0, #16 \n"
707 "1: \n"
708 MEMACCESS(0)
709 "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
710 "subs %w3, %w3, #8 \n" // 8 pixels per loop.
711 "rev64 v0.8b, v0.8b \n"
712 "rev64 v1.8b, v1.8b \n"
713 MEMACCESS(1)
714 "st1 {v0.8b}, [%1], #8 \n" // dst += 8
715 MEMACCESS(2)
716 "st1 {v1.8b}, [%2], #8 \n"
717 "b.gt 1b \n"
718 : "+r"(src_uv), // %0
719 "+r"(dst_u), // %1
720 "+r"(dst_v), // %2
721 "+r"(width) // %3
722 : "r"((ptrdiff_t)-16) // %4
723 : "cc", "memory", "v0", "v1"
724 );
725 }
726
ARGBMirrorRow_NEON(const uint8 * src,uint8 * dst,int width)727 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
728 asm volatile (
729 // Start at end of source row.
730 "add %0, %0, %w2, sxtw #2 \n"
731 "sub %0, %0, #16 \n"
732 "1: \n"
733 MEMACCESS(0)
734 "ld1 {v0.16b}, [%0], %3 \n" // src -= 16
735 "subs %w2, %w2, #4 \n" // 4 pixels per loop.
736 "rev64 v0.4s, v0.4s \n"
737 MEMACCESS(1)
738 "st1 {v0.D}[1], [%1], #8 \n" // dst += 16
739 MEMACCESS(1)
740 "st1 {v0.D}[0], [%1], #8 \n"
741 "b.gt 1b \n"
742 : "+r"(src), // %0
743 "+r"(dst), // %1
744 "+r"(width) // %2
745 : "r"((ptrdiff_t)-16) // %3
746 : "cc", "memory", "v0"
747 );
748 }
749
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int width)750 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
751 asm volatile (
752 "movi v4.8b, #255 \n" // Alpha
753 "1: \n"
754 MEMACCESS(0)
755 "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24.
756 "subs %w2, %w2, #8 \n" // 8 processed per loop.
757 MEMACCESS(1)
758 "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels
759 "b.gt 1b \n"
760 : "+r"(src_rgb24), // %0
761 "+r"(dst_argb), // %1
762 "+r"(width) // %2
763 :
764 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
765 );
766 }
767
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int width)768 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
769 asm volatile (
770 "movi v5.8b, #255 \n" // Alpha
771 "1: \n"
772 MEMACCESS(0)
773 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
774 "subs %w2, %w2, #8 \n" // 8 processed per loop.
775 "orr v3.8b, v1.8b, v1.8b \n" // move g
776 "orr v4.8b, v0.8b, v0.8b \n" // move r
777 MEMACCESS(1)
778 "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a
779 "b.gt 1b \n"
780 : "+r"(src_raw), // %0
781 "+r"(dst_argb), // %1
782 "+r"(width) // %2
783 :
784 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
785 );
786 }
787
RAWToRGB24Row_NEON(const uint8 * src_raw,uint8 * dst_rgb24,int width)788 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
789 asm volatile (
790 "1: \n"
791 MEMACCESS(0)
792 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
793 "subs %w2, %w2, #8 \n" // 8 processed per loop.
794 "orr v3.8b, v1.8b, v1.8b \n" // move g
795 "orr v4.8b, v0.8b, v0.8b \n" // move r
796 MEMACCESS(1)
797 "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
798 "b.gt 1b \n"
799 : "+r"(src_raw), // %0
800 "+r"(dst_rgb24), // %1
801 "+r"(width) // %2
802 :
803 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
804 );
805 }
806
807 #define RGB565TOARGB \
808 "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
809 "shl v6.8b, v6.8b, #2 \n" /* G GGGGGG00 upper 6 */ \
810 "ushr v4.8b, v6.8b, #6 \n" /* G 000000GG lower 2 */ \
811 "orr v1.8b, v4.8b, v6.8b \n" /* G */ \
812 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
813 "ushr v0.8h, v0.8h, #11 \n" /* R 000RRRRR */ \
814 "xtn2 v2.16b,v0.8h \n" /* R in upper part */ \
815 "shl v2.16b, v2.16b, #3 \n" /* R,B BBBBB000 upper 5 */ \
816 "ushr v0.16b, v2.16b, #5 \n" /* R,B 00000BBB lower 3 */ \
817 "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
818 "dup v2.2D, v0.D[1] \n" /* R */
819
RGB565ToARGBRow_NEON(const uint8 * src_rgb565,uint8 * dst_argb,int width)820 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
821 asm volatile (
822 "movi v3.8b, #255 \n" // Alpha
823 "1: \n"
824 MEMACCESS(0)
825 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
826 "subs %w2, %w2, #8 \n" // 8 processed per loop.
827 RGB565TOARGB
828 MEMACCESS(1)
829 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
830 "b.gt 1b \n"
831 : "+r"(src_rgb565), // %0
832 "+r"(dst_argb), // %1
833 "+r"(width) // %2
834 :
835 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
836 );
837 }
838
839 #define ARGB1555TOARGB \
840 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
841 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
842 "xtn v3.8b, v2.8h \n" /* RRRRR000 AAAAAAAA */ \
843 \
844 "sshr v2.8h, v0.8h, #15 \n" /* A AAAAAAAA */ \
845 "xtn2 v3.16b, v2.8h \n" \
846 \
847 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
848 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
849 \
850 "ushr v1.16b, v3.16b, #5 \n" /* R,A 00000RRR lower 3 */ \
851 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
852 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
853 \
854 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
855 "orr v2.16b, v1.16b, v3.16b \n" /* R,A */ \
856 "dup v1.2D, v0.D[1] \n" \
857 "dup v3.2D, v2.D[1] \n"
858
859 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
860 #define RGB555TOARGB \
861 "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
862 "shl v2.8h, v2.8h, #3 \n" /* R RRRRR000 upper 5 */ \
863 "xtn v3.8b, v2.8h \n" /* RRRRR000 */ \
864 \
865 "xtn v2.8b, v0.8h \n" /* B xxxBBBBB */ \
866 "shrn2 v2.16b,v0.8h, #5 \n" /* G xxxGGGGG */ \
867 \
868 "ushr v1.16b, v3.16b, #5 \n" /* R 00000RRR lower 3 */ \
869 "shl v0.16b, v2.16b, #3 \n" /* B,G BBBBB000 upper 5 */ \
870 "ushr v2.16b, v0.16b, #5 \n" /* B,G 00000BBB lower 3 */ \
871 \
872 "orr v0.16b, v0.16b, v2.16b \n" /* B,G */ \
873 "orr v2.16b, v1.16b, v3.16b \n" /* R */ \
874 "dup v1.2D, v0.D[1] \n" /* G */
875
ARGB1555ToARGBRow_NEON(const uint8 * src_argb1555,uint8 * dst_argb,int width)876 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
877 uint8* dst_argb,
878 int width) {
879 asm volatile (
880 "movi v3.8b, #255 \n" // Alpha
881 "1: \n"
882 MEMACCESS(0)
883 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
884 "subs %w2, %w2, #8 \n" // 8 processed per loop.
885 ARGB1555TOARGB
886 MEMACCESS(1)
887 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
888 "b.gt 1b \n"
889 : "+r"(src_argb1555), // %0
890 "+r"(dst_argb), // %1
891 "+r"(width) // %2
892 :
893 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
894 );
895 }
896
897 #define ARGB4444TOARGB \
898 "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
899 "xtn2 v1.16b, v0.8h \n" /* v1(h) GB */ \
900 "shl v2.16b, v1.16b, #4 \n" /* B,R BBBB0000 */ \
901 "ushr v3.16b, v1.16b, #4 \n" /* G,A 0000GGGG */ \
902 "ushr v0.16b, v2.16b, #4 \n" /* B,R 0000BBBB */ \
903 "shl v1.16b, v3.16b, #4 \n" /* G,A GGGG0000 */ \
904 "orr v2.16b, v0.16b, v2.16b \n" /* B,R BBBBBBBB */ \
905 "orr v3.16b, v1.16b, v3.16b \n" /* G,A GGGGGGGG */ \
906 "dup v0.2D, v2.D[1] \n" \
907 "dup v1.2D, v3.D[1] \n"
908
ARGB4444ToARGBRow_NEON(const uint8 * src_argb4444,uint8 * dst_argb,int width)909 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
910 uint8* dst_argb,
911 int width) {
912 asm volatile (
913 "1: \n"
914 MEMACCESS(0)
915 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
916 "subs %w2, %w2, #8 \n" // 8 processed per loop.
917 ARGB4444TOARGB
918 MEMACCESS(1)
919 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
920 "b.gt 1b \n"
921 : "+r"(src_argb4444), // %0
922 "+r"(dst_argb), // %1
923 "+r"(width) // %2
924 :
925 : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
926 );
927 }
928
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int width)929 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
930 asm volatile (
931 "1: \n"
932 MEMACCESS(0)
933 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels
934 "subs %w2, %w2, #8 \n" // 8 processed per loop.
935 MEMACCESS(1)
936 "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24.
937 "b.gt 1b \n"
938 : "+r"(src_argb), // %0
939 "+r"(dst_rgb24), // %1
940 "+r"(width) // %2
941 :
942 : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
943 );
944 }
945
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int width)946 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
947 asm volatile (
948 "1: \n"
949 MEMACCESS(0)
950 "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a
951 "subs %w2, %w2, #8 \n" // 8 processed per loop.
952 "orr v4.8b, v2.8b, v2.8b \n" // mov g
953 "orr v5.8b, v1.8b, v1.8b \n" // mov b
954 MEMACCESS(1)
955 "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b
956 "b.gt 1b \n"
957 : "+r"(src_argb), // %0
958 "+r"(dst_raw), // %1
959 "+r"(width) // %2
960 :
961 : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
962 );
963 }
964
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int width)965 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
966 asm volatile (
967 "1: \n"
968 MEMACCESS(0)
969 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2.
970 "subs %w2, %w2, #16 \n" // 16 processed per loop.
971 MEMACCESS(1)
972 "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y.
973 "b.gt 1b \n"
974 : "+r"(src_yuy2), // %0
975 "+r"(dst_y), // %1
976 "+r"(width) // %2
977 :
978 : "cc", "memory", "v0", "v1" // Clobber List
979 );
980 }
981
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int width)982 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
983 asm volatile (
984 "1: \n"
985 MEMACCESS(0)
986 "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
987 "subs %w2, %w2, #16 \n" // 16 processed per loop.
988 MEMACCESS(1)
989 "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
990 "b.gt 1b \n"
991 : "+r"(src_uyvy), // %0
992 "+r"(dst_y), // %1
993 "+r"(width) // %2
994 :
995 : "cc", "memory", "v0", "v1" // Clobber List
996 );
997 }
998
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)999 void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
1000 uint8* dst_u,
1001 uint8* dst_v,
1002 int width) {
1003 asm volatile (
1004 "1: \n"
1005 MEMACCESS(0)
1006 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels
1007 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1008 MEMACCESS(1)
1009 "st1 {v1.8b}, [%1], #8 \n" // store 8 U.
1010 MEMACCESS(2)
1011 "st1 {v3.8b}, [%2], #8 \n" // store 8 V.
1012 "b.gt 1b \n"
1013 : "+r"(src_yuy2), // %0
1014 "+r"(dst_u), // %1
1015 "+r"(dst_v), // %2
1016 "+r"(width) // %3
1017 :
1018 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1019 );
1020 }
1021
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1022 void UYVYToUV422Row_NEON(const uint8* src_uyvy,
1023 uint8* dst_u,
1024 uint8* dst_v,
1025 int width) {
1026 asm volatile (
1027 "1: \n"
1028 MEMACCESS(0)
1029 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels
1030 "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
1031 MEMACCESS(1)
1032 "st1 {v0.8b}, [%1], #8 \n" // store 8 U.
1033 MEMACCESS(2)
1034 "st1 {v2.8b}, [%2], #8 \n" // store 8 V.
1035 "b.gt 1b \n"
1036 : "+r"(src_uyvy), // %0
1037 "+r"(dst_u), // %1
1038 "+r"(dst_v), // %2
1039 "+r"(width) // %3
1040 :
1041 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1042 );
1043 }
1044
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)1045 void YUY2ToUVRow_NEON(const uint8* src_yuy2,
1046 int stride_yuy2,
1047 uint8* dst_u,
1048 uint8* dst_v,
1049 int width) {
1050 const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1051 asm volatile (
1052 "1: \n"
1053 MEMACCESS(0)
1054 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1055 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1056 MEMACCESS(1)
1057 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1058 "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U
1059 "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V
1060 MEMACCESS(2)
1061 "st1 {v1.8b}, [%2], #8 \n" // store 8 U.
1062 MEMACCESS(3)
1063 "st1 {v3.8b}, [%3], #8 \n" // store 8 V.
1064 "b.gt 1b \n"
1065 : "+r"(src_yuy2), // %0
1066 "+r"(src_yuy2b), // %1
1067 "+r"(dst_u), // %2
1068 "+r"(dst_v), // %3
1069 "+r"(width) // %4
1070 :
1071 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1072 "v5", "v6", "v7" // Clobber List
1073 );
1074 }
1075
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1076 void UYVYToUVRow_NEON(const uint8* src_uyvy,
1077 int stride_uyvy,
1078 uint8* dst_u,
1079 uint8* dst_v,
1080 int width) {
1081 const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1082 asm volatile (
1083 "1: \n"
1084 MEMACCESS(0)
1085 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels
1086 "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs.
1087 MEMACCESS(1)
1088 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row
1089 "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U
1090 "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V
1091 MEMACCESS(2)
1092 "st1 {v0.8b}, [%2], #8 \n" // store 8 U.
1093 MEMACCESS(3)
1094 "st1 {v2.8b}, [%3], #8 \n" // store 8 V.
1095 "b.gt 1b \n"
1096 : "+r"(src_uyvy), // %0
1097 "+r"(src_uyvyb), // %1
1098 "+r"(dst_u), // %2
1099 "+r"(dst_v), // %3
1100 "+r"(width) // %4
1101 :
1102 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1103 "v5", "v6", "v7" // Clobber List
1104 );
1105 }
1106
1107 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)1108 void ARGBShuffleRow_NEON(const uint8* src_argb,
1109 uint8* dst_argb,
1110 const uint8* shuffler,
1111 int width) {
1112 asm volatile (
1113 MEMACCESS(3)
1114 "ld1 {v2.16b}, [%3] \n" // shuffler
1115 "1: \n"
1116 MEMACCESS(0)
1117 "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels.
1118 "subs %w2, %w2, #4 \n" // 4 processed per loop
1119 "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels
1120 MEMACCESS(1)
1121 "st1 {v1.16b}, [%1], #16 \n" // store 4.
1122 "b.gt 1b \n"
1123 : "+r"(src_argb), // %0
1124 "+r"(dst_argb), // %1
1125 "+r"(width) // %2
1126 : "r"(shuffler) // %3
1127 : "cc", "memory", "v0", "v1", "v2" // Clobber List
1128 );
1129 }
1130
I422ToYUY2Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)1131 void I422ToYUY2Row_NEON(const uint8* src_y,
1132 const uint8* src_u,
1133 const uint8* src_v,
1134 uint8* dst_yuy2,
1135 int width) {
1136 asm volatile (
1137 "1: \n"
1138 MEMACCESS(0)
1139 "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys
1140 "orr v2.8b, v1.8b, v1.8b \n"
1141 MEMACCESS(1)
1142 "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us
1143 MEMACCESS(2)
1144 "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs
1145 "subs %w4, %w4, #16 \n" // 16 pixels
1146 MEMACCESS(3)
1147 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1148 "b.gt 1b \n"
1149 : "+r"(src_y), // %0
1150 "+r"(src_u), // %1
1151 "+r"(src_v), // %2
1152 "+r"(dst_yuy2), // %3
1153 "+r"(width) // %4
1154 :
1155 : "cc", "memory", "v0", "v1", "v2", "v3"
1156 );
1157 }
1158
I422ToUYVYRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)1159 void I422ToUYVYRow_NEON(const uint8* src_y,
1160 const uint8* src_u,
1161 const uint8* src_v,
1162 uint8* dst_uyvy,
1163 int width) {
1164 asm volatile (
1165 "1: \n"
1166 MEMACCESS(0)
1167 "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys
1168 "orr v3.8b, v2.8b, v2.8b \n"
1169 MEMACCESS(1)
1170 "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us
1171 MEMACCESS(2)
1172 "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs
1173 "subs %w4, %w4, #16 \n" // 16 pixels
1174 MEMACCESS(3)
1175 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels.
1176 "b.gt 1b \n"
1177 : "+r"(src_y), // %0
1178 "+r"(src_u), // %1
1179 "+r"(src_v), // %2
1180 "+r"(dst_uyvy), // %3
1181 "+r"(width) // %4
1182 :
1183 : "cc", "memory", "v0", "v1", "v2", "v3"
1184 );
1185 }
1186
ARGBToRGB565Row_NEON(const uint8 * src_argb,uint8 * dst_rgb565,int width)1187 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1188 asm volatile (
1189 "1: \n"
1190 MEMACCESS(0)
1191 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1192 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1193 ARGBTORGB565
1194 MEMACCESS(1)
1195 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565.
1196 "b.gt 1b \n"
1197 : "+r"(src_argb), // %0
1198 "+r"(dst_rgb565), // %1
1199 "+r"(width) // %2
1200 :
1201 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1202 );
1203 }
1204
ARGBToRGB565DitherRow_NEON(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int width)1205 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
1206 uint8* dst_rgb,
1207 const uint32 dither4,
1208 int width) {
1209 asm volatile (
1210 "dup v1.4s, %w2 \n" // dither4
1211 "1: \n"
1212 MEMACCESS(1)
1213 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels
1214 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1215 "uqadd v20.8b, v20.8b, v1.8b \n"
1216 "uqadd v21.8b, v21.8b, v1.8b \n"
1217 "uqadd v22.8b, v22.8b, v1.8b \n"
1218 ARGBTORGB565
1219 MEMACCESS(0)
1220 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565.
1221 "b.gt 1b \n"
1222 : "+r"(dst_rgb) // %0
1223 : "r"(src_argb), // %1
1224 "r"(dither4), // %2
1225 "r"(width) // %3
1226 : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1227 );
1228 }
1229
ARGBToARGB1555Row_NEON(const uint8 * src_argb,uint8 * dst_argb1555,int width)1230 void ARGBToARGB1555Row_NEON(const uint8* src_argb,
1231 uint8* dst_argb1555,
1232 int width) {
1233 asm volatile (
1234 "1: \n"
1235 MEMACCESS(0)
1236 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1237 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1238 ARGBTOARGB1555
1239 MEMACCESS(1)
1240 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555.
1241 "b.gt 1b \n"
1242 : "+r"(src_argb), // %0
1243 "+r"(dst_argb1555), // %1
1244 "+r"(width) // %2
1245 :
1246 : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1247 );
1248 }
1249
ARGBToARGB4444Row_NEON(const uint8 * src_argb,uint8 * dst_argb4444,int width)1250 void ARGBToARGB4444Row_NEON(const uint8* src_argb,
1251 uint8* dst_argb4444,
1252 int width) {
1253 asm volatile (
1254 "movi v4.16b, #0x0f \n" // bits to clear with vbic.
1255 "1: \n"
1256 MEMACCESS(0)
1257 "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels
1258 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1259 ARGBTOARGB4444
1260 MEMACCESS(1)
1261 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444.
1262 "b.gt 1b \n"
1263 : "+r"(src_argb), // %0
1264 "+r"(dst_argb4444), // %1
1265 "+r"(width) // %2
1266 :
1267 : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1268 );
1269 }
1270
ARGBToYRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1271 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1272 asm volatile (
1273 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1274 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1275 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1276 "movi v7.8b, #16 \n" // Add 16 constant
1277 "1: \n"
1278 MEMACCESS(0)
1279 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1280 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1281 "umull v3.8h, v0.8b, v4.8b \n" // B
1282 "umlal v3.8h, v1.8b, v5.8b \n" // G
1283 "umlal v3.8h, v2.8b, v6.8b \n" // R
1284 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1285 "uqadd v0.8b, v0.8b, v7.8b \n"
1286 MEMACCESS(1)
1287 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1288 "b.gt 1b \n"
1289 : "+r"(src_argb), // %0
1290 "+r"(dst_y), // %1
1291 "+r"(width) // %2
1292 :
1293 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1294 );
1295 }
1296
ARGBExtractAlphaRow_NEON(const uint8 * src_argb,uint8 * dst_a,int width)1297 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1298 asm volatile (
1299 "1: \n"
1300 MEMACCESS(0)
1301 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
1302 "subs %w2, %w2, #16 \n" // 16 processed per loop
1303 MEMACCESS(1)
1304 "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
1305 "b.gt 1b \n"
1306 : "+r"(src_argb), // %0
1307 "+r"(dst_a), // %1
1308 "+r"(width) // %2
1309 :
1310 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
1311 );
1312 }
1313
ARGBToYJRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1314 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1315 asm volatile (
1316 "movi v4.8b, #15 \n" // B * 0.11400 coefficient
1317 "movi v5.8b, #75 \n" // G * 0.58700 coefficient
1318 "movi v6.8b, #38 \n" // R * 0.29900 coefficient
1319 "1: \n"
1320 MEMACCESS(0)
1321 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1322 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1323 "umull v3.8h, v0.8b, v4.8b \n" // B
1324 "umlal v3.8h, v1.8b, v5.8b \n" // G
1325 "umlal v3.8h, v2.8b, v6.8b \n" // R
1326 "sqrshrun v0.8b, v3.8h, #7 \n" // 15 bit to 8 bit Y
1327 MEMACCESS(1)
1328 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1329 "b.gt 1b \n"
1330 : "+r"(src_argb), // %0
1331 "+r"(dst_y), // %1
1332 "+r"(width) // %2
1333 :
1334 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1335 );
1336 }
1337
1338 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1339 void ARGBToUV444Row_NEON(const uint8* src_argb,
1340 uint8* dst_u,
1341 uint8* dst_v,
1342 int width) {
1343 asm volatile (
1344 "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
1345 "movi v25.8b, #74 \n" // UG -0.5781 coefficient
1346 "movi v26.8b, #38 \n" // UR -0.2969 coefficient
1347 "movi v27.8b, #18 \n" // VB -0.1406 coefficient
1348 "movi v28.8b, #94 \n" // VG -0.7344 coefficient
1349 "movi v29.16b,#0x80 \n" // 128.5
1350 "1: \n"
1351 MEMACCESS(0)
1352 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
1353 "subs %w3, %w3, #8 \n" // 8 processed per loop.
1354 "umull v4.8h, v0.8b, v24.8b \n" // B
1355 "umlsl v4.8h, v1.8b, v25.8b \n" // G
1356 "umlsl v4.8h, v2.8b, v26.8b \n" // R
1357 "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
1358
1359 "umull v3.8h, v2.8b, v24.8b \n" // R
1360 "umlsl v3.8h, v1.8b, v28.8b \n" // G
1361 "umlsl v3.8h, v0.8b, v27.8b \n" // B
1362 "add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
1363
1364 "uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
1365 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1366
1367 MEMACCESS(1)
1368 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
1369 MEMACCESS(2)
1370 "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
1371 "b.gt 1b \n"
1372 : "+r"(src_argb), // %0
1373 "+r"(dst_u), // %1
1374 "+r"(dst_v), // %2
1375 "+r"(width) // %3
1376 :
1377 : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1378 "v24", "v25", "v26", "v27", "v28", "v29"
1379 );
1380 }
1381
1382 #define RGBTOUV_SETUP_REG \
1383 "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
1384 "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
1385 "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
1386 "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
1387 "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
1388 "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
1389
1390 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
1391 #define RGBTOUV(QB, QG, QR) \
1392 "mul v3.8h, " #QB \
1393 ",v20.8h \n" /* B */ \
1394 "mul v4.8h, " #QR \
1395 ",v20.8h \n" /* R */ \
1396 "mls v3.8h, " #QG \
1397 ",v21.8h \n" /* G */ \
1398 "mls v4.8h, " #QG \
1399 ",v24.8h \n" /* G */ \
1400 "mls v3.8h, " #QR \
1401 ",v22.8h \n" /* R */ \
1402 "mls v4.8h, " #QB \
1403 ",v23.8h \n" /* B */ \
1404 "add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
1405 "add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
1406 "uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
1407 "uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
1408
1409 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1410 // TODO(fbarchard): consider ptrdiff_t for all strides.
1411
ARGBToUVRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1412 void ARGBToUVRow_NEON(const uint8* src_argb,
1413 int src_stride_argb,
1414 uint8* dst_u,
1415 uint8* dst_v,
1416 int width) {
1417 const uint8* src_argb_1 = src_argb + src_stride_argb;
1418 asm volatile (
1419 RGBTOUV_SETUP_REG
1420 "1: \n"
1421 MEMACCESS(0)
1422 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1423 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1424 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1425 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1426
1427 MEMACCESS(1)
1428 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1429 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1430 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1431 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1432
1433 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1434 "urshr v1.8h, v1.8h, #1 \n"
1435 "urshr v2.8h, v2.8h, #1 \n"
1436
1437 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1438 RGBTOUV(v0.8h, v1.8h, v2.8h)
1439 MEMACCESS(2)
1440 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1441 MEMACCESS(3)
1442 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1443 "b.gt 1b \n"
1444 : "+r"(src_argb), // %0
1445 "+r"(src_argb_1), // %1
1446 "+r"(dst_u), // %2
1447 "+r"(dst_v), // %3
1448 "+r"(width) // %4
1449 :
1450 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1451 "v20", "v21", "v22", "v23", "v24", "v25"
1452 );
1453 }
1454
1455 // TODO(fbarchard): Subsample match C code.
ARGBToUVJRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1456 void ARGBToUVJRow_NEON(const uint8* src_argb,
1457 int src_stride_argb,
1458 uint8* dst_u,
1459 uint8* dst_v,
1460 int width) {
1461 const uint8* src_argb_1 = src_argb + src_stride_argb;
1462 asm volatile (
1463 "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
1464 "movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
1465 "movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
1466 "movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
1467 "movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
1468 "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1469 "1: \n"
1470 MEMACCESS(0)
1471 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1472 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1473 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1474 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1475 MEMACCESS(1)
1476 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
1477 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1478 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1479 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1480
1481 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1482 "urshr v1.8h, v1.8h, #1 \n"
1483 "urshr v2.8h, v2.8h, #1 \n"
1484
1485 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1486 RGBTOUV(v0.8h, v1.8h, v2.8h)
1487 MEMACCESS(2)
1488 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1489 MEMACCESS(3)
1490 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1491 "b.gt 1b \n"
1492 : "+r"(src_argb), // %0
1493 "+r"(src_argb_1), // %1
1494 "+r"(dst_u), // %2
1495 "+r"(dst_v), // %3
1496 "+r"(width) // %4
1497 :
1498 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1499 "v20", "v21", "v22", "v23", "v24", "v25"
1500 );
1501 }
1502
BGRAToUVRow_NEON(const uint8 * src_bgra,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1503 void BGRAToUVRow_NEON(const uint8* src_bgra,
1504 int src_stride_bgra,
1505 uint8* dst_u,
1506 uint8* dst_v,
1507 int width) {
1508 const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1509 asm volatile (
1510 RGBTOUV_SETUP_REG
1511 "1: \n"
1512 MEMACCESS(0)
1513 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1514 "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
1515 "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1516 "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
1517 MEMACCESS(1)
1518 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
1519 "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
1520 "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1521 "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
1522
1523 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1524 "urshr v1.8h, v3.8h, #1 \n"
1525 "urshr v2.8h, v2.8h, #1 \n"
1526
1527 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1528 RGBTOUV(v0.8h, v1.8h, v2.8h)
1529 MEMACCESS(2)
1530 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1531 MEMACCESS(3)
1532 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1533 "b.gt 1b \n"
1534 : "+r"(src_bgra), // %0
1535 "+r"(src_bgra_1), // %1
1536 "+r"(dst_u), // %2
1537 "+r"(dst_v), // %3
1538 "+r"(width) // %4
1539 :
1540 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1541 "v20", "v21", "v22", "v23", "v24", "v25"
1542 );
1543 }
1544
ABGRToUVRow_NEON(const uint8 * src_abgr,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1545 void ABGRToUVRow_NEON(const uint8* src_abgr,
1546 int src_stride_abgr,
1547 uint8* dst_u,
1548 uint8* dst_v,
1549 int width) {
1550 const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1551 asm volatile (
1552 RGBTOUV_SETUP_REG
1553 "1: \n"
1554 MEMACCESS(0)
1555 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1556 "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1557 "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1558 "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1559 MEMACCESS(1)
1560 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1561 "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1562 "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1563 "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1564
1565 "urshr v0.8h, v3.8h, #1 \n" // 2x average
1566 "urshr v2.8h, v2.8h, #1 \n"
1567 "urshr v1.8h, v1.8h, #1 \n"
1568
1569 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1570 RGBTOUV(v0.8h, v2.8h, v1.8h)
1571 MEMACCESS(2)
1572 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1573 MEMACCESS(3)
1574 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1575 "b.gt 1b \n"
1576 : "+r"(src_abgr), // %0
1577 "+r"(src_abgr_1), // %1
1578 "+r"(dst_u), // %2
1579 "+r"(dst_v), // %3
1580 "+r"(width) // %4
1581 :
1582 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1583 "v20", "v21", "v22", "v23", "v24", "v25"
1584 );
1585 }
1586
RGBAToUVRow_NEON(const uint8 * src_rgba,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1587 void RGBAToUVRow_NEON(const uint8* src_rgba,
1588 int src_stride_rgba,
1589 uint8* dst_u,
1590 uint8* dst_v,
1591 int width) {
1592 const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1593 asm volatile (
1594 RGBTOUV_SETUP_REG
1595 "1: \n"
1596 MEMACCESS(0)
1597 "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
1598 "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
1599 "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
1600 "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
1601 MEMACCESS(1)
1602 "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
1603 "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
1604 "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
1605 "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
1606
1607 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1608 "urshr v1.8h, v1.8h, #1 \n"
1609 "urshr v2.8h, v2.8h, #1 \n"
1610
1611 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1612 RGBTOUV(v0.8h, v1.8h, v2.8h)
1613 MEMACCESS(2)
1614 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1615 MEMACCESS(3)
1616 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1617 "b.gt 1b \n"
1618 : "+r"(src_rgba), // %0
1619 "+r"(src_rgba_1), // %1
1620 "+r"(dst_u), // %2
1621 "+r"(dst_v), // %3
1622 "+r"(width) // %4
1623 :
1624 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1625 "v20", "v21", "v22", "v23", "v24", "v25"
1626 );
1627 }
1628
RGB24ToUVRow_NEON(const uint8 * src_rgb24,int src_stride_rgb24,uint8 * dst_u,uint8 * dst_v,int width)1629 void RGB24ToUVRow_NEON(const uint8* src_rgb24,
1630 int src_stride_rgb24,
1631 uint8* dst_u,
1632 uint8* dst_v,
1633 int width) {
1634 const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1635 asm volatile (
1636 RGBTOUV_SETUP_REG
1637 "1: \n"
1638 MEMACCESS(0)
1639 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
1640 "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
1641 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1642 "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
1643 MEMACCESS(1)
1644 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more.
1645 "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
1646 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1647 "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
1648
1649 "urshr v0.8h, v0.8h, #1 \n" // 2x average
1650 "urshr v1.8h, v1.8h, #1 \n"
1651 "urshr v2.8h, v2.8h, #1 \n"
1652
1653 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1654 RGBTOUV(v0.8h, v1.8h, v2.8h)
1655 MEMACCESS(2)
1656 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1657 MEMACCESS(3)
1658 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1659 "b.gt 1b \n"
1660 : "+r"(src_rgb24), // %0
1661 "+r"(src_rgb24_1), // %1
1662 "+r"(dst_u), // %2
1663 "+r"(dst_v), // %3
1664 "+r"(width) // %4
1665 :
1666 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1667 "v20", "v21", "v22", "v23", "v24", "v25"
1668 );
1669 }
1670
RAWToUVRow_NEON(const uint8 * src_raw,int src_stride_raw,uint8 * dst_u,uint8 * dst_v,int width)1671 void RAWToUVRow_NEON(const uint8* src_raw,
1672 int src_stride_raw,
1673 uint8* dst_u,
1674 uint8* dst_v,
1675 int width) {
1676 const uint8* src_raw_1 = src_raw + src_stride_raw;
1677 asm volatile (
1678 RGBTOUV_SETUP_REG
1679 "1: \n"
1680 MEMACCESS(0)
1681 "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
1682 "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
1683 "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
1684 "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
1685 MEMACCESS(1)
1686 "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels
1687 "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
1688 "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
1689 "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
1690
1691 "urshr v2.8h, v2.8h, #1 \n" // 2x average
1692 "urshr v1.8h, v1.8h, #1 \n"
1693 "urshr v0.8h, v0.8h, #1 \n"
1694
1695 "subs %w4, %w4, #16 \n" // 32 processed per loop.
1696 RGBTOUV(v2.8h, v1.8h, v0.8h)
1697 MEMACCESS(2)
1698 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1699 MEMACCESS(3)
1700 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1701 "b.gt 1b \n"
1702 : "+r"(src_raw), // %0
1703 "+r"(src_raw_1), // %1
1704 "+r"(dst_u), // %2
1705 "+r"(dst_v), // %3
1706 "+r"(width) // %4
1707 :
1708 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1709 "v20", "v21", "v22", "v23", "v24", "v25"
1710 );
1711 }
1712
1713 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int width)1714 void RGB565ToUVRow_NEON(const uint8* src_rgb565,
1715 int src_stride_rgb565,
1716 uint8* dst_u,
1717 uint8* dst_v,
1718 int width) {
1719 const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1720 asm volatile (
1721 "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
1722 "movi v23.8h, #37, lsl #0 \n" // UG coeff (-0.5781) / 2
1723 "movi v24.8h, #19, lsl #0 \n" // UR coeff (-0.2969) / 2
1724 "movi v25.8h, #9 , lsl #0 \n" // VB coeff (-0.1406) / 2
1725 "movi v26.8h, #47, lsl #0 \n" // VG coeff (-0.7344) / 2
1726 "movi v27.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
1727 "1: \n"
1728 MEMACCESS(0)
1729 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1730 RGB565TOARGB
1731 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1732 "uaddlp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1733 "uaddlp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1734 MEMACCESS(0)
1735 "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels.
1736 RGB565TOARGB
1737 "uaddlp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1738 "uaddlp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1739 "uaddlp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1740
1741 MEMACCESS(1)
1742 "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels.
1743 RGB565TOARGB
1744 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1745 "uadalp v18.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1746 "uadalp v20.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1747 MEMACCESS(1)
1748 "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels.
1749 RGB565TOARGB
1750 "uadalp v17.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1751 "uadalp v19.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1752 "uadalp v21.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1753
1754 "ins v16.D[1], v17.D[0] \n"
1755 "ins v18.D[1], v19.D[0] \n"
1756 "ins v20.D[1], v21.D[0] \n"
1757
1758 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1759 "urshr v5.8h, v18.8h, #1 \n"
1760 "urshr v6.8h, v20.8h, #1 \n"
1761
1762 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1763 "mul v16.8h, v4.8h, v22.8h \n" // B
1764 "mls v16.8h, v5.8h, v23.8h \n" // G
1765 "mls v16.8h, v6.8h, v24.8h \n" // R
1766 "add v16.8h, v16.8h, v27.8h \n" // +128 -> unsigned
1767 "mul v17.8h, v6.8h, v22.8h \n" // R
1768 "mls v17.8h, v5.8h, v26.8h \n" // G
1769 "mls v17.8h, v4.8h, v25.8h \n" // B
1770 "add v17.8h, v17.8h, v27.8h \n" // +128 -> unsigned
1771 "uqshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit U
1772 "uqshrn v1.8b, v17.8h, #8 \n" // 16 bit to 8 bit V
1773 MEMACCESS(2)
1774 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1775 MEMACCESS(3)
1776 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1777 "b.gt 1b \n"
1778 : "+r"(src_rgb565), // %0
1779 "+r"(src_rgb565_1), // %1
1780 "+r"(dst_u), // %2
1781 "+r"(dst_v), // %3
1782 "+r"(width) // %4
1783 :
1784 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1785 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1786 "v25", "v26", "v27"
1787 );
1788 }
1789
1790 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int width)1791 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
1792 int src_stride_argb1555,
1793 uint8* dst_u,
1794 uint8* dst_v,
1795 int width) {
1796 const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1797 asm volatile (
1798 RGBTOUV_SETUP_REG
1799 "1: \n"
1800 MEMACCESS(0)
1801 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1802 RGB555TOARGB
1803 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1804 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1805 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1806 MEMACCESS(0)
1807 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels.
1808 RGB555TOARGB
1809 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1810 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1811 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1812
1813 MEMACCESS(1)
1814 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels.
1815 RGB555TOARGB
1816 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1817 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1818 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1819 MEMACCESS(1)
1820 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels.
1821 RGB555TOARGB
1822 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1823 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1824 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1825
1826 "ins v16.D[1], v26.D[0] \n"
1827 "ins v17.D[1], v27.D[0] \n"
1828 "ins v18.D[1], v28.D[0] \n"
1829
1830 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1831 "urshr v5.8h, v17.8h, #1 \n"
1832 "urshr v6.8h, v18.8h, #1 \n"
1833
1834 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1835 "mul v2.8h, v4.8h, v20.8h \n" // B
1836 "mls v2.8h, v5.8h, v21.8h \n" // G
1837 "mls v2.8h, v6.8h, v22.8h \n" // R
1838 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
1839 "mul v3.8h, v6.8h, v20.8h \n" // R
1840 "mls v3.8h, v5.8h, v24.8h \n" // G
1841 "mls v3.8h, v4.8h, v23.8h \n" // B
1842 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1843 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
1844 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1845 MEMACCESS(2)
1846 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1847 MEMACCESS(3)
1848 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1849 "b.gt 1b \n"
1850 : "+r"(src_argb1555), // %0
1851 "+r"(src_argb1555_1), // %1
1852 "+r"(dst_u), // %2
1853 "+r"(dst_v), // %3
1854 "+r"(width) // %4
1855 :
1856 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1857 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1858 "v26", "v27", "v28"
1859 );
1860 }
1861
1862 // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8 * src_argb4444,int src_stride_argb4444,uint8 * dst_u,uint8 * dst_v,int width)1863 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
1864 int src_stride_argb4444,
1865 uint8* dst_u,
1866 uint8* dst_v,
1867 int width) {
1868 const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1869 asm volatile (
1870 RGBTOUV_SETUP_REG
1871 "1: \n"
1872 MEMACCESS(0)
1873 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1874 ARGB4444TOARGB
1875 "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1876 "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1877 "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1878 MEMACCESS(0)
1879 "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels.
1880 ARGB4444TOARGB
1881 "uaddlp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1882 "uaddlp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1883 "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1884
1885 MEMACCESS(1)
1886 "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels.
1887 ARGB4444TOARGB
1888 "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1889 "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1890 "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1891 MEMACCESS(1)
1892 "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels.
1893 ARGB4444TOARGB
1894 "uadalp v26.4h, v0.8b \n" // B 8 bytes -> 4 shorts.
1895 "uadalp v27.4h, v1.8b \n" // G 8 bytes -> 4 shorts.
1896 "uadalp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts.
1897
1898 "ins v16.D[1], v26.D[0] \n"
1899 "ins v17.D[1], v27.D[0] \n"
1900 "ins v18.D[1], v28.D[0] \n"
1901
1902 "urshr v4.8h, v16.8h, #1 \n" // 2x average
1903 "urshr v5.8h, v17.8h, #1 \n"
1904 "urshr v6.8h, v18.8h, #1 \n"
1905
1906 "subs %w4, %w4, #16 \n" // 16 processed per loop.
1907 "mul v2.8h, v4.8h, v20.8h \n" // B
1908 "mls v2.8h, v5.8h, v21.8h \n" // G
1909 "mls v2.8h, v6.8h, v22.8h \n" // R
1910 "add v2.8h, v2.8h, v25.8h \n" // +128 -> unsigned
1911 "mul v3.8h, v6.8h, v20.8h \n" // R
1912 "mls v3.8h, v5.8h, v24.8h \n" // G
1913 "mls v3.8h, v4.8h, v23.8h \n" // B
1914 "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
1915 "uqshrn v0.8b, v2.8h, #8 \n" // 16 bit to 8 bit U
1916 "uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
1917 MEMACCESS(2)
1918 "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
1919 MEMACCESS(3)
1920 "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
1921 "b.gt 1b \n"
1922 : "+r"(src_argb4444), // %0
1923 "+r"(src_argb4444_1), // %1
1924 "+r"(dst_u), // %2
1925 "+r"(dst_v), // %3
1926 "+r"(width) // %4
1927 :
1928 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1929 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1930 "v26", "v27", "v28"
1931
1932 );
1933 }
1934
RGB565ToYRow_NEON(const uint8 * src_rgb565,uint8 * dst_y,int width)1935 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1936 asm volatile (
1937 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
1938 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
1939 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
1940 "movi v27.8b, #16 \n" // Add 16 constant
1941 "1: \n"
1942 MEMACCESS(0)
1943 "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels.
1944 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1945 RGB565TOARGB
1946 "umull v3.8h, v0.8b, v24.8b \n" // B
1947 "umlal v3.8h, v1.8b, v25.8b \n" // G
1948 "umlal v3.8h, v2.8b, v26.8b \n" // R
1949 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1950 "uqadd v0.8b, v0.8b, v27.8b \n"
1951 MEMACCESS(1)
1952 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1953 "b.gt 1b \n"
1954 : "+r"(src_rgb565), // %0
1955 "+r"(dst_y), // %1
1956 "+r"(width) // %2
1957 :
1958 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
1959 "v24", "v25", "v26", "v27"
1960 );
1961 }
1962
ARGB1555ToYRow_NEON(const uint8 * src_argb1555,uint8 * dst_y,int width)1963 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1964 asm volatile (
1965 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
1966 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
1967 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
1968 "movi v7.8b, #16 \n" // Add 16 constant
1969 "1: \n"
1970 MEMACCESS(0)
1971 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels.
1972 "subs %w2, %w2, #8 \n" // 8 processed per loop.
1973 ARGB1555TOARGB
1974 "umull v3.8h, v0.8b, v4.8b \n" // B
1975 "umlal v3.8h, v1.8b, v5.8b \n" // G
1976 "umlal v3.8h, v2.8b, v6.8b \n" // R
1977 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
1978 "uqadd v0.8b, v0.8b, v7.8b \n"
1979 MEMACCESS(1)
1980 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
1981 "b.gt 1b \n"
1982 : "+r"(src_argb1555), // %0
1983 "+r"(dst_y), // %1
1984 "+r"(width) // %2
1985 :
1986 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1987 );
1988 }
1989
ARGB4444ToYRow_NEON(const uint8 * src_argb4444,uint8 * dst_y,int width)1990 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
1991 asm volatile (
1992 "movi v24.8b, #13 \n" // B * 0.1016 coefficient
1993 "movi v25.8b, #65 \n" // G * 0.5078 coefficient
1994 "movi v26.8b, #33 \n" // R * 0.2578 coefficient
1995 "movi v27.8b, #16 \n" // Add 16 constant
1996 "1: \n"
1997 MEMACCESS(0)
1998 "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels.
1999 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2000 ARGB4444TOARGB
2001 "umull v3.8h, v0.8b, v24.8b \n" // B
2002 "umlal v3.8h, v1.8b, v25.8b \n" // G
2003 "umlal v3.8h, v2.8b, v26.8b \n" // R
2004 "sqrshrun v0.8b, v3.8h, #7 \n" // 16 bit to 8 bit Y
2005 "uqadd v0.8b, v0.8b, v27.8b \n"
2006 MEMACCESS(1)
2007 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2008 "b.gt 1b \n"
2009 : "+r"(src_argb4444), // %0
2010 "+r"(dst_y), // %1
2011 "+r"(width) // %2
2012 :
2013 : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2014 );
2015 }
2016
BGRAToYRow_NEON(const uint8 * src_bgra,uint8 * dst_y,int width)2017 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2018 asm volatile (
2019 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2020 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2021 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2022 "movi v7.8b, #16 \n" // Add 16 constant
2023 "1: \n"
2024 MEMACCESS(0)
2025 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2026 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2027 "umull v16.8h, v1.8b, v4.8b \n" // R
2028 "umlal v16.8h, v2.8b, v5.8b \n" // G
2029 "umlal v16.8h, v3.8b, v6.8b \n" // B
2030 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2031 "uqadd v0.8b, v0.8b, v7.8b \n"
2032 MEMACCESS(1)
2033 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2034 "b.gt 1b \n"
2035 : "+r"(src_bgra), // %0
2036 "+r"(dst_y), // %1
2037 "+r"(width) // %2
2038 :
2039 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2040 );
2041 }
2042
ABGRToYRow_NEON(const uint8 * src_abgr,uint8 * dst_y,int width)2043 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2044 asm volatile (
2045 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2046 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2047 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2048 "movi v7.8b, #16 \n" // Add 16 constant
2049 "1: \n"
2050 MEMACCESS(0)
2051 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2052 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2053 "umull v16.8h, v0.8b, v4.8b \n" // R
2054 "umlal v16.8h, v1.8b, v5.8b \n" // G
2055 "umlal v16.8h, v2.8b, v6.8b \n" // B
2056 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2057 "uqadd v0.8b, v0.8b, v7.8b \n"
2058 MEMACCESS(1)
2059 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2060 "b.gt 1b \n"
2061 : "+r"(src_abgr), // %0
2062 "+r"(dst_y), // %1
2063 "+r"(width) // %2
2064 :
2065 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2066 );
2067 }
2068
RGBAToYRow_NEON(const uint8 * src_rgba,uint8 * dst_y,int width)2069 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2070 asm volatile (
2071 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2072 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2073 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2074 "movi v7.8b, #16 \n" // Add 16 constant
2075 "1: \n"
2076 MEMACCESS(0)
2077 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
2078 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2079 "umull v16.8h, v1.8b, v4.8b \n" // B
2080 "umlal v16.8h, v2.8b, v5.8b \n" // G
2081 "umlal v16.8h, v3.8b, v6.8b \n" // R
2082 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2083 "uqadd v0.8b, v0.8b, v7.8b \n"
2084 MEMACCESS(1)
2085 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2086 "b.gt 1b \n"
2087 : "+r"(src_rgba), // %0
2088 "+r"(dst_y), // %1
2089 "+r"(width) // %2
2090 :
2091 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2092 );
2093 }
2094
RGB24ToYRow_NEON(const uint8 * src_rgb24,uint8 * dst_y,int width)2095 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2096 asm volatile (
2097 "movi v4.8b, #13 \n" // B * 0.1016 coefficient
2098 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2099 "movi v6.8b, #33 \n" // R * 0.2578 coefficient
2100 "movi v7.8b, #16 \n" // Add 16 constant
2101 "1: \n"
2102 MEMACCESS(0)
2103 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2104 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2105 "umull v16.8h, v0.8b, v4.8b \n" // B
2106 "umlal v16.8h, v1.8b, v5.8b \n" // G
2107 "umlal v16.8h, v2.8b, v6.8b \n" // R
2108 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2109 "uqadd v0.8b, v0.8b, v7.8b \n"
2110 MEMACCESS(1)
2111 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2112 "b.gt 1b \n"
2113 : "+r"(src_rgb24), // %0
2114 "+r"(dst_y), // %1
2115 "+r"(width) // %2
2116 :
2117 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2118 );
2119 }
2120
RAWToYRow_NEON(const uint8 * src_raw,uint8 * dst_y,int width)2121 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2122 asm volatile (
2123 "movi v4.8b, #33 \n" // R * 0.2578 coefficient
2124 "movi v5.8b, #65 \n" // G * 0.5078 coefficient
2125 "movi v6.8b, #13 \n" // B * 0.1016 coefficient
2126 "movi v7.8b, #16 \n" // Add 16 constant
2127 "1: \n"
2128 MEMACCESS(0)
2129 "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
2130 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2131 "umull v16.8h, v0.8b, v4.8b \n" // B
2132 "umlal v16.8h, v1.8b, v5.8b \n" // G
2133 "umlal v16.8h, v2.8b, v6.8b \n" // R
2134 "sqrshrun v0.8b, v16.8h, #7 \n" // 16 bit to 8 bit Y
2135 "uqadd v0.8b, v0.8b, v7.8b \n"
2136 MEMACCESS(1)
2137 "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
2138 "b.gt 1b \n"
2139 : "+r"(src_raw), // %0
2140 "+r"(dst_y), // %1
2141 "+r"(width) // %2
2142 :
2143 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2144 );
2145 }
2146
2147 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2148 void InterpolateRow_NEON(uint8* dst_ptr,
2149 const uint8* src_ptr,
2150 ptrdiff_t src_stride,
2151 int dst_width,
2152 int source_y_fraction) {
2153 int y1_fraction = source_y_fraction;
2154 int y0_fraction = 256 - y1_fraction;
2155 const uint8* src_ptr1 = src_ptr + src_stride;
2156 asm volatile (
2157 "cmp %w4, #0 \n"
2158 "b.eq 100f \n"
2159 "cmp %w4, #128 \n"
2160 "b.eq 50f \n"
2161
2162 "dup v5.16b, %w4 \n"
2163 "dup v4.16b, %w5 \n"
2164 // General purpose row blend.
2165 "1: \n"
2166 MEMACCESS(1)
2167 "ld1 {v0.16b}, [%1], #16 \n"
2168 MEMACCESS(2)
2169 "ld1 {v1.16b}, [%2], #16 \n"
2170 "subs %w3, %w3, #16 \n"
2171 "umull v2.8h, v0.8b, v4.8b \n"
2172 "umull2 v3.8h, v0.16b, v4.16b \n"
2173 "umlal v2.8h, v1.8b, v5.8b \n"
2174 "umlal2 v3.8h, v1.16b, v5.16b \n"
2175 "rshrn v0.8b, v2.8h, #8 \n"
2176 "rshrn2 v0.16b, v3.8h, #8 \n"
2177 MEMACCESS(0)
2178 "st1 {v0.16b}, [%0], #16 \n"
2179 "b.gt 1b \n"
2180 "b 99f \n"
2181
2182 // Blend 50 / 50.
2183 "50: \n"
2184 MEMACCESS(1)
2185 "ld1 {v0.16b}, [%1], #16 \n"
2186 MEMACCESS(2)
2187 "ld1 {v1.16b}, [%2], #16 \n"
2188 "subs %w3, %w3, #16 \n"
2189 "urhadd v0.16b, v0.16b, v1.16b \n"
2190 MEMACCESS(0)
2191 "st1 {v0.16b}, [%0], #16 \n"
2192 "b.gt 50b \n"
2193 "b 99f \n"
2194
2195 // Blend 100 / 0 - Copy row unchanged.
2196 "100: \n"
2197 MEMACCESS(1)
2198 "ld1 {v0.16b}, [%1], #16 \n"
2199 "subs %w3, %w3, #16 \n"
2200 MEMACCESS(0)
2201 "st1 {v0.16b}, [%0], #16 \n"
2202 "b.gt 100b \n"
2203
2204 "99: \n"
2205 : "+r"(dst_ptr), // %0
2206 "+r"(src_ptr), // %1
2207 "+r"(src_ptr1), // %2
2208 "+r"(dst_width), // %3
2209 "+r"(y1_fraction), // %4
2210 "+r"(y0_fraction) // %5
2211 :
2212 : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2213 );
2214 }
2215
2216 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2217 void ARGBBlendRow_NEON(const uint8* src_argb0,
2218 const uint8* src_argb1,
2219 uint8* dst_argb,
2220 int width) {
2221 asm volatile (
2222 "subs %w3, %w3, #8 \n"
2223 "b.lt 89f \n"
2224 // Blend 8 pixels.
2225 "8: \n"
2226 MEMACCESS(0)
2227 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels
2228 MEMACCESS(1)
2229 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels
2230 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2231 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2232 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2233 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2234 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2235 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2236 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2237 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2238 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2239 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2240 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2241 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2242 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2243 "movi v3.8b, #255 \n" // a = 255
2244 MEMACCESS(2)
2245 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2246 "b.ge 8b \n"
2247
2248 "89: \n"
2249 "adds %w3, %w3, #8-1 \n"
2250 "b.lt 99f \n"
2251
2252 // Blend 1 pixels.
2253 "1: \n"
2254 MEMACCESS(0)
2255 "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0.
2256 MEMACCESS(1)
2257 "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1.
2258 "subs %w3, %w3, #1 \n" // 1 processed per loop.
2259 "umull v16.8h, v4.8b, v3.8b \n" // db * a
2260 "umull v17.8h, v5.8b, v3.8b \n" // dg * a
2261 "umull v18.8h, v6.8b, v3.8b \n" // dr * a
2262 "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8
2263 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8
2264 "uqrshrn v18.8b, v18.8h, #8 \n" // dr >>= 8
2265 "uqsub v4.8b, v4.8b, v16.8b \n" // db - (db * a / 256)
2266 "uqsub v5.8b, v5.8b, v17.8b \n" // dg - (dg * a / 256)
2267 "uqsub v6.8b, v6.8b, v18.8b \n" // dr - (dr * a / 256)
2268 "uqadd v0.8b, v0.8b, v4.8b \n" // + sb
2269 "uqadd v1.8b, v1.8b, v5.8b \n" // + sg
2270 "uqadd v2.8b, v2.8b, v6.8b \n" // + sr
2271 "movi v3.8b, #255 \n" // a = 255
2272 MEMACCESS(2)
2273 "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel.
2274 "b.ge 1b \n"
2275
2276 "99: \n"
2277
2278 : "+r"(src_argb0), // %0
2279 "+r"(src_argb1), // %1
2280 "+r"(dst_argb), // %2
2281 "+r"(width) // %3
2282 :
2283 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2284 "v16", "v17", "v18"
2285 );
2286 }
2287
2288 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2289 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2290 asm volatile (
2291 // Attenuate 8 pixels.
2292 "1: \n"
2293 MEMACCESS(0)
2294 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels
2295 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2296 "umull v4.8h, v0.8b, v3.8b \n" // b * a
2297 "umull v5.8h, v1.8b, v3.8b \n" // g * a
2298 "umull v6.8h, v2.8b, v3.8b \n" // r * a
2299 "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8
2300 "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8
2301 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8
2302 MEMACCESS(1)
2303 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels
2304 "b.gt 1b \n"
2305 : "+r"(src_argb), // %0
2306 "+r"(dst_argb), // %1
2307 "+r"(width) // %2
2308 :
2309 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2310 );
2311 }
2312
2313 // Quantize 8 ARGB pixels (32 bytes).
2314 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)2315 void ARGBQuantizeRow_NEON(uint8* dst_argb,
2316 int scale,
2317 int interval_size,
2318 int interval_offset,
2319 int width) {
2320 asm volatile (
2321 "dup v4.8h, %w2 \n"
2322 "ushr v4.8h, v4.8h, #1 \n" // scale >>= 1
2323 "dup v5.8h, %w3 \n" // interval multiply.
2324 "dup v6.8h, %w4 \n" // interval add
2325
2326 // 8 pixel loop.
2327 "1: \n"
2328 MEMACCESS(0)
2329 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB.
2330 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2331 "uxtl v0.8h, v0.8b \n" // b (0 .. 255)
2332 "uxtl v1.8h, v1.8b \n"
2333 "uxtl v2.8h, v2.8b \n"
2334 "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
2335 "sqdmulh v1.8h, v1.8h, v4.8h \n" // g
2336 "sqdmulh v2.8h, v2.8h, v4.8h \n" // r
2337 "mul v0.8h, v0.8h, v5.8h \n" // b * interval_size
2338 "mul v1.8h, v1.8h, v5.8h \n" // g
2339 "mul v2.8h, v2.8h, v5.8h \n" // r
2340 "add v0.8h, v0.8h, v6.8h \n" // b + interval_offset
2341 "add v1.8h, v1.8h, v6.8h \n" // g
2342 "add v2.8h, v2.8h, v6.8h \n" // r
2343 "uqxtn v0.8b, v0.8h \n"
2344 "uqxtn v1.8b, v1.8h \n"
2345 "uqxtn v2.8b, v2.8h \n"
2346 MEMACCESS(0)
2347 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 ARGB pixels
2348 "b.gt 1b \n"
2349 : "+r"(dst_argb), // %0
2350 "+r"(width) // %1
2351 : "r"(scale), // %2
2352 "r"(interval_size), // %3
2353 "r"(interval_offset) // %4
2354 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2355 );
2356 }
2357
2358 // Shade 8 pixels at a time by specified value.
2359 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2360 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)2361 void ARGBShadeRow_NEON(const uint8* src_argb,
2362 uint8* dst_argb,
2363 int width,
2364 uint32 value) {
2365 asm volatile (
2366 "dup v0.4s, %w3 \n" // duplicate scale value.
2367 "zip1 v0.8b, v0.8b, v0.8b \n" // v0.8b aarrggbb.
2368 "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
2369
2370 // 8 pixel loop.
2371 "1: \n"
2372 MEMACCESS(0)
2373 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2374 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2375 "uxtl v4.8h, v4.8b \n" // b (0 .. 255)
2376 "uxtl v5.8h, v5.8b \n"
2377 "uxtl v6.8h, v6.8b \n"
2378 "uxtl v7.8h, v7.8b \n"
2379 "sqrdmulh v4.8h, v4.8h, v0.h[0] \n" // b * scale * 2
2380 "sqrdmulh v5.8h, v5.8h, v0.h[1] \n" // g
2381 "sqrdmulh v6.8h, v6.8h, v0.h[2] \n" // r
2382 "sqrdmulh v7.8h, v7.8h, v0.h[3] \n" // a
2383 "uqxtn v4.8b, v4.8h \n"
2384 "uqxtn v5.8b, v5.8h \n"
2385 "uqxtn v6.8b, v6.8h \n"
2386 "uqxtn v7.8b, v7.8h \n"
2387 MEMACCESS(1)
2388 "st4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // store 8 ARGB pixels
2389 "b.gt 1b \n"
2390 : "+r"(src_argb), // %0
2391 "+r"(dst_argb), // %1
2392 "+r"(width) // %2
2393 : "r"(value) // %3
2394 : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2395 );
2396 }
2397
2398 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2399 // Similar to ARGBToYJ but stores ARGB.
2400 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
ARGBGrayRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2401 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2402 asm volatile (
2403 "movi v24.8b, #15 \n" // B * 0.11400 coefficient
2404 "movi v25.8b, #75 \n" // G * 0.58700 coefficient
2405 "movi v26.8b, #38 \n" // R * 0.29900 coefficient
2406 "1: \n"
2407 MEMACCESS(0)
2408 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2409 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2410 "umull v4.8h, v0.8b, v24.8b \n" // B
2411 "umlal v4.8h, v1.8b, v25.8b \n" // G
2412 "umlal v4.8h, v2.8b, v26.8b \n" // R
2413 "sqrshrun v0.8b, v4.8h, #7 \n" // 15 bit to 8 bit B
2414 "orr v1.8b, v0.8b, v0.8b \n" // G
2415 "orr v2.8b, v0.8b, v0.8b \n" // R
2416 MEMACCESS(1)
2417 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 pixels.
2418 "b.gt 1b \n"
2419 : "+r"(src_argb), // %0
2420 "+r"(dst_argb), // %1
2421 "+r"(width) // %2
2422 :
2423 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2424 );
2425 }
2426
2427 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2428 // b = (r * 35 + g * 68 + b * 17) >> 7
2429 // g = (r * 45 + g * 88 + b * 22) >> 7
2430 // r = (r * 50 + g * 98 + b * 24) >> 7
2431
ARGBSepiaRow_NEON(uint8 * dst_argb,int width)2432 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2433 asm volatile (
2434 "movi v20.8b, #17 \n" // BB coefficient
2435 "movi v21.8b, #68 \n" // BG coefficient
2436 "movi v22.8b, #35 \n" // BR coefficient
2437 "movi v24.8b, #22 \n" // GB coefficient
2438 "movi v25.8b, #88 \n" // GG coefficient
2439 "movi v26.8b, #45 \n" // GR coefficient
2440 "movi v28.8b, #24 \n" // BB coefficient
2441 "movi v29.8b, #98 \n" // BG coefficient
2442 "movi v30.8b, #50 \n" // BR coefficient
2443 "1: \n"
2444 MEMACCESS(0)
2445 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
2446 "subs %w1, %w1, #8 \n" // 8 processed per loop.
2447 "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
2448 "umlal v4.8h, v1.8b, v21.8b \n" // G
2449 "umlal v4.8h, v2.8b, v22.8b \n" // R
2450 "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
2451 "umlal v5.8h, v1.8b, v25.8b \n" // G
2452 "umlal v5.8h, v2.8b, v26.8b \n" // R
2453 "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
2454 "umlal v6.8h, v1.8b, v29.8b \n" // G
2455 "umlal v6.8h, v2.8b, v30.8b \n" // R
2456 "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
2457 "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
2458 "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
2459 MEMACCESS(0)
2460 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
2461 "b.gt 1b \n"
2462 : "+r"(dst_argb), // %0
2463 "+r"(width) // %1
2464 :
2465 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2466 "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2467 );
2468 }
2469
2470 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2471 // TODO(fbarchard): Was same as Sepia except matrix is provided. This function
2472 // needs to saturate. Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)2473 void ARGBColorMatrixRow_NEON(const uint8* src_argb,
2474 uint8* dst_argb,
2475 const int8* matrix_argb,
2476 int width) {
2477 asm volatile (
2478 MEMACCESS(3)
2479 "ld1 {v2.16b}, [%3] \n" // load 3 ARGB vectors.
2480 "sxtl v0.8h, v2.8b \n" // B,G coefficients s16.
2481 "sxtl2 v1.8h, v2.16b \n" // R,A coefficients s16.
2482
2483 "1: \n"
2484 MEMACCESS(0)
2485 "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels.
2486 "subs %w2, %w2, #8 \n" // 8 processed per loop.
2487 "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit
2488 "uxtl v17.8h, v17.8b \n" // g
2489 "uxtl v18.8h, v18.8b \n" // r
2490 "uxtl v19.8h, v19.8b \n" // a
2491 "mul v22.8h, v16.8h, v0.h[0] \n" // B = B * Matrix B
2492 "mul v23.8h, v16.8h, v0.h[4] \n" // G = B * Matrix G
2493 "mul v24.8h, v16.8h, v1.h[0] \n" // R = B * Matrix R
2494 "mul v25.8h, v16.8h, v1.h[4] \n" // A = B * Matrix A
2495 "mul v4.8h, v17.8h, v0.h[1] \n" // B += G * Matrix B
2496 "mul v5.8h, v17.8h, v0.h[5] \n" // G += G * Matrix G
2497 "mul v6.8h, v17.8h, v1.h[1] \n" // R += G * Matrix R
2498 "mul v7.8h, v17.8h, v1.h[5] \n" // A += G * Matrix A
2499 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2500 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2501 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2502 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2503 "mul v4.8h, v18.8h, v0.h[2] \n" // B += R * Matrix B
2504 "mul v5.8h, v18.8h, v0.h[6] \n" // G += R * Matrix G
2505 "mul v6.8h, v18.8h, v1.h[2] \n" // R += R * Matrix R
2506 "mul v7.8h, v18.8h, v1.h[6] \n" // A += R * Matrix A
2507 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2508 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2509 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2510 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2511 "mul v4.8h, v19.8h, v0.h[3] \n" // B += A * Matrix B
2512 "mul v5.8h, v19.8h, v0.h[7] \n" // G += A * Matrix G
2513 "mul v6.8h, v19.8h, v1.h[3] \n" // R += A * Matrix R
2514 "mul v7.8h, v19.8h, v1.h[7] \n" // A += A * Matrix A
2515 "sqadd v22.8h, v22.8h, v4.8h \n" // Accumulate B
2516 "sqadd v23.8h, v23.8h, v5.8h \n" // Accumulate G
2517 "sqadd v24.8h, v24.8h, v6.8h \n" // Accumulate R
2518 "sqadd v25.8h, v25.8h, v7.8h \n" // Accumulate A
2519 "sqshrun v16.8b, v22.8h, #6 \n" // 16 bit to 8 bit B
2520 "sqshrun v17.8b, v23.8h, #6 \n" // 16 bit to 8 bit G
2521 "sqshrun v18.8b, v24.8h, #6 \n" // 16 bit to 8 bit R
2522 "sqshrun v19.8b, v25.8h, #6 \n" // 16 bit to 8 bit A
2523 MEMACCESS(1)
2524 "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n" // store 8 pixels.
2525 "b.gt 1b \n"
2526 : "+r"(src_argb), // %0
2527 "+r"(dst_argb), // %1
2528 "+r"(width) // %2
2529 : "r"(matrix_argb) // %3
2530 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2531 "v18", "v19", "v22", "v23", "v24", "v25"
2532 );
2533 }
2534
2535 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2536 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2537 void ARGBMultiplyRow_NEON(const uint8* src_argb0,
2538 const uint8* src_argb1,
2539 uint8* dst_argb,
2540 int width) {
2541 asm volatile (
2542 // 8 pixel loop.
2543 "1: \n"
2544 MEMACCESS(0)
2545 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2546 MEMACCESS(1)
2547 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2548 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2549 "umull v0.8h, v0.8b, v4.8b \n" // multiply B
2550 "umull v1.8h, v1.8b, v5.8b \n" // multiply G
2551 "umull v2.8h, v2.8b, v6.8b \n" // multiply R
2552 "umull v3.8h, v3.8b, v7.8b \n" // multiply A
2553 "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B
2554 "rshrn v1.8b, v1.8h, #8 \n" // 16 bit to 8 bit G
2555 "rshrn v2.8b, v2.8h, #8 \n" // 16 bit to 8 bit R
2556 "rshrn v3.8b, v3.8h, #8 \n" // 16 bit to 8 bit A
2557 MEMACCESS(2)
2558 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2559 "b.gt 1b \n"
2560
2561 : "+r"(src_argb0), // %0
2562 "+r"(src_argb1), // %1
2563 "+r"(dst_argb), // %2
2564 "+r"(width) // %3
2565 :
2566 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2567 );
2568 }
2569
2570 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2571 void ARGBAddRow_NEON(const uint8* src_argb0,
2572 const uint8* src_argb1,
2573 uint8* dst_argb,
2574 int width) {
2575 asm volatile (
2576 // 8 pixel loop.
2577 "1: \n"
2578 MEMACCESS(0)
2579 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2580 MEMACCESS(1)
2581 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2582 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2583 "uqadd v0.8b, v0.8b, v4.8b \n"
2584 "uqadd v1.8b, v1.8b, v5.8b \n"
2585 "uqadd v2.8b, v2.8b, v6.8b \n"
2586 "uqadd v3.8b, v3.8b, v7.8b \n"
2587 MEMACCESS(2)
2588 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2589 "b.gt 1b \n"
2590
2591 : "+r"(src_argb0), // %0
2592 "+r"(src_argb1), // %1
2593 "+r"(dst_argb), // %2
2594 "+r"(width) // %3
2595 :
2596 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2597 );
2598 }
2599
2600 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2601 void ARGBSubtractRow_NEON(const uint8* src_argb0,
2602 const uint8* src_argb1,
2603 uint8* dst_argb,
2604 int width) {
2605 asm volatile (
2606 // 8 pixel loop.
2607 "1: \n"
2608 MEMACCESS(0)
2609 "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels.
2610 MEMACCESS(1)
2611 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels.
2612 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2613 "uqsub v0.8b, v0.8b, v4.8b \n"
2614 "uqsub v1.8b, v1.8b, v5.8b \n"
2615 "uqsub v2.8b, v2.8b, v6.8b \n"
2616 "uqsub v3.8b, v3.8b, v7.8b \n"
2617 MEMACCESS(2)
2618 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2619 "b.gt 1b \n"
2620
2621 : "+r"(src_argb0), // %0
2622 "+r"(src_argb1), // %1
2623 "+r"(dst_argb), // %2
2624 "+r"(width) // %3
2625 :
2626 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2627 );
2628 }
2629
2630 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2631 // A = 255
2632 // R = Sobel
2633 // G = Sobel
2634 // B = Sobel
SobelRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2635 void SobelRow_NEON(const uint8* src_sobelx,
2636 const uint8* src_sobely,
2637 uint8* dst_argb,
2638 int width) {
2639 asm volatile (
2640 "movi v3.8b, #255 \n" // alpha
2641 // 8 pixel loop.
2642 "1: \n"
2643 MEMACCESS(0)
2644 "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx.
2645 MEMACCESS(1)
2646 "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely.
2647 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2648 "uqadd v0.8b, v0.8b, v1.8b \n" // add
2649 "orr v1.8b, v0.8b, v0.8b \n"
2650 "orr v2.8b, v0.8b, v0.8b \n"
2651 MEMACCESS(2)
2652 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2653 "b.gt 1b \n"
2654 : "+r"(src_sobelx), // %0
2655 "+r"(src_sobely), // %1
2656 "+r"(dst_argb), // %2
2657 "+r"(width) // %3
2658 :
2659 : "cc", "memory", "v0", "v1", "v2", "v3"
2660 );
2661 }
2662
2663 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2664 void SobelToPlaneRow_NEON(const uint8* src_sobelx,
2665 const uint8* src_sobely,
2666 uint8* dst_y,
2667 int width) {
2668 asm volatile (
2669 // 16 pixel loop.
2670 "1: \n"
2671 MEMACCESS(0)
2672 "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx.
2673 MEMACCESS(1)
2674 "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely.
2675 "subs %w3, %w3, #16 \n" // 16 processed per loop.
2676 "uqadd v0.16b, v0.16b, v1.16b \n" // add
2677 MEMACCESS(2)
2678 "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels.
2679 "b.gt 1b \n"
2680 : "+r"(src_sobelx), // %0
2681 "+r"(src_sobely), // %1
2682 "+r"(dst_y), // %2
2683 "+r"(width) // %3
2684 :
2685 : "cc", "memory", "v0", "v1"
2686 );
2687 }
2688
2689 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2690 // A = 255
2691 // R = Sobel X
2692 // G = Sobel
2693 // B = Sobel Y
SobelXYRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2694 void SobelXYRow_NEON(const uint8* src_sobelx,
2695 const uint8* src_sobely,
2696 uint8* dst_argb,
2697 int width) {
2698 asm volatile (
2699 "movi v3.8b, #255 \n" // alpha
2700 // 8 pixel loop.
2701 "1: \n"
2702 MEMACCESS(0)
2703 "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx.
2704 MEMACCESS(1)
2705 "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely.
2706 "subs %w3, %w3, #8 \n" // 8 processed per loop.
2707 "uqadd v1.8b, v0.8b, v2.8b \n" // add
2708 MEMACCESS(2)
2709 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels
2710 "b.gt 1b \n"
2711 : "+r"(src_sobelx), // %0
2712 "+r"(src_sobely), // %1
2713 "+r"(dst_argb), // %2
2714 "+r"(width) // %3
2715 :
2716 : "cc", "memory", "v0", "v1", "v2", "v3"
2717 );
2718 }
2719
2720 // SobelX as a matrix is
2721 // -1 0 1
2722 // -2 0 2
2723 // -1 0 1
SobelXRow_NEON(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)2724 void SobelXRow_NEON(const uint8* src_y0,
2725 const uint8* src_y1,
2726 const uint8* src_y2,
2727 uint8* dst_sobelx,
2728 int width) {
2729 asm volatile (
2730 "1: \n"
2731 MEMACCESS(0)
2732 "ld1 {v0.8b}, [%0],%5 \n" // top
2733 MEMACCESS(0)
2734 "ld1 {v1.8b}, [%0],%6 \n"
2735 "usubl v0.8h, v0.8b, v1.8b \n"
2736 MEMACCESS(1)
2737 "ld1 {v2.8b}, [%1],%5 \n" // center * 2
2738 MEMACCESS(1)
2739 "ld1 {v3.8b}, [%1],%6 \n"
2740 "usubl v1.8h, v2.8b, v3.8b \n"
2741 "add v0.8h, v0.8h, v1.8h \n"
2742 "add v0.8h, v0.8h, v1.8h \n"
2743 MEMACCESS(2)
2744 "ld1 {v2.8b}, [%2],%5 \n" // bottom
2745 MEMACCESS(2)
2746 "ld1 {v3.8b}, [%2],%6 \n"
2747 "subs %w4, %w4, #8 \n" // 8 pixels
2748 "usubl v1.8h, v2.8b, v3.8b \n"
2749 "add v0.8h, v0.8h, v1.8h \n"
2750 "abs v0.8h, v0.8h \n"
2751 "uqxtn v0.8b, v0.8h \n"
2752 MEMACCESS(3)
2753 "st1 {v0.8b}, [%3], #8 \n" // store 8 sobelx
2754 "b.gt 1b \n"
2755 : "+r"(src_y0), // %0
2756 "+r"(src_y1), // %1
2757 "+r"(src_y2), // %2
2758 "+r"(dst_sobelx), // %3
2759 "+r"(width) // %4
2760 : "r"(2LL), // %5
2761 "r"(6LL) // %6
2762 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2763 );
2764 }
2765
2766 // SobelY as a matrix is
2767 // -1 -2 -1
2768 // 0 0 0
2769 // 1 2 1
SobelYRow_NEON(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)2770 void SobelYRow_NEON(const uint8* src_y0,
2771 const uint8* src_y1,
2772 uint8* dst_sobely,
2773 int width) {
2774 asm volatile (
2775 "1: \n"
2776 MEMACCESS(0)
2777 "ld1 {v0.8b}, [%0],%4 \n" // left
2778 MEMACCESS(1)
2779 "ld1 {v1.8b}, [%1],%4 \n"
2780 "usubl v0.8h, v0.8b, v1.8b \n"
2781 MEMACCESS(0)
2782 "ld1 {v2.8b}, [%0],%4 \n" // center * 2
2783 MEMACCESS(1)
2784 "ld1 {v3.8b}, [%1],%4 \n"
2785 "usubl v1.8h, v2.8b, v3.8b \n"
2786 "add v0.8h, v0.8h, v1.8h \n"
2787 "add v0.8h, v0.8h, v1.8h \n"
2788 MEMACCESS(0)
2789 "ld1 {v2.8b}, [%0],%5 \n" // right
2790 MEMACCESS(1)
2791 "ld1 {v3.8b}, [%1],%5 \n"
2792 "subs %w3, %w3, #8 \n" // 8 pixels
2793 "usubl v1.8h, v2.8b, v3.8b \n"
2794 "add v0.8h, v0.8h, v1.8h \n"
2795 "abs v0.8h, v0.8h \n"
2796 "uqxtn v0.8b, v0.8h \n"
2797 MEMACCESS(2)
2798 "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely
2799 "b.gt 1b \n"
2800 : "+r"(src_y0), // %0
2801 "+r"(src_y1), // %1
2802 "+r"(dst_sobely), // %2
2803 "+r"(width) // %3
2804 : "r"(1LL), // %4
2805 "r"(6LL) // %5
2806 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
2807 );
2808 }
2809
2810 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16 * src,uint16 * dst,float,int width)2811 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
2812 asm volatile (
2813 "1: \n"
2814 MEMACCESS(0)
2815 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
2816 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2817 "uxtl v2.4s, v1.4h \n" // 8 int's
2818 "uxtl2 v3.4s, v1.8h \n"
2819 "scvtf v2.4s, v2.4s \n" // 8 floats
2820 "scvtf v3.4s, v3.4s \n"
2821 "fcvtn v1.4h, v2.4s \n" // 8 half floats
2822 "fcvtn2 v1.8h, v3.4s \n"
2823 MEMACCESS(1)
2824 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
2825 "b.gt 1b \n"
2826 : "+r"(src), // %0
2827 "+r"(dst), // %1
2828 "+r"(width) // %2
2829 :
2830 : "cc", "memory", "v1", "v2", "v3"
2831 );
2832 }
2833
HalfFloatRow_NEON(const uint16 * src,uint16 * dst,float scale,int width)2834 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
2835 asm volatile (
2836 "1: \n"
2837 MEMACCESS(0)
2838 "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts
2839 "subs %w2, %w2, #8 \n" // 8 pixels per loop
2840 "uxtl v2.4s, v1.4h \n" // 8 int's
2841 "uxtl2 v3.4s, v1.8h \n"
2842 "scvtf v2.4s, v2.4s \n" // 8 floats
2843 "scvtf v3.4s, v3.4s \n"
2844 "fmul v2.4s, v2.4s, %3.s[0] \n" // adjust exponent
2845 "fmul v3.4s, v3.4s, %3.s[0] \n"
2846 "uqshrn v1.4h, v2.4s, #13 \n" // isolate halffloat
2847 "uqshrn2 v1.8h, v3.4s, #13 \n"
2848 MEMACCESS(1)
2849 "st1 {v1.16b}, [%1], #16 \n" // store 8 shorts
2850 "b.gt 1b \n"
2851 : "+r"(src), // %0
2852 "+r"(dst), // %1
2853 "+r"(width) // %2
2854 : "w"(scale * 1.9259299444e-34f) // %3
2855 : "cc", "memory", "v1", "v2", "v3"
2856 );
2857 }
2858
2859 #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2860
2861 #ifdef __cplusplus
2862 } // extern "C"
2863 } // namespace libyuv
2864 #endif
2865