1 /*
2  *  Copyright 2014 The LibYuv Project Authors. All rights reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS. All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10 
11 #include "libyuv/row.h"
12 
13 #ifdef __cplusplus
14 namespace libyuv {
15 extern "C" {
16 #endif
17 
18 // This module is for GCC Neon armv8 64 bit.
19 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
20 
21 // Read 8 Y, 4 U and 4 V from 422
22 #define READYUV422 \
23   MEMACCESS(0)     \
24   "ld1        {v0.8b}, [%0], #8              \n"                             \
25     MEMACCESS(1)                                                               \
26     "ld1        {v1.s}[0], [%1], #4            \n"                             \
27     MEMACCESS(2)                                                               \
28     "ld1        {v1.s}[1], [%2], #4            \n"
29 
30 // Read 8 Y, 8 U and 8 V from 444
31 #define READYUV444 \
32   MEMACCESS(0)     \
33   "ld1        {v0.8b}, [%0], #8              \n"                             \
34     MEMACCESS(1)                                                               \
35     "ld1        {v1.d}[0], [%1], #8            \n"                             \
36     MEMACCESS(2)                                                               \
37     "ld1        {v1.d}[1], [%2], #8            \n"                             \
38     "uaddlp     v1.8h, v1.16b                  \n"                             \
39     "rshrn      v1.8b, v1.8h, #1               \n"
40 
41 // Read 8 Y, and set 4 U and 4 V to 128
42 #define READYUV400                               \
43   MEMACCESS(0)                                   \
44   "ld1        {v0.8b}, [%0], #8              \n" \
45   "movi       v1.8b , #128                   \n"
46 
47 // Read 8 Y and 4 UV from NV12
48 #define READNV12 \
49   MEMACCESS(0)   \
50   "ld1        {v0.8b}, [%0], #8              \n"                             \
51     MEMACCESS(1)                                                               \
52     "ld1        {v2.8b}, [%1], #8              \n"                             \
53     "uzp1       v1.8b, v2.8b, v2.8b            \n"                             \
54     "uzp2       v3.8b, v2.8b, v2.8b            \n"                             \
55     "ins        v1.s[1], v3.s[0]               \n"
56 
57 // Read 8 Y and 4 VU from NV21
58 #define READNV21 \
59   MEMACCESS(0)   \
60   "ld1        {v0.8b}, [%0], #8              \n"                             \
61     MEMACCESS(1)                                                               \
62     "ld1        {v2.8b}, [%1], #8              \n"                             \
63     "uzp1       v3.8b, v2.8b, v2.8b            \n"                             \
64     "uzp2       v1.8b, v2.8b, v2.8b            \n"                             \
65     "ins        v1.s[1], v3.s[0]               \n"
66 
67 // Read 8 YUY2
68 #define READYUY2                                 \
69   MEMACCESS(0)                                   \
70   "ld2        {v0.8b, v1.8b}, [%0], #16      \n" \
71   "uzp2       v3.8b, v1.8b, v1.8b            \n" \
72   "uzp1       v1.8b, v1.8b, v1.8b            \n" \
73   "ins        v1.s[1], v3.s[0]               \n"
74 
75 // Read 8 UYVY
76 #define READUYVY                                 \
77   MEMACCESS(0)                                   \
78   "ld2        {v2.8b, v3.8b}, [%0], #16      \n" \
79   "orr        v0.8b, v3.8b, v3.8b            \n" \
80   "uzp1       v1.8b, v2.8b, v2.8b            \n" \
81   "uzp2       v3.8b, v2.8b, v2.8b            \n" \
82   "ins        v1.s[1], v3.s[0]               \n"
83 
84 #define YUVTORGB_SETUP                           \
85   "ld1r       {v24.8h}, [%[kUVBiasBGR]], #2  \n" \
86   "ld1r       {v25.8h}, [%[kUVBiasBGR]], #2  \n" \
87   "ld1r       {v26.8h}, [%[kUVBiasBGR]]      \n" \
88   "ld1r       {v31.4s}, [%[kYToRgb]]         \n" \
89   "ld2        {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
90   "ld2        {v29.8h, v30.8h}, [%[kUVToG]]  \n"
91 
92 #define YUVTORGB(vR, vG, vB)                                        \
93   "uxtl       v0.8h, v0.8b                   \n" /* Extract Y    */ \
94   "shll       v2.8h, v1.8b, #8               \n" /* Replicate UV */ \
95   "ushll2     v3.4s, v0.8h, #0               \n" /* Y */            \
96   "ushll      v0.4s, v0.4h, #0               \n"                    \
97   "mul        v3.4s, v3.4s, v31.4s           \n"                    \
98   "mul        v0.4s, v0.4s, v31.4s           \n"                    \
99   "sqshrun    v0.4h, v0.4s, #16              \n"                    \
100   "sqshrun2   v0.8h, v3.4s, #16              \n" /* Y */            \
101   "uaddw      v1.8h, v2.8h, v1.8b            \n" /* Replicate UV */ \
102   "mov        v2.d[0], v1.d[1]               \n" /* Extract V */    \
103   "uxtl       v2.8h, v2.8b                   \n"                    \
104   "uxtl       v1.8h, v1.8b                   \n" /* Extract U */    \
105   "mul        v3.8h, v1.8h, v27.8h           \n"                    \
106   "mul        v5.8h, v1.8h, v29.8h           \n"                    \
107   "mul        v6.8h, v2.8h, v30.8h           \n"                    \
108   "mul        v7.8h, v2.8h, v28.8h           \n"                    \
109   "sqadd      v6.8h, v6.8h, v5.8h            \n"                    \
110   "sqadd      " #vB                                                 \
111   ".8h, v24.8h, v0.8h      \n" /* B */                              \
112   "sqadd      " #vG                                                 \
113   ".8h, v25.8h, v0.8h      \n" /* G */                              \
114   "sqadd      " #vR                                                 \
115   ".8h, v26.8h, v0.8h      \n" /* R */                              \
116   "sqadd      " #vB ".8h, " #vB                                     \
117   ".8h, v3.8h  \n" /* B */                                          \
118   "sqsub      " #vG ".8h, " #vG                                     \
119   ".8h, v6.8h  \n" /* G */                                          \
120   "sqadd      " #vR ".8h, " #vR                                     \
121   ".8h, v7.8h  \n" /* R */                                          \
122   "sqshrun    " #vB ".8b, " #vB                                     \
123   ".8h, #6     \n" /* B */                                          \
124   "sqshrun    " #vG ".8b, " #vG                                     \
125   ".8h, #6     \n"                               /* G */            \
126   "sqshrun    " #vR ".8b, " #vR ".8h, #6     \n" /* R */
127 
I444ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)128 void I444ToARGBRow_NEON(const uint8* src_y,
129                         const uint8* src_u,
130                         const uint8* src_v,
131                         uint8* dst_argb,
132                         const struct YuvConstants* yuvconstants,
133                         int width) {
134   asm volatile (
135     YUVTORGB_SETUP
136     "movi       v23.8b, #255                   \n" /* A */
137   "1:                                          \n"
138     READYUV444
139     YUVTORGB(v22, v21, v20)
140     "subs       %w4, %w4, #8                   \n"
141     MEMACCESS(3)
142     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
143     "b.gt       1b                             \n"
144     : "+r"(src_y),     // %0
145       "+r"(src_u),     // %1
146       "+r"(src_v),     // %2
147       "+r"(dst_argb),  // %3
148       "+r"(width)      // %4
149     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
150       [kUVToG]"r"(&yuvconstants->kUVToG),
151       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
152       [kYToRgb]"r"(&yuvconstants->kYToRgb)
153     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
154       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
155   );
156 }
157 
I422ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)158 void I422ToARGBRow_NEON(const uint8* src_y,
159                         const uint8* src_u,
160                         const uint8* src_v,
161                         uint8* dst_argb,
162                         const struct YuvConstants* yuvconstants,
163                         int width) {
164   asm volatile (
165     YUVTORGB_SETUP
166     "movi       v23.8b, #255                   \n" /* A */
167   "1:                                          \n"
168     READYUV422
169     YUVTORGB(v22, v21, v20)
170     "subs       %w4, %w4, #8                   \n"
171     MEMACCESS(3)
172     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
173     "b.gt       1b                             \n"
174     : "+r"(src_y),     // %0
175       "+r"(src_u),     // %1
176       "+r"(src_v),     // %2
177       "+r"(dst_argb),  // %3
178       "+r"(width)      // %4
179     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
180       [kUVToG]"r"(&yuvconstants->kUVToG),
181       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
182       [kYToRgb]"r"(&yuvconstants->kYToRgb)
183     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
184       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
185   );
186 }
187 
I422AlphaToARGBRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,const uint8 * src_a,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)188 void I422AlphaToARGBRow_NEON(const uint8* src_y,
189                              const uint8* src_u,
190                              const uint8* src_v,
191                              const uint8* src_a,
192                              uint8* dst_argb,
193                              const struct YuvConstants* yuvconstants,
194                              int width) {
195   asm volatile (
196     YUVTORGB_SETUP
197   "1:                                          \n"
198     READYUV422
199     YUVTORGB(v22, v21, v20)
200     MEMACCESS(3)
201     "ld1        {v23.8b}, [%3], #8             \n"
202     "subs       %w5, %w5, #8                   \n"
203     MEMACCESS(4)
204     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32     \n"
205     "b.gt       1b                             \n"
206     : "+r"(src_y),     // %0
207       "+r"(src_u),     // %1
208       "+r"(src_v),     // %2
209       "+r"(src_a),     // %3
210       "+r"(dst_argb),  // %4
211       "+r"(width)      // %5
212     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
213       [kUVToG]"r"(&yuvconstants->kUVToG),
214       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
215       [kYToRgb]"r"(&yuvconstants->kYToRgb)
216     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
217       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
218   );
219 }
220 
I422ToRGBARow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgba,const struct YuvConstants * yuvconstants,int width)221 void I422ToRGBARow_NEON(const uint8* src_y,
222                         const uint8* src_u,
223                         const uint8* src_v,
224                         uint8* dst_rgba,
225                         const struct YuvConstants* yuvconstants,
226                         int width) {
227   asm volatile (
228     YUVTORGB_SETUP
229     "movi       v20.8b, #255                   \n" /* A */
230   "1:                                          \n"
231     READYUV422
232     YUVTORGB(v23, v22, v21)
233     "subs       %w4, %w4, #8                   \n"
234     MEMACCESS(3)
235     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32     \n"
236     "b.gt       1b                             \n"
237     : "+r"(src_y),     // %0
238       "+r"(src_u),     // %1
239       "+r"(src_v),     // %2
240       "+r"(dst_rgba),  // %3
241       "+r"(width)      // %4
242     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
243       [kUVToG]"r"(&yuvconstants->kUVToG),
244       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
245       [kYToRgb]"r"(&yuvconstants->kYToRgb)
246     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
247       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
248   );
249 }
250 
I422ToRGB24Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb24,const struct YuvConstants * yuvconstants,int width)251 void I422ToRGB24Row_NEON(const uint8* src_y,
252                          const uint8* src_u,
253                          const uint8* src_v,
254                          uint8* dst_rgb24,
255                          const struct YuvConstants* yuvconstants,
256                          int width) {
257   asm volatile (
258     YUVTORGB_SETUP
259   "1:                                          \n"
260     READYUV422
261     YUVTORGB(v22, v21, v20)
262     "subs       %w4, %w4, #8                   \n"
263     MEMACCESS(3)
264     "st3        {v20.8b,v21.8b,v22.8b}, [%3], #24     \n"
265     "b.gt       1b                             \n"
266     : "+r"(src_y),     // %0
267       "+r"(src_u),     // %1
268       "+r"(src_v),     // %2
269       "+r"(dst_rgb24), // %3
270       "+r"(width)      // %4
271     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
272       [kUVToG]"r"(&yuvconstants->kUVToG),
273       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
274       [kYToRgb]"r"(&yuvconstants->kYToRgb)
275     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
276       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
277   );
278 }
279 
280 #define ARGBTORGB565                                                        \
281   "shll       v0.8h,  v22.8b, #8             \n" /* R                    */ \
282   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
283   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
284   "sri        v0.8h,  v21.8h, #5             \n" /* RG                   */ \
285   "sri        v0.8h,  v20.8h, #11            \n" /* RGB                  */
286 
I422ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)287 void I422ToRGB565Row_NEON(const uint8* src_y,
288                           const uint8* src_u,
289                           const uint8* src_v,
290                           uint8* dst_rgb565,
291                           const struct YuvConstants* yuvconstants,
292                           int width) {
293   asm volatile (
294     YUVTORGB_SETUP
295   "1:                                          \n"
296     READYUV422
297     YUVTORGB(v22, v21, v20)
298     "subs       %w4, %w4, #8                   \n"
299     ARGBTORGB565
300     MEMACCESS(3)
301     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
302     "b.gt       1b                             \n"
303     : "+r"(src_y),    // %0
304       "+r"(src_u),    // %1
305       "+r"(src_v),    // %2
306       "+r"(dst_rgb565),  // %3
307       "+r"(width)     // %4
308     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
309       [kUVToG]"r"(&yuvconstants->kUVToG),
310       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
311       [kYToRgb]"r"(&yuvconstants->kYToRgb)
312     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
313       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
314   );
315 }
316 
317 #define ARGBTOARGB1555                                                      \
318   "shll       v0.8h,  v23.8b, #8             \n" /* A                    */ \
319   "shll       v22.8h, v22.8b, #8             \n" /* R                    */ \
320   "shll       v21.8h, v21.8b, #8             \n" /* G                    */ \
321   "shll       v20.8h, v20.8b, #8             \n" /* B                    */ \
322   "sri        v0.8h,  v22.8h, #1             \n" /* AR                   */ \
323   "sri        v0.8h,  v21.8h, #6             \n" /* ARG                  */ \
324   "sri        v0.8h,  v20.8h, #11            \n" /* ARGB                 */
325 
I422ToARGB1555Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb1555,const struct YuvConstants * yuvconstants,int width)326 void I422ToARGB1555Row_NEON(const uint8* src_y,
327                             const uint8* src_u,
328                             const uint8* src_v,
329                             uint8* dst_argb1555,
330                             const struct YuvConstants* yuvconstants,
331                             int width) {
332   asm volatile (
333     YUVTORGB_SETUP
334     "movi       v23.8b, #255                   \n"
335   "1:                                          \n"
336     READYUV422
337     YUVTORGB(v22, v21, v20)
338     "subs       %w4, %w4, #8                   \n"
339     ARGBTOARGB1555
340     MEMACCESS(3)
341     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels RGB565.
342     "b.gt       1b                             \n"
343     : "+r"(src_y),    // %0
344       "+r"(src_u),    // %1
345       "+r"(src_v),    // %2
346       "+r"(dst_argb1555),  // %3
347       "+r"(width)     // %4
348     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
349       [kUVToG]"r"(&yuvconstants->kUVToG),
350       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
351       [kYToRgb]"r"(&yuvconstants->kYToRgb)
352     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
353       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
354   );
355 }
356 
357 #define ARGBTOARGB4444                                                       \
358   /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f        */ \
359   "ushr       v20.8b, v20.8b, #4             \n" /* B                    */  \
360   "bic        v21.8b, v21.8b, v4.8b          \n" /* G                    */  \
361   "ushr       v22.8b, v22.8b, #4             \n" /* R                    */  \
362   "bic        v23.8b, v23.8b, v4.8b          \n" /* A                    */  \
363   "orr        v0.8b,  v20.8b, v21.8b         \n" /* BG                   */  \
364   "orr        v1.8b,  v22.8b, v23.8b         \n" /* RA                   */  \
365   "zip1       v0.16b, v0.16b, v1.16b         \n" /* BGRA                 */
366 
I422ToARGB4444Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_argb4444,const struct YuvConstants * yuvconstants,int width)367 void I422ToARGB4444Row_NEON(const uint8* src_y,
368                             const uint8* src_u,
369                             const uint8* src_v,
370                             uint8* dst_argb4444,
371                             const struct YuvConstants* yuvconstants,
372                             int width) {
373   asm volatile (
374     YUVTORGB_SETUP
375     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
376   "1:                                          \n"
377     READYUV422
378     YUVTORGB(v22, v21, v20)
379     "subs       %w4, %w4, #8                   \n"
380     "movi       v23.8b, #255                   \n"
381     ARGBTOARGB4444
382     MEMACCESS(3)
383     "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels ARGB4444.
384     "b.gt       1b                             \n"
385     : "+r"(src_y),    // %0
386       "+r"(src_u),    // %1
387       "+r"(src_v),    // %2
388       "+r"(dst_argb4444),  // %3
389       "+r"(width)     // %4
390     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
391       [kUVToG]"r"(&yuvconstants->kUVToG),
392       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
393       [kYToRgb]"r"(&yuvconstants->kYToRgb)
394     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
395       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
396   );
397 }
398 
I400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)399 void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
400   asm volatile (
401     YUVTORGB_SETUP
402     "movi       v23.8b, #255                   \n"
403   "1:                                          \n"
404     READYUV400
405     YUVTORGB(v22, v21, v20)
406     "subs       %w2, %w2, #8                   \n"
407     MEMACCESS(1)
408     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
409     "b.gt       1b                             \n"
410     : "+r"(src_y),     // %0
411       "+r"(dst_argb),  // %1
412       "+r"(width)      // %2
413     : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
414       [kUVToG]"r"(&kYuvI601Constants.kUVToG),
415       [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
416       [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
417     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
418       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
419   );
420 }
421 
J400ToARGBRow_NEON(const uint8 * src_y,uint8 * dst_argb,int width)422 void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) {
423   asm volatile (
424     "movi       v23.8b, #255                   \n"
425   "1:                                          \n"
426     MEMACCESS(0)
427     "ld1        {v20.8b}, [%0], #8             \n"
428     "orr        v21.8b, v20.8b, v20.8b         \n"
429     "orr        v22.8b, v20.8b, v20.8b         \n"
430     "subs       %w2, %w2, #8                   \n"
431     MEMACCESS(1)
432     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
433     "b.gt       1b                             \n"
434     : "+r"(src_y),     // %0
435       "+r"(dst_argb),  // %1
436       "+r"(width)      // %2
437     :
438     : "cc", "memory", "v20", "v21", "v22", "v23"
439   );
440 }
441 
NV12ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)442 void NV12ToARGBRow_NEON(const uint8* src_y,
443                         const uint8* src_uv,
444                         uint8* dst_argb,
445                         const struct YuvConstants* yuvconstants,
446                         int width) {
447   asm volatile (
448     YUVTORGB_SETUP
449     "movi       v23.8b, #255                   \n"
450   "1:                                          \n"
451     READNV12
452     YUVTORGB(v22, v21, v20)
453     "subs       %w3, %w3, #8                   \n"
454     MEMACCESS(2)
455     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
456     "b.gt       1b                             \n"
457     : "+r"(src_y),     // %0
458       "+r"(src_uv),    // %1
459       "+r"(dst_argb),  // %2
460       "+r"(width)      // %3
461     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
462       [kUVToG]"r"(&yuvconstants->kUVToG),
463       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
464       [kYToRgb]"r"(&yuvconstants->kYToRgb)
465     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
466       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
467   );
468 }
469 
NV21ToARGBRow_NEON(const uint8 * src_y,const uint8 * src_vu,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)470 void NV21ToARGBRow_NEON(const uint8* src_y,
471                         const uint8* src_vu,
472                         uint8* dst_argb,
473                         const struct YuvConstants* yuvconstants,
474                         int width) {
475   asm volatile (
476     YUVTORGB_SETUP
477     "movi       v23.8b, #255                   \n"
478   "1:                                          \n"
479     READNV21
480     YUVTORGB(v22, v21, v20)
481     "subs       %w3, %w3, #8                   \n"
482     MEMACCESS(2)
483     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32     \n"
484     "b.gt       1b                             \n"
485     : "+r"(src_y),     // %0
486       "+r"(src_vu),    // %1
487       "+r"(dst_argb),  // %2
488       "+r"(width)      // %3
489     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
490       [kUVToG]"r"(&yuvconstants->kUVToG),
491       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
492       [kYToRgb]"r"(&yuvconstants->kYToRgb)
493     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
494       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
495   );
496 }
497 
NV12ToRGB565Row_NEON(const uint8 * src_y,const uint8 * src_uv,uint8 * dst_rgb565,const struct YuvConstants * yuvconstants,int width)498 void NV12ToRGB565Row_NEON(const uint8* src_y,
499                           const uint8* src_uv,
500                           uint8* dst_rgb565,
501                           const struct YuvConstants* yuvconstants,
502                           int width) {
503   asm volatile (
504     YUVTORGB_SETUP
505   "1:                                          \n"
506     READNV12
507     YUVTORGB(v22, v21, v20)
508     "subs       %w3, %w3, #8                   \n"
509     ARGBTORGB565
510     MEMACCESS(2)
511     "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels RGB565.
512     "b.gt       1b                             \n"
513     : "+r"(src_y),     // %0
514       "+r"(src_uv),    // %1
515       "+r"(dst_rgb565),  // %2
516       "+r"(width)      // %3
517     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
518       [kUVToG]"r"(&yuvconstants->kUVToG),
519       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
520       [kYToRgb]"r"(&yuvconstants->kYToRgb)
521     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
522       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
523   );
524 }
525 
YUY2ToARGBRow_NEON(const uint8 * src_yuy2,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)526 void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
527                         uint8* dst_argb,
528                         const struct YuvConstants* yuvconstants,
529                         int width) {
530   asm volatile (
531     YUVTORGB_SETUP
532     "movi       v23.8b, #255                   \n"
533   "1:                                          \n"
534     READYUY2
535     YUVTORGB(v22, v21, v20)
536     "subs       %w2, %w2, #8                   \n"
537     MEMACCESS(1)
538     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32      \n"
539     "b.gt       1b                             \n"
540     : "+r"(src_yuy2),  // %0
541       "+r"(dst_argb),  // %1
542       "+r"(width)      // %2
543     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
544       [kUVToG]"r"(&yuvconstants->kUVToG),
545       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
546       [kYToRgb]"r"(&yuvconstants->kYToRgb)
547     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
548       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
549   );
550 }
551 
UYVYToARGBRow_NEON(const uint8 * src_uyvy,uint8 * dst_argb,const struct YuvConstants * yuvconstants,int width)552 void UYVYToARGBRow_NEON(const uint8* src_uyvy,
553                         uint8* dst_argb,
554                         const struct YuvConstants* yuvconstants,
555                         int width) {
556   asm volatile (
557     YUVTORGB_SETUP
558     "movi       v23.8b, #255                   \n"
559   "1:                                          \n"
560     READUYVY
561     YUVTORGB(v22, v21, v20)
562     "subs       %w2, %w2, #8                   \n"
563     MEMACCESS(1)
564     "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32      \n"
565     "b.gt       1b                             \n"
566     : "+r"(src_uyvy),  // %0
567       "+r"(dst_argb),  // %1
568       "+r"(width)      // %2
569     : [kUVToRB]"r"(&yuvconstants->kUVToRB),
570       [kUVToG]"r"(&yuvconstants->kUVToG),
571       [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
572       [kYToRgb]"r"(&yuvconstants->kYToRgb)
573     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
574       "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
575   );
576 }
577 
578 // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
SplitUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)579 void SplitUVRow_NEON(const uint8* src_uv,
580                      uint8* dst_u,
581                      uint8* dst_v,
582                      int width) {
583   asm volatile (
584   "1:                                          \n"
585     MEMACCESS(0)
586     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pairs of UV
587     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
588     MEMACCESS(1)
589     "st1        {v0.16b}, [%1], #16            \n"  // store U
590     MEMACCESS(2)
591     "st1        {v1.16b}, [%2], #16            \n"  // store V
592     "b.gt       1b                             \n"
593     : "+r"(src_uv),  // %0
594       "+r"(dst_u),   // %1
595       "+r"(dst_v),   // %2
596       "+r"(width)    // %3  // Output registers
597     :                       // Input registers
598     : "cc", "memory", "v0", "v1"  // Clobber List
599   );
600 }
601 
602 // Reads 16 U's and V's and writes out 16 pairs of UV.
MergeUVRow_NEON(const uint8 * src_u,const uint8 * src_v,uint8 * dst_uv,int width)603 void MergeUVRow_NEON(const uint8* src_u,
604                      const uint8* src_v,
605                      uint8* dst_uv,
606                      int width) {
607   asm volatile (
608   "1:                                          \n"
609     MEMACCESS(0)
610     "ld1        {v0.16b}, [%0], #16            \n"  // load U
611     MEMACCESS(1)
612     "ld1        {v1.16b}, [%1], #16            \n"  // load V
613     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
614     MEMACCESS(2)
615     "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
616     "b.gt       1b                             \n"
617     :
618       "+r"(src_u),   // %0
619       "+r"(src_v),   // %1
620       "+r"(dst_uv),  // %2
621       "+r"(width)    // %3  // Output registers
622     :                       // Input registers
623     : "cc", "memory", "v0", "v1"  // Clobber List
624   );
625 }
626 
627 // Copy multiple of 32.  vld4.8  allow unaligned and is fastest on a15.
CopyRow_NEON(const uint8 * src,uint8 * dst,int count)628 void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
629   asm volatile (
630   "1:                                          \n"
631     MEMACCESS(0)
632     "ld1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32       \n"  // load 32
633     "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
634     MEMACCESS(1)
635     "st1        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32       \n"  // store 32
636     "b.gt       1b                             \n"
637   : "+r"(src),   // %0
638     "+r"(dst),   // %1
639     "+r"(count)  // %2  // Output registers
640   :                     // Input registers
641   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
642   );
643 }
644 
645 // SetRow writes 'count' bytes using an 8 bit value repeated.
SetRow_NEON(uint8 * dst,uint8 v8,int count)646 void SetRow_NEON(uint8* dst, uint8 v8, int count) {
647   asm volatile (
648     "dup        v0.16b, %w2                    \n"  // duplicate 16 bytes
649   "1:                                          \n"
650     "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
651     MEMACCESS(0)
652     "st1        {v0.16b}, [%0], #16            \n"  // store
653     "b.gt       1b                             \n"
654   : "+r"(dst),   // %0
655     "+r"(count)  // %1
656   : "r"(v8)      // %2
657   : "cc", "memory", "v0"
658   );
659 }
660 
ARGBSetRow_NEON(uint8 * dst,uint32 v32,int count)661 void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
662   asm volatile (
663     "dup        v0.4s, %w2                     \n"  // duplicate 4 ints
664   "1:                                          \n"
665     "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
666     MEMACCESS(0)
667     "st1        {v0.16b}, [%0], #16            \n"  // store
668     "b.gt       1b                             \n"
669   : "+r"(dst),   // %0
670     "+r"(count)  // %1
671   : "r"(v32)     // %2
672   : "cc", "memory", "v0"
673   );
674 }
675 
MirrorRow_NEON(const uint8 * src,uint8 * dst,int width)676 void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
677   asm volatile (
678     // Start at end of source row.
679     "add        %0, %0, %w2, sxtw              \n"
680     "sub        %0, %0, #16                    \n"
681   "1:                                          \n"
682     MEMACCESS(0)
683     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
684     "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
685     "rev64      v0.16b, v0.16b                 \n"
686     MEMACCESS(1)
687     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
688     MEMACCESS(1)
689     "st1        {v0.D}[0], [%1], #8            \n"
690     "b.gt       1b                             \n"
691   : "+r"(src),   // %0
692     "+r"(dst),   // %1
693     "+r"(width)  // %2
694   : "r"((ptrdiff_t)-16)    // %3
695   : "cc", "memory", "v0"
696   );
697 }
698 
MirrorUVRow_NEON(const uint8 * src_uv,uint8 * dst_u,uint8 * dst_v,int width)699 void MirrorUVRow_NEON(const uint8* src_uv,
700                       uint8* dst_u,
701                       uint8* dst_v,
702                       int width) {
703   asm volatile (
704     // Start at end of source row.
705     "add        %0, %0, %w3, sxtw #1           \n"
706     "sub        %0, %0, #16                    \n"
707   "1:                                          \n"
708     MEMACCESS(0)
709     "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
710     "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
711     "rev64      v0.8b, v0.8b                   \n"
712     "rev64      v1.8b, v1.8b                   \n"
713     MEMACCESS(1)
714     "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
715     MEMACCESS(2)
716     "st1        {v1.8b}, [%2], #8              \n"
717     "b.gt       1b                             \n"
718   : "+r"(src_uv),  // %0
719     "+r"(dst_u),   // %1
720     "+r"(dst_v),   // %2
721     "+r"(width)    // %3
722   : "r"((ptrdiff_t)-16)      // %4
723   : "cc", "memory", "v0", "v1"
724   );
725 }
726 
ARGBMirrorRow_NEON(const uint8 * src,uint8 * dst,int width)727 void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
728   asm volatile (
729   // Start at end of source row.
730     "add        %0, %0, %w2, sxtw #2           \n"
731     "sub        %0, %0, #16                    \n"
732   "1:                                          \n"
733     MEMACCESS(0)
734     "ld1        {v0.16b}, [%0], %3             \n"  // src -= 16
735     "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
736     "rev64      v0.4s, v0.4s                   \n"
737     MEMACCESS(1)
738     "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
739     MEMACCESS(1)
740     "st1        {v0.D}[0], [%1], #8            \n"
741     "b.gt       1b                             \n"
742   : "+r"(src),   // %0
743     "+r"(dst),   // %1
744     "+r"(width)  // %2
745   : "r"((ptrdiff_t)-16)    // %3
746   : "cc", "memory", "v0"
747   );
748 }
749 
RGB24ToARGBRow_NEON(const uint8 * src_rgb24,uint8 * dst_argb,int width)750 void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
751   asm volatile (
752     "movi       v4.8b, #255                    \n"  // Alpha
753   "1:                                          \n"
754     MEMACCESS(0)
755     "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
756     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
757     MEMACCESS(1)
758     "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB pixels
759     "b.gt       1b                             \n"
760   : "+r"(src_rgb24),  // %0
761     "+r"(dst_argb),   // %1
762     "+r"(width)       // %2
763   :
764   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
765   );
766 }
767 
RAWToARGBRow_NEON(const uint8 * src_raw,uint8 * dst_argb,int width)768 void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
769   asm volatile (
770     "movi       v5.8b, #255                    \n"  // Alpha
771   "1:                                          \n"
772     MEMACCESS(0)
773     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
774     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
775     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
776     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
777     MEMACCESS(1)
778     "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
779     "b.gt       1b                             \n"
780   : "+r"(src_raw),   // %0
781     "+r"(dst_argb),  // %1
782     "+r"(width)      // %2
783   :
784   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"  // Clobber List
785   );
786 }
787 
RAWToRGB24Row_NEON(const uint8 * src_raw,uint8 * dst_rgb24,int width)788 void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
789   asm volatile (
790   "1:                                          \n"
791     MEMACCESS(0)
792     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // read r g b
793     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
794     "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
795     "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
796     MEMACCESS(1)
797     "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
798     "b.gt       1b                             \n"
799   : "+r"(src_raw),    // %0
800     "+r"(dst_rgb24),  // %1
801     "+r"(width)       // %2
802   :
803   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
804   );
805 }
806 
807 #define RGB565TOARGB                                                        \
808   "shrn       v6.8b, v0.8h, #5               \n" /* G xxGGGGGG           */ \
809   "shl        v6.8b, v6.8b, #2               \n" /* G GGGGGG00 upper 6   */ \
810   "ushr       v4.8b, v6.8b, #6               \n" /* G 000000GG lower 2   */ \
811   "orr        v1.8b, v4.8b, v6.8b            \n" /* G                    */ \
812   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
813   "ushr       v0.8h, v0.8h, #11              \n" /* R 000RRRRR           */ \
814   "xtn2       v2.16b,v0.8h                   \n" /* R in upper part      */ \
815   "shl        v2.16b, v2.16b, #3             \n" /* R,B BBBBB000 upper 5 */ \
816   "ushr       v0.16b, v2.16b, #5             \n" /* R,B 00000BBB lower 3 */ \
817   "orr        v0.16b, v0.16b, v2.16b         \n" /* R,B                  */ \
818   "dup        v2.2D, v0.D[1]                 \n" /* R                    */
819 
RGB565ToARGBRow_NEON(const uint8 * src_rgb565,uint8 * dst_argb,int width)820 void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
821   asm volatile (
822     "movi       v3.8b, #255                    \n"  // Alpha
823   "1:                                          \n"
824     MEMACCESS(0)
825     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
826     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
827     RGB565TOARGB
828     MEMACCESS(1)
829     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
830     "b.gt       1b                             \n"
831   : "+r"(src_rgb565),  // %0
832     "+r"(dst_argb),    // %1
833     "+r"(width)          // %2
834   :
835   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6"  // Clobber List
836   );
837 }
838 
839 #define ARGB1555TOARGB                                                      \
840   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
841   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
842   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000 AAAAAAAA    */ \
843                                                                             \
844   "sshr       v2.8h, v0.8h, #15              \n" /* A AAAAAAAA           */ \
845   "xtn2       v3.16b, v2.8h                  \n"                            \
846                                                                             \
847   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
848   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
849                                                                             \
850   "ushr       v1.16b, v3.16b, #5             \n" /* R,A 00000RRR lower 3 */ \
851   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
852   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
853                                                                             \
854   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
855   "orr        v2.16b, v1.16b, v3.16b         \n" /* R,A                  */ \
856   "dup        v1.2D, v0.D[1]                 \n"                            \
857   "dup        v3.2D, v2.D[1]                 \n"
858 
859 // RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
860 #define RGB555TOARGB                                                        \
861   "ushr       v2.8h, v0.8h, #10              \n" /* R xxxRRRRR           */ \
862   "shl        v2.8h, v2.8h, #3               \n" /* R RRRRR000 upper 5   */ \
863   "xtn        v3.8b, v2.8h                   \n" /* RRRRR000             */ \
864                                                                             \
865   "xtn        v2.8b, v0.8h                   \n" /* B xxxBBBBB           */ \
866   "shrn2      v2.16b,v0.8h, #5               \n" /* G xxxGGGGG           */ \
867                                                                             \
868   "ushr       v1.16b, v3.16b, #5             \n" /* R   00000RRR lower 3 */ \
869   "shl        v0.16b, v2.16b, #3             \n" /* B,G BBBBB000 upper 5 */ \
870   "ushr       v2.16b, v0.16b, #5             \n" /* B,G 00000BBB lower 3 */ \
871                                                                             \
872   "orr        v0.16b, v0.16b, v2.16b         \n" /* B,G                  */ \
873   "orr        v2.16b, v1.16b, v3.16b         \n" /* R                    */ \
874   "dup        v1.2D, v0.D[1]                 \n" /* G */
875 
ARGB1555ToARGBRow_NEON(const uint8 * src_argb1555,uint8 * dst_argb,int width)876 void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555,
877                             uint8* dst_argb,
878                             int width) {
879   asm volatile (
880     "movi       v3.8b, #255                    \n"  // Alpha
881   "1:                                          \n"
882     MEMACCESS(0)
883     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
884     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
885     ARGB1555TOARGB
886     MEMACCESS(1)
887     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
888     "b.gt       1b                             \n"
889   : "+r"(src_argb1555),  // %0
890     "+r"(dst_argb),    // %1
891     "+r"(width)          // %2
892   :
893   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
894   );
895 }
896 
897 #define ARGB4444TOARGB                                                      \
898   "shrn       v1.8b,  v0.8h, #8              \n" /* v1(l) AR             */ \
899   "xtn2       v1.16b, v0.8h                  \n" /* v1(h) GB             */ \
900   "shl        v2.16b, v1.16b, #4             \n" /* B,R BBBB0000         */ \
901   "ushr       v3.16b, v1.16b, #4             \n" /* G,A 0000GGGG         */ \
902   "ushr       v0.16b, v2.16b, #4             \n" /* B,R 0000BBBB         */ \
903   "shl        v1.16b, v3.16b, #4             \n" /* G,A GGGG0000         */ \
904   "orr        v2.16b, v0.16b, v2.16b         \n" /* B,R BBBBBBBB         */ \
905   "orr        v3.16b, v1.16b, v3.16b         \n" /* G,A GGGGGGGG         */ \
906   "dup        v0.2D, v2.D[1]                 \n"                            \
907   "dup        v1.2D, v3.D[1]                 \n"
908 
ARGB4444ToARGBRow_NEON(const uint8 * src_argb4444,uint8 * dst_argb,int width)909 void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444,
910                             uint8* dst_argb,
911                             int width) {
912   asm volatile (
913   "1:                                          \n"
914     MEMACCESS(0)
915     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
916     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
917     ARGB4444TOARGB
918     MEMACCESS(1)
919     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
920     "b.gt       1b                             \n"
921   : "+r"(src_argb4444),  // %0
922     "+r"(dst_argb),    // %1
923     "+r"(width)          // %2
924   :
925   : "cc", "memory", "v0", "v1", "v2", "v3", "v4"  // Clobber List
926   );
927 }
928 
ARGBToRGB24Row_NEON(const uint8 * src_argb,uint8 * dst_rgb24,int width)929 void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
930   asm volatile (
931   "1:                                          \n"
932     MEMACCESS(0)
933     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB pixels
934     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
935     MEMACCESS(1)
936     "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24.
937     "b.gt       1b                             \n"
938   : "+r"(src_argb),   // %0
939     "+r"(dst_rgb24),  // %1
940     "+r"(width)         // %2
941   :
942   : "cc", "memory", "v1", "v2", "v3", "v4"  // Clobber List
943   );
944 }
945 
ARGBToRAWRow_NEON(const uint8 * src_argb,uint8 * dst_raw,int width)946 void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
947   asm volatile (
948   "1:                                          \n"
949     MEMACCESS(0)
950     "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load b g r a
951     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
952     "orr        v4.8b, v2.8b, v2.8b            \n"  // mov g
953     "orr        v5.8b, v1.8b, v1.8b            \n"  // mov b
954     MEMACCESS(1)
955     "st3        {v3.8b,v4.8b,v5.8b}, [%1], #24 \n"  // store r g b
956     "b.gt       1b                             \n"
957   : "+r"(src_argb),  // %0
958     "+r"(dst_raw),   // %1
959     "+r"(width)        // %2
960   :
961   : "cc", "memory", "v1", "v2", "v3", "v4", "v5"  // Clobber List
962   );
963 }
964 
YUY2ToYRow_NEON(const uint8 * src_yuy2,uint8 * dst_y,int width)965 void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
966   asm volatile (
967   "1:                                          \n"
968     MEMACCESS(0)
969     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
970     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
971     MEMACCESS(1)
972     "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
973     "b.gt       1b                             \n"
974   : "+r"(src_yuy2),  // %0
975     "+r"(dst_y),     // %1
976     "+r"(width)        // %2
977   :
978   : "cc", "memory", "v0", "v1"  // Clobber List
979   );
980 }
981 
UYVYToYRow_NEON(const uint8 * src_uyvy,uint8 * dst_y,int width)982 void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
983   asm volatile (
984   "1:                                          \n"
985     MEMACCESS(0)
986     "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
987     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
988     MEMACCESS(1)
989     "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
990     "b.gt       1b                             \n"
991   : "+r"(src_uyvy),  // %0
992     "+r"(dst_y),     // %1
993     "+r"(width)        // %2
994   :
995   : "cc", "memory", "v0", "v1"  // Clobber List
996   );
997 }
998 
YUY2ToUV422Row_NEON(const uint8 * src_yuy2,uint8 * dst_u,uint8 * dst_v,int width)999 void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
1000                          uint8* dst_u,
1001                          uint8* dst_v,
1002                          int width) {
1003   asm volatile (
1004   "1:                                          \n"
1005     MEMACCESS(0)
1006     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 YUY2 pixels
1007     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1008     MEMACCESS(1)
1009     "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
1010     MEMACCESS(2)
1011     "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
1012     "b.gt       1b                             \n"
1013   : "+r"(src_yuy2),  // %0
1014     "+r"(dst_u),     // %1
1015     "+r"(dst_v),     // %2
1016     "+r"(width)        // %3
1017   :
1018   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1019   );
1020 }
1021 
UYVYToUV422Row_NEON(const uint8 * src_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1022 void UYVYToUV422Row_NEON(const uint8* src_uyvy,
1023                          uint8* dst_u,
1024                          uint8* dst_v,
1025                          int width) {
1026   asm volatile (
1027   "1:                                          \n"
1028     MEMACCESS(0)
1029     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 UYVY pixels
1030     "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
1031     MEMACCESS(1)
1032     "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
1033     MEMACCESS(2)
1034     "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
1035     "b.gt       1b                             \n"
1036   : "+r"(src_uyvy),  // %0
1037     "+r"(dst_u),     // %1
1038     "+r"(dst_v),     // %2
1039     "+r"(width)        // %3
1040   :
1041   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1042   );
1043 }
1044 
YUY2ToUVRow_NEON(const uint8 * src_yuy2,int stride_yuy2,uint8 * dst_u,uint8 * dst_v,int width)1045 void YUY2ToUVRow_NEON(const uint8* src_yuy2,
1046                       int stride_yuy2,
1047                       uint8* dst_u,
1048                       uint8* dst_v,
1049                       int width) {
1050   const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
1051   asm volatile (
1052   "1:                                          \n"
1053     MEMACCESS(0)
1054     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1055     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1056     MEMACCESS(1)
1057     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1058     "urhadd     v1.8b, v1.8b, v5.8b            \n"  // average rows of U
1059     "urhadd     v3.8b, v3.8b, v7.8b            \n"  // average rows of V
1060     MEMACCESS(2)
1061     "st1        {v1.8b}, [%2], #8              \n"  // store 8 U.
1062     MEMACCESS(3)
1063     "st1        {v3.8b}, [%3], #8              \n"  // store 8 V.
1064     "b.gt       1b                             \n"
1065   : "+r"(src_yuy2),     // %0
1066     "+r"(src_yuy2b),    // %1
1067     "+r"(dst_u),        // %2
1068     "+r"(dst_v),        // %3
1069     "+r"(width)           // %4
1070   :
1071   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1072     "v5", "v6", "v7"  // Clobber List
1073   );
1074 }
1075 
UYVYToUVRow_NEON(const uint8 * src_uyvy,int stride_uyvy,uint8 * dst_u,uint8 * dst_v,int width)1076 void UYVYToUVRow_NEON(const uint8* src_uyvy,
1077                       int stride_uyvy,
1078                       uint8* dst_u,
1079                       uint8* dst_v,
1080                       int width) {
1081   const uint8* src_uyvyb = src_uyvy + stride_uyvy;
1082   asm volatile (
1083   "1:                                          \n"
1084     MEMACCESS(0)
1085     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 16 pixels
1086     "subs       %w4, %w4, #16                  \n"  // 16 pixels = 8 UVs.
1087     MEMACCESS(1)
1088     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load next row
1089     "urhadd     v0.8b, v0.8b, v4.8b            \n"  // average rows of U
1090     "urhadd     v2.8b, v2.8b, v6.8b            \n"  // average rows of V
1091     MEMACCESS(2)
1092     "st1        {v0.8b}, [%2], #8              \n"  // store 8 U.
1093     MEMACCESS(3)
1094     "st1        {v2.8b}, [%3], #8              \n"  // store 8 V.
1095     "b.gt       1b                             \n"
1096   : "+r"(src_uyvy),     // %0
1097     "+r"(src_uyvyb),    // %1
1098     "+r"(dst_u),        // %2
1099     "+r"(dst_v),        // %3
1100     "+r"(width)           // %4
1101   :
1102   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1103     "v5", "v6", "v7"  // Clobber List
1104   );
1105 }
1106 
1107 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
ARGBShuffleRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const uint8 * shuffler,int width)1108 void ARGBShuffleRow_NEON(const uint8* src_argb,
1109                          uint8* dst_argb,
1110                          const uint8* shuffler,
1111                          int width) {
1112   asm volatile (
1113     MEMACCESS(3)
1114     "ld1        {v2.16b}, [%3]                 \n"  // shuffler
1115   "1:                                          \n"
1116     MEMACCESS(0)
1117     "ld1        {v0.16b}, [%0], #16            \n"  // load 4 pixels.
1118     "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
1119     "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
1120     MEMACCESS(1)
1121     "st1        {v1.16b}, [%1], #16            \n"  // store 4.
1122     "b.gt       1b                             \n"
1123   : "+r"(src_argb),  // %0
1124     "+r"(dst_argb),  // %1
1125     "+r"(width)        // %2
1126   : "r"(shuffler)    // %3
1127   : "cc", "memory", "v0", "v1", "v2"  // Clobber List
1128   );
1129 }
1130 
I422ToYUY2Row_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_yuy2,int width)1131 void I422ToYUY2Row_NEON(const uint8* src_y,
1132                         const uint8* src_u,
1133                         const uint8* src_v,
1134                         uint8* dst_yuy2,
1135                         int width) {
1136   asm volatile (
1137   "1:                                          \n"
1138     MEMACCESS(0)
1139     "ld2        {v0.8b, v1.8b}, [%0], #16      \n"  // load 16 Ys
1140     "orr        v2.8b, v1.8b, v1.8b            \n"
1141     MEMACCESS(1)
1142     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 Us
1143     MEMACCESS(2)
1144     "ld1        {v3.8b}, [%2], #8              \n"  // load 8 Vs
1145     "subs       %w4, %w4, #16                  \n"  // 16 pixels
1146     MEMACCESS(3)
1147     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1148     "b.gt       1b                             \n"
1149   : "+r"(src_y),     // %0
1150     "+r"(src_u),     // %1
1151     "+r"(src_v),     // %2
1152     "+r"(dst_yuy2),  // %3
1153     "+r"(width)      // %4
1154   :
1155   : "cc", "memory", "v0", "v1", "v2", "v3"
1156   );
1157 }
1158 
I422ToUYVYRow_NEON(const uint8 * src_y,const uint8 * src_u,const uint8 * src_v,uint8 * dst_uyvy,int width)1159 void I422ToUYVYRow_NEON(const uint8* src_y,
1160                         const uint8* src_u,
1161                         const uint8* src_v,
1162                         uint8* dst_uyvy,
1163                         int width) {
1164   asm volatile (
1165   "1:                                          \n"
1166     MEMACCESS(0)
1167     "ld2        {v1.8b,v2.8b}, [%0], #16       \n"  // load 16 Ys
1168     "orr        v3.8b, v2.8b, v2.8b            \n"
1169     MEMACCESS(1)
1170     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 Us
1171     MEMACCESS(2)
1172     "ld1        {v2.8b}, [%2], #8              \n"  // load 8 Vs
1173     "subs       %w4, %w4, #16                  \n"  // 16 pixels
1174     MEMACCESS(3)
1175     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
1176     "b.gt       1b                             \n"
1177   : "+r"(src_y),     // %0
1178     "+r"(src_u),     // %1
1179     "+r"(src_v),     // %2
1180     "+r"(dst_uyvy),  // %3
1181     "+r"(width)      // %4
1182   :
1183   : "cc", "memory", "v0", "v1", "v2", "v3"
1184   );
1185 }
1186 
ARGBToRGB565Row_NEON(const uint8 * src_argb,uint8 * dst_rgb565,int width)1187 void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
1188   asm volatile (
1189   "1:                                          \n"
1190     MEMACCESS(0)
1191     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1192     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1193     ARGBTORGB565
1194     MEMACCESS(1)
1195     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
1196     "b.gt       1b                             \n"
1197   : "+r"(src_argb),  // %0
1198     "+r"(dst_rgb565),  // %1
1199     "+r"(width)        // %2
1200   :
1201   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1202   );
1203 }
1204 
ARGBToRGB565DitherRow_NEON(const uint8 * src_argb,uint8 * dst_rgb,const uint32 dither4,int width)1205 void ARGBToRGB565DitherRow_NEON(const uint8* src_argb,
1206                                 uint8* dst_rgb,
1207                                 const uint32 dither4,
1208                                 int width) {
1209   asm volatile (
1210     "dup        v1.4s, %w2                     \n"  // dither4
1211   "1:                                          \n"
1212     MEMACCESS(1)
1213     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"  // load 8 pixels
1214     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1215     "uqadd      v20.8b, v20.8b, v1.8b          \n"
1216     "uqadd      v21.8b, v21.8b, v1.8b          \n"
1217     "uqadd      v22.8b, v22.8b, v1.8b          \n"
1218     ARGBTORGB565
1219     MEMACCESS(0)
1220     "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
1221     "b.gt       1b                             \n"
1222   : "+r"(dst_rgb)    // %0
1223   : "r"(src_argb),   // %1
1224     "r"(dither4),    // %2
1225     "r"(width)       // %3
1226   : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
1227   );
1228 }
1229 
ARGBToARGB1555Row_NEON(const uint8 * src_argb,uint8 * dst_argb1555,int width)1230 void ARGBToARGB1555Row_NEON(const uint8* src_argb,
1231                             uint8* dst_argb1555,
1232                             int width) {
1233   asm volatile (
1234   "1:                                          \n"
1235     MEMACCESS(0)
1236     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1237     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1238     ARGBTOARGB1555
1239     MEMACCESS(1)
1240     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB1555.
1241     "b.gt       1b                             \n"
1242   : "+r"(src_argb),  // %0
1243     "+r"(dst_argb1555),  // %1
1244     "+r"(width)        // %2
1245   :
1246   : "cc", "memory", "v0", "v20", "v21", "v22", "v23"
1247   );
1248 }
1249 
ARGBToARGB4444Row_NEON(const uint8 * src_argb,uint8 * dst_argb4444,int width)1250 void ARGBToARGB4444Row_NEON(const uint8* src_argb,
1251                             uint8* dst_argb4444,
1252                             int width) {
1253   asm volatile (
1254     "movi       v4.16b, #0x0f                  \n"  // bits to clear with vbic.
1255   "1:                                          \n"
1256     MEMACCESS(0)
1257     "ld4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n"  // load 8 pixels
1258     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1259     ARGBTOARGB4444
1260     MEMACCESS(1)
1261     "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels ARGB4444.
1262     "b.gt       1b                             \n"
1263   : "+r"(src_argb),      // %0
1264     "+r"(dst_argb4444),  // %1
1265     "+r"(width)            // %2
1266   :
1267   : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
1268   );
1269 }
1270 
ARGBToYRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1271 void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1272   asm volatile (
1273     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1274     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1275     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1276     "movi       v7.8b, #16                     \n"  // Add 16 constant
1277   "1:                                          \n"
1278     MEMACCESS(0)
1279     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1280     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1281     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1282     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1283     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1284     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1285     "uqadd      v0.8b, v0.8b, v7.8b            \n"
1286     MEMACCESS(1)
1287     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1288     "b.gt       1b                             \n"
1289   : "+r"(src_argb),  // %0
1290     "+r"(dst_y),     // %1
1291     "+r"(width)        // %2
1292   :
1293   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1294   );
1295 }
1296 
ARGBExtractAlphaRow_NEON(const uint8 * src_argb,uint8 * dst_a,int width)1297 void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
1298   asm volatile (
1299   "1:                                          \n"
1300     MEMACCESS(0)
1301     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load row 16 pixels
1302     "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
1303     MEMACCESS(1)
1304     "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
1305     "b.gt       1b                             \n"
1306   : "+r"(src_argb),   // %0
1307     "+r"(dst_a),      // %1
1308     "+r"(width)       // %2
1309   :
1310   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
1311   );
1312 }
1313 
ARGBToYJRow_NEON(const uint8 * src_argb,uint8 * dst_y,int width)1314 void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
1315   asm volatile (
1316     "movi       v4.8b, #15                     \n"  // B * 0.11400 coefficient
1317     "movi       v5.8b, #75                     \n"  // G * 0.58700 coefficient
1318     "movi       v6.8b, #38                     \n"  // R * 0.29900 coefficient
1319   "1:                                          \n"
1320     MEMACCESS(0)
1321     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1322     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1323     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1324     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1325     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1326     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 15 bit to 8 bit Y
1327     MEMACCESS(1)
1328     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1329     "b.gt       1b                             \n"
1330   : "+r"(src_argb),  // %0
1331     "+r"(dst_y),     // %1
1332     "+r"(width)        // %2
1333   :
1334   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
1335   );
1336 }
1337 
1338 // 8x1 pixels.
ARGBToUV444Row_NEON(const uint8 * src_argb,uint8 * dst_u,uint8 * dst_v,int width)1339 void ARGBToUV444Row_NEON(const uint8* src_argb,
1340                          uint8* dst_u,
1341                          uint8* dst_v,
1342                          int width) {
1343   asm volatile (
1344     "movi       v24.8b, #112                   \n"  // UB / VR 0.875 coefficient
1345     "movi       v25.8b, #74                    \n"  // UG -0.5781 coefficient
1346     "movi       v26.8b, #38                    \n"  // UR -0.2969 coefficient
1347     "movi       v27.8b, #18                    \n"  // VB -0.1406 coefficient
1348     "movi       v28.8b, #94                    \n"  // VG -0.7344 coefficient
1349     "movi       v29.16b,#0x80                  \n"  // 128.5
1350   "1:                                          \n"
1351     MEMACCESS(0)
1352     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
1353     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
1354     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
1355     "umlsl      v4.8h, v1.8b, v25.8b           \n"  // G
1356     "umlsl      v4.8h, v2.8b, v26.8b           \n"  // R
1357     "add        v4.8h, v4.8h, v29.8h           \n"  // +128 -> unsigned
1358 
1359     "umull      v3.8h, v2.8b, v24.8b           \n"  // R
1360     "umlsl      v3.8h, v1.8b, v28.8b           \n"  // G
1361     "umlsl      v3.8h, v0.8b, v27.8b           \n"  // B
1362     "add        v3.8h, v3.8h, v29.8h           \n"  // +128 -> unsigned
1363 
1364     "uqshrn     v0.8b, v4.8h, #8               \n"  // 16 bit to 8 bit U
1365     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1366 
1367     MEMACCESS(1)
1368     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
1369     MEMACCESS(2)
1370     "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
1371     "b.gt       1b                             \n"
1372   : "+r"(src_argb),  // %0
1373     "+r"(dst_u),     // %1
1374     "+r"(dst_v),     // %2
1375     "+r"(width)        // %3
1376   :
1377   : "cc", "memory", "v0", "v1", "v2", "v3", "v4",
1378     "v24", "v25", "v26", "v27", "v28", "v29"
1379   );
1380 }
1381 
1382 #define RGBTOUV_SETUP_REG                                                  \
1383   "movi       v20.8h, #56, lsl #0  \n" /* UB/VR coefficient (0.875) / 2 */ \
1384   "movi       v21.8h, #37, lsl #0  \n" /* UG coefficient (-0.5781) / 2  */ \
1385   "movi       v22.8h, #19, lsl #0  \n" /* UR coefficient (-0.2969) / 2  */ \
1386   "movi       v23.8h, #9,  lsl #0  \n" /* VB coefficient (-0.1406) / 2  */ \
1387   "movi       v24.8h, #47, lsl #0  \n" /* VG coefficient (-0.7344) / 2  */ \
1388   "movi       v25.16b, #0x80       \n" /* 128.5 (0x8080 in 16-bit)      */
1389 
1390 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
1391 #define RGBTOUV(QB, QG, QR)                                                 \
1392   "mul        v3.8h, " #QB                                                  \
1393   ",v20.8h          \n" /* B                    */                          \
1394   "mul        v4.8h, " #QR                                                  \
1395   ",v20.8h          \n" /* R                    */                          \
1396   "mls        v3.8h, " #QG                                                  \
1397   ",v21.8h          \n" /* G                    */                          \
1398   "mls        v4.8h, " #QG                                                  \
1399   ",v24.8h          \n" /* G                    */                          \
1400   "mls        v3.8h, " #QR                                                  \
1401   ",v22.8h          \n" /* R                    */                          \
1402   "mls        v4.8h, " #QB                                                  \
1403   ",v23.8h          \n"                          /* B                    */ \
1404   "add        v3.8h, v3.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1405   "add        v4.8h, v4.8h, v25.8h           \n" /* +128 -> unsigned     */ \
1406   "uqshrn     v0.8b, v3.8h, #8               \n" /* 16 bit to 8 bit U    */ \
1407   "uqshrn     v1.8b, v4.8h, #8               \n" /* 16 bit to 8 bit V    */
1408 
1409 // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
1410 // TODO(fbarchard): consider ptrdiff_t for all strides.
1411 
ARGBToUVRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1412 void ARGBToUVRow_NEON(const uint8* src_argb,
1413                       int src_stride_argb,
1414                       uint8* dst_u,
1415                       uint8* dst_v,
1416                       int width) {
1417   const uint8* src_argb_1 = src_argb + src_stride_argb;
1418   asm volatile (
1419     RGBTOUV_SETUP_REG
1420   "1:                                          \n"
1421     MEMACCESS(0)
1422     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1423     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1424     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1425     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1426 
1427     MEMACCESS(1)
1428     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
1429     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1430     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1431     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1432 
1433     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1434     "urshr      v1.8h, v1.8h, #1               \n"
1435     "urshr      v2.8h, v2.8h, #1               \n"
1436 
1437     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1438     RGBTOUV(v0.8h, v1.8h, v2.8h)
1439     MEMACCESS(2)
1440     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1441     MEMACCESS(3)
1442     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1443     "b.gt       1b                             \n"
1444   : "+r"(src_argb),  // %0
1445     "+r"(src_argb_1),  // %1
1446     "+r"(dst_u),     // %2
1447     "+r"(dst_v),     // %3
1448     "+r"(width)        // %4
1449   :
1450   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1451     "v20", "v21", "v22", "v23", "v24", "v25"
1452   );
1453 }
1454 
1455 // TODO(fbarchard): Subsample match C code.
ARGBToUVJRow_NEON(const uint8 * src_argb,int src_stride_argb,uint8 * dst_u,uint8 * dst_v,int width)1456 void ARGBToUVJRow_NEON(const uint8* src_argb,
1457                        int src_stride_argb,
1458                        uint8* dst_u,
1459                        uint8* dst_v,
1460                        int width) {
1461   const uint8* src_argb_1 = src_argb + src_stride_argb;
1462   asm volatile (
1463     "movi       v20.8h, #63, lsl #0            \n"  // UB/VR coeff (0.500) / 2
1464     "movi       v21.8h, #42, lsl #0            \n"  // UG coeff (-0.33126) / 2
1465     "movi       v22.8h, #21, lsl #0            \n"  // UR coeff (-0.16874) / 2
1466     "movi       v23.8h, #10, lsl #0            \n"  // VB coeff (-0.08131) / 2
1467     "movi       v24.8h, #53, lsl #0            \n"  // VG coeff (-0.41869) / 2
1468     "movi       v25.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1469   "1:                                          \n"
1470     MEMACCESS(0)
1471     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1472     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1473     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1474     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1475     MEMACCESS(1)
1476     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64  \n"  // load next 16
1477     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1478     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1479     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1480 
1481     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1482     "urshr      v1.8h, v1.8h, #1               \n"
1483     "urshr      v2.8h, v2.8h, #1               \n"
1484 
1485     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1486     RGBTOUV(v0.8h, v1.8h, v2.8h)
1487     MEMACCESS(2)
1488     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1489     MEMACCESS(3)
1490     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1491     "b.gt       1b                             \n"
1492   : "+r"(src_argb),  // %0
1493     "+r"(src_argb_1),  // %1
1494     "+r"(dst_u),     // %2
1495     "+r"(dst_v),     // %3
1496     "+r"(width)        // %4
1497   :
1498   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1499     "v20", "v21", "v22", "v23", "v24", "v25"
1500   );
1501 }
1502 
BGRAToUVRow_NEON(const uint8 * src_bgra,int src_stride_bgra,uint8 * dst_u,uint8 * dst_v,int width)1503 void BGRAToUVRow_NEON(const uint8* src_bgra,
1504                       int src_stride_bgra,
1505                       uint8* dst_u,
1506                       uint8* dst_v,
1507                       int width) {
1508   const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
1509   asm volatile (
1510     RGBTOUV_SETUP_REG
1511   "1:                                          \n"
1512     MEMACCESS(0)
1513     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1514     "uaddlp     v0.8h, v3.16b                  \n"  // B 16 bytes -> 8 shorts.
1515     "uaddlp     v3.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1516     "uaddlp     v2.8h, v1.16b                  \n"  // R 16 bytes -> 8 shorts.
1517     MEMACCESS(1)
1518     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more
1519     "uadalp     v0.8h, v7.16b                  \n"  // B 16 bytes -> 8 shorts.
1520     "uadalp     v3.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1521     "uadalp     v2.8h, v5.16b                  \n"  // R 16 bytes -> 8 shorts.
1522 
1523     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1524     "urshr      v1.8h, v3.8h, #1               \n"
1525     "urshr      v2.8h, v2.8h, #1               \n"
1526 
1527     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1528     RGBTOUV(v0.8h, v1.8h, v2.8h)
1529     MEMACCESS(2)
1530     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1531     MEMACCESS(3)
1532     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1533     "b.gt       1b                             \n"
1534   : "+r"(src_bgra),  // %0
1535     "+r"(src_bgra_1),  // %1
1536     "+r"(dst_u),     // %2
1537     "+r"(dst_v),     // %3
1538     "+r"(width)        // %4
1539   :
1540   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1541     "v20", "v21", "v22", "v23", "v24", "v25"
1542   );
1543 }
1544 
ABGRToUVRow_NEON(const uint8 * src_abgr,int src_stride_abgr,uint8 * dst_u,uint8 * dst_v,int width)1545 void ABGRToUVRow_NEON(const uint8* src_abgr,
1546                       int src_stride_abgr,
1547                       uint8* dst_u,
1548                       uint8* dst_v,
1549                       int width) {
1550   const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
1551   asm volatile (
1552     RGBTOUV_SETUP_REG
1553   "1:                                          \n"
1554     MEMACCESS(0)
1555     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1556     "uaddlp     v3.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1557     "uaddlp     v2.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1558     "uaddlp     v1.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1559     MEMACCESS(1)
1560     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1561     "uadalp     v3.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1562     "uadalp     v2.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1563     "uadalp     v1.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1564 
1565     "urshr      v0.8h, v3.8h, #1               \n"  // 2x average
1566     "urshr      v2.8h, v2.8h, #1               \n"
1567     "urshr      v1.8h, v1.8h, #1               \n"
1568 
1569     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1570     RGBTOUV(v0.8h, v2.8h, v1.8h)
1571     MEMACCESS(2)
1572     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1573     MEMACCESS(3)
1574     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1575     "b.gt       1b                             \n"
1576   : "+r"(src_abgr),  // %0
1577     "+r"(src_abgr_1),  // %1
1578     "+r"(dst_u),     // %2
1579     "+r"(dst_v),     // %3
1580     "+r"(width)        // %4
1581   :
1582   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1583     "v20", "v21", "v22", "v23", "v24", "v25"
1584   );
1585 }
1586 
RGBAToUVRow_NEON(const uint8 * src_rgba,int src_stride_rgba,uint8 * dst_u,uint8 * dst_v,int width)1587 void RGBAToUVRow_NEON(const uint8* src_rgba,
1588                       int src_stride_rgba,
1589                       uint8* dst_u,
1590                       uint8* dst_v,
1591                       int width) {
1592   const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
1593   asm volatile (
1594     RGBTOUV_SETUP_REG
1595   "1:                                          \n"
1596     MEMACCESS(0)
1597     "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 pixels.
1598     "uaddlp     v0.8h, v1.16b                  \n"  // B 16 bytes -> 8 shorts.
1599     "uaddlp     v1.8h, v2.16b                  \n"  // G 16 bytes -> 8 shorts.
1600     "uaddlp     v2.8h, v3.16b                  \n"  // R 16 bytes -> 8 shorts.
1601     MEMACCESS(1)
1602     "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load 16 more.
1603     "uadalp     v0.8h, v5.16b                  \n"  // B 16 bytes -> 8 shorts.
1604     "uadalp     v1.8h, v6.16b                  \n"  // G 16 bytes -> 8 shorts.
1605     "uadalp     v2.8h, v7.16b                  \n"  // R 16 bytes -> 8 shorts.
1606 
1607     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1608     "urshr      v1.8h, v1.8h, #1               \n"
1609     "urshr      v2.8h, v2.8h, #1               \n"
1610 
1611     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1612     RGBTOUV(v0.8h, v1.8h, v2.8h)
1613     MEMACCESS(2)
1614     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1615     MEMACCESS(3)
1616     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1617     "b.gt       1b                             \n"
1618   : "+r"(src_rgba),  // %0
1619     "+r"(src_rgba_1),  // %1
1620     "+r"(dst_u),     // %2
1621     "+r"(dst_v),     // %3
1622     "+r"(width)        // %4
1623   :
1624   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1625     "v20", "v21", "v22", "v23", "v24", "v25"
1626   );
1627 }
1628 
RGB24ToUVRow_NEON(const uint8 * src_rgb24,int src_stride_rgb24,uint8 * dst_u,uint8 * dst_v,int width)1629 void RGB24ToUVRow_NEON(const uint8* src_rgb24,
1630                        int src_stride_rgb24,
1631                        uint8* dst_u,
1632                        uint8* dst_v,
1633                        int width) {
1634   const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
1635   asm volatile (
1636     RGBTOUV_SETUP_REG
1637   "1:                                          \n"
1638     MEMACCESS(0)
1639     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 16 pixels.
1640     "uaddlp     v0.8h, v0.16b                  \n"  // B 16 bytes -> 8 shorts.
1641     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1642     "uaddlp     v2.8h, v2.16b                  \n"  // R 16 bytes -> 8 shorts.
1643     MEMACCESS(1)
1644     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 16 more.
1645     "uadalp     v0.8h, v4.16b                  \n"  // B 16 bytes -> 8 shorts.
1646     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1647     "uadalp     v2.8h, v6.16b                  \n"  // R 16 bytes -> 8 shorts.
1648 
1649     "urshr      v0.8h, v0.8h, #1               \n"  // 2x average
1650     "urshr      v1.8h, v1.8h, #1               \n"
1651     "urshr      v2.8h, v2.8h, #1               \n"
1652 
1653     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1654     RGBTOUV(v0.8h, v1.8h, v2.8h)
1655     MEMACCESS(2)
1656     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1657     MEMACCESS(3)
1658     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1659     "b.gt       1b                             \n"
1660   : "+r"(src_rgb24),  // %0
1661     "+r"(src_rgb24_1),  // %1
1662     "+r"(dst_u),     // %2
1663     "+r"(dst_v),     // %3
1664     "+r"(width)        // %4
1665   :
1666   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1667     "v20", "v21", "v22", "v23", "v24", "v25"
1668   );
1669 }
1670 
RAWToUVRow_NEON(const uint8 * src_raw,int src_stride_raw,uint8 * dst_u,uint8 * dst_v,int width)1671 void RAWToUVRow_NEON(const uint8* src_raw,
1672                      int src_stride_raw,
1673                      uint8* dst_u,
1674                      uint8* dst_v,
1675                      int width) {
1676   const uint8* src_raw_1 = src_raw + src_stride_raw;
1677   asm volatile (
1678     RGBTOUV_SETUP_REG
1679   "1:                                          \n"
1680     MEMACCESS(0)
1681     "ld3        {v0.16b,v1.16b,v2.16b}, [%0], #48 \n"  // load 8 RAW pixels.
1682     "uaddlp     v2.8h, v2.16b                  \n"  // B 16 bytes -> 8 shorts.
1683     "uaddlp     v1.8h, v1.16b                  \n"  // G 16 bytes -> 8 shorts.
1684     "uaddlp     v0.8h, v0.16b                  \n"  // R 16 bytes -> 8 shorts.
1685     MEMACCESS(1)
1686     "ld3        {v4.16b,v5.16b,v6.16b}, [%1], #48 \n"  // load 8 more RAW pixels
1687     "uadalp     v2.8h, v6.16b                  \n"  // B 16 bytes -> 8 shorts.
1688     "uadalp     v1.8h, v5.16b                  \n"  // G 16 bytes -> 8 shorts.
1689     "uadalp     v0.8h, v4.16b                  \n"  // R 16 bytes -> 8 shorts.
1690 
1691     "urshr      v2.8h, v2.8h, #1               \n"  // 2x average
1692     "urshr      v1.8h, v1.8h, #1               \n"
1693     "urshr      v0.8h, v0.8h, #1               \n"
1694 
1695     "subs       %w4, %w4, #16                  \n"  // 32 processed per loop.
1696     RGBTOUV(v2.8h, v1.8h, v0.8h)
1697     MEMACCESS(2)
1698     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1699     MEMACCESS(3)
1700     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1701     "b.gt       1b                             \n"
1702   : "+r"(src_raw),  // %0
1703     "+r"(src_raw_1),  // %1
1704     "+r"(dst_u),     // %2
1705     "+r"(dst_v),     // %3
1706     "+r"(width)        // %4
1707   :
1708   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1709     "v20", "v21", "v22", "v23", "v24", "v25"
1710   );
1711 }
1712 
1713 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
RGB565ToUVRow_NEON(const uint8 * src_rgb565,int src_stride_rgb565,uint8 * dst_u,uint8 * dst_v,int width)1714 void RGB565ToUVRow_NEON(const uint8* src_rgb565,
1715                         int src_stride_rgb565,
1716                         uint8* dst_u,
1717                         uint8* dst_v,
1718                         int width) {
1719   const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
1720   asm volatile (
1721     "movi       v22.8h, #56, lsl #0            \n"  // UB / VR coeff (0.875) / 2
1722     "movi       v23.8h, #37, lsl #0            \n"  // UG coeff (-0.5781) / 2
1723     "movi       v24.8h, #19, lsl #0            \n"  // UR coeff (-0.2969) / 2
1724     "movi       v25.8h, #9 , lsl #0            \n"  // VB coeff (-0.1406) / 2
1725     "movi       v26.8h, #47, lsl #0            \n"  // VG coeff (-0.7344) / 2
1726     "movi       v27.16b, #0x80                 \n"  // 128.5 (0x8080 in 16-bit)
1727   "1:                                          \n"
1728     MEMACCESS(0)
1729     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1730     RGB565TOARGB
1731     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1732     "uaddlp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1733     "uaddlp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1734     MEMACCESS(0)
1735     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 RGB565 pixels.
1736     RGB565TOARGB
1737     "uaddlp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1738     "uaddlp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1739     "uaddlp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1740 
1741     MEMACCESS(1)
1742     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 RGB565 pixels.
1743     RGB565TOARGB
1744     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1745     "uadalp     v18.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1746     "uadalp     v20.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1747     MEMACCESS(1)
1748     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 RGB565 pixels.
1749     RGB565TOARGB
1750     "uadalp     v17.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1751     "uadalp     v19.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1752     "uadalp     v21.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1753 
1754     "ins        v16.D[1], v17.D[0]             \n"
1755     "ins        v18.D[1], v19.D[0]             \n"
1756     "ins        v20.D[1], v21.D[0]             \n"
1757 
1758     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1759     "urshr      v5.8h, v18.8h, #1              \n"
1760     "urshr      v6.8h, v20.8h, #1              \n"
1761 
1762     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1763     "mul        v16.8h, v4.8h, v22.8h          \n"  // B
1764     "mls        v16.8h, v5.8h, v23.8h          \n"  // G
1765     "mls        v16.8h, v6.8h, v24.8h          \n"  // R
1766     "add        v16.8h, v16.8h, v27.8h         \n"  // +128 -> unsigned
1767     "mul        v17.8h, v6.8h, v22.8h          \n"  // R
1768     "mls        v17.8h, v5.8h, v26.8h          \n"  // G
1769     "mls        v17.8h, v4.8h, v25.8h          \n"  // B
1770     "add        v17.8h, v17.8h, v27.8h         \n"  // +128 -> unsigned
1771     "uqshrn     v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit U
1772     "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
1773     MEMACCESS(2)
1774     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1775     MEMACCESS(3)
1776     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1777     "b.gt       1b                             \n"
1778   : "+r"(src_rgb565),  // %0
1779     "+r"(src_rgb565_1),  // %1
1780     "+r"(dst_u),     // %2
1781     "+r"(dst_v),     // %3
1782     "+r"(width)        // %4
1783   :
1784   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
1785     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
1786     "v25", "v26", "v27"
1787   );
1788 }
1789 
1790 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB1555ToUVRow_NEON(const uint8 * src_argb1555,int src_stride_argb1555,uint8 * dst_u,uint8 * dst_v,int width)1791 void ARGB1555ToUVRow_NEON(const uint8* src_argb1555,
1792                           int src_stride_argb1555,
1793                           uint8* dst_u,
1794                           uint8* dst_v,
1795                           int width) {
1796   const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
1797   asm volatile (
1798     RGBTOUV_SETUP_REG
1799   "1:                                          \n"
1800     MEMACCESS(0)
1801     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1802     RGB555TOARGB
1803     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1804     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1805     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1806     MEMACCESS(0)
1807     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB1555 pixels.
1808     RGB555TOARGB
1809     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1810     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1811     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1812 
1813     MEMACCESS(1)
1814     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB1555 pixels.
1815     RGB555TOARGB
1816     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1817     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1818     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1819     MEMACCESS(1)
1820     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB1555 pixels.
1821     RGB555TOARGB
1822     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1823     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1824     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1825 
1826     "ins        v16.D[1], v26.D[0]             \n"
1827     "ins        v17.D[1], v27.D[0]             \n"
1828     "ins        v18.D[1], v28.D[0]             \n"
1829 
1830     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1831     "urshr      v5.8h, v17.8h, #1              \n"
1832     "urshr      v6.8h, v18.8h, #1              \n"
1833 
1834     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1835     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1836     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1837     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1838     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1839     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1840     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1841     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1842     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1843     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1844     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1845     MEMACCESS(2)
1846     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1847     MEMACCESS(3)
1848     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1849     "b.gt       1b                             \n"
1850   : "+r"(src_argb1555),  // %0
1851     "+r"(src_argb1555_1),  // %1
1852     "+r"(dst_u),     // %2
1853     "+r"(dst_v),     // %3
1854     "+r"(width)        // %4
1855   :
1856   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1857     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1858     "v26", "v27", "v28"
1859   );
1860 }
1861 
1862 // 16x2 pixels -> 8x1.  width is number of argb pixels. e.g. 16.
ARGB4444ToUVRow_NEON(const uint8 * src_argb4444,int src_stride_argb4444,uint8 * dst_u,uint8 * dst_v,int width)1863 void ARGB4444ToUVRow_NEON(const uint8* src_argb4444,
1864                           int src_stride_argb4444,
1865                           uint8* dst_u,
1866                           uint8* dst_v,
1867                           int width) {
1868   const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
1869   asm volatile (
1870     RGBTOUV_SETUP_REG
1871   "1:                                          \n"
1872     MEMACCESS(0)
1873     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1874     ARGB4444TOARGB
1875     "uaddlp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1876     "uaddlp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1877     "uaddlp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1878     MEMACCESS(0)
1879     "ld1        {v0.16b}, [%0], #16            \n"  // next 8 ARGB4444 pixels.
1880     ARGB4444TOARGB
1881     "uaddlp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1882     "uaddlp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1883     "uaddlp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1884 
1885     MEMACCESS(1)
1886     "ld1        {v0.16b}, [%1], #16            \n"  // load 8 ARGB4444 pixels.
1887     ARGB4444TOARGB
1888     "uadalp     v16.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1889     "uadalp     v17.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1890     "uadalp     v18.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1891     MEMACCESS(1)
1892     "ld1        {v0.16b}, [%1], #16            \n"  // next 8 ARGB4444 pixels.
1893     ARGB4444TOARGB
1894     "uadalp     v26.4h, v0.8b                  \n"  // B 8 bytes -> 4 shorts.
1895     "uadalp     v27.4h, v1.8b                  \n"  // G 8 bytes -> 4 shorts.
1896     "uadalp     v28.4h, v2.8b                  \n"  // R 8 bytes -> 4 shorts.
1897 
1898     "ins        v16.D[1], v26.D[0]             \n"
1899     "ins        v17.D[1], v27.D[0]             \n"
1900     "ins        v18.D[1], v28.D[0]             \n"
1901 
1902     "urshr      v4.8h, v16.8h, #1              \n"  // 2x average
1903     "urshr      v5.8h, v17.8h, #1              \n"
1904     "urshr      v6.8h, v18.8h, #1              \n"
1905 
1906     "subs       %w4, %w4, #16                  \n"  // 16 processed per loop.
1907     "mul        v2.8h, v4.8h, v20.8h           \n"  // B
1908     "mls        v2.8h, v5.8h, v21.8h           \n"  // G
1909     "mls        v2.8h, v6.8h, v22.8h           \n"  // R
1910     "add        v2.8h, v2.8h, v25.8h           \n"  // +128 -> unsigned
1911     "mul        v3.8h, v6.8h, v20.8h           \n"  // R
1912     "mls        v3.8h, v5.8h, v24.8h           \n"  // G
1913     "mls        v3.8h, v4.8h, v23.8h           \n"  // B
1914     "add        v3.8h, v3.8h, v25.8h           \n"  // +128 -> unsigned
1915     "uqshrn     v0.8b, v2.8h, #8               \n"  // 16 bit to 8 bit U
1916     "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
1917     MEMACCESS(2)
1918     "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
1919     MEMACCESS(3)
1920     "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
1921     "b.gt       1b                             \n"
1922   : "+r"(src_argb4444),  // %0
1923     "+r"(src_argb4444_1),  // %1
1924     "+r"(dst_u),     // %2
1925     "+r"(dst_v),     // %3
1926     "+r"(width)        // %4
1927   :
1928   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
1929     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
1930     "v26", "v27", "v28"
1931 
1932   );
1933 }
1934 
RGB565ToYRow_NEON(const uint8 * src_rgb565,uint8 * dst_y,int width)1935 void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
1936   asm volatile (
1937     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1938     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1939     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1940     "movi       v27.8b, #16                    \n"  // Add 16 constant
1941   "1:                                          \n"
1942     MEMACCESS(0)
1943     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 RGB565 pixels.
1944     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1945     RGB565TOARGB
1946     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
1947     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
1948     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
1949     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1950     "uqadd      v0.8b, v0.8b, v27.8b           \n"
1951     MEMACCESS(1)
1952     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1953     "b.gt       1b                             \n"
1954   : "+r"(src_rgb565),  // %0
1955     "+r"(dst_y),       // %1
1956     "+r"(width)          // %2
1957   :
1958   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
1959     "v24", "v25", "v26", "v27"
1960   );
1961 }
1962 
ARGB1555ToYRow_NEON(const uint8 * src_argb1555,uint8 * dst_y,int width)1963 void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
1964   asm volatile (
1965     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
1966     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
1967     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
1968     "movi       v7.8b, #16                     \n"  // Add 16 constant
1969   "1:                                          \n"
1970     MEMACCESS(0)
1971     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB1555 pixels.
1972     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
1973     ARGB1555TOARGB
1974     "umull      v3.8h, v0.8b, v4.8b            \n"  // B
1975     "umlal      v3.8h, v1.8b, v5.8b            \n"  // G
1976     "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
1977     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
1978     "uqadd      v0.8b, v0.8b, v7.8b            \n"
1979     MEMACCESS(1)
1980     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
1981     "b.gt       1b                             \n"
1982   : "+r"(src_argb1555),  // %0
1983     "+r"(dst_y),         // %1
1984     "+r"(width)            // %2
1985   :
1986   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
1987   );
1988 }
1989 
ARGB4444ToYRow_NEON(const uint8 * src_argb4444,uint8 * dst_y,int width)1990 void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
1991   asm volatile (
1992     "movi       v24.8b, #13                    \n"  // B * 0.1016 coefficient
1993     "movi       v25.8b, #65                    \n"  // G * 0.5078 coefficient
1994     "movi       v26.8b, #33                    \n"  // R * 0.2578 coefficient
1995     "movi       v27.8b, #16                    \n"  // Add 16 constant
1996   "1:                                          \n"
1997     MEMACCESS(0)
1998     "ld1        {v0.16b}, [%0], #16            \n"  // load 8 ARGB4444 pixels.
1999     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2000     ARGB4444TOARGB
2001     "umull      v3.8h, v0.8b, v24.8b           \n"  // B
2002     "umlal      v3.8h, v1.8b, v25.8b           \n"  // G
2003     "umlal      v3.8h, v2.8b, v26.8b           \n"  // R
2004     "sqrshrun   v0.8b, v3.8h, #7               \n"  // 16 bit to 8 bit Y
2005     "uqadd      v0.8b, v0.8b, v27.8b           \n"
2006     MEMACCESS(1)
2007     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2008     "b.gt       1b                             \n"
2009   : "+r"(src_argb4444),  // %0
2010     "+r"(dst_y),         // %1
2011     "+r"(width)            // %2
2012   :
2013   : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
2014   );
2015 }
2016 
BGRAToYRow_NEON(const uint8 * src_bgra,uint8 * dst_y,int width)2017 void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
2018   asm volatile (
2019     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2020     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2021     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2022     "movi       v7.8b, #16                     \n"  // Add 16 constant
2023   "1:                                          \n"
2024     MEMACCESS(0)
2025     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2026     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2027     "umull      v16.8h, v1.8b, v4.8b           \n"  // R
2028     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2029     "umlal      v16.8h, v3.8b, v6.8b           \n"  // B
2030     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2031     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2032     MEMACCESS(1)
2033     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2034     "b.gt       1b                             \n"
2035   : "+r"(src_bgra),  // %0
2036     "+r"(dst_y),     // %1
2037     "+r"(width)        // %2
2038   :
2039   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2040   );
2041 }
2042 
ABGRToYRow_NEON(const uint8 * src_abgr,uint8 * dst_y,int width)2043 void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
2044   asm volatile (
2045     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2046     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2047     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2048     "movi       v7.8b, #16                     \n"  // Add 16 constant
2049   "1:                                          \n"
2050     MEMACCESS(0)
2051     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2052     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2053     "umull      v16.8h, v0.8b, v4.8b           \n"  // R
2054     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2055     "umlal      v16.8h, v2.8b, v6.8b           \n"  // B
2056     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2057     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2058     MEMACCESS(1)
2059     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2060     "b.gt       1b                             \n"
2061   : "+r"(src_abgr),  // %0
2062     "+r"(dst_y),     // %1
2063     "+r"(width)        // %2
2064   :
2065   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2066   );
2067 }
2068 
RGBAToYRow_NEON(const uint8 * src_rgba,uint8 * dst_y,int width)2069 void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
2070   asm volatile (
2071     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2072     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2073     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2074     "movi       v7.8b, #16                     \n"  // Add 16 constant
2075   "1:                                          \n"
2076     MEMACCESS(0)
2077     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 pixels.
2078     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2079     "umull      v16.8h, v1.8b, v4.8b           \n"  // B
2080     "umlal      v16.8h, v2.8b, v5.8b           \n"  // G
2081     "umlal      v16.8h, v3.8b, v6.8b           \n"  // R
2082     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2083     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2084     MEMACCESS(1)
2085     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2086     "b.gt       1b                             \n"
2087   : "+r"(src_rgba),  // %0
2088     "+r"(dst_y),     // %1
2089     "+r"(width)        // %2
2090   :
2091   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2092   );
2093 }
2094 
RGB24ToYRow_NEON(const uint8 * src_rgb24,uint8 * dst_y,int width)2095 void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
2096   asm volatile (
2097     "movi       v4.8b, #13                     \n"  // B * 0.1016 coefficient
2098     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2099     "movi       v6.8b, #33                     \n"  // R * 0.2578 coefficient
2100     "movi       v7.8b, #16                     \n"  // Add 16 constant
2101   "1:                                          \n"
2102     MEMACCESS(0)
2103     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2104     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2105     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2106     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2107     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2108     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2109     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2110     MEMACCESS(1)
2111     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2112     "b.gt       1b                             \n"
2113   : "+r"(src_rgb24),  // %0
2114     "+r"(dst_y),      // %1
2115     "+r"(width)         // %2
2116   :
2117   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2118   );
2119 }
2120 
RAWToYRow_NEON(const uint8 * src_raw,uint8 * dst_y,int width)2121 void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
2122   asm volatile (
2123     "movi       v4.8b, #33                     \n"  // R * 0.2578 coefficient
2124     "movi       v5.8b, #65                     \n"  // G * 0.5078 coefficient
2125     "movi       v6.8b, #13                     \n"  // B * 0.1016 coefficient
2126     "movi       v7.8b, #16                     \n"  // Add 16 constant
2127   "1:                                          \n"
2128     MEMACCESS(0)
2129     "ld3        {v0.8b,v1.8b,v2.8b}, [%0], #24 \n"  // load 8 pixels.
2130     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2131     "umull      v16.8h, v0.8b, v4.8b           \n"  // B
2132     "umlal      v16.8h, v1.8b, v5.8b           \n"  // G
2133     "umlal      v16.8h, v2.8b, v6.8b           \n"  // R
2134     "sqrshrun   v0.8b, v16.8h, #7              \n"  // 16 bit to 8 bit Y
2135     "uqadd      v0.8b, v0.8b, v7.8b            \n"
2136     MEMACCESS(1)
2137     "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
2138     "b.gt       1b                             \n"
2139   : "+r"(src_raw),  // %0
2140     "+r"(dst_y),    // %1
2141     "+r"(width)       // %2
2142   :
2143   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
2144   );
2145 }
2146 
2147 // Bilinear filter 16x2 -> 16x1
InterpolateRow_NEON(uint8 * dst_ptr,const uint8 * src_ptr,ptrdiff_t src_stride,int dst_width,int source_y_fraction)2148 void InterpolateRow_NEON(uint8* dst_ptr,
2149                          const uint8* src_ptr,
2150                          ptrdiff_t src_stride,
2151                          int dst_width,
2152                          int source_y_fraction) {
2153   int y1_fraction = source_y_fraction;
2154   int y0_fraction = 256 - y1_fraction;
2155   const uint8* src_ptr1 = src_ptr + src_stride;
2156   asm volatile (
2157     "cmp        %w4, #0                        \n"
2158     "b.eq       100f                           \n"
2159     "cmp        %w4, #128                      \n"
2160     "b.eq       50f                            \n"
2161 
2162     "dup        v5.16b, %w4                    \n"
2163     "dup        v4.16b, %w5                    \n"
2164     // General purpose row blend.
2165   "1:                                          \n"
2166     MEMACCESS(1)
2167     "ld1        {v0.16b}, [%1], #16            \n"
2168     MEMACCESS(2)
2169     "ld1        {v1.16b}, [%2], #16            \n"
2170     "subs       %w3, %w3, #16                  \n"
2171     "umull      v2.8h, v0.8b,  v4.8b           \n"
2172     "umull2     v3.8h, v0.16b, v4.16b          \n"
2173     "umlal      v2.8h, v1.8b,  v5.8b           \n"
2174     "umlal2     v3.8h, v1.16b, v5.16b          \n"
2175     "rshrn      v0.8b,  v2.8h, #8              \n"
2176     "rshrn2     v0.16b, v3.8h, #8              \n"
2177     MEMACCESS(0)
2178     "st1        {v0.16b}, [%0], #16            \n"
2179     "b.gt       1b                             \n"
2180     "b          99f                            \n"
2181 
2182     // Blend 50 / 50.
2183   "50:                                         \n"
2184     MEMACCESS(1)
2185     "ld1        {v0.16b}, [%1], #16            \n"
2186     MEMACCESS(2)
2187     "ld1        {v1.16b}, [%2], #16            \n"
2188     "subs       %w3, %w3, #16                  \n"
2189     "urhadd     v0.16b, v0.16b, v1.16b         \n"
2190     MEMACCESS(0)
2191     "st1        {v0.16b}, [%0], #16            \n"
2192     "b.gt       50b                            \n"
2193     "b          99f                            \n"
2194 
2195     // Blend 100 / 0 - Copy row unchanged.
2196   "100:                                        \n"
2197     MEMACCESS(1)
2198     "ld1        {v0.16b}, [%1], #16            \n"
2199     "subs       %w3, %w3, #16                  \n"
2200     MEMACCESS(0)
2201     "st1        {v0.16b}, [%0], #16            \n"
2202     "b.gt       100b                           \n"
2203 
2204   "99:                                         \n"
2205   : "+r"(dst_ptr),          // %0
2206     "+r"(src_ptr),          // %1
2207     "+r"(src_ptr1),         // %2
2208     "+r"(dst_width),        // %3
2209     "+r"(y1_fraction),      // %4
2210     "+r"(y0_fraction)       // %5
2211   :
2212   : "cc", "memory", "v0", "v1", "v3", "v4", "v5"
2213   );
2214 }
2215 
2216 // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
ARGBBlendRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2217 void ARGBBlendRow_NEON(const uint8* src_argb0,
2218                        const uint8* src_argb1,
2219                        uint8* dst_argb,
2220                        int width) {
2221   asm volatile (
2222     "subs       %w3, %w3, #8                   \n"
2223     "b.lt       89f                            \n"
2224     // Blend 8 pixels.
2225   "8:                                          \n"
2226     MEMACCESS(0)
2227     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB0 pixels
2228     MEMACCESS(1)
2229     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 ARGB1 pixels
2230     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2231     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2232     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2233     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2234     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2235     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2236     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2237     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2238     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2239     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2240     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2241     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2242     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2243     "movi       v3.8b, #255                    \n"  // a = 255
2244     MEMACCESS(2)
2245     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2246     "b.ge       8b                             \n"
2247 
2248   "89:                                         \n"
2249     "adds       %w3, %w3, #8-1                 \n"
2250     "b.lt       99f                            \n"
2251 
2252     // Blend 1 pixels.
2253   "1:                                          \n"
2254     MEMACCESS(0)
2255     "ld4        {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n"  // load 1 pixel ARGB0.
2256     MEMACCESS(1)
2257     "ld4        {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n"  // load 1 pixel ARGB1.
2258     "subs       %w3, %w3, #1                   \n"  // 1 processed per loop.
2259     "umull      v16.8h, v4.8b, v3.8b           \n"  // db * a
2260     "umull      v17.8h, v5.8b, v3.8b           \n"  // dg * a
2261     "umull      v18.8h, v6.8b, v3.8b           \n"  // dr * a
2262     "uqrshrn    v16.8b, v16.8h, #8             \n"  // db >>= 8
2263     "uqrshrn    v17.8b, v17.8h, #8             \n"  // dg >>= 8
2264     "uqrshrn    v18.8b, v18.8h, #8             \n"  // dr >>= 8
2265     "uqsub      v4.8b, v4.8b, v16.8b           \n"  // db - (db * a / 256)
2266     "uqsub      v5.8b, v5.8b, v17.8b           \n"  // dg - (dg * a / 256)
2267     "uqsub      v6.8b, v6.8b, v18.8b           \n"  // dr - (dr * a / 256)
2268     "uqadd      v0.8b, v0.8b, v4.8b            \n"  // + sb
2269     "uqadd      v1.8b, v1.8b, v5.8b            \n"  // + sg
2270     "uqadd      v2.8b, v2.8b, v6.8b            \n"  // + sr
2271     "movi       v3.8b, #255                    \n"  // a = 255
2272     MEMACCESS(2)
2273     "st4        {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n"  // store 1 pixel.
2274     "b.ge       1b                             \n"
2275 
2276   "99:                                         \n"
2277 
2278   : "+r"(src_argb0),    // %0
2279     "+r"(src_argb1),    // %1
2280     "+r"(dst_argb),     // %2
2281     "+r"(width)         // %3
2282   :
2283   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2284     "v16", "v17", "v18"
2285   );
2286 }
2287 
2288 // Attenuate 8 pixels at a time.
ARGBAttenuateRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2289 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2290   asm volatile (
2291     // Attenuate 8 pixels.
2292   "1:                                          \n"
2293     MEMACCESS(0)
2294     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels
2295     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2296     "umull      v4.8h, v0.8b, v3.8b            \n"  // b * a
2297     "umull      v5.8h, v1.8b, v3.8b            \n"  // g * a
2298     "umull      v6.8h, v2.8b, v3.8b            \n"  // r * a
2299     "uqrshrn    v0.8b, v4.8h, #8               \n"  // b >>= 8
2300     "uqrshrn    v1.8b, v5.8h, #8               \n"  // g >>= 8
2301     "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
2302     MEMACCESS(1)
2303     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2304     "b.gt       1b                             \n"
2305   : "+r"(src_argb),   // %0
2306     "+r"(dst_argb),   // %1
2307     "+r"(width)       // %2
2308   :
2309   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2310   );
2311 }
2312 
2313 // Quantize 8 ARGB pixels (32 bytes).
2314 // dst = (dst * scale >> 16) * interval_size + interval_offset;
ARGBQuantizeRow_NEON(uint8 * dst_argb,int scale,int interval_size,int interval_offset,int width)2315 void ARGBQuantizeRow_NEON(uint8* dst_argb,
2316                           int scale,
2317                           int interval_size,
2318                           int interval_offset,
2319                           int width) {
2320   asm volatile (
2321     "dup        v4.8h, %w2                     \n"
2322     "ushr       v4.8h, v4.8h, #1               \n"  // scale >>= 1
2323     "dup        v5.8h, %w3                     \n"  // interval multiply.
2324     "dup        v6.8h, %w4                     \n"  // interval add
2325 
2326     // 8 pixel loop.
2327   "1:                                          \n"
2328     MEMACCESS(0)
2329     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0]  \n"  // load 8 pixels of ARGB.
2330     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2331     "uxtl       v0.8h, v0.8b                   \n"  // b (0 .. 255)
2332     "uxtl       v1.8h, v1.8b                   \n"
2333     "uxtl       v2.8h, v2.8b                   \n"
2334     "sqdmulh    v0.8h, v0.8h, v4.8h            \n"  // b * scale
2335     "sqdmulh    v1.8h, v1.8h, v4.8h            \n"  // g
2336     "sqdmulh    v2.8h, v2.8h, v4.8h            \n"  // r
2337     "mul        v0.8h, v0.8h, v5.8h            \n"  // b * interval_size
2338     "mul        v1.8h, v1.8h, v5.8h            \n"  // g
2339     "mul        v2.8h, v2.8h, v5.8h            \n"  // r
2340     "add        v0.8h, v0.8h, v6.8h            \n"  // b + interval_offset
2341     "add        v1.8h, v1.8h, v6.8h            \n"  // g
2342     "add        v2.8h, v2.8h, v6.8h            \n"  // r
2343     "uqxtn      v0.8b, v0.8h                   \n"
2344     "uqxtn      v1.8b, v1.8h                   \n"
2345     "uqxtn      v2.8b, v2.8h                   \n"
2346     MEMACCESS(0)
2347     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB pixels
2348     "b.gt       1b                             \n"
2349   : "+r"(dst_argb),       // %0
2350     "+r"(width)           // %1
2351   : "r"(scale),           // %2
2352     "r"(interval_size),   // %3
2353     "r"(interval_offset)  // %4
2354   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
2355   );
2356 }
2357 
2358 // Shade 8 pixels at a time by specified value.
2359 // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
2360 // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
ARGBShadeRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width,uint32 value)2361 void ARGBShadeRow_NEON(const uint8* src_argb,
2362                        uint8* dst_argb,
2363                        int width,
2364                        uint32 value) {
2365   asm volatile (
2366     "dup        v0.4s, %w3                     \n"  // duplicate scale value.
2367     "zip1       v0.8b, v0.8b, v0.8b            \n"  // v0.8b aarrggbb.
2368     "ushr       v0.8h, v0.8h, #1               \n"  // scale / 2.
2369 
2370     // 8 pixel loop.
2371   "1:                                          \n"
2372     MEMACCESS(0)
2373     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2374     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2375     "uxtl       v4.8h, v4.8b                   \n"  // b (0 .. 255)
2376     "uxtl       v5.8h, v5.8b                   \n"
2377     "uxtl       v6.8h, v6.8b                   \n"
2378     "uxtl       v7.8h, v7.8b                   \n"
2379     "sqrdmulh   v4.8h, v4.8h, v0.h[0]          \n"  // b * scale * 2
2380     "sqrdmulh   v5.8h, v5.8h, v0.h[1]          \n"  // g
2381     "sqrdmulh   v6.8h, v6.8h, v0.h[2]          \n"  // r
2382     "sqrdmulh   v7.8h, v7.8h, v0.h[3]          \n"  // a
2383     "uqxtn      v4.8b, v4.8h                   \n"
2384     "uqxtn      v5.8b, v5.8h                   \n"
2385     "uqxtn      v6.8b, v6.8h                   \n"
2386     "uqxtn      v7.8b, v7.8h                   \n"
2387     MEMACCESS(1)
2388     "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB pixels
2389     "b.gt       1b                             \n"
2390   : "+r"(src_argb),       // %0
2391     "+r"(dst_argb),       // %1
2392     "+r"(width)           // %2
2393   : "r"(value)            // %3
2394   : "cc", "memory", "v0", "v4", "v5", "v6", "v7"
2395   );
2396 }
2397 
2398 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
2399 // Similar to ARGBToYJ but stores ARGB.
2400 // C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
ARGBGrayRow_NEON(const uint8 * src_argb,uint8 * dst_argb,int width)2401 void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
2402   asm volatile (
2403     "movi       v24.8b, #15                    \n"  // B * 0.11400 coefficient
2404     "movi       v25.8b, #75                    \n"  // G * 0.58700 coefficient
2405     "movi       v26.8b, #38                    \n"  // R * 0.29900 coefficient
2406   "1:                                          \n"
2407     MEMACCESS(0)
2408     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2409     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2410     "umull      v4.8h, v0.8b, v24.8b           \n"  // B
2411     "umlal      v4.8h, v1.8b, v25.8b           \n"  // G
2412     "umlal      v4.8h, v2.8b, v26.8b           \n"  // R
2413     "sqrshrun   v0.8b, v4.8h, #7               \n"  // 15 bit to 8 bit B
2414     "orr        v1.8b, v0.8b, v0.8b            \n"  // G
2415     "orr        v2.8b, v0.8b, v0.8b            \n"  // R
2416     MEMACCESS(1)
2417     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
2418     "b.gt       1b                             \n"
2419   : "+r"(src_argb),  // %0
2420     "+r"(dst_argb),  // %1
2421     "+r"(width)      // %2
2422   :
2423   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
2424   );
2425 }
2426 
2427 // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
2428 //    b = (r * 35 + g * 68 + b * 17) >> 7
2429 //    g = (r * 45 + g * 88 + b * 22) >> 7
2430 //    r = (r * 50 + g * 98 + b * 24) >> 7
2431 
ARGBSepiaRow_NEON(uint8 * dst_argb,int width)2432 void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
2433   asm volatile (
2434     "movi       v20.8b, #17                    \n"  // BB coefficient
2435     "movi       v21.8b, #68                    \n"  // BG coefficient
2436     "movi       v22.8b, #35                    \n"  // BR coefficient
2437     "movi       v24.8b, #22                    \n"  // GB coefficient
2438     "movi       v25.8b, #88                    \n"  // GG coefficient
2439     "movi       v26.8b, #45                    \n"  // GR coefficient
2440     "movi       v28.8b, #24                    \n"  // BB coefficient
2441     "movi       v29.8b, #98                    \n"  // BG coefficient
2442     "movi       v30.8b, #50                    \n"  // BR coefficient
2443   "1:                                          \n"
2444     MEMACCESS(0)
2445     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n"  // load 8 ARGB pixels.
2446     "subs       %w1, %w1, #8                   \n"  // 8 processed per loop.
2447     "umull      v4.8h, v0.8b, v20.8b           \n"  // B to Sepia B
2448     "umlal      v4.8h, v1.8b, v21.8b           \n"  // G
2449     "umlal      v4.8h, v2.8b, v22.8b           \n"  // R
2450     "umull      v5.8h, v0.8b, v24.8b           \n"  // B to Sepia G
2451     "umlal      v5.8h, v1.8b, v25.8b           \n"  // G
2452     "umlal      v5.8h, v2.8b, v26.8b           \n"  // R
2453     "umull      v6.8h, v0.8b, v28.8b           \n"  // B to Sepia R
2454     "umlal      v6.8h, v1.8b, v29.8b           \n"  // G
2455     "umlal      v6.8h, v2.8b, v30.8b           \n"  // R
2456     "uqshrn     v0.8b, v4.8h, #7               \n"  // 16 bit to 8 bit B
2457     "uqshrn     v1.8b, v5.8h, #7               \n"  // 16 bit to 8 bit G
2458     "uqshrn     v2.8b, v6.8h, #7               \n"  // 16 bit to 8 bit R
2459     MEMACCESS(0)
2460     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
2461     "b.gt       1b                             \n"
2462   : "+r"(dst_argb),  // %0
2463     "+r"(width)      // %1
2464   :
2465   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
2466     "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
2467   );
2468 }
2469 
2470 // Tranform 8 ARGB pixels (32 bytes) with color matrix.
2471 // TODO(fbarchard): Was same as Sepia except matrix is provided.  This function
2472 // needs to saturate.  Consider doing a non-saturating version.
ARGBColorMatrixRow_NEON(const uint8 * src_argb,uint8 * dst_argb,const int8 * matrix_argb,int width)2473 void ARGBColorMatrixRow_NEON(const uint8* src_argb,
2474                              uint8* dst_argb,
2475                              const int8* matrix_argb,
2476                              int width) {
2477   asm volatile (
2478     MEMACCESS(3)
2479     "ld1        {v2.16b}, [%3]                 \n"  // load 3 ARGB vectors.
2480     "sxtl       v0.8h, v2.8b                   \n"  // B,G coefficients s16.
2481     "sxtl2      v1.8h, v2.16b                  \n"  // R,A coefficients s16.
2482 
2483   "1:                                          \n"
2484     MEMACCESS(0)
2485     "ld4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n"  // load 8 pixels.
2486     "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
2487     "uxtl       v16.8h, v16.8b                 \n"  // b (0 .. 255) 16 bit
2488     "uxtl       v17.8h, v17.8b                 \n"  // g
2489     "uxtl       v18.8h, v18.8b                 \n"  // r
2490     "uxtl       v19.8h, v19.8b                 \n"  // a
2491     "mul        v22.8h, v16.8h, v0.h[0]        \n"  // B = B * Matrix B
2492     "mul        v23.8h, v16.8h, v0.h[4]        \n"  // G = B * Matrix G
2493     "mul        v24.8h, v16.8h, v1.h[0]        \n"  // R = B * Matrix R
2494     "mul        v25.8h, v16.8h, v1.h[4]        \n"  // A = B * Matrix A
2495     "mul        v4.8h, v17.8h, v0.h[1]         \n"  // B += G * Matrix B
2496     "mul        v5.8h, v17.8h, v0.h[5]         \n"  // G += G * Matrix G
2497     "mul        v6.8h, v17.8h, v1.h[1]         \n"  // R += G * Matrix R
2498     "mul        v7.8h, v17.8h, v1.h[5]         \n"  // A += G * Matrix A
2499     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2500     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2501     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2502     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2503     "mul        v4.8h, v18.8h, v0.h[2]         \n"  // B += R * Matrix B
2504     "mul        v5.8h, v18.8h, v0.h[6]         \n"  // G += R * Matrix G
2505     "mul        v6.8h, v18.8h, v1.h[2]         \n"  // R += R * Matrix R
2506     "mul        v7.8h, v18.8h, v1.h[6]         \n"  // A += R * Matrix A
2507     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2508     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2509     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2510     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2511     "mul        v4.8h, v19.8h, v0.h[3]         \n"  // B += A * Matrix B
2512     "mul        v5.8h, v19.8h, v0.h[7]         \n"  // G += A * Matrix G
2513     "mul        v6.8h, v19.8h, v1.h[3]         \n"  // R += A * Matrix R
2514     "mul        v7.8h, v19.8h, v1.h[7]         \n"  // A += A * Matrix A
2515     "sqadd      v22.8h, v22.8h, v4.8h          \n"  // Accumulate B
2516     "sqadd      v23.8h, v23.8h, v5.8h          \n"  // Accumulate G
2517     "sqadd      v24.8h, v24.8h, v6.8h          \n"  // Accumulate R
2518     "sqadd      v25.8h, v25.8h, v7.8h          \n"  // Accumulate A
2519     "sqshrun    v16.8b, v22.8h, #6             \n"  // 16 bit to 8 bit B
2520     "sqshrun    v17.8b, v23.8h, #6             \n"  // 16 bit to 8 bit G
2521     "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
2522     "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
2523     MEMACCESS(1)
2524     "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 pixels.
2525     "b.gt       1b                             \n"
2526   : "+r"(src_argb),   // %0
2527     "+r"(dst_argb),   // %1
2528     "+r"(width)       // %2
2529   : "r"(matrix_argb)  // %3
2530   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17",
2531     "v18", "v19", "v22", "v23", "v24", "v25"
2532   );
2533 }
2534 
2535 // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
2536 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBMultiplyRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2537 void ARGBMultiplyRow_NEON(const uint8* src_argb0,
2538                           const uint8* src_argb1,
2539                           uint8* dst_argb,
2540                           int width) {
2541   asm volatile (
2542     // 8 pixel loop.
2543   "1:                                          \n"
2544     MEMACCESS(0)
2545     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2546     MEMACCESS(1)
2547     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2548     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2549     "umull      v0.8h, v0.8b, v4.8b            \n"  // multiply B
2550     "umull      v1.8h, v1.8b, v5.8b            \n"  // multiply G
2551     "umull      v2.8h, v2.8b, v6.8b            \n"  // multiply R
2552     "umull      v3.8h, v3.8b, v7.8b            \n"  // multiply A
2553     "rshrn      v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit B
2554     "rshrn      v1.8b, v1.8h, #8               \n"  // 16 bit to 8 bit G
2555     "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
2556     "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
2557     MEMACCESS(2)
2558     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2559     "b.gt       1b                             \n"
2560 
2561   : "+r"(src_argb0),  // %0
2562     "+r"(src_argb1),  // %1
2563     "+r"(dst_argb),   // %2
2564     "+r"(width)       // %3
2565   :
2566   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2567   );
2568 }
2569 
2570 // Add 2 rows of ARGB pixels together, 8 pixels at a time.
ARGBAddRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2571 void ARGBAddRow_NEON(const uint8* src_argb0,
2572                      const uint8* src_argb1,
2573                      uint8* dst_argb,
2574                      int width) {
2575   asm volatile (
2576     // 8 pixel loop.
2577   "1:                                          \n"
2578     MEMACCESS(0)
2579     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2580     MEMACCESS(1)
2581     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2582     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2583     "uqadd      v0.8b, v0.8b, v4.8b            \n"
2584     "uqadd      v1.8b, v1.8b, v5.8b            \n"
2585     "uqadd      v2.8b, v2.8b, v6.8b            \n"
2586     "uqadd      v3.8b, v3.8b, v7.8b            \n"
2587     MEMACCESS(2)
2588     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2589     "b.gt       1b                             \n"
2590 
2591   : "+r"(src_argb0),  // %0
2592     "+r"(src_argb1),  // %1
2593     "+r"(dst_argb),   // %2
2594     "+r"(width)       // %3
2595   :
2596   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2597   );
2598 }
2599 
2600 // Subtract 2 rows of ARGB pixels, 8 pixels at a time.
ARGBSubtractRow_NEON(const uint8 * src_argb0,const uint8 * src_argb1,uint8 * dst_argb,int width)2601 void ARGBSubtractRow_NEON(const uint8* src_argb0,
2602                           const uint8* src_argb1,
2603                           uint8* dst_argb,
2604                           int width) {
2605   asm volatile (
2606     // 8 pixel loop.
2607   "1:                                          \n"
2608     MEMACCESS(0)
2609     "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load 8 ARGB pixels.
2610     MEMACCESS(1)
2611     "ld4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // load 8 more pixels.
2612     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2613     "uqsub      v0.8b, v0.8b, v4.8b            \n"
2614     "uqsub      v1.8b, v1.8b, v5.8b            \n"
2615     "uqsub      v2.8b, v2.8b, v6.8b            \n"
2616     "uqsub      v3.8b, v3.8b, v7.8b            \n"
2617     MEMACCESS(2)
2618     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2619     "b.gt       1b                             \n"
2620 
2621   : "+r"(src_argb0),  // %0
2622     "+r"(src_argb1),  // %1
2623     "+r"(dst_argb),   // %2
2624     "+r"(width)       // %3
2625   :
2626   : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
2627   );
2628 }
2629 
2630 // Adds Sobel X and Sobel Y and stores Sobel into ARGB.
2631 // A = 255
2632 // R = Sobel
2633 // G = Sobel
2634 // B = Sobel
SobelRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2635 void SobelRow_NEON(const uint8* src_sobelx,
2636                    const uint8* src_sobely,
2637                    uint8* dst_argb,
2638                    int width) {
2639   asm volatile (
2640     "movi       v3.8b, #255                    \n"  // alpha
2641     // 8 pixel loop.
2642   "1:                                          \n"
2643     MEMACCESS(0)
2644     "ld1        {v0.8b}, [%0], #8              \n"  // load 8 sobelx.
2645     MEMACCESS(1)
2646     "ld1        {v1.8b}, [%1], #8              \n"  // load 8 sobely.
2647     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2648     "uqadd      v0.8b, v0.8b, v1.8b            \n"  // add
2649     "orr        v1.8b, v0.8b, v0.8b            \n"
2650     "orr        v2.8b, v0.8b, v0.8b            \n"
2651     MEMACCESS(2)
2652     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2653     "b.gt       1b                             \n"
2654   : "+r"(src_sobelx),  // %0
2655     "+r"(src_sobely),  // %1
2656     "+r"(dst_argb),    // %2
2657     "+r"(width)        // %3
2658   :
2659   : "cc", "memory", "v0", "v1", "v2", "v3"
2660   );
2661 }
2662 
2663 // Adds Sobel X and Sobel Y and stores Sobel into plane.
SobelToPlaneRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_y,int width)2664 void SobelToPlaneRow_NEON(const uint8* src_sobelx,
2665                           const uint8* src_sobely,
2666                           uint8* dst_y,
2667                           int width) {
2668   asm volatile (
2669     // 16 pixel loop.
2670   "1:                                          \n"
2671     MEMACCESS(0)
2672     "ld1        {v0.16b}, [%0], #16            \n"  // load 16 sobelx.
2673     MEMACCESS(1)
2674     "ld1        {v1.16b}, [%1], #16            \n"  // load 16 sobely.
2675     "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
2676     "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
2677     MEMACCESS(2)
2678     "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
2679     "b.gt       1b                             \n"
2680   : "+r"(src_sobelx),  // %0
2681     "+r"(src_sobely),  // %1
2682     "+r"(dst_y),       // %2
2683     "+r"(width)        // %3
2684   :
2685   : "cc", "memory", "v0", "v1"
2686   );
2687 }
2688 
2689 // Mixes Sobel X, Sobel Y and Sobel into ARGB.
2690 // A = 255
2691 // R = Sobel X
2692 // G = Sobel
2693 // B = Sobel Y
SobelXYRow_NEON(const uint8 * src_sobelx,const uint8 * src_sobely,uint8 * dst_argb,int width)2694 void SobelXYRow_NEON(const uint8* src_sobelx,
2695                      const uint8* src_sobely,
2696                      uint8* dst_argb,
2697                      int width) {
2698   asm volatile (
2699     "movi       v3.8b, #255                    \n"  // alpha
2700     // 8 pixel loop.
2701   "1:                                          \n"
2702     MEMACCESS(0)
2703     "ld1        {v2.8b}, [%0], #8              \n"  // load 8 sobelx.
2704     MEMACCESS(1)
2705     "ld1        {v0.8b}, [%1], #8              \n"  // load 8 sobely.
2706     "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
2707     "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
2708     MEMACCESS(2)
2709     "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB pixels
2710     "b.gt       1b                             \n"
2711   : "+r"(src_sobelx),  // %0
2712     "+r"(src_sobely),  // %1
2713     "+r"(dst_argb),    // %2
2714     "+r"(width)        // %3
2715   :
2716   : "cc", "memory", "v0", "v1", "v2", "v3"
2717   );
2718 }
2719 
2720 // SobelX as a matrix is
2721 // -1  0  1
2722 // -2  0  2
2723 // -1  0  1
SobelXRow_NEON(const uint8 * src_y0,const uint8 * src_y1,const uint8 * src_y2,uint8 * dst_sobelx,int width)2724 void SobelXRow_NEON(const uint8* src_y0,
2725                     const uint8* src_y1,
2726                     const uint8* src_y2,
2727                     uint8* dst_sobelx,
2728                     int width) {
2729   asm volatile (
2730   "1:                                          \n"
2731     MEMACCESS(0)
2732     "ld1        {v0.8b}, [%0],%5               \n"  // top
2733     MEMACCESS(0)
2734     "ld1        {v1.8b}, [%0],%6               \n"
2735     "usubl      v0.8h, v0.8b, v1.8b            \n"
2736     MEMACCESS(1)
2737     "ld1        {v2.8b}, [%1],%5               \n"  // center * 2
2738     MEMACCESS(1)
2739     "ld1        {v3.8b}, [%1],%6               \n"
2740     "usubl      v1.8h, v2.8b, v3.8b            \n"
2741     "add        v0.8h, v0.8h, v1.8h            \n"
2742     "add        v0.8h, v0.8h, v1.8h            \n"
2743     MEMACCESS(2)
2744     "ld1        {v2.8b}, [%2],%5               \n"  // bottom
2745     MEMACCESS(2)
2746     "ld1        {v3.8b}, [%2],%6               \n"
2747     "subs       %w4, %w4, #8                   \n"  // 8 pixels
2748     "usubl      v1.8h, v2.8b, v3.8b            \n"
2749     "add        v0.8h, v0.8h, v1.8h            \n"
2750     "abs        v0.8h, v0.8h                   \n"
2751     "uqxtn      v0.8b, v0.8h                   \n"
2752     MEMACCESS(3)
2753     "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
2754     "b.gt       1b                             \n"
2755   : "+r"(src_y0),      // %0
2756     "+r"(src_y1),      // %1
2757     "+r"(src_y2),      // %2
2758     "+r"(dst_sobelx),  // %3
2759     "+r"(width)        // %4
2760   : "r"(2LL),          // %5
2761     "r"(6LL)           // %6
2762   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2763   );
2764 }
2765 
2766 // SobelY as a matrix is
2767 // -1 -2 -1
2768 //  0  0  0
2769 //  1  2  1
SobelYRow_NEON(const uint8 * src_y0,const uint8 * src_y1,uint8 * dst_sobely,int width)2770 void SobelYRow_NEON(const uint8* src_y0,
2771                     const uint8* src_y1,
2772                     uint8* dst_sobely,
2773                     int width) {
2774   asm volatile (
2775   "1:                                          \n"
2776     MEMACCESS(0)
2777     "ld1        {v0.8b}, [%0],%4               \n"  // left
2778     MEMACCESS(1)
2779     "ld1        {v1.8b}, [%1],%4               \n"
2780     "usubl      v0.8h, v0.8b, v1.8b            \n"
2781     MEMACCESS(0)
2782     "ld1        {v2.8b}, [%0],%4               \n"  // center * 2
2783     MEMACCESS(1)
2784     "ld1        {v3.8b}, [%1],%4               \n"
2785     "usubl      v1.8h, v2.8b, v3.8b            \n"
2786     "add        v0.8h, v0.8h, v1.8h            \n"
2787     "add        v0.8h, v0.8h, v1.8h            \n"
2788     MEMACCESS(0)
2789     "ld1        {v2.8b}, [%0],%5               \n"  // right
2790     MEMACCESS(1)
2791     "ld1        {v3.8b}, [%1],%5               \n"
2792     "subs       %w3, %w3, #8                   \n"  // 8 pixels
2793     "usubl      v1.8h, v2.8b, v3.8b            \n"
2794     "add        v0.8h, v0.8h, v1.8h            \n"
2795     "abs        v0.8h, v0.8h                   \n"
2796     "uqxtn      v0.8b, v0.8h                   \n"
2797     MEMACCESS(2)
2798     "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
2799     "b.gt       1b                             \n"
2800   : "+r"(src_y0),      // %0
2801     "+r"(src_y1),      // %1
2802     "+r"(dst_sobely),  // %2
2803     "+r"(width)        // %3
2804   : "r"(1LL),          // %4
2805     "r"(6LL)           // %5
2806   : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
2807   );
2808 }
2809 
2810 // Caveat - rounds float to half float whereas scaling version truncates.
HalfFloat1Row_NEON(const uint16 * src,uint16 * dst,float,int width)2811 void HalfFloat1Row_NEON(const uint16* src, uint16* dst, float, int width) {
2812   asm volatile (
2813   "1:                                          \n"
2814     MEMACCESS(0)
2815     "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
2816     "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2817     "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
2818     "uxtl2      v3.4s, v1.8h                   \n"
2819     "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2820     "scvtf      v3.4s, v3.4s                   \n"
2821     "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
2822     "fcvtn2     v1.8h, v3.4s                   \n"
2823    MEMACCESS(1)
2824     "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
2825     "b.gt       1b                             \n"
2826   : "+r"(src),    // %0
2827     "+r"(dst),    // %1
2828     "+r"(width)   // %2
2829   :
2830   : "cc", "memory", "v1", "v2", "v3"
2831   );
2832 }
2833 
HalfFloatRow_NEON(const uint16 * src,uint16 * dst,float scale,int width)2834 void HalfFloatRow_NEON(const uint16* src, uint16* dst, float scale, int width) {
2835   asm volatile (
2836   "1:                                          \n"
2837     MEMACCESS(0)
2838     "ld1        {v1.16b}, [%0], #16            \n"  // load 8 shorts
2839     "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop
2840     "uxtl       v2.4s, v1.4h                   \n"  // 8 int's
2841     "uxtl2      v3.4s, v1.8h                   \n"
2842     "scvtf      v2.4s, v2.4s                   \n"  // 8 floats
2843     "scvtf      v3.4s, v3.4s                   \n"
2844     "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // adjust exponent
2845     "fmul       v3.4s, v3.4s, %3.s[0]          \n"
2846     "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
2847     "uqshrn2    v1.8h, v3.4s, #13              \n"
2848    MEMACCESS(1)
2849     "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
2850     "b.gt       1b                             \n"
2851   : "+r"(src),    // %0
2852     "+r"(dst),    // %1
2853     "+r"(width)   // %2
2854   : "w"(scale * 1.9259299444e-34f)    // %3
2855   : "cc", "memory", "v1", "v2", "v3"
2856   );
2857 }
2858 
2859 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
2860 
2861 #ifdef __cplusplus
2862 }  // extern "C"
2863 }  // namespace libyuv
2864 #endif
2865