1 /*
2  * Copyright 2012 The Android Open Source Project
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitRow_opts_arm_neon.h"
9 
10 #include "SkBlitMask.h"
11 #include "SkBlitRow.h"
12 #include "SkColorData.h"
13 #include "SkDither.h"
14 #include "SkMathPriv.h"
15 #include "SkUtils.h"
16 
17 #include "SkColor_opts_neon.h"
18 #include <arm_neon.h>
19 
20 /* Neon version of S32_Blend_BlitRow32()
21  * portable version is in src/core/SkBlitRow_D32.cpp
22  */
23 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
24                               const SkPMColor* SK_RESTRICT src,
25                               int count, U8CPU alpha) {
26     SkASSERT(alpha <= 255);
27 
28     if (count <= 0) {
29         return;
30     }
31 
32     uint16_t src_scale = SkAlpha255To256(alpha);
33     uint16_t dst_scale = 256 - src_scale;
34 
35     while (count >= 2) {
36         uint8x8_t vsrc, vdst, vres;
37         uint16x8_t vsrc_wide, vdst_wide;
38 
39         /* These commented prefetches are a big win for count
40          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
41          * They also hurt a little (<5%) on an A15
42          */
43         //__builtin_prefetch(src+32);
44         //__builtin_prefetch(dst+32);
45 
46         // Load
47         vsrc = vreinterpret_u8_u32(vld1_u32(src));
48         vdst = vreinterpret_u8_u32(vld1_u32(dst));
49 
50         // Process src
51         vsrc_wide = vmovl_u8(vsrc);
52         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
53 
54         // Process dst
55         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
56 
57         // Combine
58         vdst_wide += vsrc_wide;
59         vres = vshrn_n_u16(vdst_wide, 8);
60 
61         // Store
62         vst1_u32(dst, vreinterpret_u32_u8(vres));
63 
64         src += 2;
65         dst += 2;
66         count -= 2;
67     }
68 
69     if (count == 1) {
70         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
71         uint16x8_t vsrc_wide, vdst_wide;
72 
73         // Load
74         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
75         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
76 
77         // Process
78         vsrc_wide = vmovl_u8(vsrc);
79         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
80         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
81         vdst_wide += vsrc_wide;
82         vres = vshrn_n_u16(vdst_wide, 8);
83 
84         // Store
85         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
86     }
87 }
88 
89 #ifdef SK_CPU_ARM32
90 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
91                          const SkPMColor* SK_RESTRICT src,
92                          int count, U8CPU alpha) {
93 
94     SkASSERT(255 > alpha);
95 
96     if (count <= 0) {
97         return;
98     }
99 
100     unsigned alpha256 = SkAlpha255To256(alpha);
101 
102     // First deal with odd counts
103     if (count & 1) {
104         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
105         uint16x8_t vdst_wide, vsrc_wide;
106         unsigned dst_scale;
107 
108         // Load
109         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
110         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
111 
112         // Calc dst_scale
113         dst_scale = vget_lane_u8(vsrc, 3);
114         dst_scale = SkAlphaMulInv256(dst_scale, alpha256);
115 
116         // Process src
117         vsrc_wide = vmovl_u8(vsrc);
118         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
119 
120         // Process dst
121         vdst_wide = vmovl_u8(vdst);
122         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
123 
124         // Combine
125         vdst_wide += vsrc_wide;
126         vres = vshrn_n_u16(vdst_wide, 8);
127 
128         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
129         dst++;
130         src++;
131         count--;
132     }
133 
134     if (count) {
135         uint8x8_t alpha_mask;
136         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
137         alpha_mask = vld1_u8(alpha_mask_setup);
138 
139         do {
140 
141             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
142             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
143 
144             __builtin_prefetch(src+32);
145             __builtin_prefetch(dst+32);
146 
147             // Load
148             vsrc = vreinterpret_u8_u32(vld1_u32(src));
149             vdst = vreinterpret_u8_u32(vld1_u32(dst));
150 
151             // Prepare src_scale
152             vsrc_scale = vdupq_n_u16(alpha256);
153 
154             // Calc dst_scale
155             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
156             vdst_scale = vmovl_u8(vsrc_alphas);
157             // Calculate SkAlphaMulInv256(vdst_scale, vsrc_scale).
158             // A 16-bit lane would overflow if we used 0xFFFF here,
159             // so use an approximation with 0xFF00 that is off by 1,
160             // and add back 1 after to get the correct value.
161             // This is valid if alpha256 <= 255.
162             vdst_scale = vmlsq_u16(vdupq_n_u16(0xFF00), vdst_scale, vsrc_scale);
163             vdst_scale = vsraq_n_u16(vdst_scale, vdst_scale, 8);
164             vdst_scale = vsraq_n_u16(vdupq_n_u16(1), vdst_scale, 8);
165 
166             // Process src
167             vsrc_wide = vmovl_u8(vsrc);
168             vsrc_wide *= vsrc_scale;
169 
170             // Process dst
171             vdst_wide = vmovl_u8(vdst);
172             vdst_wide *= vdst_scale;
173 
174             // Combine
175             vdst_wide += vsrc_wide;
176             vres = vshrn_n_u16(vdst_wide, 8);
177 
178             vst1_u32(dst, vreinterpret_u32_u8(vres));
179 
180             src += 2;
181             dst += 2;
182             count -= 2;
183         } while(count);
184     }
185 }
186 
187 ///////////////////////////////////////////////////////////////////////////////
188 
189 #endif // #ifdef SK_CPU_ARM32
190 
191 ///////////////////////////////////////////////////////////////////////////////
192 
193 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
194     nullptr,   // S32_Opaque,
195     S32_Blend_BlitRow32_neon,        // S32_Blend,
196     nullptr,  // Ported to SkOpts
197 #ifdef SK_CPU_ARM32
198     S32A_Blend_BlitRow32_neon        // S32A_Blend
199 #else
200     nullptr
201 #endif
202 };
203