1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitMask.h"
9 #include "SkColor_opts_neon.h"
10 
SkBlitLCD16OpaqueRow_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
12                                         SkColor color, int width,
13                                         SkPMColor opaqueDst) {
14     int colR = SkColorGetR(color);
15     int colG = SkColorGetG(color);
16     int colB = SkColorGetB(color);
17 
18     uint8x8_t vcolR, vcolG, vcolB;
19     uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;
20 
21     if (width >= 8) {
22         vcolR = vdup_n_u8(colR);
23         vcolG = vdup_n_u8(colG);
24         vcolB = vdup_n_u8(colB);
25         vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
26         vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
27         vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
28         vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
29     }
30 
31     while (width >= 8) {
32         uint8x8x4_t vdst;
33         uint16x8_t vmask;
34         uint16x8_t vmaskR, vmaskG, vmaskB;
35         uint8x8_t vsel_trans, vsel_opq;
36 
37         vdst = vld4_u8((uint8_t*)dst);
38         vmask = vld1q_u16(src);
39 
40         // Prepare compare masks
41         vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
42         vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
43 
44         // Get all the color masks on 5 bits
45         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
46         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
47                              SK_B16_BITS + SK_R16_BITS + 1);
48         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
49 
50         // Upscale to 0..32
51         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
52         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
53         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
54 
55         vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
56         vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
57 
58         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
59         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
60         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
61 
62         vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
63         vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
64         vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
65 
66         vst4_u8((uint8_t*)dst, vdst);
67 
68         dst += 8;
69         src += 8;
70         width -= 8;
71     }
72 
73     // Leftovers
74     for (int i = 0; i < width; i++) {
75         dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
76                                     opaqueDst);
77     }
78 }
79 
SkBlitLCD16Row_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)80 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
81                                    SkColor color, int width, SkPMColor) {
82     int colA = SkColorGetA(color);
83     int colR = SkColorGetR(color);
84     int colG = SkColorGetG(color);
85     int colB = SkColorGetB(color);
86 
87     colA = SkAlpha255To256(colA);
88 
89     uint8x8_t vcolR, vcolG, vcolB;
90     uint16x8_t vcolA;
91 
92     if (width >= 8) {
93         vcolA = vdupq_n_u16(colA);
94         vcolR = vdup_n_u8(colR);
95         vcolG = vdup_n_u8(colG);
96         vcolB = vdup_n_u8(colB);
97     }
98 
99     while (width >= 8) {
100         uint8x8x4_t vdst;
101         uint16x8_t vmask;
102         uint16x8_t vmaskR, vmaskG, vmaskB;
103 
104         vdst = vld4_u8((uint8_t*)dst);
105         vmask = vld1q_u16(src);
106 
107         // Get all the color masks on 5 bits
108         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
109         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
110                              SK_B16_BITS + SK_R16_BITS + 1);
111         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
112 
113         // Upscale to 0..32
114         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
115         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
116         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
117 
118         vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
119         vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
120         vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
121 
122         vdst.val[NEON_A] = vdup_n_u8(0xFF);
123         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
124         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
125         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
126 
127         vst4_u8((uint8_t*)dst, vdst);
128 
129         dst += 8;
130         src += 8;
131         width -= 8;
132     }
133 
134     for (int i = 0; i < width; i++) {
135         dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
136     }
137 }
138 
139 #define LOAD_LANE_16(reg, n) \
140     reg = vld1q_lane_u16(device, reg, n); \
141     device = (uint16_t*)((char*)device + deviceRB);
142 
143 #define STORE_LANE_16(reg, n) \
144     vst1_lane_u16(dst, reg, n); \
145     dst = (uint16_t*)((char*)dst + deviceRB);
146 
SkRGB16BlitterBlitV_neon(uint16_t * device,int height,size_t deviceRB,unsigned scale,uint32_t src32)147 void SkRGB16BlitterBlitV_neon(uint16_t* device,
148                               int height,
149                               size_t deviceRB,
150                               unsigned scale,
151                               uint32_t src32) {
152     if (height >= 8)
153     {
154         uint16_t* dst = device;
155 
156         // prepare constants
157         uint16x8_t vdev = vdupq_n_u16(0);
158         uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
159         uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
160         uint32x4_t vsrc32 = vdupq_n_u32(src32);
161         uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);
162 
163         while (height >= 8){
164             LOAD_LANE_16(vdev, 0)
165             LOAD_LANE_16(vdev, 1)
166             LOAD_LANE_16(vdev, 2)
167             LOAD_LANE_16(vdev, 3)
168             LOAD_LANE_16(vdev, 4)
169             LOAD_LANE_16(vdev, 5)
170             LOAD_LANE_16(vdev, 6)
171             LOAD_LANE_16(vdev, 7)
172 
173             // Expand_rgb_16
174             uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
175             uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
176             uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);
177 
178             // Compact_rgb_16
179             vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
180             vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
181             vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
182             vdst32_hi = vshrq_n_u32(vdst32_hi, 5);
183 
184             uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
185             uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
186             uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
187             vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
188             vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
189             uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);
190 
191             STORE_LANE_16(vdst16_lo, 0)
192             STORE_LANE_16(vdst16_lo, 1)
193             STORE_LANE_16(vdst16_lo, 2)
194             STORE_LANE_16(vdst16_lo, 3)
195             STORE_LANE_16(vdst16_hi, 0)
196             STORE_LANE_16(vdst16_hi, 1)
197             STORE_LANE_16(vdst16_hi, 2)
198             STORE_LANE_16(vdst16_hi, 3)
199             height -= 8;
200         }
201     }
202     while (height != 0){
203         uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
204         *device = SkCompact_rgb_16((src32 + dst32) >> 5);
205         device = (uint16_t*)((char*)device + deviceRB);
206         height--;
207     }
208 }
209 
210 #undef LOAD_LANE_16
211 #undef STORE_LANE_16
212