1 /*
2  * Copyright 2015 Google Inc.
3  *
4  * Use of this source code is governed by a BSD-style license that can be
5  * found in the LICENSE file.
6  */
7 
8 #include "SkBlitMask.h"
9 #include "SkColor_opts_neon.h"
10 
SkBlitLCD16OpaqueRow_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
12                                         SkColor color, int width,
13                                         SkPMColor opaqueDst) {
14     int colR = SkColorGetR(color);
15     int colG = SkColorGetG(color);
16     int colB = SkColorGetB(color);
17 
18     uint8x8_t vcolR = vdup_n_u8(colR);
19     uint8x8_t vcolG = vdup_n_u8(colG);
20     uint8x8_t vcolB = vdup_n_u8(colB);
21     uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
22     uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
23     uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
24     uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
25 
26     while (width >= 8) {
27         uint8x8x4_t vdst;
28         uint16x8_t vmask;
29         uint16x8_t vmaskR, vmaskG, vmaskB;
30         uint8x8_t vsel_trans, vsel_opq;
31 
32         vdst = vld4_u8((uint8_t*)dst);
33         vmask = vld1q_u16(src);
34 
35         // Prepare compare masks
36         vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
37         vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
38 
39         // Get all the color masks on 5 bits
40         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
41         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
42                              SK_B16_BITS + SK_R16_BITS + 1);
43         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
44 
45         // Upscale to 0..32
46         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
47         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
48         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
49 
50         vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
51         vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
52 
53         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
54         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
55         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
56 
57         vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
58         vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
59         vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
60 
61         vst4_u8((uint8_t*)dst, vdst);
62 
63         dst += 8;
64         src += 8;
65         width -= 8;
66     }
67 
68     // Leftovers
69     for (int i = 0; i < width; i++) {
70         dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
71                                     opaqueDst);
72     }
73 }
74 
SkBlitLCD16Row_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)75 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
76                                    SkColor color, int width, SkPMColor) {
77     int colA = SkColorGetA(color);
78     int colR = SkColorGetR(color);
79     int colG = SkColorGetG(color);
80     int colB = SkColorGetB(color);
81 
82     colA = SkAlpha255To256(colA);
83 
84     uint16x8_t vcolA = vdupq_n_u16(colA);
85     uint8x8_t vcolR = vdup_n_u8(colR);
86     uint8x8_t vcolG = vdup_n_u8(colG);
87     uint8x8_t vcolB = vdup_n_u8(colB);
88 
89     while (width >= 8) {
90         uint8x8x4_t vdst;
91         uint16x8_t vmask;
92         uint16x8_t vmaskR, vmaskG, vmaskB;
93 
94         vdst = vld4_u8((uint8_t*)dst);
95         vmask = vld1q_u16(src);
96 
97         // Get all the color masks on 5 bits
98         vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
99         vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
100                              SK_B16_BITS + SK_R16_BITS + 1);
101         vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
102 
103         // Upscale to 0..32
104         vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
105         vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
106         vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
107 
108         vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
109         vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
110         vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
111 
112         vdst.val[NEON_A] = vdup_n_u8(0xFF);
113         vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
114         vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
115         vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
116 
117         vst4_u8((uint8_t*)dst, vdst);
118 
119         dst += 8;
120         src += 8;
121         width -= 8;
122     }
123 
124     for (int i = 0; i < width; i++) {
125         dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
126     }
127 }
128 
129 #define LOAD_LANE_16(reg, n) \
130     reg = vld1q_lane_u16(device, reg, n); \
131     device = (uint16_t*)((char*)device + deviceRB);
132 
133 #define STORE_LANE_16(reg, n) \
134     vst1_lane_u16(dst, reg, n); \
135     dst = (uint16_t*)((char*)dst + deviceRB);
136 
SkRGB16BlitterBlitV_neon(uint16_t * device,int height,size_t deviceRB,unsigned scale,uint32_t src32)137 void SkRGB16BlitterBlitV_neon(uint16_t* device,
138                               int height,
139                               size_t deviceRB,
140                               unsigned scale,
141                               uint32_t src32) {
142     if (height >= 8)
143     {
144         uint16_t* dst = device;
145 
146         // prepare constants
147         uint16x8_t vdev = vdupq_n_u16(0);
148         uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
149         uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
150         uint32x4_t vsrc32 = vdupq_n_u32(src32);
151         uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);
152 
153         while (height >= 8){
154             LOAD_LANE_16(vdev, 0)
155             LOAD_LANE_16(vdev, 1)
156             LOAD_LANE_16(vdev, 2)
157             LOAD_LANE_16(vdev, 3)
158             LOAD_LANE_16(vdev, 4)
159             LOAD_LANE_16(vdev, 5)
160             LOAD_LANE_16(vdev, 6)
161             LOAD_LANE_16(vdev, 7)
162 
163             // Expand_rgb_16
164             uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
165             uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
166             uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);
167 
168             // Compact_rgb_16
169             vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
170             vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
171             vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
172             vdst32_hi = vshrq_n_u32(vdst32_hi, 5);
173 
174             uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
175             uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
176             uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
177             vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
178             vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
179             uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);
180 
181             STORE_LANE_16(vdst16_lo, 0)
182             STORE_LANE_16(vdst16_lo, 1)
183             STORE_LANE_16(vdst16_lo, 2)
184             STORE_LANE_16(vdst16_lo, 3)
185             STORE_LANE_16(vdst16_hi, 0)
186             STORE_LANE_16(vdst16_hi, 1)
187             STORE_LANE_16(vdst16_hi, 2)
188             STORE_LANE_16(vdst16_hi, 3)
189             height -= 8;
190         }
191     }
192     while (height != 0){
193         uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
194         *device = SkCompact_rgb_16((src32 + dst32) >> 5);
195         device = (uint16_t*)((char*)device + deviceRB);
196         height--;
197     }
198 }
199 
200 #undef LOAD_LANE_16
201 #undef STORE_LANE_16
202