1 /*
2 * Copyright 2015 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "SkBlitMask.h"
9 #include "SkColor_opts_neon.h"
10
SkBlitLCD16OpaqueRow_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor opaqueDst)11 void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
12 SkColor color, int width,
13 SkPMColor opaqueDst) {
14 int colR = SkColorGetR(color);
15 int colG = SkColorGetG(color);
16 int colB = SkColorGetB(color);
17
18 uint8x8_t vcolR = vdup_n_u8(colR);
19 uint8x8_t vcolG = vdup_n_u8(colG);
20 uint8x8_t vcolB = vdup_n_u8(colB);
21 uint8x8_t vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
22 uint8x8_t vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
23 uint8x8_t vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
24 uint8x8_t vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
25
26 while (width >= 8) {
27 uint8x8x4_t vdst;
28 uint16x8_t vmask;
29 uint16x8_t vmaskR, vmaskG, vmaskB;
30 uint8x8_t vsel_trans, vsel_opq;
31
32 vdst = vld4_u8((uint8_t*)dst);
33 vmask = vld1q_u16(src);
34
35 // Prepare compare masks
36 vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
37 vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));
38
39 // Get all the color masks on 5 bits
40 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
41 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
42 SK_B16_BITS + SK_R16_BITS + 1);
43 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
44
45 // Upscale to 0..32
46 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
47 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
48 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
49
50 vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
51 vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);
52
53 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
54 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
55 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
56
57 vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
58 vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
59 vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);
60
61 vst4_u8((uint8_t*)dst, vdst);
62
63 dst += 8;
64 src += 8;
65 width -= 8;
66 }
67
68 // Leftovers
69 for (int i = 0; i < width; i++) {
70 dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
71 opaqueDst);
72 }
73 }
74
SkBlitLCD16Row_neon(SkPMColor dst[],const uint16_t src[],SkColor color,int width,SkPMColor)75 void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
76 SkColor color, int width, SkPMColor) {
77 int colA = SkColorGetA(color);
78 int colR = SkColorGetR(color);
79 int colG = SkColorGetG(color);
80 int colB = SkColorGetB(color);
81
82 colA = SkAlpha255To256(colA);
83
84 uint16x8_t vcolA = vdupq_n_u16(colA);
85 uint8x8_t vcolR = vdup_n_u8(colR);
86 uint8x8_t vcolG = vdup_n_u8(colG);
87 uint8x8_t vcolB = vdup_n_u8(colB);
88
89 while (width >= 8) {
90 uint8x8x4_t vdst;
91 uint16x8_t vmask;
92 uint16x8_t vmaskR, vmaskG, vmaskB;
93
94 vdst = vld4_u8((uint8_t*)dst);
95 vmask = vld1q_u16(src);
96
97 // Get all the color masks on 5 bits
98 vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
99 vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
100 SK_B16_BITS + SK_R16_BITS + 1);
101 vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);
102
103 // Upscale to 0..32
104 vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
105 vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
106 vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);
107
108 vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
109 vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
110 vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);
111
112 vdst.val[NEON_A] = vdup_n_u8(0xFF);
113 vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
114 vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
115 vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);
116
117 vst4_u8((uint8_t*)dst, vdst);
118
119 dst += 8;
120 src += 8;
121 width -= 8;
122 }
123
124 for (int i = 0; i < width; i++) {
125 dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
126 }
127 }
128
129 #define LOAD_LANE_16(reg, n) \
130 reg = vld1q_lane_u16(device, reg, n); \
131 device = (uint16_t*)((char*)device + deviceRB);
132
133 #define STORE_LANE_16(reg, n) \
134 vst1_lane_u16(dst, reg, n); \
135 dst = (uint16_t*)((char*)dst + deviceRB);
136
SkRGB16BlitterBlitV_neon(uint16_t * device,int height,size_t deviceRB,unsigned scale,uint32_t src32)137 void SkRGB16BlitterBlitV_neon(uint16_t* device,
138 int height,
139 size_t deviceRB,
140 unsigned scale,
141 uint32_t src32) {
142 if (height >= 8)
143 {
144 uint16_t* dst = device;
145
146 // prepare constants
147 uint16x8_t vdev = vdupq_n_u16(0);
148 uint16x8_t vmaskq_g16 = vdupq_n_u16(SK_G16_MASK_IN_PLACE);
149 uint16x8_t vmaskq_ng16 = vdupq_n_u16(~SK_G16_MASK_IN_PLACE);
150 uint32x4_t vsrc32 = vdupq_n_u32(src32);
151 uint32x4_t vscale5 = vdupq_n_u32((uint32_t)scale);
152
153 while (height >= 8){
154 LOAD_LANE_16(vdev, 0)
155 LOAD_LANE_16(vdev, 1)
156 LOAD_LANE_16(vdev, 2)
157 LOAD_LANE_16(vdev, 3)
158 LOAD_LANE_16(vdev, 4)
159 LOAD_LANE_16(vdev, 5)
160 LOAD_LANE_16(vdev, 6)
161 LOAD_LANE_16(vdev, 7)
162
163 // Expand_rgb_16
164 uint16x8x2_t vdst = vzipq_u16((vdev & vmaskq_ng16), (vdev & vmaskq_g16));
165 uint32x4_t vdst32_lo = vmulq_u32(vreinterpretq_u32_u16(vdst.val[0]), vscale5);
166 uint32x4_t vdst32_hi = vmulq_u32(vreinterpretq_u32_u16(vdst.val[1]), vscale5);
167
168 // Compact_rgb_16
169 vdst32_lo = vaddq_u32(vdst32_lo, vsrc32);
170 vdst32_hi = vaddq_u32(vdst32_hi, vsrc32);
171 vdst32_lo = vshrq_n_u32(vdst32_lo, 5);
172 vdst32_hi = vshrq_n_u32(vdst32_hi, 5);
173
174 uint16x4_t vtmp_lo = vmovn_u32(vdst32_lo) & vget_low_u16(vmaskq_ng16);
175 uint16x4_t vtmp_hi = vshrn_n_u32(vdst32_lo, 16) & vget_low_u16(vmaskq_g16);
176 uint16x4_t vdst16_lo = vorr_u16(vtmp_lo, vtmp_hi);
177 vtmp_lo = vmovn_u32(vdst32_hi) & vget_low_u16(vmaskq_ng16);
178 vtmp_hi = vshrn_n_u32(vdst32_hi, 16) & vget_low_u16(vmaskq_g16);
179 uint16x4_t vdst16_hi = vorr_u16(vtmp_lo, vtmp_hi);
180
181 STORE_LANE_16(vdst16_lo, 0)
182 STORE_LANE_16(vdst16_lo, 1)
183 STORE_LANE_16(vdst16_lo, 2)
184 STORE_LANE_16(vdst16_lo, 3)
185 STORE_LANE_16(vdst16_hi, 0)
186 STORE_LANE_16(vdst16_hi, 1)
187 STORE_LANE_16(vdst16_hi, 2)
188 STORE_LANE_16(vdst16_hi, 3)
189 height -= 8;
190 }
191 }
192 while (height != 0){
193 uint32_t dst32 = SkExpand_rgb_16(*device) * scale;
194 *device = SkCompact_rgb_16((src32 + dst32) >> 5);
195 device = (uint16_t*)((char*)device + deviceRB);
196 height--;
197 }
198 }
199
200 #undef LOAD_LANE_16
201 #undef STORE_LANE_16
202