1 /*
2 * jdcolext-neon.c - colorspace conversion (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23
24 /* This file is included by jdcolor-neon.c. */
25
26
27 /* YCbCr -> RGB conversion is defined by the following equations:
28 * R = Y + 1.40200 * (Cr - 128)
29 * G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
30 * B = Y + 1.77200 * (Cb - 128)
31 *
32 * Scaled integer constants are used to avoid floating-point arithmetic:
33 * 0.3441467 = 11277 * 2^-15
34 * 0.7141418 = 23401 * 2^-15
35 * 1.4020386 = 22971 * 2^-14
36 * 1.7720337 = 29033 * 2^-14
37 * These constants are defined in jdcolor-neon.c.
38 *
39 * To ensure correct results, rounding is used when descaling.
40 */
41
42 /* Notes on safe memory access for YCbCr -> RGB conversion routines:
43 *
44 * Input memory buffers can be safely overread up to the next multiple of
45 * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
46 * jmemmgr.c.
47 *
48 * The output buffer cannot safely be written beyond output_width, since
49 * output_buf points to a possibly unpadded row in the decompressed image
50 * buffer allocated by the calling program.
51 */
52
jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION input_row,JSAMPARRAY output_buf,int num_rows)53 void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
54 JDIMENSION input_row, JSAMPARRAY output_buf,
55 int num_rows)
56 {
57 JSAMPROW outptr;
58 /* Pointers to Y, Cb, and Cr data */
59 JSAMPROW inptr0, inptr1, inptr2;
60
61 const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
62 const int16x8_t neg_128 = vdupq_n_s16(-128);
63
64 while (--num_rows >= 0) {
65 inptr0 = input_buf[0][input_row];
66 inptr1 = input_buf[1][input_row];
67 inptr2 = input_buf[2][input_row];
68 input_row++;
69 outptr = *output_buf++;
70 int cols_remaining = output_width;
71 for (; cols_remaining >= 16; cols_remaining -= 16) {
72 uint8x16_t y = vld1q_u8(inptr0);
73 uint8x16_t cb = vld1q_u8(inptr1);
74 uint8x16_t cr = vld1q_u8(inptr2);
75 /* Subtract 128 from Cb and Cr. */
76 int16x8_t cr_128_l =
77 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
78 vget_low_u8(cr)));
79 int16x8_t cr_128_h =
80 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
81 vget_high_u8(cr)));
82 int16x8_t cb_128_l =
83 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
84 vget_low_u8(cb)));
85 int16x8_t cb_128_h =
86 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
87 vget_high_u8(cb)));
88 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
89 int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
90 int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
91 consts, 0);
92 int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
93 int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
94 consts, 0);
95 g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
96 consts, 1);
97 g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
98 consts, 1);
99 g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
100 consts, 1);
101 g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
102 consts, 1);
103 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
104 int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
105 vrshrn_n_s32(g_sub_y_lh, 15));
106 int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
107 vrshrn_n_s32(g_sub_y_hh, 15));
108 /* Compute R-Y: 1.40200 * (Cr - 128) */
109 int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
110 consts, 2);
111 int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
112 consts, 2);
113 /* Compute B-Y: 1.77200 * (Cb - 128) */
114 int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
115 consts, 3);
116 int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
117 consts, 3);
118 /* Add Y. */
119 int16x8_t r_l =
120 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
121 vget_low_u8(y)));
122 int16x8_t r_h =
123 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
124 vget_high_u8(y)));
125 int16x8_t b_l =
126 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
127 vget_low_u8(y)));
128 int16x8_t b_h =
129 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
130 vget_high_u8(y)));
131 int16x8_t g_l =
132 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
133 vget_low_u8(y)));
134 int16x8_t g_h =
135 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
136 vget_high_u8(y)));
137
138 #if RGB_PIXELSIZE == 4
139 uint8x16x4_t rgba;
140 /* Convert each component to unsigned and narrow, clamping to [0-255]. */
141 rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
142 rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
143 rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
144 /* Set alpha channel to opaque (0xFF). */
145 rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
146 /* Store RGBA pixel data to memory. */
147 vst4q_u8(outptr, rgba);
148 #elif RGB_PIXELSIZE == 3
149 uint8x16x3_t rgb;
150 /* Convert each component to unsigned and narrow, clamping to [0-255]. */
151 rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
152 rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
153 rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
154 /* Store RGB pixel data to memory. */
155 vst3q_u8(outptr, rgb);
156 #else
157 /* Pack R, G, and B values in ratio 5:6:5. */
158 uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
159 rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
160 rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
161 uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
162 rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
163 rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
164 /* Store RGB pixel data to memory. */
165 vst1q_u16((uint16_t *)outptr, rgb565_l);
166 vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
167 #endif
168
169 /* Increment pointers. */
170 inptr0 += 16;
171 inptr1 += 16;
172 inptr2 += 16;
173 outptr += (RGB_PIXELSIZE * 16);
174 }
175
176 if (cols_remaining >= 8) {
177 uint8x8_t y = vld1_u8(inptr0);
178 uint8x8_t cb = vld1_u8(inptr1);
179 uint8x8_t cr = vld1_u8(inptr2);
180 /* Subtract 128 from Cb and Cr. */
181 int16x8_t cr_128 =
182 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
183 int16x8_t cb_128 =
184 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
185 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
186 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
187 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
188 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
189 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
190 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
191 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
192 vrshrn_n_s32(g_sub_y_h, 15));
193 /* Compute R-Y: 1.40200 * (Cr - 128) */
194 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
195 consts, 2);
196 /* Compute B-Y: 1.77200 * (Cb - 128) */
197 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
198 consts, 3);
199 /* Add Y. */
200 int16x8_t r =
201 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
202 int16x8_t b =
203 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
204 int16x8_t g =
205 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
206
207 #if RGB_PIXELSIZE == 4
208 uint8x8x4_t rgba;
209 /* Convert each component to unsigned and narrow, clamping to [0-255]. */
210 rgba.val[RGB_RED] = vqmovun_s16(r);
211 rgba.val[RGB_GREEN] = vqmovun_s16(g);
212 rgba.val[RGB_BLUE] = vqmovun_s16(b);
213 /* Set alpha channel to opaque (0xFF). */
214 rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
215 /* Store RGBA pixel data to memory. */
216 vst4_u8(outptr, rgba);
217 #elif RGB_PIXELSIZE == 3
218 uint8x8x3_t rgb;
219 /* Convert each component to unsigned and narrow, clamping to [0-255]. */
220 rgb.val[RGB_RED] = vqmovun_s16(r);
221 rgb.val[RGB_GREEN] = vqmovun_s16(g);
222 rgb.val[RGB_BLUE] = vqmovun_s16(b);
223 /* Store RGB pixel data to memory. */
224 vst3_u8(outptr, rgb);
225 #else
226 /* Pack R, G, and B values in ratio 5:6:5. */
227 uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
228 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
229 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
230 /* Store RGB pixel data to memory. */
231 vst1q_u16((uint16_t *)outptr, rgb565);
232 #endif
233
234 /* Increment pointers. */
235 inptr0 += 8;
236 inptr1 += 8;
237 inptr2 += 8;
238 outptr += (RGB_PIXELSIZE * 8);
239 cols_remaining -= 8;
240 }
241
242 /* Handle the tail elements. */
243 if (cols_remaining > 0) {
244 uint8x8_t y = vld1_u8(inptr0);
245 uint8x8_t cb = vld1_u8(inptr1);
246 uint8x8_t cr = vld1_u8(inptr2);
247 /* Subtract 128 from Cb and Cr. */
248 int16x8_t cr_128 =
249 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
250 int16x8_t cb_128 =
251 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
252 /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
253 int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
254 int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
255 g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
256 g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
257 /* Descale G components: shift right 15, round, and narrow to 16-bit. */
258 int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
259 vrshrn_n_s32(g_sub_y_h, 15));
260 /* Compute R-Y: 1.40200 * (Cr - 128) */
261 int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
262 consts, 2);
263 /* Compute B-Y: 1.77200 * (Cb - 128) */
264 int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
265 consts, 3);
266 /* Add Y. */
267 int16x8_t r =
268 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
269 int16x8_t b =
270 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
271 int16x8_t g =
272 vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
273
274 #if RGB_PIXELSIZE == 4
275 uint8x8x4_t rgba;
276 /* Convert each component to unsigned and narrow, clamping to [0-255]. */
277 rgba.val[RGB_RED] = vqmovun_s16(r);
278 rgba.val[RGB_GREEN] = vqmovun_s16(g);
279 rgba.val[RGB_BLUE] = vqmovun_s16(b);
280 /* Set alpha channel to opaque (0xFF). */
281 rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
282 /* Store RGBA pixel data to memory. */
283 switch (cols_remaining) {
284 case 7:
285 vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
286 case 6:
287 vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
288 case 5:
289 vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
290 case 4:
291 vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
292 case 3:
293 vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
294 case 2:
295 vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
296 case 1:
297 vst4_lane_u8(outptr, rgba, 0);
298 default:
299 break;
300 }
301 #elif RGB_PIXELSIZE == 3
302 uint8x8x3_t rgb;
303 /* Convert each component to unsigned and narrow, clamping to [0-255]. */
304 rgb.val[RGB_RED] = vqmovun_s16(r);
305 rgb.val[RGB_GREEN] = vqmovun_s16(g);
306 rgb.val[RGB_BLUE] = vqmovun_s16(b);
307 /* Store RGB pixel data to memory. */
308 switch (cols_remaining) {
309 case 7:
310 vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
311 case 6:
312 vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
313 case 5:
314 vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
315 case 4:
316 vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
317 case 3:
318 vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
319 case 2:
320 vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
321 case 1:
322 vst3_lane_u8(outptr, rgb, 0);
323 default:
324 break;
325 }
326 #else
327 /* Pack R, G, and B values in ratio 5:6:5. */
328 uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
329 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
330 rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
331 /* Store RGB565 pixel data to memory. */
332 switch (cols_remaining) {
333 case 7:
334 vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
335 case 6:
336 vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
337 case 5:
338 vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
339 case 4:
340 vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
341 case 3:
342 vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
343 case 2:
344 vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
345 case 1:
346 vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
347 default:
348 break;
349 }
350 #endif
351 }
352 }
353 }
354