1 /*
2  * jdcolext-neon.c - colorspace conversion (Arm Neon)
3  *
4  * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
5  * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
6  *
7  * This software is provided 'as-is', without any express or implied
8  * warranty.  In no event will the authors be held liable for any damages
9  * arising from the use of this software.
10  *
11  * Permission is granted to anyone to use this software for any purpose,
12  * including commercial applications, and to alter it and redistribute it
13  * freely, subject to the following restrictions:
14  *
15  * 1. The origin of this software must not be misrepresented; you must not
16  *    claim that you wrote the original software. If you use this software
17  *    in a product, an acknowledgment in the product documentation would be
18  *    appreciated but is not required.
19  * 2. Altered source versions must be plainly marked as such, and must not be
20  *    misrepresented as being the original software.
21  * 3. This notice may not be removed or altered from any source distribution.
22  */
23 
24 /* This file is included by jdcolor-neon.c. */
25 
26 
27 /* YCbCr -> RGB conversion is defined by the following equations:
28  *    R = Y                        + 1.40200 * (Cr - 128)
29  *    G = Y - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128)
30  *    B = Y + 1.77200 * (Cb - 128)
31  *
32  * Scaled integer constants are used to avoid floating-point arithmetic:
33  *    0.3441467 = 11277 * 2^-15
34  *    0.7141418 = 23401 * 2^-15
35  *    1.4020386 = 22971 * 2^-14
36  *    1.7720337 = 29033 * 2^-14
37  * These constants are defined in jdcolor-neon.c.
38  *
39  * To ensure correct results, rounding is used when descaling.
40  */
41 
42 /* Notes on safe memory access for YCbCr -> RGB conversion routines:
43  *
44  * Input memory buffers can be safely overread up to the next multiple of
45  * ALIGN_SIZE bytes, since they are always allocated by alloc_sarray() in
46  * jmemmgr.c.
47  *
48  * The output buffer cannot safely be written beyond output_width, since
49  * output_buf points to a possibly unpadded row in the decompressed image
50  * buffer allocated by the calling program.
51  */
52 
jsimd_ycc_rgb_convert_neon(JDIMENSION output_width,JSAMPIMAGE input_buf,JDIMENSION input_row,JSAMPARRAY output_buf,int num_rows)53 void jsimd_ycc_rgb_convert_neon(JDIMENSION output_width, JSAMPIMAGE input_buf,
54                                 JDIMENSION input_row, JSAMPARRAY output_buf,
55                                 int num_rows)
56 {
57   JSAMPROW outptr;
58   /* Pointers to Y, Cb, and Cr data */
59   JSAMPROW inptr0, inptr1, inptr2;
60 
61   const int16x4_t consts = vld1_s16(jsimd_ycc_rgb_convert_neon_consts);
62   const int16x8_t neg_128 = vdupq_n_s16(-128);
63 
64   while (--num_rows >= 0) {
65     inptr0 = input_buf[0][input_row];
66     inptr1 = input_buf[1][input_row];
67     inptr2 = input_buf[2][input_row];
68     input_row++;
69     outptr = *output_buf++;
70     int cols_remaining = output_width;
71     for (; cols_remaining >= 16; cols_remaining -= 16) {
72       uint8x16_t y  = vld1q_u8(inptr0);
73       uint8x16_t cb = vld1q_u8(inptr1);
74       uint8x16_t cr = vld1q_u8(inptr2);
75       /* Subtract 128 from Cb and Cr. */
76       int16x8_t cr_128_l =
77         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
78                                        vget_low_u8(cr)));
79       int16x8_t cr_128_h =
80         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
81                                        vget_high_u8(cr)));
82       int16x8_t cb_128_l =
83         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
84                                        vget_low_u8(cb)));
85       int16x8_t cb_128_h =
86         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128),
87                                        vget_high_u8(cb)));
88       /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
89       int32x4_t g_sub_y_ll = vmull_lane_s16(vget_low_s16(cb_128_l), consts, 0);
90       int32x4_t g_sub_y_lh = vmull_lane_s16(vget_high_s16(cb_128_l),
91                                             consts, 0);
92       int32x4_t g_sub_y_hl = vmull_lane_s16(vget_low_s16(cb_128_h), consts, 0);
93       int32x4_t g_sub_y_hh = vmull_lane_s16(vget_high_s16(cb_128_h),
94                                             consts, 0);
95       g_sub_y_ll = vmlsl_lane_s16(g_sub_y_ll, vget_low_s16(cr_128_l),
96                                   consts, 1);
97       g_sub_y_lh = vmlsl_lane_s16(g_sub_y_lh, vget_high_s16(cr_128_l),
98                                   consts, 1);
99       g_sub_y_hl = vmlsl_lane_s16(g_sub_y_hl, vget_low_s16(cr_128_h),
100                                   consts, 1);
101       g_sub_y_hh = vmlsl_lane_s16(g_sub_y_hh, vget_high_s16(cr_128_h),
102                                   consts, 1);
103       /* Descale G components: shift right 15, round, and narrow to 16-bit. */
104       int16x8_t g_sub_y_l = vcombine_s16(vrshrn_n_s32(g_sub_y_ll, 15),
105                                          vrshrn_n_s32(g_sub_y_lh, 15));
106       int16x8_t g_sub_y_h = vcombine_s16(vrshrn_n_s32(g_sub_y_hl, 15),
107                                          vrshrn_n_s32(g_sub_y_hh, 15));
108       /* Compute R-Y: 1.40200 * (Cr - 128) */
109       int16x8_t r_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_l, 1),
110                                                consts, 2);
111       int16x8_t r_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128_h, 1),
112                                                consts, 2);
113       /* Compute B-Y: 1.77200 * (Cb - 128) */
114       int16x8_t b_sub_y_l = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_l, 1),
115                                                consts, 3);
116       int16x8_t b_sub_y_h = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128_h, 1),
117                                                consts, 3);
118       /* Add Y. */
119       int16x8_t r_l =
120         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_l),
121                                        vget_low_u8(y)));
122       int16x8_t r_h =
123         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y_h),
124                                        vget_high_u8(y)));
125       int16x8_t b_l =
126         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_l),
127                                        vget_low_u8(y)));
128       int16x8_t b_h =
129         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y_h),
130                                        vget_high_u8(y)));
131       int16x8_t g_l =
132         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_l),
133                                        vget_low_u8(y)));
134       int16x8_t g_h =
135         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y_h),
136                                        vget_high_u8(y)));
137 
138 #if RGB_PIXELSIZE == 4
139       uint8x16x4_t rgba;
140       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
141       rgba.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
142       rgba.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
143       rgba.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
144       /* Set alpha channel to opaque (0xFF). */
145       rgba.val[RGB_ALPHA] = vdupq_n_u8(0xFF);
146       /* Store RGBA pixel data to memory. */
147       vst4q_u8(outptr, rgba);
148 #elif RGB_PIXELSIZE == 3
149       uint8x16x3_t rgb;
150       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
151       rgb.val[RGB_RED] = vcombine_u8(vqmovun_s16(r_l), vqmovun_s16(r_h));
152       rgb.val[RGB_GREEN] = vcombine_u8(vqmovun_s16(g_l), vqmovun_s16(g_h));
153       rgb.val[RGB_BLUE] = vcombine_u8(vqmovun_s16(b_l), vqmovun_s16(b_h));
154       /* Store RGB pixel data to memory. */
155       vst3q_u8(outptr, rgb);
156 #else
157       /* Pack R, G, and B values in ratio 5:6:5. */
158       uint16x8_t rgb565_l = vqshluq_n_s16(r_l, 8);
159       rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(g_l, 8), 5);
160       rgb565_l = vsriq_n_u16(rgb565_l, vqshluq_n_s16(b_l, 8), 11);
161       uint16x8_t rgb565_h = vqshluq_n_s16(r_h, 8);
162       rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(g_h, 8), 5);
163       rgb565_h = vsriq_n_u16(rgb565_h, vqshluq_n_s16(b_h, 8), 11);
164       /* Store RGB pixel data to memory. */
165       vst1q_u16((uint16_t *)outptr, rgb565_l);
166       vst1q_u16(((uint16_t *)outptr) + 8, rgb565_h);
167 #endif
168 
169       /* Increment pointers. */
170       inptr0 += 16;
171       inptr1 += 16;
172       inptr2 += 16;
173       outptr += (RGB_PIXELSIZE * 16);
174     }
175 
176     if (cols_remaining >= 8) {
177       uint8x8_t y  = vld1_u8(inptr0);
178       uint8x8_t cb = vld1_u8(inptr1);
179       uint8x8_t cr = vld1_u8(inptr2);
180       /* Subtract 128 from Cb and Cr. */
181       int16x8_t cr_128 =
182         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
183       int16x8_t cb_128 =
184         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
185       /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
186       int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
187       int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
188       g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
189       g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
190       /* Descale G components: shift right 15, round, and narrow to 16-bit. */
191       int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
192                                        vrshrn_n_s32(g_sub_y_h, 15));
193       /* Compute R-Y: 1.40200 * (Cr - 128) */
194       int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
195                                              consts, 2);
196       /* Compute B-Y: 1.77200 * (Cb - 128) */
197       int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
198                                              consts, 3);
199       /* Add Y. */
200       int16x8_t r =
201         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
202       int16x8_t b =
203         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
204       int16x8_t g =
205         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
206 
207 #if RGB_PIXELSIZE == 4
208       uint8x8x4_t rgba;
209       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
210       rgba.val[RGB_RED] = vqmovun_s16(r);
211       rgba.val[RGB_GREEN] = vqmovun_s16(g);
212       rgba.val[RGB_BLUE] = vqmovun_s16(b);
213       /* Set alpha channel to opaque (0xFF). */
214       rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
215       /* Store RGBA pixel data to memory. */
216       vst4_u8(outptr, rgba);
217 #elif RGB_PIXELSIZE == 3
218       uint8x8x3_t rgb;
219       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
220       rgb.val[RGB_RED] = vqmovun_s16(r);
221       rgb.val[RGB_GREEN] = vqmovun_s16(g);
222       rgb.val[RGB_BLUE] = vqmovun_s16(b);
223       /* Store RGB pixel data to memory. */
224       vst3_u8(outptr, rgb);
225 #else
226       /* Pack R, G, and B values in ratio 5:6:5. */
227       uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
228       rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
229       rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
230       /* Store RGB pixel data to memory. */
231       vst1q_u16((uint16_t *)outptr, rgb565);
232 #endif
233 
234       /* Increment pointers. */
235       inptr0 += 8;
236       inptr1 += 8;
237       inptr2 += 8;
238       outptr += (RGB_PIXELSIZE * 8);
239       cols_remaining -= 8;
240     }
241 
242     /* Handle the tail elements. */
243     if (cols_remaining > 0) {
244       uint8x8_t y  = vld1_u8(inptr0);
245       uint8x8_t cb = vld1_u8(inptr1);
246       uint8x8_t cr = vld1_u8(inptr2);
247       /* Subtract 128 from Cb and Cr. */
248       int16x8_t cr_128 =
249         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cr));
250       int16x8_t cb_128 =
251         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(neg_128), cb));
252       /* Compute G-Y: - 0.34414 * (Cb - 128) - 0.71414 * (Cr - 128) */
253       int32x4_t g_sub_y_l = vmull_lane_s16(vget_low_s16(cb_128), consts, 0);
254       int32x4_t g_sub_y_h = vmull_lane_s16(vget_high_s16(cb_128), consts, 0);
255       g_sub_y_l = vmlsl_lane_s16(g_sub_y_l, vget_low_s16(cr_128), consts, 1);
256       g_sub_y_h = vmlsl_lane_s16(g_sub_y_h, vget_high_s16(cr_128), consts, 1);
257       /* Descale G components: shift right 15, round, and narrow to 16-bit. */
258       int16x8_t g_sub_y = vcombine_s16(vrshrn_n_s32(g_sub_y_l, 15),
259                                        vrshrn_n_s32(g_sub_y_h, 15));
260       /* Compute R-Y: 1.40200 * (Cr - 128) */
261       int16x8_t r_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cr_128, 1),
262                                              consts, 2);
263       /* Compute B-Y: 1.77200 * (Cb - 128) */
264       int16x8_t b_sub_y = vqrdmulhq_lane_s16(vshlq_n_s16(cb_128, 1),
265                                              consts, 3);
266       /* Add Y. */
267       int16x8_t r =
268         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(r_sub_y), y));
269       int16x8_t b =
270         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(b_sub_y), y));
271       int16x8_t g =
272         vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(g_sub_y), y));
273 
274 #if RGB_PIXELSIZE == 4
275       uint8x8x4_t rgba;
276       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
277       rgba.val[RGB_RED] = vqmovun_s16(r);
278       rgba.val[RGB_GREEN] = vqmovun_s16(g);
279       rgba.val[RGB_BLUE] = vqmovun_s16(b);
280       /* Set alpha channel to opaque (0xFF). */
281       rgba.val[RGB_ALPHA] = vdup_n_u8(0xFF);
282       /* Store RGBA pixel data to memory. */
283       switch (cols_remaining) {
284       case 7:
285         vst4_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgba, 6);
286       case 6:
287         vst4_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgba, 5);
288       case 5:
289         vst4_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgba, 4);
290       case 4:
291         vst4_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgba, 3);
292       case 3:
293         vst4_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgba, 2);
294       case 2:
295         vst4_lane_u8(outptr + RGB_PIXELSIZE, rgba, 1);
296       case 1:
297         vst4_lane_u8(outptr, rgba, 0);
298       default:
299         break;
300       }
301 #elif RGB_PIXELSIZE == 3
302       uint8x8x3_t rgb;
303       /* Convert each component to unsigned and narrow, clamping to [0-255]. */
304       rgb.val[RGB_RED] = vqmovun_s16(r);
305       rgb.val[RGB_GREEN] = vqmovun_s16(g);
306       rgb.val[RGB_BLUE] = vqmovun_s16(b);
307       /* Store RGB pixel data to memory. */
308       switch (cols_remaining) {
309       case 7:
310         vst3_lane_u8(outptr + 6 * RGB_PIXELSIZE, rgb, 6);
311       case 6:
312         vst3_lane_u8(outptr + 5 * RGB_PIXELSIZE, rgb, 5);
313       case 5:
314         vst3_lane_u8(outptr + 4 * RGB_PIXELSIZE, rgb, 4);
315       case 4:
316         vst3_lane_u8(outptr + 3 * RGB_PIXELSIZE, rgb, 3);
317       case 3:
318         vst3_lane_u8(outptr + 2 * RGB_PIXELSIZE, rgb, 2);
319       case 2:
320         vst3_lane_u8(outptr + RGB_PIXELSIZE, rgb, 1);
321       case 1:
322         vst3_lane_u8(outptr, rgb, 0);
323       default:
324         break;
325       }
326 #else
327       /* Pack R, G, and B values in ratio 5:6:5. */
328       uint16x8_t rgb565 = vqshluq_n_s16(r, 8);
329       rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(g, 8), 5);
330       rgb565 = vsriq_n_u16(rgb565, vqshluq_n_s16(b, 8), 11);
331       /* Store RGB565 pixel data to memory. */
332       switch (cols_remaining) {
333       case 7:
334         vst1q_lane_u16((uint16_t *)(outptr + 6 * RGB_PIXELSIZE), rgb565, 6);
335       case 6:
336         vst1q_lane_u16((uint16_t *)(outptr + 5 * RGB_PIXELSIZE), rgb565, 5);
337       case 5:
338         vst1q_lane_u16((uint16_t *)(outptr + 4 * RGB_PIXELSIZE), rgb565, 4);
339       case 4:
340         vst1q_lane_u16((uint16_t *)(outptr + 3 * RGB_PIXELSIZE), rgb565, 3);
341       case 3:
342         vst1q_lane_u16((uint16_t *)(outptr + 2 * RGB_PIXELSIZE), rgb565, 2);
343       case 2:
344         vst1q_lane_u16((uint16_t *)(outptr + RGB_PIXELSIZE), rgb565, 1);
345       case 1:
346         vst1q_lane_u16((uint16_t *)outptr, rgb565, 0);
347       default:
348         break;
349       }
350 #endif
351     }
352   }
353 }
354