1 /*
2 * jfdctint-neon.c - accurate integer FDCT (Arm Neon)
3 *
4 * Copyright (C) 2020, Arm Limited. All Rights Reserved.
5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved.
6 *
7 * This software is provided 'as-is', without any express or implied
8 * warranty. In no event will the authors be held liable for any damages
9 * arising from the use of this software.
10 *
11 * Permission is granted to anyone to use this software for any purpose,
12 * including commercial applications, and to alter it and redistribute it
13 * freely, subject to the following restrictions:
14 *
15 * 1. The origin of this software must not be misrepresented; you must not
16 * claim that you wrote the original software. If you use this software
17 * in a product, an acknowledgment in the product documentation would be
18 * appreciated but is not required.
19 * 2. Altered source versions must be plainly marked as such, and must not be
20 * misrepresented as being the original software.
21 * 3. This notice may not be removed or altered from any source distribution.
22 */
23
24 #define JPEG_INTERNALS
25 #include "../../jinclude.h"
26 #include "../../jpeglib.h"
27 #include "../../jsimd.h"
28 #include "../../jdct.h"
29 #include "../../jsimddct.h"
30 #include "../jsimd.h"
31 #include "align.h"
32 #include "neon-compat.h"
33
34 #include <arm_neon.h>
35
36
37 /* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT
38 * (Discrete Cosine Transform) on one block of samples. It uses the same
39 * calculations and produces exactly the same output as IJG's original
40 * jpeg_fdct_islow() function, which can be found in jfdctint.c.
41 *
42 * Scaled integer constants are used to avoid floating-point arithmetic:
43 * 0.298631336 = 2446 * 2^-13
44 * 0.390180644 = 3196 * 2^-13
45 * 0.541196100 = 4433 * 2^-13
46 * 0.765366865 = 6270 * 2^-13
47 * 0.899976223 = 7373 * 2^-13
48 * 1.175875602 = 9633 * 2^-13
49 * 1.501321110 = 12299 * 2^-13
50 * 1.847759065 = 15137 * 2^-13
51 * 1.961570560 = 16069 * 2^-13
52 * 2.053119869 = 16819 * 2^-13
53 * 2.562915447 = 20995 * 2^-13
54 * 3.072711026 = 25172 * 2^-13
55 *
56 * See jfdctint.c for further details of the DCT algorithm. Where possible,
57 * the variable names and comments here in jsimd_fdct_islow_neon() match up
58 * with those in jpeg_fdct_islow().
59 */
60
61 #define CONST_BITS 13
62 #define PASS1_BITS 2
63
64 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
65 #define DESCALE_P2 (CONST_BITS + PASS1_BITS)
66
67 #define F_0_298 2446
68 #define F_0_390 3196
69 #define F_0_541 4433
70 #define F_0_765 6270
71 #define F_0_899 7373
72 #define F_1_175 9633
73 #define F_1_501 12299
74 #define F_1_847 15137
75 #define F_1_961 16069
76 #define F_2_053 16819
77 #define F_2_562 20995
78 #define F_3_072 25172
79
80
81 ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = {
82 F_0_298, -F_0_390, F_0_541, F_0_765,
83 -F_0_899, F_1_175, F_1_501, -F_1_847,
84 -F_1_961, F_2_053, -F_2_562, F_3_072
85 };
86
jsimd_fdct_islow_neon(DCTELEM * data)87 void jsimd_fdct_islow_neon(DCTELEM *data)
88 {
89 /* Load DCT constants. */
90 #ifdef HAVE_VLD1_S16_X3
91 const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts);
92 #else
93 /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
94 const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts);
95 const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4);
96 const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8);
97 const int16x4x3_t consts = { { consts1, consts2, consts3 } };
98 #endif
99
100 /* Load an 8x8 block of samples into Neon registers. De-interleaving loads
101 * are used, followed by vuzp to transpose the block such that we have a
102 * column of samples per vector - allowing all rows to be processed at once.
103 */
104 int16x8x4_t s_rows_0123 = vld4q_s16(data);
105 int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE);
106
107 int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]);
108 int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]);
109 int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]);
110 int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]);
111
112 int16x8_t col0 = cols_04.val[0];
113 int16x8_t col1 = cols_15.val[0];
114 int16x8_t col2 = cols_26.val[0];
115 int16x8_t col3 = cols_37.val[0];
116 int16x8_t col4 = cols_04.val[1];
117 int16x8_t col5 = cols_15.val[1];
118 int16x8_t col6 = cols_26.val[1];
119 int16x8_t col7 = cols_37.val[1];
120
121 /* Pass 1: process rows. */
122
123 int16x8_t tmp0 = vaddq_s16(col0, col7);
124 int16x8_t tmp7 = vsubq_s16(col0, col7);
125 int16x8_t tmp1 = vaddq_s16(col1, col6);
126 int16x8_t tmp6 = vsubq_s16(col1, col6);
127 int16x8_t tmp2 = vaddq_s16(col2, col5);
128 int16x8_t tmp5 = vsubq_s16(col2, col5);
129 int16x8_t tmp3 = vaddq_s16(col3, col4);
130 int16x8_t tmp4 = vsubq_s16(col3, col4);
131
132 /* Even part */
133 int16x8_t tmp10 = vaddq_s16(tmp0, tmp3);
134 int16x8_t tmp13 = vsubq_s16(tmp0, tmp3);
135 int16x8_t tmp11 = vaddq_s16(tmp1, tmp2);
136 int16x8_t tmp12 = vsubq_s16(tmp1, tmp2);
137
138 col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
139 col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
140
141 int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
142 int32x4_t z1_l =
143 vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
144 int32x4_t z1_h =
145 vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
146
147 int32x4_t col2_scaled_l =
148 vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
149 int32x4_t col2_scaled_h =
150 vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
151 col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1),
152 vrshrn_n_s32(col2_scaled_h, DESCALE_P1));
153
154 int32x4_t col6_scaled_l =
155 vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
156 int32x4_t col6_scaled_h =
157 vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
158 col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1),
159 vrshrn_n_s32(col6_scaled_h, DESCALE_P1));
160
161 /* Odd part */
162 int16x8_t z1 = vaddq_s16(tmp4, tmp7);
163 int16x8_t z2 = vaddq_s16(tmp5, tmp6);
164 int16x8_t z3 = vaddq_s16(tmp4, tmp6);
165 int16x8_t z4 = vaddq_s16(tmp5, tmp7);
166 /* sqrt(2) * c3 */
167 int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
168 int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
169 z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
170 z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
171
172 /* sqrt(2) * (-c1+c3+c5-c7) */
173 int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
174 int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
175 /* sqrt(2) * ( c1+c3-c5+c7) */
176 int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
177 int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
178 /* sqrt(2) * ( c1+c3+c5-c7) */
179 int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
180 int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
181 /* sqrt(2) * ( c1+c3-c5-c7) */
182 int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
183 int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
184
185 /* sqrt(2) * (c7-c3) */
186 z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
187 z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
188 /* sqrt(2) * (-c1-c3) */
189 int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
190 int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
191 /* sqrt(2) * (-c3-c5) */
192 int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
193 int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
194 /* sqrt(2) * (c5-c3) */
195 int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
196 int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
197
198 z3_l = vaddq_s32(z3_l, z5_l);
199 z3_h = vaddq_s32(z3_h, z5_h);
200 z4_l = vaddq_s32(z4_l, z5_l);
201 z4_h = vaddq_s32(z4_h, z5_h);
202
203 tmp4_l = vaddq_s32(tmp4_l, z1_l);
204 tmp4_h = vaddq_s32(tmp4_h, z1_h);
205 tmp4_l = vaddq_s32(tmp4_l, z3_l);
206 tmp4_h = vaddq_s32(tmp4_h, z3_h);
207 col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1),
208 vrshrn_n_s32(tmp4_h, DESCALE_P1));
209
210 tmp5_l = vaddq_s32(tmp5_l, z2_l);
211 tmp5_h = vaddq_s32(tmp5_h, z2_h);
212 tmp5_l = vaddq_s32(tmp5_l, z4_l);
213 tmp5_h = vaddq_s32(tmp5_h, z4_h);
214 col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1),
215 vrshrn_n_s32(tmp5_h, DESCALE_P1));
216
217 tmp6_l = vaddq_s32(tmp6_l, z2_l);
218 tmp6_h = vaddq_s32(tmp6_h, z2_h);
219 tmp6_l = vaddq_s32(tmp6_l, z3_l);
220 tmp6_h = vaddq_s32(tmp6_h, z3_h);
221 col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1),
222 vrshrn_n_s32(tmp6_h, DESCALE_P1));
223
224 tmp7_l = vaddq_s32(tmp7_l, z1_l);
225 tmp7_h = vaddq_s32(tmp7_h, z1_h);
226 tmp7_l = vaddq_s32(tmp7_l, z4_l);
227 tmp7_h = vaddq_s32(tmp7_h, z4_h);
228 col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1),
229 vrshrn_n_s32(tmp7_h, DESCALE_P1));
230
231 /* Transpose to work on columns in pass 2. */
232 int16x8x2_t cols_01 = vtrnq_s16(col0, col1);
233 int16x8x2_t cols_23 = vtrnq_s16(col2, col3);
234 int16x8x2_t cols_45 = vtrnq_s16(col4, col5);
235 int16x8x2_t cols_67 = vtrnq_s16(col6, col7);
236
237 int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]),
238 vreinterpretq_s32_s16(cols_45.val[0]));
239 int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]),
240 vreinterpretq_s32_s16(cols_45.val[1]));
241 int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]),
242 vreinterpretq_s32_s16(cols_67.val[0]));
243 int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]),
244 vreinterpretq_s32_s16(cols_67.val[1]));
245
246 int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]);
247 int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]);
248 int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]);
249 int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]);
250
251 int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]);
252 int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]);
253 int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]);
254 int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]);
255 int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]);
256 int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]);
257 int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]);
258 int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]);
259
260 /* Pass 2: process columns. */
261
262 tmp0 = vaddq_s16(row0, row7);
263 tmp7 = vsubq_s16(row0, row7);
264 tmp1 = vaddq_s16(row1, row6);
265 tmp6 = vsubq_s16(row1, row6);
266 tmp2 = vaddq_s16(row2, row5);
267 tmp5 = vsubq_s16(row2, row5);
268 tmp3 = vaddq_s16(row3, row4);
269 tmp4 = vsubq_s16(row3, row4);
270
271 /* Even part */
272 tmp10 = vaddq_s16(tmp0, tmp3);
273 tmp13 = vsubq_s16(tmp0, tmp3);
274 tmp11 = vaddq_s16(tmp1, tmp2);
275 tmp12 = vsubq_s16(tmp1, tmp2);
276
277 row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS);
278 row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS);
279
280 tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13);
281 z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2);
282 z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2);
283
284 int32x4_t row2_scaled_l =
285 vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3);
286 int32x4_t row2_scaled_h =
287 vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3);
288 row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2),
289 vrshrn_n_s32(row2_scaled_h, DESCALE_P2));
290
291 int32x4_t row6_scaled_l =
292 vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3);
293 int32x4_t row6_scaled_h =
294 vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3);
295 row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2),
296 vrshrn_n_s32(row6_scaled_h, DESCALE_P2));
297
298 /* Odd part */
299 z1 = vaddq_s16(tmp4, tmp7);
300 z2 = vaddq_s16(tmp5, tmp6);
301 z3 = vaddq_s16(tmp4, tmp6);
302 z4 = vaddq_s16(tmp5, tmp7);
303 /* sqrt(2) * c3 */
304 z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1);
305 z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1);
306 z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1);
307 z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1);
308
309 /* sqrt(2) * (-c1+c3+c5-c7) */
310 tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0);
311 tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0);
312 /* sqrt(2) * ( c1+c3-c5+c7) */
313 tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1);
314 tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1);
315 /* sqrt(2) * ( c1+c3+c5-c7) */
316 tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3);
317 tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3);
318 /* sqrt(2) * ( c1+c3-c5-c7) */
319 tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2);
320 tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2);
321
322 /* sqrt(2) * (c7-c3) */
323 z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0);
324 z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0);
325 /* sqrt(2) * (-c1-c3) */
326 z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2);
327 z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2);
328 /* sqrt(2) * (-c3-c5) */
329 z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0);
330 z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0);
331 /* sqrt(2) * (c5-c3) */
332 z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1);
333 z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1);
334
335 z3_l = vaddq_s32(z3_l, z5_l);
336 z3_h = vaddq_s32(z3_h, z5_h);
337 z4_l = vaddq_s32(z4_l, z5_l);
338 z4_h = vaddq_s32(z4_h, z5_h);
339
340 tmp4_l = vaddq_s32(tmp4_l, z1_l);
341 tmp4_h = vaddq_s32(tmp4_h, z1_h);
342 tmp4_l = vaddq_s32(tmp4_l, z3_l);
343 tmp4_h = vaddq_s32(tmp4_h, z3_h);
344 row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2),
345 vrshrn_n_s32(tmp4_h, DESCALE_P2));
346
347 tmp5_l = vaddq_s32(tmp5_l, z2_l);
348 tmp5_h = vaddq_s32(tmp5_h, z2_h);
349 tmp5_l = vaddq_s32(tmp5_l, z4_l);
350 tmp5_h = vaddq_s32(tmp5_h, z4_h);
351 row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2),
352 vrshrn_n_s32(tmp5_h, DESCALE_P2));
353
354 tmp6_l = vaddq_s32(tmp6_l, z2_l);
355 tmp6_h = vaddq_s32(tmp6_h, z2_h);
356 tmp6_l = vaddq_s32(tmp6_l, z3_l);
357 tmp6_h = vaddq_s32(tmp6_h, z3_h);
358 row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2),
359 vrshrn_n_s32(tmp6_h, DESCALE_P2));
360
361 tmp7_l = vaddq_s32(tmp7_l, z1_l);
362 tmp7_h = vaddq_s32(tmp7_h, z1_h);
363 tmp7_l = vaddq_s32(tmp7_l, z4_l);
364 tmp7_h = vaddq_s32(tmp7_h, z4_h);
365 row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2),
366 vrshrn_n_s32(tmp7_h, DESCALE_P2));
367
368 vst1q_s16(data + 0 * DCTSIZE, row0);
369 vst1q_s16(data + 1 * DCTSIZE, row1);
370 vst1q_s16(data + 2 * DCTSIZE, row2);
371 vst1q_s16(data + 3 * DCTSIZE, row3);
372 vst1q_s16(data + 4 * DCTSIZE, row4);
373 vst1q_s16(data + 5 * DCTSIZE, row5);
374 vst1q_s16(data + 6 * DCTSIZE, row6);
375 vst1q_s16(data + 7 * DCTSIZE, row7);
376 }
377