1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* SLOW INTEGER INVERSE DCT */
24 
25 #include "jsimd_altivec.h"
26 
27 
28 #define F_0_298  2446   /* FIX(0.298631336) */
29 #define F_0_390  3196   /* FIX(0.390180644) */
30 #define F_0_541  4433   /* FIX(0.541196100) */
31 #define F_0_765  6270   /* FIX(0.765366865) */
32 #define F_0_899  7373   /* FIX(0.899976223) */
33 #define F_1_175  9633   /* FIX(1.175875602) */
34 #define F_1_501  12299  /* FIX(1.501321110) */
35 #define F_1_847  15137  /* FIX(1.847759065) */
36 #define F_1_961  16069  /* FIX(1.961570560) */
37 #define F_2_053  16819  /* FIX(2.053119869) */
38 #define F_2_562  20995  /* FIX(2.562915447) */
39 #define F_3_072  25172  /* FIX(3.072711026) */
40 
41 #define CONST_BITS  13
42 #define PASS1_BITS  2
43 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
44 #define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
45 
46 
47 #define DO_IDCT(in, PASS) { \
48   /* Even part \
49    * \
50    * (Original) \
51    * z1 = (z2 + z3) * 0.541196100; \
52    * tmp2 = z1 + z3 * -1.847759065; \
53    * tmp3 = z1 + z2 * 0.765366865; \
54    * \
55    * (This implementation) \
56    * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065); \
57    * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100; \
58    */ \
59   \
60   in##26l = vec_mergeh(in##2, in##6); \
61   in##26h = vec_mergel(in##2, in##6); \
62   \
63   tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero); \
64   tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero); \
65   tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero); \
66   tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero); \
67   \
68   tmp0 = vec_add(in##0, in##4); \
69   tmp1 = vec_sub(in##0, in##4); \
70   \
71   tmp0l = vec_unpackh(tmp0); \
72   tmp0h = vec_unpackl(tmp0); \
73   tmp0l = vec_sl(tmp0l, const_bits); \
74   tmp0h = vec_sl(tmp0h, const_bits); \
75   tmp0l = vec_add(tmp0l, pd_descale_p##PASS); \
76   tmp0h = vec_add(tmp0h, pd_descale_p##PASS); \
77   \
78   tmp10l = vec_add(tmp0l, tmp3l); \
79   tmp10h = vec_add(tmp0h, tmp3h); \
80   tmp13l = vec_sub(tmp0l, tmp3l); \
81   tmp13h = vec_sub(tmp0h, tmp3h); \
82   \
83   tmp1l = vec_unpackh(tmp1); \
84   tmp1h = vec_unpackl(tmp1); \
85   tmp1l = vec_sl(tmp1l, const_bits); \
86   tmp1h = vec_sl(tmp1h, const_bits); \
87   tmp1l = vec_add(tmp1l, pd_descale_p##PASS); \
88   tmp1h = vec_add(tmp1h, pd_descale_p##PASS); \
89   \
90   tmp11l = vec_add(tmp1l, tmp2l); \
91   tmp11h = vec_add(tmp1h, tmp2h); \
92   tmp12l = vec_sub(tmp1l, tmp2l); \
93   tmp12h = vec_sub(tmp1h, tmp2h); \
94   \
95   /* Odd part */ \
96   \
97   z3 = vec_add(in##3, in##7); \
98   z4 = vec_add(in##1, in##5); \
99   \
100   /* (Original) \
101    * z5 = (z3 + z4) * 1.175875602; \
102    * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644; \
103    * z3 += z5;  z4 += z5; \
104    * \
105    * (This implementation) \
106    * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602; \
107    * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644); \
108    */ \
109   \
110   z34l = vec_mergeh(z3, z4); \
111   z34h = vec_mergel(z3, z4); \
112   \
113   z3l = vec_msums(z34l, pw_mf078_f117, pd_zero); \
114   z3h = vec_msums(z34h, pw_mf078_f117, pd_zero); \
115   z4l = vec_msums(z34l, pw_f117_f078, pd_zero); \
116   z4h = vec_msums(z34h, pw_f117_f078, pd_zero); \
117   \
118   /* (Original) \
119    * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2; \
120    * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869; \
121    * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110; \
122    * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447; \
123    * tmp0 += z1 + z3;  tmp1 += z2 + z4; \
124    * tmp2 += z2 + z3;  tmp3 += z1 + z4; \
125    * \
126    * (This implementation) \
127    * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223; \
128    * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447; \
129    * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447); \
130    * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223); \
131    * tmp0 += z3;  tmp1 += z4; \
132    * tmp2 += z3;  tmp3 += z4; \
133    */ \
134   \
135   in##71l = vec_mergeh(in##7, in##1); \
136   in##71h = vec_mergel(in##7, in##1); \
137   \
138   tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l); \
139   tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h); \
140   tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l); \
141   tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h); \
142   \
143   in##53l = vec_mergeh(in##5, in##3); \
144   in##53h = vec_mergel(in##5, in##3); \
145   \
146   tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l); \
147   tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h); \
148   tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l); \
149   tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h); \
150   \
151   /* Final output stage */ \
152   \
153   out0l = vec_add(tmp10l, tmp3l); \
154   out0h = vec_add(tmp10h, tmp3h); \
155   out7l = vec_sub(tmp10l, tmp3l); \
156   out7h = vec_sub(tmp10h, tmp3h); \
157   \
158   out0l = vec_sra(out0l, descale_p##PASS); \
159   out0h = vec_sra(out0h, descale_p##PASS); \
160   out7l = vec_sra(out7l, descale_p##PASS); \
161   out7h = vec_sra(out7h, descale_p##PASS); \
162   \
163   out0 = vec_pack(out0l, out0h); \
164   out7 = vec_pack(out7l, out7h); \
165   \
166   out1l = vec_add(tmp11l, tmp2l); \
167   out1h = vec_add(tmp11h, tmp2h); \
168   out6l = vec_sub(tmp11l, tmp2l); \
169   out6h = vec_sub(tmp11h, tmp2h); \
170   \
171   out1l = vec_sra(out1l, descale_p##PASS); \
172   out1h = vec_sra(out1h, descale_p##PASS); \
173   out6l = vec_sra(out6l, descale_p##PASS); \
174   out6h = vec_sra(out6h, descale_p##PASS); \
175   \
176   out1 = vec_pack(out1l, out1h); \
177   out6 = vec_pack(out6l, out6h); \
178   \
179   out2l = vec_add(tmp12l, tmp1l); \
180   out2h = vec_add(tmp12h, tmp1h); \
181   out5l = vec_sub(tmp12l, tmp1l); \
182   out5h = vec_sub(tmp12h, tmp1h); \
183   \
184   out2l = vec_sra(out2l, descale_p##PASS); \
185   out2h = vec_sra(out2h, descale_p##PASS); \
186   out5l = vec_sra(out5l, descale_p##PASS); \
187   out5h = vec_sra(out5h, descale_p##PASS); \
188   \
189   out2 = vec_pack(out2l, out2h); \
190   out5 = vec_pack(out5l, out5h); \
191   \
192   out3l = vec_add(tmp13l, tmp0l); \
193   out3h = vec_add(tmp13h, tmp0h); \
194   out4l = vec_sub(tmp13l, tmp0l); \
195   out4h = vec_sub(tmp13h, tmp0h); \
196   \
197   out3l = vec_sra(out3l, descale_p##PASS); \
198   out3h = vec_sra(out3h, descale_p##PASS); \
199   out4l = vec_sra(out4l, descale_p##PASS); \
200   out4h = vec_sra(out4h, descale_p##PASS); \
201   \
202   out3 = vec_pack(out3l, out3h); \
203   out4 = vec_pack(out4l, out4h); \
204 }
205 
206 
jsimd_idct_islow_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)207 void jsimd_idct_islow_altivec(void *dct_table_, JCOEFPTR coef_block,
208                               JSAMPARRAY output_buf, JDIMENSION output_col)
209 {
210   short *dct_table = (short *)dct_table_;
211   int *outptr;
212 
213   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
214     col0, col1, col2, col3, col4, col5, col6, col7,
215     quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
216     tmp0, tmp1, tmp2, tmp3, z3, z4,
217     z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
218     row71l, row71h, row26l, row26h, row53l, row53h,
219     out0, out1, out2, out3, out4, out5, out6, out7;
220   __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
221     tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
222     z3l, z3h, z4l, z4h,
223     out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
224     out5l, out5h, out6l, out6h, out7l, out7h;
225   __vector signed char outb;
226 
227   /* Constants */
228   __vector short pw_zero = { __8X(0) },
229     pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
230     pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
231     pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
232     pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
233     pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
234     pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
235     pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
236     pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
237   __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
238   __vector int pd_zero = { __4X(0) },
239     pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
240     pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
241   __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
242     descale_p2 = { __4X(DESCALE_P2) },
243     const_bits = { __4X(CONST_BITS) };
244   __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
245 
246   /* Pass 1: process columns */
247 
248   col0 = vec_ld(0, coef_block);
249   col1 = vec_ld(16, coef_block);
250   col2 = vec_ld(32, coef_block);
251   col3 = vec_ld(48, coef_block);
252   col4 = vec_ld(64, coef_block);
253   col5 = vec_ld(80, coef_block);
254   col6 = vec_ld(96, coef_block);
255   col7 = vec_ld(112, coef_block);
256 
257   tmp1 = vec_or(col1, col2);
258   tmp2 = vec_or(col3, col4);
259   tmp1 = vec_or(tmp1, tmp2);
260   tmp3 = vec_or(col5, col6);
261   tmp3 = vec_or(tmp3, col7);
262   tmp1 = vec_or(tmp1, tmp3);
263 
264   quant0 = vec_ld(0, dct_table);
265   col0 = vec_mladd(col0, quant0, pw_zero);
266 
267   if (vec_all_eq(tmp1, pw_zero)) {
268     /* AC terms all zero */
269 
270     col0 = vec_sl(col0, pass1_bits);
271 
272     row0 = vec_splat(col0, 0);
273     row1 = vec_splat(col0, 1);
274     row2 = vec_splat(col0, 2);
275     row3 = vec_splat(col0, 3);
276     row4 = vec_splat(col0, 4);
277     row5 = vec_splat(col0, 5);
278     row6 = vec_splat(col0, 6);
279     row7 = vec_splat(col0, 7);
280 
281   } else {
282 
283     quant1 = vec_ld(16, dct_table);
284     quant2 = vec_ld(32, dct_table);
285     quant3 = vec_ld(48, dct_table);
286     quant4 = vec_ld(64, dct_table);
287     quant5 = vec_ld(80, dct_table);
288     quant6 = vec_ld(96, dct_table);
289     quant7 = vec_ld(112, dct_table);
290 
291     col1 = vec_mladd(col1, quant1, pw_zero);
292     col2 = vec_mladd(col2, quant2, pw_zero);
293     col3 = vec_mladd(col3, quant3, pw_zero);
294     col4 = vec_mladd(col4, quant4, pw_zero);
295     col5 = vec_mladd(col5, quant5, pw_zero);
296     col6 = vec_mladd(col6, quant6, pw_zero);
297     col7 = vec_mladd(col7, quant7, pw_zero);
298 
299     DO_IDCT(col, 1);
300 
301     TRANSPOSE(out, row);
302   }
303 
304   /* Pass 2: process rows */
305 
306   DO_IDCT(row, 2);
307 
308   TRANSPOSE(out, col);
309 
310   outb = vec_packs(col0, col0);
311   outb = vec_add(outb, pb_centerjsamp);
312   outptr = (int *)(output_buf[0] + output_col);
313   vec_ste((__vector int)outb, 0, outptr);
314   vec_ste((__vector int)outb, 4, outptr);
315 
316   outb = vec_packs(col1, col1);
317   outb = vec_add(outb, pb_centerjsamp);
318   outptr = (int *)(output_buf[1] + output_col);
319   vec_ste((__vector int)outb, 0, outptr);
320   vec_ste((__vector int)outb, 4, outptr);
321 
322   outb = vec_packs(col2, col2);
323   outb = vec_add(outb, pb_centerjsamp);
324   outptr = (int *)(output_buf[2] + output_col);
325   vec_ste((__vector int)outb, 0, outptr);
326   vec_ste((__vector int)outb, 4, outptr);
327 
328   outb = vec_packs(col3, col3);
329   outb = vec_add(outb, pb_centerjsamp);
330   outptr = (int *)(output_buf[3] + output_col);
331   vec_ste((__vector int)outb, 0, outptr);
332   vec_ste((__vector int)outb, 4, outptr);
333 
334   outb = vec_packs(col4, col4);
335   outb = vec_add(outb, pb_centerjsamp);
336   outptr = (int *)(output_buf[4] + output_col);
337   vec_ste((__vector int)outb, 0, outptr);
338   vec_ste((__vector int)outb, 4, outptr);
339 
340   outb = vec_packs(col5, col5);
341   outb = vec_add(outb, pb_centerjsamp);
342   outptr = (int *)(output_buf[5] + output_col);
343   vec_ste((__vector int)outb, 0, outptr);
344   vec_ste((__vector int)outb, 4, outptr);
345 
346   outb = vec_packs(col6, col6);
347   outb = vec_add(outb, pb_centerjsamp);
348   outptr = (int *)(output_buf[6] + output_col);
349   vec_ste((__vector int)outb, 0, outptr);
350   vec_ste((__vector int)outb, 4, outptr);
351 
352   outb = vec_packs(col7, col7);
353   outb = vec_add(outb, pb_centerjsamp);
354   outptr = (int *)(output_buf[7] + output_col);
355   vec_ste((__vector int)outb, 0, outptr);
356   vec_ste((__vector int)outb, 4, outptr);
357 }
358