1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* SLOW INTEGER INVERSE DCT */
24 
25 #include "jsimd_altivec.h"
26 
27 
28 #define F_0_298 2446   /* FIX(0.298631336) */
29 #define F_0_390 3196   /* FIX(0.390180644) */
30 #define F_0_541 4433   /* FIX(0.541196100) */
31 #define F_0_765 6270   /* FIX(0.765366865) */
32 #define F_0_899 7373   /* FIX(0.899976223) */
33 #define F_1_175 9633   /* FIX(1.175875602) */
34 #define F_1_501 12299  /* FIX(1.501321110) */
35 #define F_1_847 15137  /* FIX(1.847759065) */
36 #define F_1_961 16069  /* FIX(1.961570560) */
37 #define F_2_053 16819  /* FIX(2.053119869) */
38 #define F_2_562 20995  /* FIX(2.562915447) */
39 #define F_3_072 25172  /* FIX(3.072711026) */
40 
41 #define CONST_BITS 13
42 #define PASS1_BITS 2
43 #define DESCALE_P1 (CONST_BITS - PASS1_BITS)
44 #define DESCALE_P2 (CONST_BITS + PASS1_BITS + 3)
45 
46 
47 #define DO_IDCT(in, PASS)  \
48 {  \
49   /* Even part  \
50    *  \
51    * (Original)  \
52    * z1 = (z2 + z3) * 0.541196100;  \
53    * tmp2 = z1 + z3 * -1.847759065;  \
54    * tmp3 = z1 + z2 * 0.765366865;  \
55    *  \
56    * (This implementation)  \
57    * tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);  \
58    * tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;  \
59    */  \
60   \
61   in##26l = vec_mergeh(in##2, in##6);  \
62   in##26h = vec_mergel(in##2, in##6);  \
63   \
64   tmp3l = vec_msums(in##26l, pw_f130_f054, pd_zero);  \
65   tmp3h = vec_msums(in##26h, pw_f130_f054, pd_zero);  \
66   tmp2l = vec_msums(in##26l, pw_f054_mf130, pd_zero);  \
67   tmp2h = vec_msums(in##26h, pw_f054_mf130, pd_zero);  \
68   \
69   tmp0 = vec_add(in##0, in##4);  \
70   tmp1 = vec_sub(in##0, in##4);  \
71   \
72   tmp0l = vec_unpackh(tmp0);  \
73   tmp0h = vec_unpackl(tmp0);  \
74   tmp0l = vec_sl(tmp0l, const_bits);  \
75   tmp0h = vec_sl(tmp0h, const_bits);  \
76   tmp0l = vec_add(tmp0l, pd_descale_p##PASS);  \
77   tmp0h = vec_add(tmp0h, pd_descale_p##PASS);  \
78   \
79   tmp10l = vec_add(tmp0l, tmp3l);  \
80   tmp10h = vec_add(tmp0h, tmp3h);  \
81   tmp13l = vec_sub(tmp0l, tmp3l);  \
82   tmp13h = vec_sub(tmp0h, tmp3h);  \
83   \
84   tmp1l = vec_unpackh(tmp1);  \
85   tmp1h = vec_unpackl(tmp1);  \
86   tmp1l = vec_sl(tmp1l, const_bits);  \
87   tmp1h = vec_sl(tmp1h, const_bits);  \
88   tmp1l = vec_add(tmp1l, pd_descale_p##PASS);  \
89   tmp1h = vec_add(tmp1h, pd_descale_p##PASS);  \
90   \
91   tmp11l = vec_add(tmp1l, tmp2l);  \
92   tmp11h = vec_add(tmp1h, tmp2h);  \
93   tmp12l = vec_sub(tmp1l, tmp2l);  \
94   tmp12h = vec_sub(tmp1h, tmp2h);  \
95   \
96   /* Odd part */  \
97   \
98   z3 = vec_add(in##3, in##7);  \
99   z4 = vec_add(in##1, in##5);  \
100   \
101   /* (Original)  \
102    * z5 = (z3 + z4) * 1.175875602;  \
103    * z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;  \
104    * z3 += z5;  z4 += z5;  \
105    *  \
106    * (This implementation)  \
107    * z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;  \
108    * z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);  \
109    */  \
110   \
111   z34l = vec_mergeh(z3, z4);  \
112   z34h = vec_mergel(z3, z4);  \
113   \
114   z3l = vec_msums(z34l, pw_mf078_f117, pd_zero);  \
115   z3h = vec_msums(z34h, pw_mf078_f117, pd_zero);  \
116   z4l = vec_msums(z34l, pw_f117_f078, pd_zero);  \
117   z4h = vec_msums(z34h, pw_f117_f078, pd_zero);  \
118   \
119   /* (Original)  \
120    * z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;  \
121    * tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;  \
122    * tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;  \
123    * z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;  \
124    * tmp0 += z1 + z3;  tmp1 += z2 + z4;  \
125    * tmp2 += z2 + z3;  tmp3 += z1 + z4;  \
126    *  \
127    * (This implementation)  \
128    * tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;  \
129    * tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;  \
130    * tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);  \
131    * tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);  \
132    * tmp0 += z3;  tmp1 += z4;  \
133    * tmp2 += z3;  tmp3 += z4;  \
134    */  \
135   \
136   in##71l = vec_mergeh(in##7, in##1);  \
137   in##71h = vec_mergel(in##7, in##1);  \
138   \
139   tmp0l = vec_msums(in##71l, pw_mf060_mf089, z3l);  \
140   tmp0h = vec_msums(in##71h, pw_mf060_mf089, z3h);  \
141   tmp3l = vec_msums(in##71l, pw_mf089_f060, z4l);  \
142   tmp3h = vec_msums(in##71h, pw_mf089_f060, z4h);  \
143   \
144   in##53l = vec_mergeh(in##5, in##3);  \
145   in##53h = vec_mergel(in##5, in##3);  \
146   \
147   tmp1l = vec_msums(in##53l, pw_mf050_mf256, z4l);  \
148   tmp1h = vec_msums(in##53h, pw_mf050_mf256, z4h);  \
149   tmp2l = vec_msums(in##53l, pw_mf256_f050, z3l);  \
150   tmp2h = vec_msums(in##53h, pw_mf256_f050, z3h);  \
151   \
152   /* Final output stage */  \
153   \
154   out0l = vec_add(tmp10l, tmp3l);  \
155   out0h = vec_add(tmp10h, tmp3h);  \
156   out7l = vec_sub(tmp10l, tmp3l);  \
157   out7h = vec_sub(tmp10h, tmp3h);  \
158   \
159   out0l = vec_sra(out0l, descale_p##PASS);  \
160   out0h = vec_sra(out0h, descale_p##PASS);  \
161   out7l = vec_sra(out7l, descale_p##PASS);  \
162   out7h = vec_sra(out7h, descale_p##PASS);  \
163   \
164   out0 = vec_pack(out0l, out0h);  \
165   out7 = vec_pack(out7l, out7h);  \
166   \
167   out1l = vec_add(tmp11l, tmp2l);  \
168   out1h = vec_add(tmp11h, tmp2h);  \
169   out6l = vec_sub(tmp11l, tmp2l);  \
170   out6h = vec_sub(tmp11h, tmp2h);  \
171   \
172   out1l = vec_sra(out1l, descale_p##PASS);  \
173   out1h = vec_sra(out1h, descale_p##PASS);  \
174   out6l = vec_sra(out6l, descale_p##PASS);  \
175   out6h = vec_sra(out6h, descale_p##PASS);  \
176   \
177   out1 = vec_pack(out1l, out1h);  \
178   out6 = vec_pack(out6l, out6h);  \
179   \
180   out2l = vec_add(tmp12l, tmp1l);  \
181   out2h = vec_add(tmp12h, tmp1h);  \
182   out5l = vec_sub(tmp12l, tmp1l);  \
183   out5h = vec_sub(tmp12h, tmp1h);  \
184   \
185   out2l = vec_sra(out2l, descale_p##PASS);  \
186   out2h = vec_sra(out2h, descale_p##PASS);  \
187   out5l = vec_sra(out5l, descale_p##PASS);  \
188   out5h = vec_sra(out5h, descale_p##PASS);  \
189   \
190   out2 = vec_pack(out2l, out2h);  \
191   out5 = vec_pack(out5l, out5h);  \
192   \
193   out3l = vec_add(tmp13l, tmp0l);  \
194   out3h = vec_add(tmp13h, tmp0h);  \
195   out4l = vec_sub(tmp13l, tmp0l);  \
196   out4h = vec_sub(tmp13h, tmp0h);  \
197   \
198   out3l = vec_sra(out3l, descale_p##PASS);  \
199   out3h = vec_sra(out3h, descale_p##PASS);  \
200   out4l = vec_sra(out4l, descale_p##PASS);  \
201   out4h = vec_sra(out4h, descale_p##PASS);  \
202   \
203   out3 = vec_pack(out3l, out3h);  \
204   out4 = vec_pack(out4l, out4h);  \
205 }
206 
207 
208 void
jsimd_idct_islow_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)209 jsimd_idct_islow_altivec (void *dct_table_, JCOEFPTR coef_block,
210                           JSAMPARRAY output_buf, JDIMENSION output_col)
211 {
212   short *dct_table = (short *)dct_table_;
213   int *outptr;
214 
215   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
216     col0, col1, col2, col3, col4, col5, col6, col7,
217     quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
218     tmp0, tmp1, tmp2, tmp3, z3, z4,
219     z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h,
220     row71l, row71h, row26l, row26h, row53l, row53h,
221     out0, out1, out2, out3, out4, out5, out6, out7;
222   __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h,
223     tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h,
224     z3l, z3h, z4l, z4h,
225     out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h,
226     out5l, out5h, out6l, out6h, out7l, out7h;
227   __vector signed char outb;
228 
229   /* Constants */
230   __vector short pw_zero = { __8X(0) },
231     pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) },
232     pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) },
233     pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) },
234     pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) },
235     pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) },
236     pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) },
237     pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) },
238     pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) };
239   __vector unsigned short pass1_bits = { __8X(PASS1_BITS) };
240   __vector int pd_zero = { __4X(0) },
241     pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) },
242     pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) };
243   __vector unsigned int descale_p1 = { __4X(DESCALE_P1) },
244     descale_p2 = { __4X(DESCALE_P2) },
245     const_bits = { __4X(CONST_BITS) };
246   __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
247 
248   /* Pass 1: process columns */
249 
250   col0 = vec_ld(0, coef_block);
251   col1 = vec_ld(16, coef_block);
252   col2 = vec_ld(32, coef_block);
253   col3 = vec_ld(48, coef_block);
254   col4 = vec_ld(64, coef_block);
255   col5 = vec_ld(80, coef_block);
256   col6 = vec_ld(96, coef_block);
257   col7 = vec_ld(112, coef_block);
258 
259   tmp1 = vec_or(col1, col2);
260   tmp2 = vec_or(col3, col4);
261   tmp1 = vec_or(tmp1, tmp2);
262   tmp3 = vec_or(col5, col6);
263   tmp3 = vec_or(tmp3, col7);
264   tmp1 = vec_or(tmp1, tmp3);
265 
266   quant0 = vec_ld(0, dct_table);
267   col0 = vec_mladd(col0, quant0, pw_zero);
268 
269   if (vec_all_eq(tmp1, pw_zero)) {
270     /* AC terms all zero */
271 
272     col0 = vec_sl(col0, pass1_bits);
273 
274     row0 = vec_splat(col0, 0);
275     row1 = vec_splat(col0, 1);
276     row2 = vec_splat(col0, 2);
277     row3 = vec_splat(col0, 3);
278     row4 = vec_splat(col0, 4);
279     row5 = vec_splat(col0, 5);
280     row6 = vec_splat(col0, 6);
281     row7 = vec_splat(col0, 7);
282 
283   } else {
284 
285     quant1 = vec_ld(16, dct_table);
286     quant2 = vec_ld(32, dct_table);
287     quant3 = vec_ld(48, dct_table);
288     quant4 = vec_ld(64, dct_table);
289     quant5 = vec_ld(80, dct_table);
290     quant6 = vec_ld(96, dct_table);
291     quant7 = vec_ld(112, dct_table);
292 
293     col1 = vec_mladd(col1, quant1, pw_zero);
294     col2 = vec_mladd(col2, quant2, pw_zero);
295     col3 = vec_mladd(col3, quant3, pw_zero);
296     col4 = vec_mladd(col4, quant4, pw_zero);
297     col5 = vec_mladd(col5, quant5, pw_zero);
298     col6 = vec_mladd(col6, quant6, pw_zero);
299     col7 = vec_mladd(col7, quant7, pw_zero);
300 
301     DO_IDCT(col, 1);
302 
303     TRANSPOSE(out, row);
304   }
305 
306   /* Pass 2: process rows */
307 
308   DO_IDCT(row, 2);
309 
310   TRANSPOSE(out, col);
311 
312   outb = vec_packs(col0, col0);
313   outb = vec_add(outb, pb_centerjsamp);
314   outptr = (int *)(output_buf[0] + output_col);
315   vec_ste((__vector int)outb, 0, outptr);
316   vec_ste((__vector int)outb, 4, outptr);
317 
318   outb = vec_packs(col1, col1);
319   outb = vec_add(outb, pb_centerjsamp);
320   outptr = (int *)(output_buf[1] + output_col);
321   vec_ste((__vector int)outb, 0, outptr);
322   vec_ste((__vector int)outb, 4, outptr);
323 
324   outb = vec_packs(col2, col2);
325   outb = vec_add(outb, pb_centerjsamp);
326   outptr = (int *)(output_buf[2] + output_col);
327   vec_ste((__vector int)outb, 0, outptr);
328   vec_ste((__vector int)outb, 4, outptr);
329 
330   outb = vec_packs(col3, col3);
331   outb = vec_add(outb, pb_centerjsamp);
332   outptr = (int *)(output_buf[3] + output_col);
333   vec_ste((__vector int)outb, 0, outptr);
334   vec_ste((__vector int)outb, 4, outptr);
335 
336   outb = vec_packs(col4, col4);
337   outb = vec_add(outb, pb_centerjsamp);
338   outptr = (int *)(output_buf[4] + output_col);
339   vec_ste((__vector int)outb, 0, outptr);
340   vec_ste((__vector int)outb, 4, outptr);
341 
342   outb = vec_packs(col5, col5);
343   outb = vec_add(outb, pb_centerjsamp);
344   outptr = (int *)(output_buf[5] + output_col);
345   vec_ste((__vector int)outb, 0, outptr);
346   vec_ste((__vector int)outb, 4, outptr);
347 
348   outb = vec_packs(col6, col6);
349   outb = vec_add(outb, pb_centerjsamp);
350   outptr = (int *)(output_buf[6] + output_col);
351   vec_ste((__vector int)outb, 0, outptr);
352   vec_ste((__vector int)outb, 4, outptr);
353 
354   outb = vec_packs(col7, col7);
355   outb = vec_add(outb, pb_centerjsamp);
356   outptr = (int *)(output_buf[7] + output_col);
357   vec_ste((__vector int)outb, 0, outptr);
358   vec_ste((__vector int)outb, 4, outptr);
359 }
360