1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* FAST INTEGER INVERSE DCT
24  *
25  * This is similar to the SSE2 implementation, except that we left-shift the
26  * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
27  * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28  *   the elements in arg3 + the most significant 17 bits of
29  *     (the elements in arg1 * the elements in arg2).
30  */
31 
32 #include "jsimd_altivec.h"
33 
34 
35 #define F_1_082  277              /* FIX(1.082392200) */
36 #define F_1_414  362              /* FIX(1.414213562) */
37 #define F_1_847  473              /* FIX(1.847759065) */
38 #define F_2_613  669              /* FIX(2.613125930) */
39 #define F_1_613  (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
40 
41 #define CONST_BITS  8
42 #define PASS1_BITS  2
43 #define PRE_MULTIPLY_SCALE_BITS  2
44 #define CONST_SHIFT  (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45 
46 
47 #define DO_IDCT(in) { \
48   /* Even part */ \
49   \
50   tmp10 = vec_add(in##0, in##4); \
51   tmp11 = vec_sub(in##0, in##4); \
52   tmp13 = vec_add(in##2, in##6); \
53   \
54   tmp12 = vec_sub(in##2, in##6); \
55   tmp12 = vec_sl(tmp12, pre_multiply_scale_bits); \
56   tmp12 = vec_madds(tmp12, pw_F1414, pw_zero); \
57   tmp12 = vec_sub(tmp12, tmp13); \
58   \
59   tmp0 = vec_add(tmp10, tmp13); \
60   tmp3 = vec_sub(tmp10, tmp13); \
61   tmp1 = vec_add(tmp11, tmp12); \
62   tmp2 = vec_sub(tmp11, tmp12); \
63   \
64   /* Odd part */ \
65   \
66   z13 = vec_add(in##5, in##3); \
67   z10 = vec_sub(in##5, in##3); \
68   z10s = vec_sl(z10, pre_multiply_scale_bits); \
69   z11 = vec_add(in##1, in##7); \
70   z12s = vec_sub(in##1, in##7); \
71   z12s = vec_sl(z12s, pre_multiply_scale_bits); \
72   \
73   tmp11 = vec_sub(z11, z13); \
74   tmp11 = vec_sl(tmp11, pre_multiply_scale_bits); \
75   tmp11 = vec_madds(tmp11, pw_F1414, pw_zero); \
76   \
77   tmp7 = vec_add(z11, z13); \
78   \
79   /* To avoid overflow... \
80    * \
81    * (Original) \
82    * tmp12 = -2.613125930 * z10 + z5; \
83    * \
84    * (This implementation) \
85    * tmp12 = (-1.613125930 - 1) * z10 + z5; \
86    *       = -1.613125930 * z10 - z10 + z5; \
87    */ \
88   \
89   z5 = vec_add(z10s, z12s); \
90   z5 = vec_madds(z5, pw_F1847, pw_zero); \
91   \
92   tmp10 = vec_madds(z12s, pw_F1082, pw_zero); \
93   tmp10 = vec_sub(tmp10, z5); \
94   tmp12 = vec_madds(z10s, pw_MF1613, z5); \
95   tmp12 = vec_sub(tmp12, z10); \
96   \
97   tmp6 = vec_sub(tmp12, tmp7); \
98   tmp5 = vec_sub(tmp11, tmp6); \
99   tmp4 = vec_add(tmp10, tmp5); \
100   \
101   out0 = vec_add(tmp0, tmp7); \
102   out1 = vec_add(tmp1, tmp6); \
103   out2 = vec_add(tmp2, tmp5); \
104   out3 = vec_sub(tmp3, tmp4); \
105   out4 = vec_add(tmp3, tmp4); \
106   out5 = vec_sub(tmp2, tmp5); \
107   out6 = vec_sub(tmp1, tmp6); \
108   out7 = vec_sub(tmp0, tmp7); \
109 }
110 
111 
jsimd_idct_ifast_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)112 void jsimd_idct_ifast_altivec(void *dct_table_, JCOEFPTR coef_block,
113                               JSAMPARRAY output_buf, JDIMENSION output_col)
114 {
115   short *dct_table = (short *)dct_table_;
116   int *outptr;
117 
118   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
119     col0, col1, col2, col3, col4, col5, col6, col7,
120     quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
121     tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
122     z5, z10, z10s, z11, z12s, z13,
123     out0, out1, out2, out3, out4, out5, out6, out7;
124   __vector signed char outb;
125 
126   /* Constants */
127   __vector short pw_zero = { __8X(0) },
128     pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
129     pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
130     pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
131     pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
132   __vector unsigned short
133     pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
134     pass1_bits3 = { __8X(PASS1_BITS + 3) };
135   __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
136 
137   /* Pass 1: process columns */
138 
139   col0 = vec_ld(0, coef_block);
140   col1 = vec_ld(16, coef_block);
141   col2 = vec_ld(32, coef_block);
142   col3 = vec_ld(48, coef_block);
143   col4 = vec_ld(64, coef_block);
144   col5 = vec_ld(80, coef_block);
145   col6 = vec_ld(96, coef_block);
146   col7 = vec_ld(112, coef_block);
147 
148   tmp1 = vec_or(col1, col2);
149   tmp2 = vec_or(col3, col4);
150   tmp1 = vec_or(tmp1, tmp2);
151   tmp3 = vec_or(col5, col6);
152   tmp3 = vec_or(tmp3, col7);
153   tmp1 = vec_or(tmp1, tmp3);
154 
155   quant0 = vec_ld(0, dct_table);
156   col0 = vec_mladd(col0, quant0, pw_zero);
157 
158   if (vec_all_eq(tmp1, pw_zero)) {
159     /* AC terms all zero */
160 
161     row0 = vec_splat(col0, 0);
162     row1 = vec_splat(col0, 1);
163     row2 = vec_splat(col0, 2);
164     row3 = vec_splat(col0, 3);
165     row4 = vec_splat(col0, 4);
166     row5 = vec_splat(col0, 5);
167     row6 = vec_splat(col0, 6);
168     row7 = vec_splat(col0, 7);
169 
170   } else {
171 
172     quant1 = vec_ld(16, dct_table);
173     quant2 = vec_ld(32, dct_table);
174     quant3 = vec_ld(48, dct_table);
175     quant4 = vec_ld(64, dct_table);
176     quant5 = vec_ld(80, dct_table);
177     quant6 = vec_ld(96, dct_table);
178     quant7 = vec_ld(112, dct_table);
179 
180     col1 = vec_mladd(col1, quant1, pw_zero);
181     col2 = vec_mladd(col2, quant2, pw_zero);
182     col3 = vec_mladd(col3, quant3, pw_zero);
183     col4 = vec_mladd(col4, quant4, pw_zero);
184     col5 = vec_mladd(col5, quant5, pw_zero);
185     col6 = vec_mladd(col6, quant6, pw_zero);
186     col7 = vec_mladd(col7, quant7, pw_zero);
187 
188     DO_IDCT(col);
189 
190     TRANSPOSE(out, row);
191   }
192 
193   /* Pass 2: process rows */
194 
195   DO_IDCT(row);
196 
197   out0 = vec_sra(out0, pass1_bits3);
198   out1 = vec_sra(out1, pass1_bits3);
199   out2 = vec_sra(out2, pass1_bits3);
200   out3 = vec_sra(out3, pass1_bits3);
201   out4 = vec_sra(out4, pass1_bits3);
202   out5 = vec_sra(out5, pass1_bits3);
203   out6 = vec_sra(out6, pass1_bits3);
204   out7 = vec_sra(out7, pass1_bits3);
205 
206   TRANSPOSE(out, col);
207 
208   outb = vec_packs(col0, col0);
209   outb = vec_add(outb, pb_centerjsamp);
210   outptr = (int *)(output_buf[0] + output_col);
211   vec_ste((__vector int)outb, 0, outptr);
212   vec_ste((__vector int)outb, 4, outptr);
213 
214   outb = vec_packs(col1, col1);
215   outb = vec_add(outb, pb_centerjsamp);
216   outptr = (int *)(output_buf[1] + output_col);
217   vec_ste((__vector int)outb, 0, outptr);
218   vec_ste((__vector int)outb, 4, outptr);
219 
220   outb = vec_packs(col2, col2);
221   outb = vec_add(outb, pb_centerjsamp);
222   outptr = (int *)(output_buf[2] + output_col);
223   vec_ste((__vector int)outb, 0, outptr);
224   vec_ste((__vector int)outb, 4, outptr);
225 
226   outb = vec_packs(col3, col3);
227   outb = vec_add(outb, pb_centerjsamp);
228   outptr = (int *)(output_buf[3] + output_col);
229   vec_ste((__vector int)outb, 0, outptr);
230   vec_ste((__vector int)outb, 4, outptr);
231 
232   outb = vec_packs(col4, col4);
233   outb = vec_add(outb, pb_centerjsamp);
234   outptr = (int *)(output_buf[4] + output_col);
235   vec_ste((__vector int)outb, 0, outptr);
236   vec_ste((__vector int)outb, 4, outptr);
237 
238   outb = vec_packs(col5, col5);
239   outb = vec_add(outb, pb_centerjsamp);
240   outptr = (int *)(output_buf[5] + output_col);
241   vec_ste((__vector int)outb, 0, outptr);
242   vec_ste((__vector int)outb, 4, outptr);
243 
244   outb = vec_packs(col6, col6);
245   outb = vec_add(outb, pb_centerjsamp);
246   outptr = (int *)(output_buf[6] + output_col);
247   vec_ste((__vector int)outb, 0, outptr);
248   vec_ste((__vector int)outb, 4, outptr);
249 
250   outb = vec_packs(col7, col7);
251   outb = vec_add(outb, pb_centerjsamp);
252   outptr = (int *)(output_buf[7] + output_col);
253   vec_ste((__vector int)outb, 0, outptr);
254   vec_ste((__vector int)outb, 4, outptr);
255 }
256