1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* FAST INTEGER INVERSE DCT
24  *
25  * This is similar to the SSE2 implementation, except that we left-shift the
26  * constants by 1 less bit (the -1 in CONST_SHIFT.)  This is because
27  * vec_madds(arg1, arg2, arg3) generates the 16-bit saturated sum of:
28  *   the elements in arg3 + the most significant 17 bits of
29  *     (the elements in arg1 * the elements in arg2).
30  */
31 
32 #include "jsimd_altivec.h"
33 
34 
35 #define F_1_082 277              /* FIX(1.082392200) */
36 #define F_1_414 362              /* FIX(1.414213562) */
37 #define F_1_847 473              /* FIX(1.847759065) */
38 #define F_2_613 669              /* FIX(2.613125930) */
39 #define F_1_613 (F_2_613 - 256)  /* FIX(2.613125930) - FIX(1) */
40 
41 #define CONST_BITS 8
42 #define PASS1_BITS 2
43 #define PRE_MULTIPLY_SCALE_BITS 2
44 #define CONST_SHIFT (16 - PRE_MULTIPLY_SCALE_BITS - CONST_BITS - 1)
45 
46 
47 #define DO_IDCT(in)  \
48 {  \
49   /* Even part */  \
50   \
51   tmp10 = vec_add(in##0, in##4);  \
52   tmp11 = vec_sub(in##0, in##4);  \
53   tmp13 = vec_add(in##2, in##6);  \
54   \
55   tmp12 = vec_sub(in##2, in##6);  \
56   tmp12 = vec_sl(tmp12, pre_multiply_scale_bits);  \
57   tmp12 = vec_madds(tmp12, pw_F1414, pw_zero);  \
58   tmp12 = vec_sub(tmp12, tmp13);  \
59   \
60   tmp0 = vec_add(tmp10, tmp13);  \
61   tmp3 = vec_sub(tmp10, tmp13);  \
62   tmp1 = vec_add(tmp11, tmp12);  \
63   tmp2 = vec_sub(tmp11, tmp12);  \
64   \
65   /* Odd part */  \
66   \
67   z13 = vec_add(in##5, in##3);  \
68   z10 = vec_sub(in##5, in##3);  \
69   z10s = vec_sl(z10, pre_multiply_scale_bits);  \
70   z11 = vec_add(in##1, in##7);  \
71   z12s = vec_sub(in##1, in##7);  \
72   z12s = vec_sl(z12s, pre_multiply_scale_bits);  \
73   \
74   tmp11 = vec_sub(z11, z13);  \
75   tmp11 = vec_sl(tmp11, pre_multiply_scale_bits);  \
76   tmp11 = vec_madds(tmp11, pw_F1414, pw_zero);  \
77   \
78   tmp7 = vec_add(z11, z13);  \
79   \
80   /* To avoid overflow...  \
81    *  \
82    * (Original)  \
83    * tmp12 = -2.613125930 * z10 + z5;  \
84    *  \
85    * (This implementation)  \
86    * tmp12 = (-1.613125930 - 1) * z10 + z5;  \
87    *       = -1.613125930 * z10 - z10 + z5;  \
88    */  \
89   \
90   z5 = vec_add(z10s, z12s);  \
91   z5 = vec_madds(z5, pw_F1847, pw_zero);  \
92   \
93   tmp10 = vec_madds(z12s, pw_F1082, pw_zero);  \
94   tmp10 = vec_sub(tmp10, z5);  \
95   tmp12 = vec_madds(z10s, pw_MF1613, z5);  \
96   tmp12 = vec_sub(tmp12, z10);  \
97   \
98   tmp6 = vec_sub(tmp12, tmp7);  \
99   tmp5 = vec_sub(tmp11, tmp6);  \
100   tmp4 = vec_add(tmp10, tmp5);  \
101   \
102   out0 = vec_add(tmp0, tmp7);  \
103   out1 = vec_add(tmp1, tmp6);  \
104   out2 = vec_add(tmp2, tmp5);  \
105   out3 = vec_sub(tmp3, tmp4);  \
106   out4 = vec_add(tmp3, tmp4);  \
107   out5 = vec_sub(tmp2, tmp5);  \
108   out6 = vec_sub(tmp1, tmp6);  \
109   out7 = vec_sub(tmp0, tmp7);  \
110 }
111 
112 
113 void
jsimd_idct_ifast_altivec(void * dct_table_,JCOEFPTR coef_block,JSAMPARRAY output_buf,JDIMENSION output_col)114 jsimd_idct_ifast_altivec (void *dct_table_, JCOEFPTR coef_block,
115                           JSAMPARRAY output_buf, JDIMENSION output_col)
116 {
117   short *dct_table = (short *)dct_table_;
118   int *outptr;
119 
120   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
121     col0, col1, col2, col3, col4, col5, col6, col7,
122     quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7,
123     tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp10, tmp11, tmp12, tmp13,
124     z5, z10, z10s, z11, z12s, z13,
125     out0, out1, out2, out3, out4, out5, out6, out7;
126   __vector signed char outb;
127 
128   /* Constants */
129   __vector short pw_zero = { __8X(0) },
130     pw_F1414 = { __8X(F_1_414 << CONST_SHIFT) },
131     pw_F1847 = { __8X(F_1_847 << CONST_SHIFT) },
132     pw_MF1613 = { __8X(-F_1_613 << CONST_SHIFT) },
133     pw_F1082 = { __8X(F_1_082 << CONST_SHIFT) };
134   __vector unsigned short
135     pre_multiply_scale_bits = { __8X(PRE_MULTIPLY_SCALE_BITS) },
136     pass1_bits3 = { __8X(PASS1_BITS + 3) };
137   __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) };
138 
139   /* Pass 1: process columns */
140 
141   col0 = vec_ld(0, coef_block);
142   col1 = vec_ld(16, coef_block);
143   col2 = vec_ld(32, coef_block);
144   col3 = vec_ld(48, coef_block);
145   col4 = vec_ld(64, coef_block);
146   col5 = vec_ld(80, coef_block);
147   col6 = vec_ld(96, coef_block);
148   col7 = vec_ld(112, coef_block);
149 
150   tmp1 = vec_or(col1, col2);
151   tmp2 = vec_or(col3, col4);
152   tmp1 = vec_or(tmp1, tmp2);
153   tmp3 = vec_or(col5, col6);
154   tmp3 = vec_or(tmp3, col7);
155   tmp1 = vec_or(tmp1, tmp3);
156 
157   quant0 = vec_ld(0, dct_table);
158   col0 = vec_mladd(col0, quant0, pw_zero);
159 
160   if (vec_all_eq(tmp1, pw_zero)) {
161     /* AC terms all zero */
162 
163     row0 = vec_splat(col0, 0);
164     row1 = vec_splat(col0, 1);
165     row2 = vec_splat(col0, 2);
166     row3 = vec_splat(col0, 3);
167     row4 = vec_splat(col0, 4);
168     row5 = vec_splat(col0, 5);
169     row6 = vec_splat(col0, 6);
170     row7 = vec_splat(col0, 7);
171 
172   } else {
173 
174     quant1 = vec_ld(16, dct_table);
175     quant2 = vec_ld(32, dct_table);
176     quant3 = vec_ld(48, dct_table);
177     quant4 = vec_ld(64, dct_table);
178     quant5 = vec_ld(80, dct_table);
179     quant6 = vec_ld(96, dct_table);
180     quant7 = vec_ld(112, dct_table);
181 
182     col1 = vec_mladd(col1, quant1, pw_zero);
183     col2 = vec_mladd(col2, quant2, pw_zero);
184     col3 = vec_mladd(col3, quant3, pw_zero);
185     col4 = vec_mladd(col4, quant4, pw_zero);
186     col5 = vec_mladd(col5, quant5, pw_zero);
187     col6 = vec_mladd(col6, quant6, pw_zero);
188     col7 = vec_mladd(col7, quant7, pw_zero);
189 
190     DO_IDCT(col);
191 
192     TRANSPOSE(out, row);
193   }
194 
195   /* Pass 2: process rows */
196 
197   DO_IDCT(row);
198 
199   out0 = vec_sra(out0, pass1_bits3);
200   out1 = vec_sra(out1, pass1_bits3);
201   out2 = vec_sra(out2, pass1_bits3);
202   out3 = vec_sra(out3, pass1_bits3);
203   out4 = vec_sra(out4, pass1_bits3);
204   out5 = vec_sra(out5, pass1_bits3);
205   out6 = vec_sra(out6, pass1_bits3);
206   out7 = vec_sra(out7, pass1_bits3);
207 
208   TRANSPOSE(out, col);
209 
210   outb = vec_packs(col0, col0);
211   outb = vec_add(outb, pb_centerjsamp);
212   outptr = (int *)(output_buf[0] + output_col);
213   vec_ste((__vector int)outb, 0, outptr);
214   vec_ste((__vector int)outb, 4, outptr);
215 
216   outb = vec_packs(col1, col1);
217   outb = vec_add(outb, pb_centerjsamp);
218   outptr = (int *)(output_buf[1] + output_col);
219   vec_ste((__vector int)outb, 0, outptr);
220   vec_ste((__vector int)outb, 4, outptr);
221 
222   outb = vec_packs(col2, col2);
223   outb = vec_add(outb, pb_centerjsamp);
224   outptr = (int *)(output_buf[2] + output_col);
225   vec_ste((__vector int)outb, 0, outptr);
226   vec_ste((__vector int)outb, 4, outptr);
227 
228   outb = vec_packs(col3, col3);
229   outb = vec_add(outb, pb_centerjsamp);
230   outptr = (int *)(output_buf[3] + output_col);
231   vec_ste((__vector int)outb, 0, outptr);
232   vec_ste((__vector int)outb, 4, outptr);
233 
234   outb = vec_packs(col4, col4);
235   outb = vec_add(outb, pb_centerjsamp);
236   outptr = (int *)(output_buf[4] + output_col);
237   vec_ste((__vector int)outb, 0, outptr);
238   vec_ste((__vector int)outb, 4, outptr);
239 
240   outb = vec_packs(col5, col5);
241   outb = vec_add(outb, pb_centerjsamp);
242   outptr = (int *)(output_buf[5] + output_col);
243   vec_ste((__vector int)outb, 0, outptr);
244   vec_ste((__vector int)outb, 4, outptr);
245 
246   outb = vec_packs(col6, col6);
247   outb = vec_add(outb, pb_centerjsamp);
248   outptr = (int *)(output_buf[6] + output_col);
249   vec_ste((__vector int)outb, 0, outptr);
250   vec_ste((__vector int)outb, 4, outptr);
251 
252   outb = vec_packs(col7, col7);
253   outb = vec_add(outb, pb_centerjsamp);
254   outptr = (int *)(output_buf[7] + output_col);
255   vec_ste((__vector int)outb, 0, outptr);
256   vec_ste((__vector int)outb, 4, outptr);
257 }
258