1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
24 
25 #include "jsimd_altivec.h"
26 
27 
28 /* NOTE: The address will either be aligned or offset by 8 bytes, so we can
29  * always get the data we want by using a single vector load (although we may
30  * have to permute the result.)
31  */
32 #if __BIG_ENDIAN__
33 
34 #define LOAD_ROW(row) { \
35   elemptr = sample_data[row] + start_col; \
36   in##row = vec_ld(0, elemptr); \
37   if ((size_t)elemptr & 15) \
38     in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr)); \
39 }
40 
41 #else
42 
43 #define LOAD_ROW(row) { \
44   elemptr = sample_data[row] + start_col; \
45   in##row = vec_vsx_ld(0, elemptr); \
46 }
47 
48 #endif
49 
50 
jsimd_convsamp_altivec(JSAMPARRAY sample_data,JDIMENSION start_col,DCTELEM * workspace)51 void jsimd_convsamp_altivec(JSAMPARRAY sample_data, JDIMENSION start_col,
52                             DCTELEM *workspace)
53 {
54   JSAMPROW elemptr;
55 
56   __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
57   __vector short out0, out1, out2, out3, out4, out5, out6, out7;
58 
59   /* Constants */
60   __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
61   __vector unsigned char pb_zero = { __16X(0) };
62 
63   LOAD_ROW(0);
64   LOAD_ROW(1);
65   LOAD_ROW(2);
66   LOAD_ROW(3);
67   LOAD_ROW(4);
68   LOAD_ROW(5);
69   LOAD_ROW(6);
70   LOAD_ROW(7);
71 
72   out0 = (__vector short)VEC_UNPACKHU(in0);
73   out1 = (__vector short)VEC_UNPACKHU(in1);
74   out2 = (__vector short)VEC_UNPACKHU(in2);
75   out3 = (__vector short)VEC_UNPACKHU(in3);
76   out4 = (__vector short)VEC_UNPACKHU(in4);
77   out5 = (__vector short)VEC_UNPACKHU(in5);
78   out6 = (__vector short)VEC_UNPACKHU(in6);
79   out7 = (__vector short)VEC_UNPACKHU(in7);
80 
81   out0 = vec_sub(out0, pw_centerjsamp);
82   out1 = vec_sub(out1, pw_centerjsamp);
83   out2 = vec_sub(out2, pw_centerjsamp);
84   out3 = vec_sub(out3, pw_centerjsamp);
85   out4 = vec_sub(out4, pw_centerjsamp);
86   out5 = vec_sub(out5, pw_centerjsamp);
87   out6 = vec_sub(out6, pw_centerjsamp);
88   out7 = vec_sub(out7, pw_centerjsamp);
89 
90   vec_st(out0, 0, workspace);
91   vec_st(out1, 16, workspace);
92   vec_st(out2, 32, workspace);
93   vec_st(out3, 48, workspace);
94   vec_st(out4, 64, workspace);
95   vec_st(out5, 80, workspace);
96   vec_st(out6, 96, workspace);
97   vec_st(out7, 112, workspace);
98 }
99 
100 
101 #define WORD_BIT  16
102 
103 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
104    We basically need an unsigned equivalent of vec_madds(). */
105 
106 #define MULTIPLY(vs0, vs1, out) { \
107   tmpe = vec_mule((__vector unsigned short)vs0, \
108                   (__vector unsigned short)vs1); \
109   tmpo = vec_mulo((__vector unsigned short)vs0, \
110                   (__vector unsigned short)vs1); \
111   out = (__vector short)vec_perm((__vector unsigned short)tmpe, \
112                                  (__vector unsigned short)tmpo, \
113                                  shift_pack_index); \
114 }
115 
jsimd_quantize_altivec(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)116 void jsimd_quantize_altivec(JCOEFPTR coef_block, DCTELEM *divisors,
117                             DCTELEM *workspace)
118 {
119   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
120     row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
121     corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
122     recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
123     scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
124   __vector unsigned int tmpe, tmpo;
125 
126   /* Constants */
127   __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
128 #if __BIG_ENDIAN__
129   __vector unsigned char shift_pack_index =
130     {  0,  1, 16, 17,  4,  5, 20, 21,  8,  9, 24, 25, 12, 13, 28, 29 };
131 #else
132   __vector unsigned char shift_pack_index =
133     {  2,  3, 18, 19,  6,  7, 22, 23, 10, 11, 26, 27, 14, 15, 30, 31 };
134 #endif
135 
136   row0 = vec_ld(0, workspace);
137   row1 = vec_ld(16, workspace);
138   row2 = vec_ld(32, workspace);
139   row3 = vec_ld(48, workspace);
140   row4 = vec_ld(64, workspace);
141   row5 = vec_ld(80, workspace);
142   row6 = vec_ld(96, workspace);
143   row7 = vec_ld(112, workspace);
144 
145   /* Branch-less absolute value */
146   row0s = vec_sra(row0, pw_word_bit_m1);
147   row1s = vec_sra(row1, pw_word_bit_m1);
148   row2s = vec_sra(row2, pw_word_bit_m1);
149   row3s = vec_sra(row3, pw_word_bit_m1);
150   row4s = vec_sra(row4, pw_word_bit_m1);
151   row5s = vec_sra(row5, pw_word_bit_m1);
152   row6s = vec_sra(row6, pw_word_bit_m1);
153   row7s = vec_sra(row7, pw_word_bit_m1);
154   row0 = vec_xor(row0, row0s);
155   row1 = vec_xor(row1, row1s);
156   row2 = vec_xor(row2, row2s);
157   row3 = vec_xor(row3, row3s);
158   row4 = vec_xor(row4, row4s);
159   row5 = vec_xor(row5, row5s);
160   row6 = vec_xor(row6, row6s);
161   row7 = vec_xor(row7, row7s);
162   row0 = vec_sub(row0, row0s);
163   row1 = vec_sub(row1, row1s);
164   row2 = vec_sub(row2, row2s);
165   row3 = vec_sub(row3, row3s);
166   row4 = vec_sub(row4, row4s);
167   row5 = vec_sub(row5, row5s);
168   row6 = vec_sub(row6, row6s);
169   row7 = vec_sub(row7, row7s);
170 
171   corr0 = vec_ld(DCTSIZE2 * 2, divisors);
172   corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
173   corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
174   corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
175   corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
176   corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
177   corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
178   corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
179 
180   row0 = vec_add(row0, corr0);
181   row1 = vec_add(row1, corr1);
182   row2 = vec_add(row2, corr2);
183   row3 = vec_add(row3, corr3);
184   row4 = vec_add(row4, corr4);
185   row5 = vec_add(row5, corr5);
186   row6 = vec_add(row6, corr6);
187   row7 = vec_add(row7, corr7);
188 
189   recip0 = vec_ld(0, divisors);
190   recip1 = vec_ld(16, divisors);
191   recip2 = vec_ld(32, divisors);
192   recip3 = vec_ld(48, divisors);
193   recip4 = vec_ld(64, divisors);
194   recip5 = vec_ld(80, divisors);
195   recip6 = vec_ld(96, divisors);
196   recip7 = vec_ld(112, divisors);
197 
198   MULTIPLY(row0, recip0, row0);
199   MULTIPLY(row1, recip1, row1);
200   MULTIPLY(row2, recip2, row2);
201   MULTIPLY(row3, recip3, row3);
202   MULTIPLY(row4, recip4, row4);
203   MULTIPLY(row5, recip5, row5);
204   MULTIPLY(row6, recip6, row6);
205   MULTIPLY(row7, recip7, row7);
206 
207   scale0 = vec_ld(DCTSIZE2 * 4, divisors);
208   scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
209   scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
210   scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
211   scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
212   scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
213   scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
214   scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
215 
216   MULTIPLY(row0, scale0, row0);
217   MULTIPLY(row1, scale1, row1);
218   MULTIPLY(row2, scale2, row2);
219   MULTIPLY(row3, scale3, row3);
220   MULTIPLY(row4, scale4, row4);
221   MULTIPLY(row5, scale5, row5);
222   MULTIPLY(row6, scale6, row6);
223   MULTIPLY(row7, scale7, row7);
224 
225   row0 = vec_xor(row0, row0s);
226   row1 = vec_xor(row1, row1s);
227   row2 = vec_xor(row2, row2s);
228   row3 = vec_xor(row3, row3s);
229   row4 = vec_xor(row4, row4s);
230   row5 = vec_xor(row5, row5s);
231   row6 = vec_xor(row6, row6s);
232   row7 = vec_xor(row7, row7s);
233   row0 = vec_sub(row0, row0s);
234   row1 = vec_sub(row1, row1s);
235   row2 = vec_sub(row2, row2s);
236   row3 = vec_sub(row3, row3s);
237   row4 = vec_sub(row4, row4s);
238   row5 = vec_sub(row5, row5s);
239   row6 = vec_sub(row6, row6s);
240   row7 = vec_sub(row7, row7s);
241 
242   vec_st(row0, 0, coef_block);
243   vec_st(row1, 16, coef_block);
244   vec_st(row2, 32, coef_block);
245   vec_st(row3, 48, coef_block);
246   vec_st(row4, 64, coef_block);
247   vec_st(row5, 80, coef_block);
248   vec_st(row6, 96, coef_block);
249   vec_st(row7, 112, coef_block);
250 }
251