1 /*
2  * AltiVec optimizations for libjpeg-turbo
3  *
4  * Copyright (C) 2014-2015, D. R. Commander.  All Rights Reserved.
5  *
6  * This software is provided 'as-is', without any express or implied
7  * warranty.  In no event will the authors be held liable for any damages
8  * arising from the use of this software.
9  *
10  * Permission is granted to anyone to use this software for any purpose,
11  * including commercial applications, and to alter it and redistribute it
12  * freely, subject to the following restrictions:
13  *
14  * 1. The origin of this software must not be misrepresented; you must not
15  *    claim that you wrote the original software. If you use this software
16  *    in a product, an acknowledgment in the product documentation would be
17  *    appreciated but is not required.
18  * 2. Altered source versions must be plainly marked as such, and must not be
19  *    misrepresented as being the original software.
20  * 3. This notice may not be removed or altered from any source distribution.
21  */
22 
23 /* INTEGER QUANTIZATION AND SAMPLE CONVERSION */
24 
25 #include "jsimd_altivec.h"
26 
27 
28 /* NOTE: The address will either be aligned or offset by 8 bytes, so we can
29  * always get the data we want by using a single vector load (although we may
30  * have to permute the result.)
31  */
32 #if __BIG_ENDIAN__
33 
34 #define LOAD_ROW(row) {  \
35   elemptr = sample_data[row] + start_col;  \
36   in##row = vec_ld(0, elemptr);  \
37   if ((size_t)elemptr & 15)  \
38     in##row = vec_perm(in##row, in##row, vec_lvsl(0, elemptr));  \
39 }
40 
41 #else
42 
43 #define LOAD_ROW(row) {  \
44   elemptr = sample_data[row] + start_col;  \
45   in##row = vec_vsx_ld(0, elemptr);  \
46 }
47 
48 #endif
49 
50 
51 void
jsimd_convsamp_altivec(JSAMPARRAY sample_data,JDIMENSION start_col,DCTELEM * workspace)52 jsimd_convsamp_altivec (JSAMPARRAY sample_data, JDIMENSION start_col,
53                         DCTELEM *workspace)
54 {
55   JSAMPROW elemptr;
56 
57   __vector unsigned char in0, in1, in2, in3, in4, in5, in6, in7;
58   __vector short out0, out1, out2, out3, out4, out5, out6, out7;
59 
60   /* Constants */
61   __vector short pw_centerjsamp = { __8X(CENTERJSAMPLE) };
62   __vector unsigned char pb_zero = { __16X(0) };
63 
64   LOAD_ROW(0);
65   LOAD_ROW(1);
66   LOAD_ROW(2);
67   LOAD_ROW(3);
68   LOAD_ROW(4);
69   LOAD_ROW(5);
70   LOAD_ROW(6);
71   LOAD_ROW(7);
72 
73   out0 = (__vector short)VEC_UNPACKHU(in0);
74   out1 = (__vector short)VEC_UNPACKHU(in1);
75   out2 = (__vector short)VEC_UNPACKHU(in2);
76   out3 = (__vector short)VEC_UNPACKHU(in3);
77   out4 = (__vector short)VEC_UNPACKHU(in4);
78   out5 = (__vector short)VEC_UNPACKHU(in5);
79   out6 = (__vector short)VEC_UNPACKHU(in6);
80   out7 = (__vector short)VEC_UNPACKHU(in7);
81 
82   out0 = vec_sub(out0, pw_centerjsamp);
83   out1 = vec_sub(out1, pw_centerjsamp);
84   out2 = vec_sub(out2, pw_centerjsamp);
85   out3 = vec_sub(out3, pw_centerjsamp);
86   out4 = vec_sub(out4, pw_centerjsamp);
87   out5 = vec_sub(out5, pw_centerjsamp);
88   out6 = vec_sub(out6, pw_centerjsamp);
89   out7 = vec_sub(out7, pw_centerjsamp);
90 
91   vec_st(out0, 0, workspace);
92   vec_st(out1, 16, workspace);
93   vec_st(out2, 32, workspace);
94   vec_st(out3, 48, workspace);
95   vec_st(out4, 64, workspace);
96   vec_st(out5, 80, workspace);
97   vec_st(out6, 96, workspace);
98   vec_st(out7, 112, workspace);
99 }
100 
101 
102 #define WORD_BIT 16
103 
104 /* There is no AltiVec 16-bit unsigned multiply instruction, hence this.
105    We basically need an unsigned equivalent of vec_madds(). */
106 
107 #define MULTIPLY(vs0, vs1, out) {  \
108   tmpe = vec_mule((__vector unsigned short)vs0,  \
109                   (__vector unsigned short)vs1);  \
110   tmpo = vec_mulo((__vector unsigned short)vs0,  \
111                   (__vector unsigned short)vs1);  \
112   out = (__vector short)vec_perm((__vector unsigned short)tmpe,  \
113                                  (__vector unsigned short)tmpo,  \
114                                  shift_pack_index);  \
115 }
116 
117 void
jsimd_quantize_altivec(JCOEFPTR coef_block,DCTELEM * divisors,DCTELEM * workspace)118 jsimd_quantize_altivec (JCOEFPTR coef_block, DCTELEM *divisors,
119                         DCTELEM *workspace)
120 {
121   __vector short row0, row1, row2, row3, row4, row5, row6, row7,
122     row0s, row1s, row2s, row3s, row4s, row5s, row6s, row7s,
123     corr0, corr1, corr2, corr3, corr4, corr5, corr6, corr7,
124     recip0, recip1, recip2, recip3, recip4, recip5, recip6, recip7,
125     scale0, scale1, scale2, scale3, scale4, scale5, scale6, scale7;
126   __vector unsigned int tmpe, tmpo;
127 
128   /* Constants */
129   __vector unsigned short pw_word_bit_m1 = { __8X(WORD_BIT - 1) };
130 #if __BIG_ENDIAN__
131   __vector unsigned char shift_pack_index =
132     {0,1,16,17,4,5,20,21,8,9,24,25,12,13,28,29};
133 #else
134   __vector unsigned char shift_pack_index =
135     {2,3,18,19,6,7,22,23,10,11,26,27,14,15,30,31};
136 #endif
137 
138   row0 = vec_ld(0, workspace);
139   row1 = vec_ld(16, workspace);
140   row2 = vec_ld(32, workspace);
141   row3 = vec_ld(48, workspace);
142   row4 = vec_ld(64, workspace);
143   row5 = vec_ld(80, workspace);
144   row6 = vec_ld(96, workspace);
145   row7 = vec_ld(112, workspace);
146 
147   /* Branch-less absolute value */
148   row0s = vec_sra(row0, pw_word_bit_m1);
149   row1s = vec_sra(row1, pw_word_bit_m1);
150   row2s = vec_sra(row2, pw_word_bit_m1);
151   row3s = vec_sra(row3, pw_word_bit_m1);
152   row4s = vec_sra(row4, pw_word_bit_m1);
153   row5s = vec_sra(row5, pw_word_bit_m1);
154   row6s = vec_sra(row6, pw_word_bit_m1);
155   row7s = vec_sra(row7, pw_word_bit_m1);
156   row0 = vec_xor(row0, row0s);
157   row1 = vec_xor(row1, row1s);
158   row2 = vec_xor(row2, row2s);
159   row3 = vec_xor(row3, row3s);
160   row4 = vec_xor(row4, row4s);
161   row5 = vec_xor(row5, row5s);
162   row6 = vec_xor(row6, row6s);
163   row7 = vec_xor(row7, row7s);
164   row0 = vec_sub(row0, row0s);
165   row1 = vec_sub(row1, row1s);
166   row2 = vec_sub(row2, row2s);
167   row3 = vec_sub(row3, row3s);
168   row4 = vec_sub(row4, row4s);
169   row5 = vec_sub(row5, row5s);
170   row6 = vec_sub(row6, row6s);
171   row7 = vec_sub(row7, row7s);
172 
173   corr0 = vec_ld(DCTSIZE2 * 2, divisors);
174   corr1 = vec_ld(DCTSIZE2 * 2 + 16, divisors);
175   corr2 = vec_ld(DCTSIZE2 * 2 + 32, divisors);
176   corr3 = vec_ld(DCTSIZE2 * 2 + 48, divisors);
177   corr4 = vec_ld(DCTSIZE2 * 2 + 64, divisors);
178   corr5 = vec_ld(DCTSIZE2 * 2 + 80, divisors);
179   corr6 = vec_ld(DCTSIZE2 * 2 + 96, divisors);
180   corr7 = vec_ld(DCTSIZE2 * 2 + 112, divisors);
181 
182   row0 = vec_add(row0, corr0);
183   row1 = vec_add(row1, corr1);
184   row2 = vec_add(row2, corr2);
185   row3 = vec_add(row3, corr3);
186   row4 = vec_add(row4, corr4);
187   row5 = vec_add(row5, corr5);
188   row6 = vec_add(row6, corr6);
189   row7 = vec_add(row7, corr7);
190 
191   recip0 = vec_ld(0, divisors);
192   recip1 = vec_ld(16, divisors);
193   recip2 = vec_ld(32, divisors);
194   recip3 = vec_ld(48, divisors);
195   recip4 = vec_ld(64, divisors);
196   recip5 = vec_ld(80, divisors);
197   recip6 = vec_ld(96, divisors);
198   recip7 = vec_ld(112, divisors);
199 
200   MULTIPLY(row0, recip0, row0);
201   MULTIPLY(row1, recip1, row1);
202   MULTIPLY(row2, recip2, row2);
203   MULTIPLY(row3, recip3, row3);
204   MULTIPLY(row4, recip4, row4);
205   MULTIPLY(row5, recip5, row5);
206   MULTIPLY(row6, recip6, row6);
207   MULTIPLY(row7, recip7, row7);
208 
209   scale0 = vec_ld(DCTSIZE2 * 4, divisors);
210   scale1 = vec_ld(DCTSIZE2 * 4 + 16, divisors);
211   scale2 = vec_ld(DCTSIZE2 * 4 + 32, divisors);
212   scale3 = vec_ld(DCTSIZE2 * 4 + 48, divisors);
213   scale4 = vec_ld(DCTSIZE2 * 4 + 64, divisors);
214   scale5 = vec_ld(DCTSIZE2 * 4 + 80, divisors);
215   scale6 = vec_ld(DCTSIZE2 * 4 + 96, divisors);
216   scale7 = vec_ld(DCTSIZE2 * 4 + 112, divisors);
217 
218   MULTIPLY(row0, scale0, row0);
219   MULTIPLY(row1, scale1, row1);
220   MULTIPLY(row2, scale2, row2);
221   MULTIPLY(row3, scale3, row3);
222   MULTIPLY(row4, scale4, row4);
223   MULTIPLY(row5, scale5, row5);
224   MULTIPLY(row6, scale6, row6);
225   MULTIPLY(row7, scale7, row7);
226 
227   row0 = vec_xor(row0, row0s);
228   row1 = vec_xor(row1, row1s);
229   row2 = vec_xor(row2, row2s);
230   row3 = vec_xor(row3, row3s);
231   row4 = vec_xor(row4, row4s);
232   row5 = vec_xor(row5, row5s);
233   row6 = vec_xor(row6, row6s);
234   row7 = vec_xor(row7, row7s);
235   row0 = vec_sub(row0, row0s);
236   row1 = vec_sub(row1, row1s);
237   row2 = vec_sub(row2, row2s);
238   row3 = vec_sub(row3, row3s);
239   row4 = vec_sub(row4, row4s);
240   row5 = vec_sub(row5, row5s);
241   row6 = vec_sub(row6, row6s);
242   row7 = vec_sub(row7, row7s);
243 
244   vec_st(row0, 0, coef_block);
245   vec_st(row1, 16, coef_block);
246   vec_st(row2, 32, coef_block);
247   vec_st(row3, 48, coef_block);
248   vec_st(row4, 64, coef_block);
249   vec_st(row5, 80, coef_block);
250   vec_st(row6, 96, coef_block);
251   vec_st(row7, 112, coef_block);
252 }
253