1 /* libFLAC - Free Lossless Audio Codec library
2  * Copyright (C) 2000-2009  Josh Coalson
3  * Copyright (C) 2011-2016  Xiph.Org Foundation
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  *
9  * - Redistributions of source code must retain the above copyright
10  * notice, this list of conditions and the following disclaimer.
11  *
12  * - Redistributions in binary form must reproduce the above copyright
13  * notice, this list of conditions and the following disclaimer in the
14  * documentation and/or other materials provided with the distribution.
15  *
16  * - Neither the name of the Xiph.org Foundation nor the names of its
17  * contributors may be used to endorse or promote products derived from
18  * this software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23  * A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
24  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 #ifdef HAVE_CONFIG_H
34 #  include <config.h>
35 #endif
36 
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
40 
41 #include "private/cpu.h"
42 #include "private/lpc.h"
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45 
46 #include <altivec.h>
47 
48 #ifdef FLAC__HAS_TARGET_POWER8
49 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])50 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
51 {
52 	long i;
53 	long limit = (long)data_len - 16;
54 	const FLAC__real *base;
55 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
56 	vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
57 	vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
58 	vector float sum3 = { 0.0f, 0.0f, 0.0f, 0.0f};
59 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
60 	vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
61 	vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
62 	vector float sum13 = { 0.0f, 0.0f, 0.0f, 0.0f};
63 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
64 	vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
65 	vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
66 	vector float sum23 = { 0.0f, 0.0f, 0.0f, 0.0f};
67 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
68 	vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
69 	vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
70 	vector float sum33 = { 0.0f, 0.0f, 0.0f, 0.0f};
71 	vector float d0, d1, d2, d3, d4;
72 #if WORDS_BIGENDIAN
73 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
74 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
75 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
76 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
77 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
78 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
79 #else
80 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
81 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
82 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
83 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
84 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
85 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
86 #endif
87 
88 	(void) lag;
89 	FLAC__ASSERT(lag <= 16);
90 	FLAC__ASSERT(lag <= data_len);
91 
92 	base = data;
93 
94 	d0 = vec_vsx_ld(0, base);
95 	d1 = vec_vsx_ld(16, base);
96 	d2 = vec_vsx_ld(32, base);
97 	d3 = vec_vsx_ld(48, base);
98 
99 	base += 16;
100 
101 	for (i = 0; i <= (limit-4); i += 4) {
102 		vector float d, d0_orig = d0;
103 
104 		d4 = vec_vsx_ld(0, base);
105 		base += 4;
106 
107 		d = vec_splat(d0_orig, 0);
108 		sum0 += d0 * d;
109 		sum1 += d1 * d;
110 		sum2 += d2 * d;
111 		sum3 += d3 * d;
112 
113 		d = vec_splat(d0_orig, 1);
114 		d0 = vec_sel(d0_orig, d4, vsel1);
115 		sum10 += d0 * d;
116 		sum11 += d1 * d;
117 		sum12 += d2 * d;
118 		sum13 += d3 * d;
119 
120 		d = vec_splat(d0_orig, 2);
121 		d0 = vec_sel(d0_orig, d4, vsel2);
122 		sum20 += d0 * d;
123 		sum21 += d1 * d;
124 		sum22 += d2 * d;
125 		sum23 += d3 * d;
126 
127 		d = vec_splat(d0_orig, 3);
128 		d0 = vec_sel(d0_orig, d4, vsel3);
129 		sum30 += d0 * d;
130 		sum31 += d1 * d;
131 		sum32 += d2 * d;
132 		sum33 += d3 * d;
133 
134 		d0 = d1;
135 		d1 = d2;
136 		d2 = d3;
137 		d3 = d4;
138 	}
139 
140 	sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
141 	sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
142 	sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm1);
143 	sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm1);
144 
145 	sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
146 	sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
147 	sum2 += vec_perm(sum22, sum23, (vector unsigned char)vperm2);
148 	sum3 += vec_perm(sum23, sum20, (vector unsigned char)vperm2);
149 
150 	sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
151 	sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
152 	sum2 += vec_perm(sum32, sum33, (vector unsigned char)vperm3);
153 	sum3 += vec_perm(sum33, sum30, (vector unsigned char)vperm3);
154 
155 	for (; i <= limit; i++) {
156 		vector float d;
157 
158 		d0 = vec_vsx_ld(0, data+i);
159 		d1 = vec_vsx_ld(16, data+i);
160 		d2 = vec_vsx_ld(32, data+i);
161 		d3 = vec_vsx_ld(48, data+i);
162 
163 		d = vec_splat(d0, 0);
164 		sum0 += d0 * d;
165 		sum1 += d1 * d;
166 		sum2 += d2 * d;
167 		sum3 += d3 * d;
168 	}
169 
170 	vec_vsx_st(sum0, 0, autoc);
171 	vec_vsx_st(sum1, 16, autoc);
172 	vec_vsx_st(sum2, 32, autoc);
173 	vec_vsx_st(sum3, 48, autoc);
174 
175 	for (; i < (long)data_len; i++) {
176 		uint32_t coeff;
177 
178 		FLAC__real d = data[i];
179 		for (coeff = 0; coeff < data_len - i; coeff++)
180 			autoc[coeff] += d * data[i+coeff];
181 	}
182 }
183 
184 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])185 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
186 {
187 	long i;
188 	long limit = (long)data_len - 12;
189 	const FLAC__real *base;
190 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
191 	vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
192 	vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
193 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
194 	vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
195 	vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
196 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
197 	vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
198 	vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
199 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
200 	vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
201 	vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
202 	vector float d0, d1, d2, d3;
203 #if WORDS_BIGENDIAN
204 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
205 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
206 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
207 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
208 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
209 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
210 #else
211 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
212 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
213 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
214 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
215 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
216 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
217 #endif
218 
219 	(void) lag;
220 	FLAC__ASSERT(lag <= 12);
221 	FLAC__ASSERT(lag <= data_len);
222 
223 	base = data;
224 
225 	d0 = vec_vsx_ld(0, base);
226 	d1 = vec_vsx_ld(16, base);
227 	d2 = vec_vsx_ld(32, base);
228 
229 	base += 12;
230 
231 	for (i = 0; i <= (limit-3); i += 4) {
232 		vector float d, d0_orig = d0;
233 
234 		d3 = vec_vsx_ld(0, base);
235 		base += 4;
236 
237 		d = vec_splat(d0_orig, 0);
238 		sum0 += d0 * d;
239 		sum1 += d1 * d;
240 		sum2 += d2 * d;
241 
242 		d = vec_splat(d0_orig, 1);
243 		d0 = vec_sel(d0_orig, d3, vsel1);
244 		sum10 += d0 * d;
245 		sum11 += d1 * d;
246 		sum12 += d2 * d;
247 
248 		d = vec_splat(d0_orig, 2);
249 		d0 = vec_sel(d0_orig, d3, vsel2);
250 		sum20 += d0 * d;
251 		sum21 += d1 * d;
252 		sum22 += d2 * d;
253 
254 		d = vec_splat(d0_orig, 3);
255 		d0 = vec_sel(d0_orig, d3, vsel3);
256 		sum30 += d0 * d;
257 		sum31 += d1 * d;
258 		sum32 += d2 * d;
259 
260 		d0 = d1;
261 		d1 = d2;
262 		d2 = d3;
263 	}
264 
265 	sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
266 	sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
267 	sum2 += vec_perm(sum12, sum10, (vector unsigned char)vperm1);
268 
269 	sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
270 	sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
271 	sum2 += vec_perm(sum22, sum20, (vector unsigned char)vperm2);
272 
273 	sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
274 	sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
275 	sum2 += vec_perm(sum32, sum30, (vector unsigned char)vperm3);
276 
277 	for (; i <= limit; i++) {
278 		vector float d;
279 
280 		d0 = vec_vsx_ld(0, data+i);
281 		d1 = vec_vsx_ld(16, data+i);
282 		d2 = vec_vsx_ld(32, data+i);
283 
284 		d = vec_splat(d0, 0);
285 		sum0 += d0 * d;
286 		sum1 += d1 * d;
287 		sum2 += d2 * d;
288 	}
289 
290 	vec_vsx_st(sum0, 0, autoc);
291 	vec_vsx_st(sum1, 16, autoc);
292 	vec_vsx_st(sum2, 32, autoc);
293 
294 	for (; i < (long)data_len; i++) {
295 		uint32_t coeff;
296 
297 		FLAC__real d = data[i];
298 		for (coeff = 0; coeff < data_len - i; coeff++)
299 			autoc[coeff] += d * data[i+coeff];
300 	}
301 }
302 
303 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])304 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
305 {
306 	long i;
307 	long limit = (long)data_len - 8;
308 	const FLAC__real *base;
309 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
310 	vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
311 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
312 	vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
313 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
314 	vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
315 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
316 	vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
317 	vector float d0, d1, d2;
318 #if WORDS_BIGENDIAN
319 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
320 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
321 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
322 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
323 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
324 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
325 #else
326 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
327 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
328 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
329 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
330 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
331 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
332 #endif
333 
334 	(void) lag;
335 	FLAC__ASSERT(lag <= 8);
336 	FLAC__ASSERT(lag <= data_len);
337 
338 	base = data;
339 
340 	d0 = vec_vsx_ld(0, base);
341 	d1 = vec_vsx_ld(16, base);
342 
343 	base += 8;
344 
345 	for (i = 0; i <= (limit-2); i += 4) {
346 		vector float d, d0_orig = d0;
347 
348 		d2 = vec_vsx_ld(0, base);
349 		base += 4;
350 
351 		d = vec_splat(d0_orig, 0);
352 		sum0 += d0 * d;
353 		sum1 += d1 * d;
354 
355 		d = vec_splat(d0_orig, 1);
356 		d0 = vec_sel(d0_orig, d2, vsel1);
357 		sum10 += d0 * d;
358 		sum11 += d1 * d;
359 
360 		d = vec_splat(d0_orig, 2);
361 		d0 = vec_sel(d0_orig, d2, vsel2);
362 		sum20 += d0 * d;
363 		sum21 += d1 * d;
364 
365 		d = vec_splat(d0_orig, 3);
366 		d0 = vec_sel(d0_orig, d2, vsel3);
367 		sum30 += d0 * d;
368 		sum31 += d1 * d;
369 
370 		d0 = d1;
371 		d1 = d2;
372 	}
373 
374 	sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
375 	sum1 += vec_perm(sum11, sum10, (vector unsigned char)vperm1);
376 
377 	sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
378 	sum1 += vec_perm(sum21, sum20, (vector unsigned char)vperm2);
379 
380 	sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
381 	sum1 += vec_perm(sum31, sum30, (vector unsigned char)vperm3);
382 
383 	for (; i <= limit; i++) {
384 		vector float d;
385 
386 		d0 = vec_vsx_ld(0, data+i);
387 		d1 = vec_vsx_ld(16, data+i);
388 
389 		d = vec_splat(d0, 0);
390 		sum0 += d0 * d;
391 		sum1 += d1 * d;
392 	}
393 
394 	vec_vsx_st(sum0, 0, autoc);
395 	vec_vsx_st(sum1, 16, autoc);
396 
397 	for (; i < (long)data_len; i++) {
398 		uint32_t coeff;
399 
400 		FLAC__real d = data[i];
401 		for (coeff = 0; coeff < data_len - i; coeff++)
402 			autoc[coeff] += d * data[i+coeff];
403 	}
404 }
405 
406 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])407 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
408 {
409 	long i;
410 	long limit = (long)data_len - 4;
411 	const FLAC__real *base;
412 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
413 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
414 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
415 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
416 	vector float d0, d1;
417 #if WORDS_BIGENDIAN
418 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
419 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
420 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
421 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
422 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
423 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
424 #else
425 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
426 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
427 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
428 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
429 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
430 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
431 #endif
432 
433 	(void) lag;
434 	FLAC__ASSERT(lag <= 4);
435 	FLAC__ASSERT(lag <= data_len);
436 
437 	base = data;
438 
439 	d0 = vec_vsx_ld(0, base);
440 
441 	base += 4;
442 
443 	for (i = 0; i <= (limit-1); i += 4) {
444 		vector float d, d0_orig = d0;
445 
446 		d1 = vec_vsx_ld(0, base);
447 		base += 4;
448 
449 		d = vec_splat(d0_orig, 0);
450 		sum0 += d0 * d;
451 
452 		d = vec_splat(d0_orig, 1);
453 		d0 = vec_sel(d0_orig, d1, vsel1);
454 		sum10 += d0 * d;
455 
456 		d = vec_splat(d0_orig, 2);
457 		d0 = vec_sel(d0_orig, d1, vsel2);
458 		sum20 += d0 * d;
459 
460 		d = vec_splat(d0_orig, 3);
461 		d0 = vec_sel(d0_orig, d1, vsel3);
462 		sum30 += d0 * d;
463 
464 		d0 = d1;
465 	}
466 
467 	sum0 += vec_perm(sum10, sum10, (vector unsigned char)vperm1);
468 
469 	sum0 += vec_perm(sum20, sum20, (vector unsigned char)vperm2);
470 
471 	sum0 += vec_perm(sum30, sum30, (vector unsigned char)vperm3);
472 
473 	for (; i <= limit; i++) {
474 		vector float d;
475 
476 		d0 = vec_vsx_ld(0, data+i);
477 
478 		d = vec_splat(d0, 0);
479 		sum0 += d0 * d;
480 	}
481 
482 	vec_vsx_st(sum0, 0, autoc);
483 
484 	for (; i < (long)data_len; i++) {
485 		uint32_t coeff;
486 
487 		FLAC__real d = data[i];
488 		for (coeff = 0; coeff < data_len - i; coeff++)
489 			autoc[coeff] += d * data[i+coeff];
490 	}
491 }
492 #endif /* FLAC__HAS_TARGET_POWER8 */
493 
494 #ifdef FLAC__HAS_TARGET_POWER9
495 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])496 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
497 {
498 	long i;
499 	long limit = (long)data_len - 16;
500 	const FLAC__real *base;
501 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
502 	vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
503 	vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
504 	vector float sum3 = { 0.0f, 0.0f, 0.0f, 0.0f};
505 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
506 	vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
507 	vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
508 	vector float sum13 = { 0.0f, 0.0f, 0.0f, 0.0f};
509 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
510 	vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
511 	vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
512 	vector float sum23 = { 0.0f, 0.0f, 0.0f, 0.0f};
513 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
514 	vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
515 	vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
516 	vector float sum33 = { 0.0f, 0.0f, 0.0f, 0.0f};
517 	vector float d0, d1, d2, d3, d4;
518 #if WORDS_BIGENDIAN
519 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
520 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
521 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
522 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
523 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
524 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
525 #else
526 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
527 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
528 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
529 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
530 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
531 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
532 #endif
533 
534 	(void) lag;
535 	FLAC__ASSERT(lag <= 16);
536 	FLAC__ASSERT(lag <= data_len);
537 
538 	base = data;
539 
540 	d0 = vec_vsx_ld(0, base);
541 	d1 = vec_vsx_ld(16, base);
542 	d2 = vec_vsx_ld(32, base);
543 	d3 = vec_vsx_ld(48, base);
544 
545 	base += 16;
546 
547 	for (i = 0; i <= (limit-4); i += 4) {
548 		vector float d, d0_orig = d0;
549 
550 		d4 = vec_vsx_ld(0, base);
551 		base += 4;
552 
553 		d = vec_splat(d0_orig, 0);
554 		sum0 += d0 * d;
555 		sum1 += d1 * d;
556 		sum2 += d2 * d;
557 		sum3 += d3 * d;
558 
559 		d = vec_splat(d0_orig, 1);
560 		d0 = vec_sel(d0_orig, d4, vsel1);
561 		sum10 += d0 * d;
562 		sum11 += d1 * d;
563 		sum12 += d2 * d;
564 		sum13 += d3 * d;
565 
566 		d = vec_splat(d0_orig, 2);
567 		d0 = vec_sel(d0_orig, d4, vsel2);
568 		sum20 += d0 * d;
569 		sum21 += d1 * d;
570 		sum22 += d2 * d;
571 		sum23 += d3 * d;
572 
573 		d = vec_splat(d0_orig, 3);
574 		d0 = vec_sel(d0_orig, d4, vsel3);
575 		sum30 += d0 * d;
576 		sum31 += d1 * d;
577 		sum32 += d2 * d;
578 		sum33 += d3 * d;
579 
580 		d0 = d1;
581 		d1 = d2;
582 		d2 = d3;
583 		d3 = d4;
584 	}
585 
586 	sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
587 	sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
588 	sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm1);
589 	sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm1);
590 
591 	sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
592 	sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
593 	sum2 += vec_perm(sum22, sum23, (vector unsigned char)vperm2);
594 	sum3 += vec_perm(sum23, sum20, (vector unsigned char)vperm2);
595 
596 	sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
597 	sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
598 	sum2 += vec_perm(sum32, sum33, (vector unsigned char)vperm3);
599 	sum3 += vec_perm(sum33, sum30, (vector unsigned char)vperm3);
600 
601 	for (; i <= limit; i++) {
602 		vector float d;
603 
604 		d0 = vec_vsx_ld(0, data+i);
605 		d1 = vec_vsx_ld(16, data+i);
606 		d2 = vec_vsx_ld(32, data+i);
607 		d3 = vec_vsx_ld(48, data+i);
608 
609 		d = vec_splat(d0, 0);
610 		sum0 += d0 * d;
611 		sum1 += d1 * d;
612 		sum2 += d2 * d;
613 		sum3 += d3 * d;
614 	}
615 
616 	vec_vsx_st(sum0, 0, autoc);
617 	vec_vsx_st(sum1, 16, autoc);
618 	vec_vsx_st(sum2, 32, autoc);
619 	vec_vsx_st(sum3, 48, autoc);
620 
621 	for (; i < (long)data_len; i++) {
622 		uint32_t coeff;
623 
624 		FLAC__real d = data[i];
625 		for (coeff = 0; coeff < data_len - i; coeff++)
626 			autoc[coeff] += d * data[i+coeff];
627 	}
628 }
629 
630 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])631 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
632 {
633 	long i;
634 	long limit = (long)data_len - 12;
635 	const FLAC__real *base;
636 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
637 	vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
638 	vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
639 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
640 	vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
641 	vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
642 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
643 	vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
644 	vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
645 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
646 	vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
647 	vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
648 	vector float d0, d1, d2, d3;
649 #if WORDS_BIGENDIAN
650 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
651 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
652 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
653 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
654 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
655 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
656 #else
657 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
658 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
659 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
660 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
661 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
662 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
663 #endif
664 
665 	(void) lag;
666 	FLAC__ASSERT(lag <= 12);
667 	FLAC__ASSERT(lag <= data_len);
668 
669 	base = data;
670 
671 	d0 = vec_vsx_ld(0, base);
672 	d1 = vec_vsx_ld(16, base);
673 	d2 = vec_vsx_ld(32, base);
674 
675 	base += 12;
676 
677 	for (i = 0; i <= (limit-3); i += 4) {
678 		vector float d, d0_orig = d0;
679 
680 		d3 = vec_vsx_ld(0, base);
681 		base += 4;
682 
683 		d = vec_splat(d0_orig, 0);
684 		sum0 += d0 * d;
685 		sum1 += d1 * d;
686 		sum2 += d2 * d;
687 
688 		d = vec_splat(d0_orig, 1);
689 		d0 = vec_sel(d0_orig, d3, vsel1);
690 		sum10 += d0 * d;
691 		sum11 += d1 * d;
692 		sum12 += d2 * d;
693 
694 		d = vec_splat(d0_orig, 2);
695 		d0 = vec_sel(d0_orig, d3, vsel2);
696 		sum20 += d0 * d;
697 		sum21 += d1 * d;
698 		sum22 += d2 * d;
699 
700 		d = vec_splat(d0_orig, 3);
701 		d0 = vec_sel(d0_orig, d3, vsel3);
702 		sum30 += d0 * d;
703 		sum31 += d1 * d;
704 		sum32 += d2 * d;
705 
706 		d0 = d1;
707 		d1 = d2;
708 		d2 = d3;
709 	}
710 
711 	sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
712 	sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
713 	sum2 += vec_perm(sum12, sum10, (vector unsigned char)vperm1);
714 
715 	sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
716 	sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
717 	sum2 += vec_perm(sum22, sum20, (vector unsigned char)vperm2);
718 
719 	sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
720 	sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
721 	sum2 += vec_perm(sum32, sum30, (vector unsigned char)vperm3);
722 
723 	for (; i <= limit; i++) {
724 		vector float d;
725 
726 		d0 = vec_vsx_ld(0, data+i);
727 		d1 = vec_vsx_ld(16, data+i);
728 		d2 = vec_vsx_ld(32, data+i);
729 
730 		d = vec_splat(d0, 0);
731 		sum0 += d0 * d;
732 		sum1 += d1 * d;
733 		sum2 += d2 * d;
734 	}
735 
736 	vec_vsx_st(sum0, 0, autoc);
737 	vec_vsx_st(sum1, 16, autoc);
738 	vec_vsx_st(sum2, 32, autoc);
739 
740 	for (; i < (long)data_len; i++) {
741 		uint32_t coeff;
742 
743 		FLAC__real d = data[i];
744 		for (coeff = 0; coeff < data_len - i; coeff++)
745 			autoc[coeff] += d * data[i+coeff];
746 	}
747 }
748 
749 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])750 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
751 {
752 	long i;
753 	long limit = (long)data_len - 8;
754 	const FLAC__real *base;
755 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
756 	vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
757 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
758 	vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
759 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
760 	vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
761 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
762 	vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
763 	vector float d0, d1, d2;
764 #if WORDS_BIGENDIAN
765 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
766 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
767 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
768 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
769 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
770 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
771 #else
772 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
773 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
774 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
775 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
776 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
777 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
778 #endif
779 
780 	(void) lag;
781 	FLAC__ASSERT(lag <= 8);
782 	FLAC__ASSERT(lag <= data_len);
783 
784 	base = data;
785 
786 	d0 = vec_vsx_ld(0, base);
787 	d1 = vec_vsx_ld(16, base);
788 
789 	base += 8;
790 
791 	for (i = 0; i <= (limit-2); i += 4) {
792 		vector float d, d0_orig = d0;
793 
794 		d2 = vec_vsx_ld(0, base);
795 		base += 4;
796 
797 		d = vec_splat(d0_orig, 0);
798 		sum0 += d0 * d;
799 		sum1 += d1 * d;
800 
801 		d = vec_splat(d0_orig, 1);
802 		d0 = vec_sel(d0_orig, d2, vsel1);
803 		sum10 += d0 * d;
804 		sum11 += d1 * d;
805 
806 		d = vec_splat(d0_orig, 2);
807 		d0 = vec_sel(d0_orig, d2, vsel2);
808 		sum20 += d0 * d;
809 		sum21 += d1 * d;
810 
811 		d = vec_splat(d0_orig, 3);
812 		d0 = vec_sel(d0_orig, d2, vsel3);
813 		sum30 += d0 * d;
814 		sum31 += d1 * d;
815 
816 		d0 = d1;
817 		d1 = d2;
818 	}
819 
820 	sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
821 	sum1 += vec_perm(sum11, sum10, (vector unsigned char)vperm1);
822 
823 	sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
824 	sum1 += vec_perm(sum21, sum20, (vector unsigned char)vperm2);
825 
826 	sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
827 	sum1 += vec_perm(sum31, sum30, (vector unsigned char)vperm3);
828 
829 	for (; i <= limit; i++) {
830 		vector float d;
831 
832 		d0 = vec_vsx_ld(0, data+i);
833 		d1 = vec_vsx_ld(16, data+i);
834 
835 		d = vec_splat(d0, 0);
836 		sum0 += d0 * d;
837 		sum1 += d1 * d;
838 	}
839 
840 	vec_vsx_st(sum0, 0, autoc);
841 	vec_vsx_st(sum1, 16, autoc);
842 
843 	for (; i < (long)data_len; i++) {
844 		uint32_t coeff;
845 
846 		FLAC__real d = data[i];
847 		for (coeff = 0; coeff < data_len - i; coeff++)
848 			autoc[coeff] += d * data[i+coeff];
849 	}
850 }
851 
852 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])853 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
854 {
855 	long i;
856 	long limit = (long)data_len - 4;
857 	const FLAC__real *base;
858 	vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
859 	vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
860 	vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
861 	vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
862 	vector float d0, d1;
863 #if WORDS_BIGENDIAN
864 	vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
865 	vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
866 	vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
867 	vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
868 	vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
869 	vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
870 #else
871 	vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
872 	vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
873 	vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
874 	vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
875 	vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
876 	vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
877 #endif
878 
879 	(void) lag;
880 	FLAC__ASSERT(lag <= 4);
881 	FLAC__ASSERT(lag <= data_len);
882 
883 	base = data;
884 
885 	d0 = vec_vsx_ld(0, base);
886 
887 	base += 4;
888 
889 	for (i = 0; i <= (limit-1); i += 4) {
890 		vector float d, d0_orig = d0;
891 
892 		d1 = vec_vsx_ld(0, base);
893 		base += 4;
894 
895 		d = vec_splat(d0_orig, 0);
896 		sum0 += d0 * d;
897 
898 		d = vec_splat(d0_orig, 1);
899 		d0 = vec_sel(d0_orig, d1, vsel1);
900 		sum10 += d0 * d;
901 
902 		d = vec_splat(d0_orig, 2);
903 		d0 = vec_sel(d0_orig, d1, vsel2);
904 		sum20 += d0 * d;
905 
906 		d = vec_splat(d0_orig, 3);
907 		d0 = vec_sel(d0_orig, d1, vsel3);
908 		sum30 += d0 * d;
909 
910 		d0 = d1;
911 	}
912 
913 	sum0 += vec_perm(sum10, sum10, (vector unsigned char)vperm1);
914 
915 	sum0 += vec_perm(sum20, sum20, (vector unsigned char)vperm2);
916 
917 	sum0 += vec_perm(sum30, sum30, (vector unsigned char)vperm3);
918 
919 	for (; i <= limit; i++) {
920 		vector float d;
921 
922 		d0 = vec_vsx_ld(0, data+i);
923 
924 		d = vec_splat(d0, 0);
925 		sum0 += d0 * d;
926 	}
927 
928 	vec_vsx_st(sum0, 0, autoc);
929 
930 	for (; i < (long)data_len; i++) {
931 		uint32_t coeff;
932 
933 		FLAC__real d = data[i];
934 		for (coeff = 0; coeff < data_len - i; coeff++)
935 			autoc[coeff] += d * data[i+coeff];
936 	}
937 }
938 #endif /* FLAC__HAS_TARGET_POWER9 */
939 
940 #endif /* FLAC__CPU_PPC64 && FLAC__USE_VSX */
941 #endif /* FLAC__NO_ASM */
942 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
943