1 /* libFLAC - Free Lossless Audio Codec library
2 * Copyright (C) 2000-2009 Josh Coalson
3 * Copyright (C) 2011-2016 Xiph.Org Foundation
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 *
9 * - Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 *
12 * - Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * - Neither the name of the Xiph.org Foundation nor the names of its
17 * contributors may be used to endorse or promote products derived from
18 * this software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
24 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
25 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
26 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 #ifdef HAVE_CONFIG_H
34 # include <config.h>
35 #endif
36
37 #ifndef FLAC__INTEGER_ONLY_LIBRARY
38 #ifndef FLAC__NO_ASM
39 #if defined(FLAC__CPU_PPC64) && defined(FLAC__USE_VSX)
40
41 #include "private/cpu.h"
42 #include "private/lpc.h"
43 #include "FLAC/assert.h"
44 #include "FLAC/format.h"
45
46 #include <altivec.h>
47
48 #ifdef FLAC__HAS_TARGET_POWER8
49 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])50 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
51 {
52 long i;
53 long limit = (long)data_len - 16;
54 const FLAC__real *base;
55 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
56 vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
57 vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
58 vector float sum3 = { 0.0f, 0.0f, 0.0f, 0.0f};
59 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
60 vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
61 vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
62 vector float sum13 = { 0.0f, 0.0f, 0.0f, 0.0f};
63 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
64 vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
65 vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
66 vector float sum23 = { 0.0f, 0.0f, 0.0f, 0.0f};
67 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
68 vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
69 vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
70 vector float sum33 = { 0.0f, 0.0f, 0.0f, 0.0f};
71 vector float d0, d1, d2, d3, d4;
72 #if WORDS_BIGENDIAN
73 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
74 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
75 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
76 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
77 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
78 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
79 #else
80 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
81 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
82 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
83 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
84 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
85 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
86 #endif
87
88 (void) lag;
89 FLAC__ASSERT(lag <= 16);
90 FLAC__ASSERT(lag <= data_len);
91
92 base = data;
93
94 d0 = vec_vsx_ld(0, base);
95 d1 = vec_vsx_ld(16, base);
96 d2 = vec_vsx_ld(32, base);
97 d3 = vec_vsx_ld(48, base);
98
99 base += 16;
100
101 for (i = 0; i <= (limit-4); i += 4) {
102 vector float d, d0_orig = d0;
103
104 d4 = vec_vsx_ld(0, base);
105 base += 4;
106
107 d = vec_splat(d0_orig, 0);
108 sum0 += d0 * d;
109 sum1 += d1 * d;
110 sum2 += d2 * d;
111 sum3 += d3 * d;
112
113 d = vec_splat(d0_orig, 1);
114 d0 = vec_sel(d0_orig, d4, vsel1);
115 sum10 += d0 * d;
116 sum11 += d1 * d;
117 sum12 += d2 * d;
118 sum13 += d3 * d;
119
120 d = vec_splat(d0_orig, 2);
121 d0 = vec_sel(d0_orig, d4, vsel2);
122 sum20 += d0 * d;
123 sum21 += d1 * d;
124 sum22 += d2 * d;
125 sum23 += d3 * d;
126
127 d = vec_splat(d0_orig, 3);
128 d0 = vec_sel(d0_orig, d4, vsel3);
129 sum30 += d0 * d;
130 sum31 += d1 * d;
131 sum32 += d2 * d;
132 sum33 += d3 * d;
133
134 d0 = d1;
135 d1 = d2;
136 d2 = d3;
137 d3 = d4;
138 }
139
140 sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
141 sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
142 sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm1);
143 sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm1);
144
145 sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
146 sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
147 sum2 += vec_perm(sum22, sum23, (vector unsigned char)vperm2);
148 sum3 += vec_perm(sum23, sum20, (vector unsigned char)vperm2);
149
150 sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
151 sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
152 sum2 += vec_perm(sum32, sum33, (vector unsigned char)vperm3);
153 sum3 += vec_perm(sum33, sum30, (vector unsigned char)vperm3);
154
155 for (; i <= limit; i++) {
156 vector float d;
157
158 d0 = vec_vsx_ld(0, data+i);
159 d1 = vec_vsx_ld(16, data+i);
160 d2 = vec_vsx_ld(32, data+i);
161 d3 = vec_vsx_ld(48, data+i);
162
163 d = vec_splat(d0, 0);
164 sum0 += d0 * d;
165 sum1 += d1 * d;
166 sum2 += d2 * d;
167 sum3 += d3 * d;
168 }
169
170 vec_vsx_st(sum0, 0, autoc);
171 vec_vsx_st(sum1, 16, autoc);
172 vec_vsx_st(sum2, 32, autoc);
173 vec_vsx_st(sum3, 48, autoc);
174
175 for (; i < (long)data_len; i++) {
176 uint32_t coeff;
177
178 FLAC__real d = data[i];
179 for (coeff = 0; coeff < data_len - i; coeff++)
180 autoc[coeff] += d * data[i+coeff];
181 }
182 }
183
184 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])185 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
186 {
187 long i;
188 long limit = (long)data_len - 12;
189 const FLAC__real *base;
190 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
191 vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
192 vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
193 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
194 vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
195 vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
196 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
197 vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
198 vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
199 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
200 vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
201 vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
202 vector float d0, d1, d2, d3;
203 #if WORDS_BIGENDIAN
204 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
205 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
206 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
207 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
208 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
209 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
210 #else
211 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
212 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
213 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
214 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
215 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
216 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
217 #endif
218
219 (void) lag;
220 FLAC__ASSERT(lag <= 12);
221 FLAC__ASSERT(lag <= data_len);
222
223 base = data;
224
225 d0 = vec_vsx_ld(0, base);
226 d1 = vec_vsx_ld(16, base);
227 d2 = vec_vsx_ld(32, base);
228
229 base += 12;
230
231 for (i = 0; i <= (limit-3); i += 4) {
232 vector float d, d0_orig = d0;
233
234 d3 = vec_vsx_ld(0, base);
235 base += 4;
236
237 d = vec_splat(d0_orig, 0);
238 sum0 += d0 * d;
239 sum1 += d1 * d;
240 sum2 += d2 * d;
241
242 d = vec_splat(d0_orig, 1);
243 d0 = vec_sel(d0_orig, d3, vsel1);
244 sum10 += d0 * d;
245 sum11 += d1 * d;
246 sum12 += d2 * d;
247
248 d = vec_splat(d0_orig, 2);
249 d0 = vec_sel(d0_orig, d3, vsel2);
250 sum20 += d0 * d;
251 sum21 += d1 * d;
252 sum22 += d2 * d;
253
254 d = vec_splat(d0_orig, 3);
255 d0 = vec_sel(d0_orig, d3, vsel3);
256 sum30 += d0 * d;
257 sum31 += d1 * d;
258 sum32 += d2 * d;
259
260 d0 = d1;
261 d1 = d2;
262 d2 = d3;
263 }
264
265 sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
266 sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
267 sum2 += vec_perm(sum12, sum10, (vector unsigned char)vperm1);
268
269 sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
270 sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
271 sum2 += vec_perm(sum22, sum20, (vector unsigned char)vperm2);
272
273 sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
274 sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
275 sum2 += vec_perm(sum32, sum30, (vector unsigned char)vperm3);
276
277 for (; i <= limit; i++) {
278 vector float d;
279
280 d0 = vec_vsx_ld(0, data+i);
281 d1 = vec_vsx_ld(16, data+i);
282 d2 = vec_vsx_ld(32, data+i);
283
284 d = vec_splat(d0, 0);
285 sum0 += d0 * d;
286 sum1 += d1 * d;
287 sum2 += d2 * d;
288 }
289
290 vec_vsx_st(sum0, 0, autoc);
291 vec_vsx_st(sum1, 16, autoc);
292 vec_vsx_st(sum2, 32, autoc);
293
294 for (; i < (long)data_len; i++) {
295 uint32_t coeff;
296
297 FLAC__real d = data[i];
298 for (coeff = 0; coeff < data_len - i; coeff++)
299 autoc[coeff] += d * data[i+coeff];
300 }
301 }
302
303 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])304 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
305 {
306 long i;
307 long limit = (long)data_len - 8;
308 const FLAC__real *base;
309 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
310 vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
311 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
312 vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
313 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
314 vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
315 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
316 vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
317 vector float d0, d1, d2;
318 #if WORDS_BIGENDIAN
319 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
320 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
321 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
322 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
323 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
324 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
325 #else
326 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
327 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
328 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
329 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
330 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
331 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
332 #endif
333
334 (void) lag;
335 FLAC__ASSERT(lag <= 8);
336 FLAC__ASSERT(lag <= data_len);
337
338 base = data;
339
340 d0 = vec_vsx_ld(0, base);
341 d1 = vec_vsx_ld(16, base);
342
343 base += 8;
344
345 for (i = 0; i <= (limit-2); i += 4) {
346 vector float d, d0_orig = d0;
347
348 d2 = vec_vsx_ld(0, base);
349 base += 4;
350
351 d = vec_splat(d0_orig, 0);
352 sum0 += d0 * d;
353 sum1 += d1 * d;
354
355 d = vec_splat(d0_orig, 1);
356 d0 = vec_sel(d0_orig, d2, vsel1);
357 sum10 += d0 * d;
358 sum11 += d1 * d;
359
360 d = vec_splat(d0_orig, 2);
361 d0 = vec_sel(d0_orig, d2, vsel2);
362 sum20 += d0 * d;
363 sum21 += d1 * d;
364
365 d = vec_splat(d0_orig, 3);
366 d0 = vec_sel(d0_orig, d2, vsel3);
367 sum30 += d0 * d;
368 sum31 += d1 * d;
369
370 d0 = d1;
371 d1 = d2;
372 }
373
374 sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
375 sum1 += vec_perm(sum11, sum10, (vector unsigned char)vperm1);
376
377 sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
378 sum1 += vec_perm(sum21, sum20, (vector unsigned char)vperm2);
379
380 sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
381 sum1 += vec_perm(sum31, sum30, (vector unsigned char)vperm3);
382
383 for (; i <= limit; i++) {
384 vector float d;
385
386 d0 = vec_vsx_ld(0, data+i);
387 d1 = vec_vsx_ld(16, data+i);
388
389 d = vec_splat(d0, 0);
390 sum0 += d0 * d;
391 sum1 += d1 * d;
392 }
393
394 vec_vsx_st(sum0, 0, autoc);
395 vec_vsx_st(sum1, 16, autoc);
396
397 for (; i < (long)data_len; i++) {
398 uint32_t coeff;
399
400 FLAC__real d = data[i];
401 for (coeff = 0; coeff < data_len - i; coeff++)
402 autoc[coeff] += d * data[i+coeff];
403 }
404 }
405
406 __attribute__((target("cpu=power8")))
FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])407 void FLAC__lpc_compute_autocorrelation_intrin_power8_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
408 {
409 long i;
410 long limit = (long)data_len - 4;
411 const FLAC__real *base;
412 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
413 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
414 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
415 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
416 vector float d0, d1;
417 #if WORDS_BIGENDIAN
418 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
419 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
420 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
421 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
422 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
423 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
424 #else
425 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
426 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
427 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
428 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
429 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
430 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
431 #endif
432
433 (void) lag;
434 FLAC__ASSERT(lag <= 4);
435 FLAC__ASSERT(lag <= data_len);
436
437 base = data;
438
439 d0 = vec_vsx_ld(0, base);
440
441 base += 4;
442
443 for (i = 0; i <= (limit-1); i += 4) {
444 vector float d, d0_orig = d0;
445
446 d1 = vec_vsx_ld(0, base);
447 base += 4;
448
449 d = vec_splat(d0_orig, 0);
450 sum0 += d0 * d;
451
452 d = vec_splat(d0_orig, 1);
453 d0 = vec_sel(d0_orig, d1, vsel1);
454 sum10 += d0 * d;
455
456 d = vec_splat(d0_orig, 2);
457 d0 = vec_sel(d0_orig, d1, vsel2);
458 sum20 += d0 * d;
459
460 d = vec_splat(d0_orig, 3);
461 d0 = vec_sel(d0_orig, d1, vsel3);
462 sum30 += d0 * d;
463
464 d0 = d1;
465 }
466
467 sum0 += vec_perm(sum10, sum10, (vector unsigned char)vperm1);
468
469 sum0 += vec_perm(sum20, sum20, (vector unsigned char)vperm2);
470
471 sum0 += vec_perm(sum30, sum30, (vector unsigned char)vperm3);
472
473 for (; i <= limit; i++) {
474 vector float d;
475
476 d0 = vec_vsx_ld(0, data+i);
477
478 d = vec_splat(d0, 0);
479 sum0 += d0 * d;
480 }
481
482 vec_vsx_st(sum0, 0, autoc);
483
484 for (; i < (long)data_len; i++) {
485 uint32_t coeff;
486
487 FLAC__real d = data[i];
488 for (coeff = 0; coeff < data_len - i; coeff++)
489 autoc[coeff] += d * data[i+coeff];
490 }
491 }
492 #endif /* FLAC__HAS_TARGET_POWER8 */
493
494 #ifdef FLAC__HAS_TARGET_POWER9
495 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])496 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_16(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
497 {
498 long i;
499 long limit = (long)data_len - 16;
500 const FLAC__real *base;
501 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
502 vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
503 vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
504 vector float sum3 = { 0.0f, 0.0f, 0.0f, 0.0f};
505 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
506 vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
507 vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
508 vector float sum13 = { 0.0f, 0.0f, 0.0f, 0.0f};
509 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
510 vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
511 vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
512 vector float sum23 = { 0.0f, 0.0f, 0.0f, 0.0f};
513 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
514 vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
515 vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
516 vector float sum33 = { 0.0f, 0.0f, 0.0f, 0.0f};
517 vector float d0, d1, d2, d3, d4;
518 #if WORDS_BIGENDIAN
519 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
520 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
521 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
522 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
523 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
524 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
525 #else
526 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
527 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
528 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
529 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
530 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
531 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
532 #endif
533
534 (void) lag;
535 FLAC__ASSERT(lag <= 16);
536 FLAC__ASSERT(lag <= data_len);
537
538 base = data;
539
540 d0 = vec_vsx_ld(0, base);
541 d1 = vec_vsx_ld(16, base);
542 d2 = vec_vsx_ld(32, base);
543 d3 = vec_vsx_ld(48, base);
544
545 base += 16;
546
547 for (i = 0; i <= (limit-4); i += 4) {
548 vector float d, d0_orig = d0;
549
550 d4 = vec_vsx_ld(0, base);
551 base += 4;
552
553 d = vec_splat(d0_orig, 0);
554 sum0 += d0 * d;
555 sum1 += d1 * d;
556 sum2 += d2 * d;
557 sum3 += d3 * d;
558
559 d = vec_splat(d0_orig, 1);
560 d0 = vec_sel(d0_orig, d4, vsel1);
561 sum10 += d0 * d;
562 sum11 += d1 * d;
563 sum12 += d2 * d;
564 sum13 += d3 * d;
565
566 d = vec_splat(d0_orig, 2);
567 d0 = vec_sel(d0_orig, d4, vsel2);
568 sum20 += d0 * d;
569 sum21 += d1 * d;
570 sum22 += d2 * d;
571 sum23 += d3 * d;
572
573 d = vec_splat(d0_orig, 3);
574 d0 = vec_sel(d0_orig, d4, vsel3);
575 sum30 += d0 * d;
576 sum31 += d1 * d;
577 sum32 += d2 * d;
578 sum33 += d3 * d;
579
580 d0 = d1;
581 d1 = d2;
582 d2 = d3;
583 d3 = d4;
584 }
585
586 sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
587 sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
588 sum2 += vec_perm(sum12, sum13, (vector unsigned char)vperm1);
589 sum3 += vec_perm(sum13, sum10, (vector unsigned char)vperm1);
590
591 sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
592 sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
593 sum2 += vec_perm(sum22, sum23, (vector unsigned char)vperm2);
594 sum3 += vec_perm(sum23, sum20, (vector unsigned char)vperm2);
595
596 sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
597 sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
598 sum2 += vec_perm(sum32, sum33, (vector unsigned char)vperm3);
599 sum3 += vec_perm(sum33, sum30, (vector unsigned char)vperm3);
600
601 for (; i <= limit; i++) {
602 vector float d;
603
604 d0 = vec_vsx_ld(0, data+i);
605 d1 = vec_vsx_ld(16, data+i);
606 d2 = vec_vsx_ld(32, data+i);
607 d3 = vec_vsx_ld(48, data+i);
608
609 d = vec_splat(d0, 0);
610 sum0 += d0 * d;
611 sum1 += d1 * d;
612 sum2 += d2 * d;
613 sum3 += d3 * d;
614 }
615
616 vec_vsx_st(sum0, 0, autoc);
617 vec_vsx_st(sum1, 16, autoc);
618 vec_vsx_st(sum2, 32, autoc);
619 vec_vsx_st(sum3, 48, autoc);
620
621 for (; i < (long)data_len; i++) {
622 uint32_t coeff;
623
624 FLAC__real d = data[i];
625 for (coeff = 0; coeff < data_len - i; coeff++)
626 autoc[coeff] += d * data[i+coeff];
627 }
628 }
629
630 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])631 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_12(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
632 {
633 long i;
634 long limit = (long)data_len - 12;
635 const FLAC__real *base;
636 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
637 vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
638 vector float sum2 = { 0.0f, 0.0f, 0.0f, 0.0f};
639 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
640 vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
641 vector float sum12 = { 0.0f, 0.0f, 0.0f, 0.0f};
642 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
643 vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
644 vector float sum22 = { 0.0f, 0.0f, 0.0f, 0.0f};
645 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
646 vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
647 vector float sum32 = { 0.0f, 0.0f, 0.0f, 0.0f};
648 vector float d0, d1, d2, d3;
649 #if WORDS_BIGENDIAN
650 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
651 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
652 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
653 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
654 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
655 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
656 #else
657 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
658 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
659 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
660 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
661 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
662 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
663 #endif
664
665 (void) lag;
666 FLAC__ASSERT(lag <= 12);
667 FLAC__ASSERT(lag <= data_len);
668
669 base = data;
670
671 d0 = vec_vsx_ld(0, base);
672 d1 = vec_vsx_ld(16, base);
673 d2 = vec_vsx_ld(32, base);
674
675 base += 12;
676
677 for (i = 0; i <= (limit-3); i += 4) {
678 vector float d, d0_orig = d0;
679
680 d3 = vec_vsx_ld(0, base);
681 base += 4;
682
683 d = vec_splat(d0_orig, 0);
684 sum0 += d0 * d;
685 sum1 += d1 * d;
686 sum2 += d2 * d;
687
688 d = vec_splat(d0_orig, 1);
689 d0 = vec_sel(d0_orig, d3, vsel1);
690 sum10 += d0 * d;
691 sum11 += d1 * d;
692 sum12 += d2 * d;
693
694 d = vec_splat(d0_orig, 2);
695 d0 = vec_sel(d0_orig, d3, vsel2);
696 sum20 += d0 * d;
697 sum21 += d1 * d;
698 sum22 += d2 * d;
699
700 d = vec_splat(d0_orig, 3);
701 d0 = vec_sel(d0_orig, d3, vsel3);
702 sum30 += d0 * d;
703 sum31 += d1 * d;
704 sum32 += d2 * d;
705
706 d0 = d1;
707 d1 = d2;
708 d2 = d3;
709 }
710
711 sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
712 sum1 += vec_perm(sum11, sum12, (vector unsigned char)vperm1);
713 sum2 += vec_perm(sum12, sum10, (vector unsigned char)vperm1);
714
715 sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
716 sum1 += vec_perm(sum21, sum22, (vector unsigned char)vperm2);
717 sum2 += vec_perm(sum22, sum20, (vector unsigned char)vperm2);
718
719 sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
720 sum1 += vec_perm(sum31, sum32, (vector unsigned char)vperm3);
721 sum2 += vec_perm(sum32, sum30, (vector unsigned char)vperm3);
722
723 for (; i <= limit; i++) {
724 vector float d;
725
726 d0 = vec_vsx_ld(0, data+i);
727 d1 = vec_vsx_ld(16, data+i);
728 d2 = vec_vsx_ld(32, data+i);
729
730 d = vec_splat(d0, 0);
731 sum0 += d0 * d;
732 sum1 += d1 * d;
733 sum2 += d2 * d;
734 }
735
736 vec_vsx_st(sum0, 0, autoc);
737 vec_vsx_st(sum1, 16, autoc);
738 vec_vsx_st(sum2, 32, autoc);
739
740 for (; i < (long)data_len; i++) {
741 uint32_t coeff;
742
743 FLAC__real d = data[i];
744 for (coeff = 0; coeff < data_len - i; coeff++)
745 autoc[coeff] += d * data[i+coeff];
746 }
747 }
748
749 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])750 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_8(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
751 {
752 long i;
753 long limit = (long)data_len - 8;
754 const FLAC__real *base;
755 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
756 vector float sum1 = { 0.0f, 0.0f, 0.0f, 0.0f};
757 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
758 vector float sum11 = { 0.0f, 0.0f, 0.0f, 0.0f};
759 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
760 vector float sum21 = { 0.0f, 0.0f, 0.0f, 0.0f};
761 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
762 vector float sum31 = { 0.0f, 0.0f, 0.0f, 0.0f};
763 vector float d0, d1, d2;
764 #if WORDS_BIGENDIAN
765 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
766 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
767 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
768 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
769 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
770 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
771 #else
772 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
773 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
774 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
775 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
776 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
777 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
778 #endif
779
780 (void) lag;
781 FLAC__ASSERT(lag <= 8);
782 FLAC__ASSERT(lag <= data_len);
783
784 base = data;
785
786 d0 = vec_vsx_ld(0, base);
787 d1 = vec_vsx_ld(16, base);
788
789 base += 8;
790
791 for (i = 0; i <= (limit-2); i += 4) {
792 vector float d, d0_orig = d0;
793
794 d2 = vec_vsx_ld(0, base);
795 base += 4;
796
797 d = vec_splat(d0_orig, 0);
798 sum0 += d0 * d;
799 sum1 += d1 * d;
800
801 d = vec_splat(d0_orig, 1);
802 d0 = vec_sel(d0_orig, d2, vsel1);
803 sum10 += d0 * d;
804 sum11 += d1 * d;
805
806 d = vec_splat(d0_orig, 2);
807 d0 = vec_sel(d0_orig, d2, vsel2);
808 sum20 += d0 * d;
809 sum21 += d1 * d;
810
811 d = vec_splat(d0_orig, 3);
812 d0 = vec_sel(d0_orig, d2, vsel3);
813 sum30 += d0 * d;
814 sum31 += d1 * d;
815
816 d0 = d1;
817 d1 = d2;
818 }
819
820 sum0 += vec_perm(sum10, sum11, (vector unsigned char)vperm1);
821 sum1 += vec_perm(sum11, sum10, (vector unsigned char)vperm1);
822
823 sum0 += vec_perm(sum20, sum21, (vector unsigned char)vperm2);
824 sum1 += vec_perm(sum21, sum20, (vector unsigned char)vperm2);
825
826 sum0 += vec_perm(sum30, sum31, (vector unsigned char)vperm3);
827 sum1 += vec_perm(sum31, sum30, (vector unsigned char)vperm3);
828
829 for (; i <= limit; i++) {
830 vector float d;
831
832 d0 = vec_vsx_ld(0, data+i);
833 d1 = vec_vsx_ld(16, data+i);
834
835 d = vec_splat(d0, 0);
836 sum0 += d0 * d;
837 sum1 += d1 * d;
838 }
839
840 vec_vsx_st(sum0, 0, autoc);
841 vec_vsx_st(sum1, 16, autoc);
842
843 for (; i < (long)data_len; i++) {
844 uint32_t coeff;
845
846 FLAC__real d = data[i];
847 for (coeff = 0; coeff < data_len - i; coeff++)
848 autoc[coeff] += d * data[i+coeff];
849 }
850 }
851
852 __attribute__((target("cpu=power9")))
FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4(const FLAC__real data[],uint32_t data_len,uint32_t lag,FLAC__real autoc[])853 void FLAC__lpc_compute_autocorrelation_intrin_power9_vsx_lag_4(const FLAC__real data[], uint32_t data_len, uint32_t lag, FLAC__real autoc[])
854 {
855 long i;
856 long limit = (long)data_len - 4;
857 const FLAC__real *base;
858 vector float sum0 = { 0.0f, 0.0f, 0.0f, 0.0f};
859 vector float sum10 = { 0.0f, 0.0f, 0.0f, 0.0f};
860 vector float sum20 = { 0.0f, 0.0f, 0.0f, 0.0f};
861 vector float sum30 = { 0.0f, 0.0f, 0.0f, 0.0f};
862 vector float d0, d1;
863 #if WORDS_BIGENDIAN
864 vector unsigned int vsel1 = { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF };
865 vector unsigned int vsel2 = { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF };
866 vector unsigned int vsel3 = { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF };
867 vector unsigned int vperm1 = { 0x04050607, 0x08090A0B, 0x0C0D0E0F, 0x10111213 };
868 vector unsigned int vperm2 = { 0x08090A0B, 0x0C0D0E0F, 0x10111213, 0x14151617 };
869 vector unsigned int vperm3 = { 0x0C0D0E0F, 0x10111213, 0x14151617, 0x18191A1B };
870 #else
871 vector unsigned int vsel1 = { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 };
872 vector unsigned int vsel2 = { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 };
873 vector unsigned int vsel3 = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 };
874 vector unsigned int vperm1 = { 0x07060504, 0x0B0A0908, 0x0F0E0D0C, 0x13121110 };
875 vector unsigned int vperm2 = { 0x0B0A0908, 0x0F0E0D0C, 0x13121110, 0x17161514 };
876 vector unsigned int vperm3 = { 0x0F0E0D0C, 0x13121110, 0x17161514, 0x1B1A1918 };
877 #endif
878
879 (void) lag;
880 FLAC__ASSERT(lag <= 4);
881 FLAC__ASSERT(lag <= data_len);
882
883 base = data;
884
885 d0 = vec_vsx_ld(0, base);
886
887 base += 4;
888
889 for (i = 0; i <= (limit-1); i += 4) {
890 vector float d, d0_orig = d0;
891
892 d1 = vec_vsx_ld(0, base);
893 base += 4;
894
895 d = vec_splat(d0_orig, 0);
896 sum0 += d0 * d;
897
898 d = vec_splat(d0_orig, 1);
899 d0 = vec_sel(d0_orig, d1, vsel1);
900 sum10 += d0 * d;
901
902 d = vec_splat(d0_orig, 2);
903 d0 = vec_sel(d0_orig, d1, vsel2);
904 sum20 += d0 * d;
905
906 d = vec_splat(d0_orig, 3);
907 d0 = vec_sel(d0_orig, d1, vsel3);
908 sum30 += d0 * d;
909
910 d0 = d1;
911 }
912
913 sum0 += vec_perm(sum10, sum10, (vector unsigned char)vperm1);
914
915 sum0 += vec_perm(sum20, sum20, (vector unsigned char)vperm2);
916
917 sum0 += vec_perm(sum30, sum30, (vector unsigned char)vperm3);
918
919 for (; i <= limit; i++) {
920 vector float d;
921
922 d0 = vec_vsx_ld(0, data+i);
923
924 d = vec_splat(d0, 0);
925 sum0 += d0 * d;
926 }
927
928 vec_vsx_st(sum0, 0, autoc);
929
930 for (; i < (long)data_len; i++) {
931 uint32_t coeff;
932
933 FLAC__real d = data[i];
934 for (coeff = 0; coeff < data_len - i; coeff++)
935 autoc[coeff] += d * data[i+coeff];
936 }
937 }
938 #endif /* FLAC__HAS_TARGET_POWER9 */
939
940 #endif /* FLAC__CPU_PPC64 && FLAC__USE_VSX */
941 #endif /* FLAC__NO_ASM */
942 #endif /* FLAC__INTEGER_ONLY_LIBRARY */
943