1 /*
2 * Copyright (C) 2012 Intel Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14 * its contributors may be used to endorse or promote products derived
15 * from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "config.h"
30
31 #if ENABLE(WEB_AUDIO)
32
33 #include "platform/audio/DirectConvolver.h"
34
35 #if OS(MACOSX)
36 #include <Accelerate/Accelerate.h>
37 #endif
38
39 #include "platform/audio/VectorMath.h"
40 #include "wtf/CPU.h"
41
42 #if (CPU(X86) || CPU(X86_64)) && !(OS(MACOSX) || USE(WEBAUDIO_IPP))
43 #include <emmintrin.h>
44 #endif
45
46 namespace blink {
47
48 using namespace VectorMath;
49
DirectConvolver(size_t inputBlockSize)50 DirectConvolver::DirectConvolver(size_t inputBlockSize)
51 : m_inputBlockSize(inputBlockSize)
52 #if USE(WEBAUDIO_IPP)
53 , m_overlayBuffer(inputBlockSize)
54 #endif // USE(WEBAUDIO_IPP)
55 , m_buffer(inputBlockSize * 2)
56 {
57 }
58
process(AudioFloatArray * convolutionKernel,const float * sourceP,float * destP,size_t framesToProcess)59 void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* sourceP, float* destP, size_t framesToProcess)
60 {
61 ASSERT(framesToProcess == m_inputBlockSize);
62 if (framesToProcess != m_inputBlockSize)
63 return;
64
65 // Only support kernelSize <= m_inputBlockSize
66 size_t kernelSize = convolutionKernel->size();
67 ASSERT(kernelSize <= m_inputBlockSize);
68 if (kernelSize > m_inputBlockSize)
69 return;
70
71 float* kernelP = convolutionKernel->data();
72
73 // Sanity check
74 bool isCopyGood = kernelP && sourceP && destP && m_buffer.data();
75 ASSERT(isCopyGood);
76 if (!isCopyGood)
77 return;
78
79 #if USE(WEBAUDIO_IPP)
80 float* outputBuffer = m_buffer.data();
81 float* overlayBuffer = m_overlayBuffer.data();
82 bool isCopyGood2 = overlayBuffer && m_overlayBuffer.size() >= kernelSize && m_buffer.size() == m_inputBlockSize * 2;
83 ASSERT(isCopyGood2);
84 if (!isCopyGood2)
85 return;
86
87 ippsConv_32f(static_cast<const Ipp32f*>(sourceP), framesToProcess, static_cast<Ipp32f*>(kernelP), kernelSize, static_cast<Ipp32f*>(outputBuffer));
88
89 vadd(outputBuffer, 1, overlayBuffer, 1, destP, 1, framesToProcess);
90 memcpy(overlayBuffer, outputBuffer + m_inputBlockSize, sizeof(float) * kernelSize);
91 #else
92 float* inputP = m_buffer.data() + m_inputBlockSize;
93
94 // Copy samples to 2nd half of input buffer.
95 memcpy(inputP, sourceP, sizeof(float) * framesToProcess);
96
97 #if OS(MACOSX)
98 #if CPU(X86)
99 conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);
100 #else
101 vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);
102 #endif // CPU(X86)
103 #else
104 size_t i = 0;
105 #if CPU(X86) || CPU(X86_64)
106 // Convolution using SSE2. Currently only do this if both |kernelSize| and |framesToProcess|
107 // are multiples of 4. If not, use the straightforward loop below.
108
109 if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {
110 // AudioFloatArray's are always aligned on at least a 16-byte boundary.
111 AudioFloatArray kernelBuffer(4 * kernelSize);
112 __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());
113
114 // Reverse the kernel and repeat each value across a vector
115 for (i = 0; i < kernelSize; ++i) {
116 kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);
117 }
118
119 float* inputStartP = inputP - kernelSize + 1;
120
121 // Do convolution with 4 inputs at a time.
122 for (i = 0; i < framesToProcess; i += 4) {
123 __m128 convolutionSum;
124
125 convolutionSum = _mm_setzero_ps();
126
127 // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manually.
128 for (size_t k = 0; k < kernelSize; k += 4) {
129 size_t dataOffset = i + k;
130
131 for (size_t m = 0; m < 4; ++m) {
132 __m128 sourceBlock;
133 __m128 product;
134
135 sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);
136 product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);
137 convolutionSum = _mm_add_ps(convolutionSum, product);
138 }
139 }
140 _mm_storeu_ps(destP + i, convolutionSum);
141 }
142 } else {
143 #endif
144
145 // FIXME: The macro can be further optimized to avoid pipeline stalls. One possibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMPLES.
146 #define CONVOLVE_ONE_SAMPLE \
147 do { \
148 sum += inputP[i - j] * kernelP[j]; \
149 j++; \
150 } while (0)
151
152 while (i < framesToProcess) {
153 size_t j = 0;
154 float sum = 0;
155
156 // FIXME: SSE optimization may be applied here.
157 if (kernelSize == 32) {
158 CONVOLVE_ONE_SAMPLE; // 1
159 CONVOLVE_ONE_SAMPLE; // 2
160 CONVOLVE_ONE_SAMPLE; // 3
161 CONVOLVE_ONE_SAMPLE; // 4
162 CONVOLVE_ONE_SAMPLE; // 5
163 CONVOLVE_ONE_SAMPLE; // 6
164 CONVOLVE_ONE_SAMPLE; // 7
165 CONVOLVE_ONE_SAMPLE; // 8
166 CONVOLVE_ONE_SAMPLE; // 9
167 CONVOLVE_ONE_SAMPLE; // 10
168
169 CONVOLVE_ONE_SAMPLE; // 11
170 CONVOLVE_ONE_SAMPLE; // 12
171 CONVOLVE_ONE_SAMPLE; // 13
172 CONVOLVE_ONE_SAMPLE; // 14
173 CONVOLVE_ONE_SAMPLE; // 15
174 CONVOLVE_ONE_SAMPLE; // 16
175 CONVOLVE_ONE_SAMPLE; // 17
176 CONVOLVE_ONE_SAMPLE; // 18
177 CONVOLVE_ONE_SAMPLE; // 19
178 CONVOLVE_ONE_SAMPLE; // 20
179
180 CONVOLVE_ONE_SAMPLE; // 21
181 CONVOLVE_ONE_SAMPLE; // 22
182 CONVOLVE_ONE_SAMPLE; // 23
183 CONVOLVE_ONE_SAMPLE; // 24
184 CONVOLVE_ONE_SAMPLE; // 25
185 CONVOLVE_ONE_SAMPLE; // 26
186 CONVOLVE_ONE_SAMPLE; // 27
187 CONVOLVE_ONE_SAMPLE; // 28
188 CONVOLVE_ONE_SAMPLE; // 29
189 CONVOLVE_ONE_SAMPLE; // 30
190
191 CONVOLVE_ONE_SAMPLE; // 31
192 CONVOLVE_ONE_SAMPLE; // 32
193
194 } else if (kernelSize == 64) {
195 CONVOLVE_ONE_SAMPLE; // 1
196 CONVOLVE_ONE_SAMPLE; // 2
197 CONVOLVE_ONE_SAMPLE; // 3
198 CONVOLVE_ONE_SAMPLE; // 4
199 CONVOLVE_ONE_SAMPLE; // 5
200 CONVOLVE_ONE_SAMPLE; // 6
201 CONVOLVE_ONE_SAMPLE; // 7
202 CONVOLVE_ONE_SAMPLE; // 8
203 CONVOLVE_ONE_SAMPLE; // 9
204 CONVOLVE_ONE_SAMPLE; // 10
205
206 CONVOLVE_ONE_SAMPLE; // 11
207 CONVOLVE_ONE_SAMPLE; // 12
208 CONVOLVE_ONE_SAMPLE; // 13
209 CONVOLVE_ONE_SAMPLE; // 14
210 CONVOLVE_ONE_SAMPLE; // 15
211 CONVOLVE_ONE_SAMPLE; // 16
212 CONVOLVE_ONE_SAMPLE; // 17
213 CONVOLVE_ONE_SAMPLE; // 18
214 CONVOLVE_ONE_SAMPLE; // 19
215 CONVOLVE_ONE_SAMPLE; // 20
216
217 CONVOLVE_ONE_SAMPLE; // 21
218 CONVOLVE_ONE_SAMPLE; // 22
219 CONVOLVE_ONE_SAMPLE; // 23
220 CONVOLVE_ONE_SAMPLE; // 24
221 CONVOLVE_ONE_SAMPLE; // 25
222 CONVOLVE_ONE_SAMPLE; // 26
223 CONVOLVE_ONE_SAMPLE; // 27
224 CONVOLVE_ONE_SAMPLE; // 28
225 CONVOLVE_ONE_SAMPLE; // 29
226 CONVOLVE_ONE_SAMPLE; // 30
227
228 CONVOLVE_ONE_SAMPLE; // 31
229 CONVOLVE_ONE_SAMPLE; // 32
230 CONVOLVE_ONE_SAMPLE; // 33
231 CONVOLVE_ONE_SAMPLE; // 34
232 CONVOLVE_ONE_SAMPLE; // 35
233 CONVOLVE_ONE_SAMPLE; // 36
234 CONVOLVE_ONE_SAMPLE; // 37
235 CONVOLVE_ONE_SAMPLE; // 38
236 CONVOLVE_ONE_SAMPLE; // 39
237 CONVOLVE_ONE_SAMPLE; // 40
238
239 CONVOLVE_ONE_SAMPLE; // 41
240 CONVOLVE_ONE_SAMPLE; // 42
241 CONVOLVE_ONE_SAMPLE; // 43
242 CONVOLVE_ONE_SAMPLE; // 44
243 CONVOLVE_ONE_SAMPLE; // 45
244 CONVOLVE_ONE_SAMPLE; // 46
245 CONVOLVE_ONE_SAMPLE; // 47
246 CONVOLVE_ONE_SAMPLE; // 48
247 CONVOLVE_ONE_SAMPLE; // 49
248 CONVOLVE_ONE_SAMPLE; // 50
249
250 CONVOLVE_ONE_SAMPLE; // 51
251 CONVOLVE_ONE_SAMPLE; // 52
252 CONVOLVE_ONE_SAMPLE; // 53
253 CONVOLVE_ONE_SAMPLE; // 54
254 CONVOLVE_ONE_SAMPLE; // 55
255 CONVOLVE_ONE_SAMPLE; // 56
256 CONVOLVE_ONE_SAMPLE; // 57
257 CONVOLVE_ONE_SAMPLE; // 58
258 CONVOLVE_ONE_SAMPLE; // 59
259 CONVOLVE_ONE_SAMPLE; // 60
260
261 CONVOLVE_ONE_SAMPLE; // 61
262 CONVOLVE_ONE_SAMPLE; // 62
263 CONVOLVE_ONE_SAMPLE; // 63
264 CONVOLVE_ONE_SAMPLE; // 64
265
266 } else if (kernelSize == 128) {
267 CONVOLVE_ONE_SAMPLE; // 1
268 CONVOLVE_ONE_SAMPLE; // 2
269 CONVOLVE_ONE_SAMPLE; // 3
270 CONVOLVE_ONE_SAMPLE; // 4
271 CONVOLVE_ONE_SAMPLE; // 5
272 CONVOLVE_ONE_SAMPLE; // 6
273 CONVOLVE_ONE_SAMPLE; // 7
274 CONVOLVE_ONE_SAMPLE; // 8
275 CONVOLVE_ONE_SAMPLE; // 9
276 CONVOLVE_ONE_SAMPLE; // 10
277
278 CONVOLVE_ONE_SAMPLE; // 11
279 CONVOLVE_ONE_SAMPLE; // 12
280 CONVOLVE_ONE_SAMPLE; // 13
281 CONVOLVE_ONE_SAMPLE; // 14
282 CONVOLVE_ONE_SAMPLE; // 15
283 CONVOLVE_ONE_SAMPLE; // 16
284 CONVOLVE_ONE_SAMPLE; // 17
285 CONVOLVE_ONE_SAMPLE; // 18
286 CONVOLVE_ONE_SAMPLE; // 19
287 CONVOLVE_ONE_SAMPLE; // 20
288
289 CONVOLVE_ONE_SAMPLE; // 21
290 CONVOLVE_ONE_SAMPLE; // 22
291 CONVOLVE_ONE_SAMPLE; // 23
292 CONVOLVE_ONE_SAMPLE; // 24
293 CONVOLVE_ONE_SAMPLE; // 25
294 CONVOLVE_ONE_SAMPLE; // 26
295 CONVOLVE_ONE_SAMPLE; // 27
296 CONVOLVE_ONE_SAMPLE; // 28
297 CONVOLVE_ONE_SAMPLE; // 29
298 CONVOLVE_ONE_SAMPLE; // 30
299
300 CONVOLVE_ONE_SAMPLE; // 31
301 CONVOLVE_ONE_SAMPLE; // 32
302 CONVOLVE_ONE_SAMPLE; // 33
303 CONVOLVE_ONE_SAMPLE; // 34
304 CONVOLVE_ONE_SAMPLE; // 35
305 CONVOLVE_ONE_SAMPLE; // 36
306 CONVOLVE_ONE_SAMPLE; // 37
307 CONVOLVE_ONE_SAMPLE; // 38
308 CONVOLVE_ONE_SAMPLE; // 39
309 CONVOLVE_ONE_SAMPLE; // 40
310
311 CONVOLVE_ONE_SAMPLE; // 41
312 CONVOLVE_ONE_SAMPLE; // 42
313 CONVOLVE_ONE_SAMPLE; // 43
314 CONVOLVE_ONE_SAMPLE; // 44
315 CONVOLVE_ONE_SAMPLE; // 45
316 CONVOLVE_ONE_SAMPLE; // 46
317 CONVOLVE_ONE_SAMPLE; // 47
318 CONVOLVE_ONE_SAMPLE; // 48
319 CONVOLVE_ONE_SAMPLE; // 49
320 CONVOLVE_ONE_SAMPLE; // 50
321
322 CONVOLVE_ONE_SAMPLE; // 51
323 CONVOLVE_ONE_SAMPLE; // 52
324 CONVOLVE_ONE_SAMPLE; // 53
325 CONVOLVE_ONE_SAMPLE; // 54
326 CONVOLVE_ONE_SAMPLE; // 55
327 CONVOLVE_ONE_SAMPLE; // 56
328 CONVOLVE_ONE_SAMPLE; // 57
329 CONVOLVE_ONE_SAMPLE; // 58
330 CONVOLVE_ONE_SAMPLE; // 59
331 CONVOLVE_ONE_SAMPLE; // 60
332
333 CONVOLVE_ONE_SAMPLE; // 61
334 CONVOLVE_ONE_SAMPLE; // 62
335 CONVOLVE_ONE_SAMPLE; // 63
336 CONVOLVE_ONE_SAMPLE; // 64
337 CONVOLVE_ONE_SAMPLE; // 65
338 CONVOLVE_ONE_SAMPLE; // 66
339 CONVOLVE_ONE_SAMPLE; // 67
340 CONVOLVE_ONE_SAMPLE; // 68
341 CONVOLVE_ONE_SAMPLE; // 69
342 CONVOLVE_ONE_SAMPLE; // 70
343
344 CONVOLVE_ONE_SAMPLE; // 71
345 CONVOLVE_ONE_SAMPLE; // 72
346 CONVOLVE_ONE_SAMPLE; // 73
347 CONVOLVE_ONE_SAMPLE; // 74
348 CONVOLVE_ONE_SAMPLE; // 75
349 CONVOLVE_ONE_SAMPLE; // 76
350 CONVOLVE_ONE_SAMPLE; // 77
351 CONVOLVE_ONE_SAMPLE; // 78
352 CONVOLVE_ONE_SAMPLE; // 79
353 CONVOLVE_ONE_SAMPLE; // 80
354
355 CONVOLVE_ONE_SAMPLE; // 81
356 CONVOLVE_ONE_SAMPLE; // 82
357 CONVOLVE_ONE_SAMPLE; // 83
358 CONVOLVE_ONE_SAMPLE; // 84
359 CONVOLVE_ONE_SAMPLE; // 85
360 CONVOLVE_ONE_SAMPLE; // 86
361 CONVOLVE_ONE_SAMPLE; // 87
362 CONVOLVE_ONE_SAMPLE; // 88
363 CONVOLVE_ONE_SAMPLE; // 89
364 CONVOLVE_ONE_SAMPLE; // 90
365
366 CONVOLVE_ONE_SAMPLE; // 91
367 CONVOLVE_ONE_SAMPLE; // 92
368 CONVOLVE_ONE_SAMPLE; // 93
369 CONVOLVE_ONE_SAMPLE; // 94
370 CONVOLVE_ONE_SAMPLE; // 95
371 CONVOLVE_ONE_SAMPLE; // 96
372 CONVOLVE_ONE_SAMPLE; // 97
373 CONVOLVE_ONE_SAMPLE; // 98
374 CONVOLVE_ONE_SAMPLE; // 99
375 CONVOLVE_ONE_SAMPLE; // 100
376
377 CONVOLVE_ONE_SAMPLE; // 101
378 CONVOLVE_ONE_SAMPLE; // 102
379 CONVOLVE_ONE_SAMPLE; // 103
380 CONVOLVE_ONE_SAMPLE; // 104
381 CONVOLVE_ONE_SAMPLE; // 105
382 CONVOLVE_ONE_SAMPLE; // 106
383 CONVOLVE_ONE_SAMPLE; // 107
384 CONVOLVE_ONE_SAMPLE; // 108
385 CONVOLVE_ONE_SAMPLE; // 109
386 CONVOLVE_ONE_SAMPLE; // 110
387
388 CONVOLVE_ONE_SAMPLE; // 111
389 CONVOLVE_ONE_SAMPLE; // 112
390 CONVOLVE_ONE_SAMPLE; // 113
391 CONVOLVE_ONE_SAMPLE; // 114
392 CONVOLVE_ONE_SAMPLE; // 115
393 CONVOLVE_ONE_SAMPLE; // 116
394 CONVOLVE_ONE_SAMPLE; // 117
395 CONVOLVE_ONE_SAMPLE; // 118
396 CONVOLVE_ONE_SAMPLE; // 119
397 CONVOLVE_ONE_SAMPLE; // 120
398
399 CONVOLVE_ONE_SAMPLE; // 121
400 CONVOLVE_ONE_SAMPLE; // 122
401 CONVOLVE_ONE_SAMPLE; // 123
402 CONVOLVE_ONE_SAMPLE; // 124
403 CONVOLVE_ONE_SAMPLE; // 125
404 CONVOLVE_ONE_SAMPLE; // 126
405 CONVOLVE_ONE_SAMPLE; // 127
406 CONVOLVE_ONE_SAMPLE; // 128
407 } else {
408 while (j < kernelSize) {
409 // Non-optimized using actual while loop.
410 CONVOLVE_ONE_SAMPLE;
411 }
412 }
413 destP[i++] = sum;
414 }
415 #if CPU(X86) || CPU(X86_64)
416 }
417 #endif
418 #endif // OS(MACOSX)
419
420 // Copy 2nd half of input buffer to 1st half.
421 memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);
422 #endif
423 }
424
reset()425 void DirectConvolver::reset()
426 {
427 m_buffer.zero();
428 #if USE(WEBAUDIO_IPP)
429 m_overlayBuffer.zero();
430 #endif // USE(WEBAUDIO_IPP)
431 }
432
433 } // namespace blink
434
435 #endif // ENABLE(WEB_AUDIO)
436