1/*
2 * Copyright (C) 2012 Intel Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1.  Redistributions of source code must retain the above copyright
9 *     notice, this list of conditions and the following disclaimer.
10 * 2.  Redistributions in binary form must reproduce the above copyright
11 *     notice, this list of conditions and the following disclaimer in the
12 *     documentation and/or other materials provided with the distribution.
13 * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14 *     its contributors may be used to endorse or promote products derived
15 *     from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "config.h"
30
31#if ENABLE(WEB_AUDIO)
32
33#include "platform/audio/DirectConvolver.h"
34
35#if OS(MACOSX)
36#include <Accelerate/Accelerate.h>
37#endif
38
39#include "platform/audio/VectorMath.h"
40#include "wtf/CPU.h"
41
42#if (CPU(X86) || CPU(X86_64)) && !(OS(MACOSX) || USE(WEBAUDIO_IPP))
43#include <emmintrin.h>
44#endif
45
46namespace blink {
47
48using namespace VectorMath;
49
50DirectConvolver::DirectConvolver(size_t inputBlockSize)
51    : m_inputBlockSize(inputBlockSize)
52#if USE(WEBAUDIO_IPP)
53    , m_overlayBuffer(inputBlockSize)
54#endif // USE(WEBAUDIO_IPP)
55    , m_buffer(inputBlockSize * 2)
56{
57}
58
59void DirectConvolver::process(AudioFloatArray* convolutionKernel, const float* sourceP, float* destP, size_t framesToProcess)
60{
61    ASSERT(framesToProcess == m_inputBlockSize);
62    if (framesToProcess != m_inputBlockSize)
63        return;
64
65    // Only support kernelSize <= m_inputBlockSize
66    size_t kernelSize = convolutionKernel->size();
67    ASSERT(kernelSize <= m_inputBlockSize);
68    if (kernelSize > m_inputBlockSize)
69        return;
70
71    float* kernelP = convolutionKernel->data();
72
73    // Sanity check
74    bool isCopyGood = kernelP && sourceP && destP && m_buffer.data();
75    ASSERT(isCopyGood);
76    if (!isCopyGood)
77        return;
78
79#if USE(WEBAUDIO_IPP)
80    float* outputBuffer = m_buffer.data();
81    float* overlayBuffer = m_overlayBuffer.data();
82    bool isCopyGood2 = overlayBuffer && m_overlayBuffer.size() >= kernelSize && m_buffer.size() == m_inputBlockSize * 2;
83    ASSERT(isCopyGood2);
84    if (!isCopyGood2)
85        return;
86
87    ippsConv_32f(static_cast<const Ipp32f*>(sourceP), framesToProcess, static_cast<Ipp32f*>(kernelP), kernelSize, static_cast<Ipp32f*>(outputBuffer));
88
89    vadd(outputBuffer, 1, overlayBuffer, 1, destP, 1, framesToProcess);
90    memcpy(overlayBuffer, outputBuffer + m_inputBlockSize, sizeof(float) * kernelSize);
91#else
92    float* inputP = m_buffer.data() + m_inputBlockSize;
93
94    // Copy samples to 2nd half of input buffer.
95    memcpy(inputP, sourceP, sizeof(float) * framesToProcess);
96
97#if OS(MACOSX)
98#if CPU(X86)
99    conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);
100#else
101    vDSP_conv(inputP - kernelSize + 1, 1, kernelP + kernelSize - 1, -1, destP, 1, framesToProcess, kernelSize);
102#endif // CPU(X86)
103#else
104    size_t i = 0;
105#if CPU(X86) || CPU(X86_64)
106    // Convolution using SSE2. Currently only do this if both |kernelSize| and |framesToProcess|
107    // are multiples of 4. If not, use the straightforward loop below.
108
109    if ((kernelSize % 4 == 0) && (framesToProcess % 4 == 0)) {
110        // AudioFloatArray's are always aligned on at least a 16-byte boundary.
111        AudioFloatArray kernelBuffer(4 * kernelSize);
112        __m128* kernelReversed = reinterpret_cast<__m128*>(kernelBuffer.data());
113
114        // Reverse the kernel and repeat each value across a vector
115        for (i = 0; i < kernelSize; ++i) {
116            kernelReversed[i] = _mm_set1_ps(kernelP[kernelSize - i - 1]);
117        }
118
119        float* inputStartP = inputP - kernelSize + 1;
120
121        // Do convolution with 4 inputs at a time.
122        for (i = 0; i < framesToProcess; i += 4) {
123            __m128 convolutionSum;
124
125            convolutionSum = _mm_setzero_ps();
126
127            // |kernelSize| is a multiple of 4 so we can unroll the loop by 4, manually.
128            for (size_t k = 0; k < kernelSize; k += 4) {
129                size_t dataOffset = i + k;
130
131                for (size_t m = 0; m < 4; ++m) {
132                    __m128 sourceBlock;
133                    __m128 product;
134
135                    sourceBlock = _mm_loadu_ps(inputStartP + dataOffset + m);
136                    product = _mm_mul_ps(kernelReversed[k + m], sourceBlock);
137                    convolutionSum = _mm_add_ps(convolutionSum, product);
138                }
139            }
140            _mm_storeu_ps(destP + i, convolutionSum);
141        }
142    } else {
143#endif
144
145    // FIXME: The macro can be further optimized to avoid pipeline stalls. One possibility is to maintain 4 separate sums and change the macro to CONVOLVE_FOUR_SAMPLES.
146#define CONVOLVE_ONE_SAMPLE                 \
147    do {                                    \
148        sum += inputP[i - j] * kernelP[j];  \
149        j++;                                \
150    } while (0)
151
152    while (i < framesToProcess) {
153        size_t j = 0;
154        float sum = 0;
155
156        // FIXME: SSE optimization may be applied here.
157        if (kernelSize == 32) {
158            CONVOLVE_ONE_SAMPLE; // 1
159            CONVOLVE_ONE_SAMPLE; // 2
160            CONVOLVE_ONE_SAMPLE; // 3
161            CONVOLVE_ONE_SAMPLE; // 4
162            CONVOLVE_ONE_SAMPLE; // 5
163            CONVOLVE_ONE_SAMPLE; // 6
164            CONVOLVE_ONE_SAMPLE; // 7
165            CONVOLVE_ONE_SAMPLE; // 8
166            CONVOLVE_ONE_SAMPLE; // 9
167            CONVOLVE_ONE_SAMPLE; // 10
168
169            CONVOLVE_ONE_SAMPLE; // 11
170            CONVOLVE_ONE_SAMPLE; // 12
171            CONVOLVE_ONE_SAMPLE; // 13
172            CONVOLVE_ONE_SAMPLE; // 14
173            CONVOLVE_ONE_SAMPLE; // 15
174            CONVOLVE_ONE_SAMPLE; // 16
175            CONVOLVE_ONE_SAMPLE; // 17
176            CONVOLVE_ONE_SAMPLE; // 18
177            CONVOLVE_ONE_SAMPLE; // 19
178            CONVOLVE_ONE_SAMPLE; // 20
179
180            CONVOLVE_ONE_SAMPLE; // 21
181            CONVOLVE_ONE_SAMPLE; // 22
182            CONVOLVE_ONE_SAMPLE; // 23
183            CONVOLVE_ONE_SAMPLE; // 24
184            CONVOLVE_ONE_SAMPLE; // 25
185            CONVOLVE_ONE_SAMPLE; // 26
186            CONVOLVE_ONE_SAMPLE; // 27
187            CONVOLVE_ONE_SAMPLE; // 28
188            CONVOLVE_ONE_SAMPLE; // 29
189            CONVOLVE_ONE_SAMPLE; // 30
190
191            CONVOLVE_ONE_SAMPLE; // 31
192            CONVOLVE_ONE_SAMPLE; // 32
193
194        } else if (kernelSize == 64) {
195            CONVOLVE_ONE_SAMPLE; // 1
196            CONVOLVE_ONE_SAMPLE; // 2
197            CONVOLVE_ONE_SAMPLE; // 3
198            CONVOLVE_ONE_SAMPLE; // 4
199            CONVOLVE_ONE_SAMPLE; // 5
200            CONVOLVE_ONE_SAMPLE; // 6
201            CONVOLVE_ONE_SAMPLE; // 7
202            CONVOLVE_ONE_SAMPLE; // 8
203            CONVOLVE_ONE_SAMPLE; // 9
204            CONVOLVE_ONE_SAMPLE; // 10
205
206            CONVOLVE_ONE_SAMPLE; // 11
207            CONVOLVE_ONE_SAMPLE; // 12
208            CONVOLVE_ONE_SAMPLE; // 13
209            CONVOLVE_ONE_SAMPLE; // 14
210            CONVOLVE_ONE_SAMPLE; // 15
211            CONVOLVE_ONE_SAMPLE; // 16
212            CONVOLVE_ONE_SAMPLE; // 17
213            CONVOLVE_ONE_SAMPLE; // 18
214            CONVOLVE_ONE_SAMPLE; // 19
215            CONVOLVE_ONE_SAMPLE; // 20
216
217            CONVOLVE_ONE_SAMPLE; // 21
218            CONVOLVE_ONE_SAMPLE; // 22
219            CONVOLVE_ONE_SAMPLE; // 23
220            CONVOLVE_ONE_SAMPLE; // 24
221            CONVOLVE_ONE_SAMPLE; // 25
222            CONVOLVE_ONE_SAMPLE; // 26
223            CONVOLVE_ONE_SAMPLE; // 27
224            CONVOLVE_ONE_SAMPLE; // 28
225            CONVOLVE_ONE_SAMPLE; // 29
226            CONVOLVE_ONE_SAMPLE; // 30
227
228            CONVOLVE_ONE_SAMPLE; // 31
229            CONVOLVE_ONE_SAMPLE; // 32
230            CONVOLVE_ONE_SAMPLE; // 33
231            CONVOLVE_ONE_SAMPLE; // 34
232            CONVOLVE_ONE_SAMPLE; // 35
233            CONVOLVE_ONE_SAMPLE; // 36
234            CONVOLVE_ONE_SAMPLE; // 37
235            CONVOLVE_ONE_SAMPLE; // 38
236            CONVOLVE_ONE_SAMPLE; // 39
237            CONVOLVE_ONE_SAMPLE; // 40
238
239            CONVOLVE_ONE_SAMPLE; // 41
240            CONVOLVE_ONE_SAMPLE; // 42
241            CONVOLVE_ONE_SAMPLE; // 43
242            CONVOLVE_ONE_SAMPLE; // 44
243            CONVOLVE_ONE_SAMPLE; // 45
244            CONVOLVE_ONE_SAMPLE; // 46
245            CONVOLVE_ONE_SAMPLE; // 47
246            CONVOLVE_ONE_SAMPLE; // 48
247            CONVOLVE_ONE_SAMPLE; // 49
248            CONVOLVE_ONE_SAMPLE; // 50
249
250            CONVOLVE_ONE_SAMPLE; // 51
251            CONVOLVE_ONE_SAMPLE; // 52
252            CONVOLVE_ONE_SAMPLE; // 53
253            CONVOLVE_ONE_SAMPLE; // 54
254            CONVOLVE_ONE_SAMPLE; // 55
255            CONVOLVE_ONE_SAMPLE; // 56
256            CONVOLVE_ONE_SAMPLE; // 57
257            CONVOLVE_ONE_SAMPLE; // 58
258            CONVOLVE_ONE_SAMPLE; // 59
259            CONVOLVE_ONE_SAMPLE; // 60
260
261            CONVOLVE_ONE_SAMPLE; // 61
262            CONVOLVE_ONE_SAMPLE; // 62
263            CONVOLVE_ONE_SAMPLE; // 63
264            CONVOLVE_ONE_SAMPLE; // 64
265
266        } else if (kernelSize == 128) {
267            CONVOLVE_ONE_SAMPLE; // 1
268            CONVOLVE_ONE_SAMPLE; // 2
269            CONVOLVE_ONE_SAMPLE; // 3
270            CONVOLVE_ONE_SAMPLE; // 4
271            CONVOLVE_ONE_SAMPLE; // 5
272            CONVOLVE_ONE_SAMPLE; // 6
273            CONVOLVE_ONE_SAMPLE; // 7
274            CONVOLVE_ONE_SAMPLE; // 8
275            CONVOLVE_ONE_SAMPLE; // 9
276            CONVOLVE_ONE_SAMPLE; // 10
277
278            CONVOLVE_ONE_SAMPLE; // 11
279            CONVOLVE_ONE_SAMPLE; // 12
280            CONVOLVE_ONE_SAMPLE; // 13
281            CONVOLVE_ONE_SAMPLE; // 14
282            CONVOLVE_ONE_SAMPLE; // 15
283            CONVOLVE_ONE_SAMPLE; // 16
284            CONVOLVE_ONE_SAMPLE; // 17
285            CONVOLVE_ONE_SAMPLE; // 18
286            CONVOLVE_ONE_SAMPLE; // 19
287            CONVOLVE_ONE_SAMPLE; // 20
288
289            CONVOLVE_ONE_SAMPLE; // 21
290            CONVOLVE_ONE_SAMPLE; // 22
291            CONVOLVE_ONE_SAMPLE; // 23
292            CONVOLVE_ONE_SAMPLE; // 24
293            CONVOLVE_ONE_SAMPLE; // 25
294            CONVOLVE_ONE_SAMPLE; // 26
295            CONVOLVE_ONE_SAMPLE; // 27
296            CONVOLVE_ONE_SAMPLE; // 28
297            CONVOLVE_ONE_SAMPLE; // 29
298            CONVOLVE_ONE_SAMPLE; // 30
299
300            CONVOLVE_ONE_SAMPLE; // 31
301            CONVOLVE_ONE_SAMPLE; // 32
302            CONVOLVE_ONE_SAMPLE; // 33
303            CONVOLVE_ONE_SAMPLE; // 34
304            CONVOLVE_ONE_SAMPLE; // 35
305            CONVOLVE_ONE_SAMPLE; // 36
306            CONVOLVE_ONE_SAMPLE; // 37
307            CONVOLVE_ONE_SAMPLE; // 38
308            CONVOLVE_ONE_SAMPLE; // 39
309            CONVOLVE_ONE_SAMPLE; // 40
310
311            CONVOLVE_ONE_SAMPLE; // 41
312            CONVOLVE_ONE_SAMPLE; // 42
313            CONVOLVE_ONE_SAMPLE; // 43
314            CONVOLVE_ONE_SAMPLE; // 44
315            CONVOLVE_ONE_SAMPLE; // 45
316            CONVOLVE_ONE_SAMPLE; // 46
317            CONVOLVE_ONE_SAMPLE; // 47
318            CONVOLVE_ONE_SAMPLE; // 48
319            CONVOLVE_ONE_SAMPLE; // 49
320            CONVOLVE_ONE_SAMPLE; // 50
321
322            CONVOLVE_ONE_SAMPLE; // 51
323            CONVOLVE_ONE_SAMPLE; // 52
324            CONVOLVE_ONE_SAMPLE; // 53
325            CONVOLVE_ONE_SAMPLE; // 54
326            CONVOLVE_ONE_SAMPLE; // 55
327            CONVOLVE_ONE_SAMPLE; // 56
328            CONVOLVE_ONE_SAMPLE; // 57
329            CONVOLVE_ONE_SAMPLE; // 58
330            CONVOLVE_ONE_SAMPLE; // 59
331            CONVOLVE_ONE_SAMPLE; // 60
332
333            CONVOLVE_ONE_SAMPLE; // 61
334            CONVOLVE_ONE_SAMPLE; // 62
335            CONVOLVE_ONE_SAMPLE; // 63
336            CONVOLVE_ONE_SAMPLE; // 64
337            CONVOLVE_ONE_SAMPLE; // 65
338            CONVOLVE_ONE_SAMPLE; // 66
339            CONVOLVE_ONE_SAMPLE; // 67
340            CONVOLVE_ONE_SAMPLE; // 68
341            CONVOLVE_ONE_SAMPLE; // 69
342            CONVOLVE_ONE_SAMPLE; // 70
343
344            CONVOLVE_ONE_SAMPLE; // 71
345            CONVOLVE_ONE_SAMPLE; // 72
346            CONVOLVE_ONE_SAMPLE; // 73
347            CONVOLVE_ONE_SAMPLE; // 74
348            CONVOLVE_ONE_SAMPLE; // 75
349            CONVOLVE_ONE_SAMPLE; // 76
350            CONVOLVE_ONE_SAMPLE; // 77
351            CONVOLVE_ONE_SAMPLE; // 78
352            CONVOLVE_ONE_SAMPLE; // 79
353            CONVOLVE_ONE_SAMPLE; // 80
354
355            CONVOLVE_ONE_SAMPLE; // 81
356            CONVOLVE_ONE_SAMPLE; // 82
357            CONVOLVE_ONE_SAMPLE; // 83
358            CONVOLVE_ONE_SAMPLE; // 84
359            CONVOLVE_ONE_SAMPLE; // 85
360            CONVOLVE_ONE_SAMPLE; // 86
361            CONVOLVE_ONE_SAMPLE; // 87
362            CONVOLVE_ONE_SAMPLE; // 88
363            CONVOLVE_ONE_SAMPLE; // 89
364            CONVOLVE_ONE_SAMPLE; // 90
365
366            CONVOLVE_ONE_SAMPLE; // 91
367            CONVOLVE_ONE_SAMPLE; // 92
368            CONVOLVE_ONE_SAMPLE; // 93
369            CONVOLVE_ONE_SAMPLE; // 94
370            CONVOLVE_ONE_SAMPLE; // 95
371            CONVOLVE_ONE_SAMPLE; // 96
372            CONVOLVE_ONE_SAMPLE; // 97
373            CONVOLVE_ONE_SAMPLE; // 98
374            CONVOLVE_ONE_SAMPLE; // 99
375            CONVOLVE_ONE_SAMPLE; // 100
376
377            CONVOLVE_ONE_SAMPLE; // 101
378            CONVOLVE_ONE_SAMPLE; // 102
379            CONVOLVE_ONE_SAMPLE; // 103
380            CONVOLVE_ONE_SAMPLE; // 104
381            CONVOLVE_ONE_SAMPLE; // 105
382            CONVOLVE_ONE_SAMPLE; // 106
383            CONVOLVE_ONE_SAMPLE; // 107
384            CONVOLVE_ONE_SAMPLE; // 108
385            CONVOLVE_ONE_SAMPLE; // 109
386            CONVOLVE_ONE_SAMPLE; // 110
387
388            CONVOLVE_ONE_SAMPLE; // 111
389            CONVOLVE_ONE_SAMPLE; // 112
390            CONVOLVE_ONE_SAMPLE; // 113
391            CONVOLVE_ONE_SAMPLE; // 114
392            CONVOLVE_ONE_SAMPLE; // 115
393            CONVOLVE_ONE_SAMPLE; // 116
394            CONVOLVE_ONE_SAMPLE; // 117
395            CONVOLVE_ONE_SAMPLE; // 118
396            CONVOLVE_ONE_SAMPLE; // 119
397            CONVOLVE_ONE_SAMPLE; // 120
398
399            CONVOLVE_ONE_SAMPLE; // 121
400            CONVOLVE_ONE_SAMPLE; // 122
401            CONVOLVE_ONE_SAMPLE; // 123
402            CONVOLVE_ONE_SAMPLE; // 124
403            CONVOLVE_ONE_SAMPLE; // 125
404            CONVOLVE_ONE_SAMPLE; // 126
405            CONVOLVE_ONE_SAMPLE; // 127
406            CONVOLVE_ONE_SAMPLE; // 128
407        } else {
408            while (j < kernelSize) {
409                // Non-optimized using actual while loop.
410                CONVOLVE_ONE_SAMPLE;
411            }
412        }
413        destP[i++] = sum;
414    }
415#if CPU(X86) || CPU(X86_64)
416    }
417#endif
418#endif // OS(MACOSX)
419
420    // Copy 2nd half of input buffer to 1st half.
421    memcpy(m_buffer.data(), inputP, sizeof(float) * framesToProcess);
422#endif
423}
424
425void DirectConvolver::reset()
426{
427    m_buffer.zero();
428#if USE(WEBAUDIO_IPP)
429    m_overlayBuffer.zero();
430#endif // USE(WEBAUDIO_IPP)
431}
432
433} // namespace blink
434
435#endif // ENABLE(WEB_AUDIO)
436