1/*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 *
8 * 1.  Redistributions of source code must retain the above copyright
9 *     notice, this list of conditions and the following disclaimer.
10 * 2.  Redistributions in binary form must reproduce the above copyright
11 *     notice, this list of conditions and the following disclaimer in the
12 *     documentation and/or other materials provided with the distribution.
13 * 3.  Neither the name of Apple Computer, Inc. ("Apple") nor the names of
14 *     its contributors may be used to endorse or promote products derived
15 *     from this software without specific prior written permission.
16 *
17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY
18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY
21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28
29#include "config.h"
30
31#if ENABLE(WEB_AUDIO)
32
33#include "platform/audio/SincResampler.h"
34
35#include "platform/audio/AudioBus.h"
36#include "wtf/CPU.h"
37#include "wtf/MathExtras.h"
38
39#if CPU(X86) || CPU(X86_64)
40#include <emmintrin.h>
41#endif
42
43// Input buffer layout, dividing the total buffer into regions (r0 - r5):
44//
45// |----------------|----------------------------------------------------------------|----------------|
46//
47//                                              blockSize + kernelSize / 2
48//                   <-------------------------------------------------------------------------------->
49//                                                  r0
50//
51//   kernelSize / 2   kernelSize / 2                                 kernelSize / 2     kernelSize / 2
52// <---------------> <--------------->                              <---------------> <--------------->
53//         r1                r2                                             r3                r4
54//
55//                                              blockSize
56//                                     <-------------------------------------------------------------->
57//                                                  r5
58
59// The Algorithm:
60//
61// 1) Consume input frames into r0 (r1 is zero-initialized).
62// 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4.
63//    or we've finished generating all the output frames.
64// 3) Copy r3 to r1 and r4 to r2.
65// 4) Consume input frames into r5 (zero-pad if we run out of input).
66// 5) Goto (2) until all of input is consumed.
67//
68// note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc.
69
70namespace blink {
71
72SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets)
73    : m_scaleFactor(scaleFactor)
74    , m_kernelSize(kernelSize)
75    , m_numberOfKernelOffsets(numberOfKernelOffsets)
76    , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1))
77    , m_virtualSourceIndex(0)
78    , m_blockSize(512)
79    , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above.
80    , m_source(0)
81    , m_sourceFramesAvailable(0)
82    , m_sourceProvider(0)
83    , m_isBufferPrimed(false)
84{
85    initializeKernel();
86}
87
88void SincResampler::initializeKernel()
89{
90    // Blackman window parameters.
91    double alpha = 0.16;
92    double a0 = 0.5 * (1.0 - alpha);
93    double a1 = 0.5;
94    double a2 = 0.5 * alpha;
95
96    // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter.
97    double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0;
98
99    // The sinc function is an idealized brick-wall filter, but since we're windowing it the
100    // transition from pass to stop does not happen right away. So we should adjust the
101    // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end.
102    // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize.
103    sincScaleFactor *= 0.9;
104
105    int n = m_kernelSize;
106    int halfSize = n / 2;
107
108    // Generates a set of windowed sinc() kernels.
109    // We generate a range of sub-sample offsets from 0.0 to 1.0.
110    for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) {
111        double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets;
112
113        for (int i = 0; i < n; ++i) {
114            // Compute the sinc() with offset.
115            double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset);
116            double sinc = !s ? 1.0 : std::sin(s) / s;
117            sinc *= sincScaleFactor;
118
119            // Compute Blackman window, matching the offset of the sinc().
120            double x = (i - subsampleOffset) / n;
121            double window = a0 - a1 * std::cos(twoPiDouble * x) + a2 * std::cos(twoPiDouble * 2.0 * x);
122
123            // Window the sinc() function and store at the correct offset.
124            m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window;
125        }
126    }
127}
128
129void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames)
130{
131    ASSERT(m_sourceProvider);
132    if (!m_sourceProvider)
133        return;
134
135    // Wrap the provided buffer by an AudioBus for use by the source provider.
136    RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false);
137
138    // FIXME: Find a way to make the following const-correct:
139    bus->setChannelMemory(0, buffer, numberOfSourceFrames);
140
141    m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames);
142}
143
144namespace {
145
146// BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer.
147
148class BufferSourceProvider FINAL : public AudioSourceProvider {
149public:
150    BufferSourceProvider(const float* source, size_t numberOfSourceFrames)
151        : m_source(source)
152        , m_sourceFramesAvailable(numberOfSourceFrames)
153    {
154    }
155
156    // Consumes samples from the in-memory buffer.
157    virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE
158    {
159        ASSERT(m_source && bus);
160        if (!m_source || !bus)
161            return;
162
163        float* buffer = bus->channel(0)->mutableData();
164
165        // Clamp to number of frames available and zero-pad.
166        size_t framesToCopy = std::min(m_sourceFramesAvailable, framesToProcess);
167        memcpy(buffer, m_source, sizeof(float) * framesToCopy);
168
169        // Zero-pad if necessary.
170        if (framesToCopy < framesToProcess)
171            memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy));
172
173        m_sourceFramesAvailable -= framesToCopy;
174        m_source += framesToCopy;
175    }
176
177private:
178    const float* m_source;
179    size_t m_sourceFramesAvailable;
180};
181
182} // namespace
183
184void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames)
185{
186    // Resample an in-memory buffer using an AudioSourceProvider.
187    BufferSourceProvider sourceProvider(source, numberOfSourceFrames);
188
189    unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor);
190    unsigned remaining = numberOfDestinationFrames;
191
192    while (remaining) {
193        unsigned framesThisTime = std::min(remaining, m_blockSize);
194        process(&sourceProvider, destination, framesThisTime);
195
196        destination += framesThisTime;
197        remaining -= framesThisTime;
198    }
199}
200
201void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess)
202{
203    bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2);
204    ASSERT(isGood);
205    if (!isGood)
206        return;
207
208    m_sourceProvider = sourceProvider;
209
210    unsigned numberOfDestinationFrames = framesToProcess;
211
212    // Setup various region pointers in the buffer (see diagram above).
213    float* r0 = m_inputBuffer.data() + m_kernelSize / 2;
214    float* r1 = m_inputBuffer.data();
215    float* r2 = r0;
216    float* r3 = r0 + m_blockSize - m_kernelSize / 2;
217    float* r4 = r0 + m_blockSize;
218    float* r5 = r0 + m_kernelSize / 2;
219
220    // Step (1)
221    // Prime the input buffer at the start of the input stream.
222    if (!m_isBufferPrimed) {
223        consumeSource(r0, m_blockSize + m_kernelSize / 2);
224        m_isBufferPrimed = true;
225    }
226
227    // Step (2)
228
229    while (numberOfDestinationFrames) {
230        while (m_virtualSourceIndex < m_blockSize) {
231            // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are.
232            int sourceIndexI = static_cast<int>(m_virtualSourceIndex);
233            double subsampleRemainder = m_virtualSourceIndex - sourceIndexI;
234
235            double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets;
236            int offsetIndex = static_cast<int>(virtualOffsetIndex);
237
238            float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize;
239            float* k2 = k1 + m_kernelSize;
240
241            // Initialize input pointer based on quantized m_virtualSourceIndex.
242            float* inputP = r1 + sourceIndexI;
243
244            // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex
245            float sum1 = 0;
246            float sum2 = 0;
247
248            // Figure out how much to weight each kernel's "convolution".
249            double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex;
250
251            // Generate a single output sample.
252            int n = m_kernelSize;
253
254#define CONVOLVE_ONE_SAMPLE      \
255            input = *inputP++;   \
256            sum1 += input * *k1; \
257            sum2 += input * *k2; \
258            ++k1;                \
259            ++k2;
260
261            {
262                float input;
263
264#if CPU(X86) || CPU(X86_64)
265                // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately.
266                while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) {
267                    CONVOLVE_ONE_SAMPLE
268                    n--;
269                }
270
271                // Now the inputP is aligned and start to apply SSE.
272                float* endP = inputP + n - n % 4;
273                __m128 mInput;
274                __m128 mK1;
275                __m128 mK2;
276                __m128 mul1;
277                __m128 mul2;
278
279                __m128 sums1 = _mm_setzero_ps();
280                __m128 sums2 = _mm_setzero_ps();
281                bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F);
282                bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F);
283
284#define LOAD_DATA(l1, l2)                        \
285                mInput = _mm_load_ps(inputP);    \
286                mK1 = _mm_##l1##_ps(k1);         \
287                mK2 = _mm_##l2##_ps(k2);
288
289#define CONVOLVE_4_SAMPLES                       \
290                mul1 = _mm_mul_ps(mInput, mK1);  \
291                mul2 = _mm_mul_ps(mInput, mK2);  \
292                sums1 = _mm_add_ps(sums1, mul1); \
293                sums2 = _mm_add_ps(sums2, mul2); \
294                inputP += 4;                     \
295                k1 += 4;                         \
296                k2 += 4;
297
298                if (k1Aligned && k2Aligned) { // both aligned
299                    while (inputP < endP) {
300                        LOAD_DATA(load, load)
301                        CONVOLVE_4_SAMPLES
302                    }
303                } else if (!k1Aligned && k2Aligned) { // only k2 aligned
304                    while (inputP < endP) {
305                        LOAD_DATA(loadu, load)
306                        CONVOLVE_4_SAMPLES
307                    }
308                } else if (k1Aligned && !k2Aligned) { // only k1 aligned
309                    while (inputP < endP) {
310                        LOAD_DATA(load, loadu)
311                        CONVOLVE_4_SAMPLES
312                    }
313                } else { // both non-aligned
314                    while (inputP < endP) {
315                        LOAD_DATA(loadu, loadu)
316                        CONVOLVE_4_SAMPLES
317                    }
318                }
319
320                // Summarize the SSE results to sum1 and sum2.
321                float* groupSumP = reinterpret_cast<float*>(&sums1);
322                sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
323                groupSumP = reinterpret_cast<float*>(&sums2);
324                sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
325
326                n %= 4;
327                while (n) {
328                    CONVOLVE_ONE_SAMPLE
329                    n--;
330                }
331#else
332                // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better.
333
334                // Optimize size 32 and size 64 kernels by unrolling the while loop.
335                // A 20 - 30% speed improvement was measured in some cases by using this approach.
336
337                if (n == 32) {
338                    CONVOLVE_ONE_SAMPLE // 1
339                    CONVOLVE_ONE_SAMPLE // 2
340                    CONVOLVE_ONE_SAMPLE // 3
341                    CONVOLVE_ONE_SAMPLE // 4
342                    CONVOLVE_ONE_SAMPLE // 5
343                    CONVOLVE_ONE_SAMPLE // 6
344                    CONVOLVE_ONE_SAMPLE // 7
345                    CONVOLVE_ONE_SAMPLE // 8
346                    CONVOLVE_ONE_SAMPLE // 9
347                    CONVOLVE_ONE_SAMPLE // 10
348                    CONVOLVE_ONE_SAMPLE // 11
349                    CONVOLVE_ONE_SAMPLE // 12
350                    CONVOLVE_ONE_SAMPLE // 13
351                    CONVOLVE_ONE_SAMPLE // 14
352                    CONVOLVE_ONE_SAMPLE // 15
353                    CONVOLVE_ONE_SAMPLE // 16
354                    CONVOLVE_ONE_SAMPLE // 17
355                    CONVOLVE_ONE_SAMPLE // 18
356                    CONVOLVE_ONE_SAMPLE // 19
357                    CONVOLVE_ONE_SAMPLE // 20
358                    CONVOLVE_ONE_SAMPLE // 21
359                    CONVOLVE_ONE_SAMPLE // 22
360                    CONVOLVE_ONE_SAMPLE // 23
361                    CONVOLVE_ONE_SAMPLE // 24
362                    CONVOLVE_ONE_SAMPLE // 25
363                    CONVOLVE_ONE_SAMPLE // 26
364                    CONVOLVE_ONE_SAMPLE // 27
365                    CONVOLVE_ONE_SAMPLE // 28
366                    CONVOLVE_ONE_SAMPLE // 29
367                    CONVOLVE_ONE_SAMPLE // 30
368                    CONVOLVE_ONE_SAMPLE // 31
369                    CONVOLVE_ONE_SAMPLE // 32
370                } else if (n == 64) {
371                    CONVOLVE_ONE_SAMPLE // 1
372                    CONVOLVE_ONE_SAMPLE // 2
373                    CONVOLVE_ONE_SAMPLE // 3
374                    CONVOLVE_ONE_SAMPLE // 4
375                    CONVOLVE_ONE_SAMPLE // 5
376                    CONVOLVE_ONE_SAMPLE // 6
377                    CONVOLVE_ONE_SAMPLE // 7
378                    CONVOLVE_ONE_SAMPLE // 8
379                    CONVOLVE_ONE_SAMPLE // 9
380                    CONVOLVE_ONE_SAMPLE // 10
381                    CONVOLVE_ONE_SAMPLE // 11
382                    CONVOLVE_ONE_SAMPLE // 12
383                    CONVOLVE_ONE_SAMPLE // 13
384                    CONVOLVE_ONE_SAMPLE // 14
385                    CONVOLVE_ONE_SAMPLE // 15
386                    CONVOLVE_ONE_SAMPLE // 16
387                    CONVOLVE_ONE_SAMPLE // 17
388                    CONVOLVE_ONE_SAMPLE // 18
389                    CONVOLVE_ONE_SAMPLE // 19
390                    CONVOLVE_ONE_SAMPLE // 20
391                    CONVOLVE_ONE_SAMPLE // 21
392                    CONVOLVE_ONE_SAMPLE // 22
393                    CONVOLVE_ONE_SAMPLE // 23
394                    CONVOLVE_ONE_SAMPLE // 24
395                    CONVOLVE_ONE_SAMPLE // 25
396                    CONVOLVE_ONE_SAMPLE // 26
397                    CONVOLVE_ONE_SAMPLE // 27
398                    CONVOLVE_ONE_SAMPLE // 28
399                    CONVOLVE_ONE_SAMPLE // 29
400                    CONVOLVE_ONE_SAMPLE // 30
401                    CONVOLVE_ONE_SAMPLE // 31
402                    CONVOLVE_ONE_SAMPLE // 32
403                    CONVOLVE_ONE_SAMPLE // 33
404                    CONVOLVE_ONE_SAMPLE // 34
405                    CONVOLVE_ONE_SAMPLE // 35
406                    CONVOLVE_ONE_SAMPLE // 36
407                    CONVOLVE_ONE_SAMPLE // 37
408                    CONVOLVE_ONE_SAMPLE // 38
409                    CONVOLVE_ONE_SAMPLE // 39
410                    CONVOLVE_ONE_SAMPLE // 40
411                    CONVOLVE_ONE_SAMPLE // 41
412                    CONVOLVE_ONE_SAMPLE // 42
413                    CONVOLVE_ONE_SAMPLE // 43
414                    CONVOLVE_ONE_SAMPLE // 44
415                    CONVOLVE_ONE_SAMPLE // 45
416                    CONVOLVE_ONE_SAMPLE // 46
417                    CONVOLVE_ONE_SAMPLE // 47
418                    CONVOLVE_ONE_SAMPLE // 48
419                    CONVOLVE_ONE_SAMPLE // 49
420                    CONVOLVE_ONE_SAMPLE // 50
421                    CONVOLVE_ONE_SAMPLE // 51
422                    CONVOLVE_ONE_SAMPLE // 52
423                    CONVOLVE_ONE_SAMPLE // 53
424                    CONVOLVE_ONE_SAMPLE // 54
425                    CONVOLVE_ONE_SAMPLE // 55
426                    CONVOLVE_ONE_SAMPLE // 56
427                    CONVOLVE_ONE_SAMPLE // 57
428                    CONVOLVE_ONE_SAMPLE // 58
429                    CONVOLVE_ONE_SAMPLE // 59
430                    CONVOLVE_ONE_SAMPLE // 60
431                    CONVOLVE_ONE_SAMPLE // 61
432                    CONVOLVE_ONE_SAMPLE // 62
433                    CONVOLVE_ONE_SAMPLE // 63
434                    CONVOLVE_ONE_SAMPLE // 64
435                } else {
436                    while (n--) {
437                        // Non-optimized using actual while loop.
438                        CONVOLVE_ONE_SAMPLE
439                    }
440                }
441#endif
442            }
443
444            // Linearly interpolate the two "convolutions".
445            double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2;
446
447            *destination++ = result;
448
449            // Advance the virtual index.
450            m_virtualSourceIndex += m_scaleFactor;
451
452            --numberOfDestinationFrames;
453            if (!numberOfDestinationFrames)
454                return;
455        }
456
457        // Wrap back around to the start.
458        m_virtualSourceIndex -= m_blockSize;
459
460        // Step (3) Copy r3 to r1 and r4 to r2.
461        // This wraps the last input frames back to the start of the buffer.
462        memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2));
463        memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2));
464
465        // Step (4)
466        // Refresh the buffer with more input.
467        consumeSource(r5, m_blockSize);
468    }
469}
470
471} // namespace blink
472
473#endif // ENABLE(WEB_AUDIO)
474