1/* 2 * Copyright (C) 2011 Google Inc. All rights reserved. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 3. Neither the name of Apple Computer, Inc. ("Apple") nor the names of 14 * its contributors may be used to endorse or promote products derived 15 * from this software without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY APPLE AND ITS CONTRIBUTORS "AS IS" AND ANY 18 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL APPLE OR ITS CONTRIBUTORS BE LIABLE FOR ANY 21 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29#include "config.h" 30 31#if ENABLE(WEB_AUDIO) 32 33#include "platform/audio/SincResampler.h" 34 35#include "platform/audio/AudioBus.h" 36#include "wtf/CPU.h" 37#include "wtf/MathExtras.h" 38 39#if CPU(X86) || CPU(X86_64) 40#include <emmintrin.h> 41#endif 42 43// Input buffer layout, dividing the total buffer into regions (r0 - r5): 44// 45// |----------------|----------------------------------------------------------------|----------------| 46// 47// blockSize + kernelSize / 2 48// <--------------------------------------------------------------------------------> 49// r0 50// 51// kernelSize / 2 kernelSize / 2 kernelSize / 2 kernelSize / 2 52// <---------------> <---------------> <---------------> <---------------> 53// r1 r2 r3 r4 54// 55// blockSize 56// <--------------------------------------------------------------> 57// r5 58 59// The Algorithm: 60// 61// 1) Consume input frames into r0 (r1 is zero-initialized). 62// 2) Position kernel centered at start of r0 (r2) and generate output frames until kernel is centered at start of r4. 63// or we've finished generating all the output frames. 64// 3) Copy r3 to r1 and r4 to r2. 65// 4) Consume input frames into r5 (zero-pad if we run out of input). 66// 5) Goto (2) until all of input is consumed. 67// 68// note: we're glossing over how the sub-sample handling works with m_virtualSourceIndex, etc. 69 70namespace blink { 71 72SincResampler::SincResampler(double scaleFactor, unsigned kernelSize, unsigned numberOfKernelOffsets) 73 : m_scaleFactor(scaleFactor) 74 , m_kernelSize(kernelSize) 75 , m_numberOfKernelOffsets(numberOfKernelOffsets) 76 , m_kernelStorage(m_kernelSize * (m_numberOfKernelOffsets + 1)) 77 , m_virtualSourceIndex(0) 78 , m_blockSize(512) 79 , m_inputBuffer(m_blockSize + m_kernelSize) // See input buffer layout above. 80 , m_source(0) 81 , m_sourceFramesAvailable(0) 82 , m_sourceProvider(0) 83 , m_isBufferPrimed(false) 84{ 85 initializeKernel(); 86} 87 88void SincResampler::initializeKernel() 89{ 90 // Blackman window parameters. 91 double alpha = 0.16; 92 double a0 = 0.5 * (1.0 - alpha); 93 double a1 = 0.5; 94 double a2 = 0.5 * alpha; 95 96 // sincScaleFactor is basically the normalized cutoff frequency of the low-pass filter. 97 double sincScaleFactor = m_scaleFactor > 1.0 ? 1.0 / m_scaleFactor : 1.0; 98 99 // The sinc function is an idealized brick-wall filter, but since we're windowing it the 100 // transition from pass to stop does not happen right away. So we should adjust the 101 // lowpass filter cutoff slightly downward to avoid some aliasing at the very high-end. 102 // FIXME: this value is empirical and to be more exact should vary depending on m_kernelSize. 103 sincScaleFactor *= 0.9; 104 105 int n = m_kernelSize; 106 int halfSize = n / 2; 107 108 // Generates a set of windowed sinc() kernels. 109 // We generate a range of sub-sample offsets from 0.0 to 1.0. 110 for (unsigned offsetIndex = 0; offsetIndex <= m_numberOfKernelOffsets; ++offsetIndex) { 111 double subsampleOffset = static_cast<double>(offsetIndex) / m_numberOfKernelOffsets; 112 113 for (int i = 0; i < n; ++i) { 114 // Compute the sinc() with offset. 115 double s = sincScaleFactor * piDouble * (i - halfSize - subsampleOffset); 116 double sinc = !s ? 1.0 : std::sin(s) / s; 117 sinc *= sincScaleFactor; 118 119 // Compute Blackman window, matching the offset of the sinc(). 120 double x = (i - subsampleOffset) / n; 121 double window = a0 - a1 * std::cos(twoPiDouble * x) + a2 * std::cos(twoPiDouble * 2.0 * x); 122 123 // Window the sinc() function and store at the correct offset. 124 m_kernelStorage[i + offsetIndex * m_kernelSize] = sinc * window; 125 } 126 } 127} 128 129void SincResampler::consumeSource(float* buffer, unsigned numberOfSourceFrames) 130{ 131 ASSERT(m_sourceProvider); 132 if (!m_sourceProvider) 133 return; 134 135 // Wrap the provided buffer by an AudioBus for use by the source provider. 136 RefPtr<AudioBus> bus = AudioBus::create(1, numberOfSourceFrames, false); 137 138 // FIXME: Find a way to make the following const-correct: 139 bus->setChannelMemory(0, buffer, numberOfSourceFrames); 140 141 m_sourceProvider->provideInput(bus.get(), numberOfSourceFrames); 142} 143 144namespace { 145 146// BufferSourceProvider is an AudioSourceProvider wrapping an in-memory buffer. 147 148class BufferSourceProvider FINAL : public AudioSourceProvider { 149public: 150 BufferSourceProvider(const float* source, size_t numberOfSourceFrames) 151 : m_source(source) 152 , m_sourceFramesAvailable(numberOfSourceFrames) 153 { 154 } 155 156 // Consumes samples from the in-memory buffer. 157 virtual void provideInput(AudioBus* bus, size_t framesToProcess) OVERRIDE 158 { 159 ASSERT(m_source && bus); 160 if (!m_source || !bus) 161 return; 162 163 float* buffer = bus->channel(0)->mutableData(); 164 165 // Clamp to number of frames available and zero-pad. 166 size_t framesToCopy = std::min(m_sourceFramesAvailable, framesToProcess); 167 memcpy(buffer, m_source, sizeof(float) * framesToCopy); 168 169 // Zero-pad if necessary. 170 if (framesToCopy < framesToProcess) 171 memset(buffer + framesToCopy, 0, sizeof(float) * (framesToProcess - framesToCopy)); 172 173 m_sourceFramesAvailable -= framesToCopy; 174 m_source += framesToCopy; 175 } 176 177private: 178 const float* m_source; 179 size_t m_sourceFramesAvailable; 180}; 181 182} // namespace 183 184void SincResampler::process(const float* source, float* destination, unsigned numberOfSourceFrames) 185{ 186 // Resample an in-memory buffer using an AudioSourceProvider. 187 BufferSourceProvider sourceProvider(source, numberOfSourceFrames); 188 189 unsigned numberOfDestinationFrames = static_cast<unsigned>(numberOfSourceFrames / m_scaleFactor); 190 unsigned remaining = numberOfDestinationFrames; 191 192 while (remaining) { 193 unsigned framesThisTime = std::min(remaining, m_blockSize); 194 process(&sourceProvider, destination, framesThisTime); 195 196 destination += framesThisTime; 197 remaining -= framesThisTime; 198 } 199} 200 201void SincResampler::process(AudioSourceProvider* sourceProvider, float* destination, size_t framesToProcess) 202{ 203 bool isGood = sourceProvider && m_blockSize > m_kernelSize && m_inputBuffer.size() >= m_blockSize + m_kernelSize && !(m_kernelSize % 2); 204 ASSERT(isGood); 205 if (!isGood) 206 return; 207 208 m_sourceProvider = sourceProvider; 209 210 unsigned numberOfDestinationFrames = framesToProcess; 211 212 // Setup various region pointers in the buffer (see diagram above). 213 float* r0 = m_inputBuffer.data() + m_kernelSize / 2; 214 float* r1 = m_inputBuffer.data(); 215 float* r2 = r0; 216 float* r3 = r0 + m_blockSize - m_kernelSize / 2; 217 float* r4 = r0 + m_blockSize; 218 float* r5 = r0 + m_kernelSize / 2; 219 220 // Step (1) 221 // Prime the input buffer at the start of the input stream. 222 if (!m_isBufferPrimed) { 223 consumeSource(r0, m_blockSize + m_kernelSize / 2); 224 m_isBufferPrimed = true; 225 } 226 227 // Step (2) 228 229 while (numberOfDestinationFrames) { 230 while (m_virtualSourceIndex < m_blockSize) { 231 // m_virtualSourceIndex lies in between two kernel offsets so figure out what they are. 232 int sourceIndexI = static_cast<int>(m_virtualSourceIndex); 233 double subsampleRemainder = m_virtualSourceIndex - sourceIndexI; 234 235 double virtualOffsetIndex = subsampleRemainder * m_numberOfKernelOffsets; 236 int offsetIndex = static_cast<int>(virtualOffsetIndex); 237 238 float* k1 = m_kernelStorage.data() + offsetIndex * m_kernelSize; 239 float* k2 = k1 + m_kernelSize; 240 241 // Initialize input pointer based on quantized m_virtualSourceIndex. 242 float* inputP = r1 + sourceIndexI; 243 244 // We'll compute "convolutions" for the two kernels which straddle m_virtualSourceIndex 245 float sum1 = 0; 246 float sum2 = 0; 247 248 // Figure out how much to weight each kernel's "convolution". 249 double kernelInterpolationFactor = virtualOffsetIndex - offsetIndex; 250 251 // Generate a single output sample. 252 int n = m_kernelSize; 253 254#define CONVOLVE_ONE_SAMPLE \ 255 input = *inputP++; \ 256 sum1 += input * *k1; \ 257 sum2 += input * *k2; \ 258 ++k1; \ 259 ++k2; 260 261 { 262 float input; 263 264#if CPU(X86) || CPU(X86_64) 265 // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed seperately. 266 while ((reinterpret_cast<uintptr_t>(inputP) & 0x0F) && n) { 267 CONVOLVE_ONE_SAMPLE 268 n--; 269 } 270 271 // Now the inputP is aligned and start to apply SSE. 272 float* endP = inputP + n - n % 4; 273 __m128 mInput; 274 __m128 mK1; 275 __m128 mK2; 276 __m128 mul1; 277 __m128 mul2; 278 279 __m128 sums1 = _mm_setzero_ps(); 280 __m128 sums2 = _mm_setzero_ps(); 281 bool k1Aligned = !(reinterpret_cast<uintptr_t>(k1) & 0x0F); 282 bool k2Aligned = !(reinterpret_cast<uintptr_t>(k2) & 0x0F); 283 284#define LOAD_DATA(l1, l2) \ 285 mInput = _mm_load_ps(inputP); \ 286 mK1 = _mm_##l1##_ps(k1); \ 287 mK2 = _mm_##l2##_ps(k2); 288 289#define CONVOLVE_4_SAMPLES \ 290 mul1 = _mm_mul_ps(mInput, mK1); \ 291 mul2 = _mm_mul_ps(mInput, mK2); \ 292 sums1 = _mm_add_ps(sums1, mul1); \ 293 sums2 = _mm_add_ps(sums2, mul2); \ 294 inputP += 4; \ 295 k1 += 4; \ 296 k2 += 4; 297 298 if (k1Aligned && k2Aligned) { // both aligned 299 while (inputP < endP) { 300 LOAD_DATA(load, load) 301 CONVOLVE_4_SAMPLES 302 } 303 } else if (!k1Aligned && k2Aligned) { // only k2 aligned 304 while (inputP < endP) { 305 LOAD_DATA(loadu, load) 306 CONVOLVE_4_SAMPLES 307 } 308 } else if (k1Aligned && !k2Aligned) { // only k1 aligned 309 while (inputP < endP) { 310 LOAD_DATA(load, loadu) 311 CONVOLVE_4_SAMPLES 312 } 313 } else { // both non-aligned 314 while (inputP < endP) { 315 LOAD_DATA(loadu, loadu) 316 CONVOLVE_4_SAMPLES 317 } 318 } 319 320 // Summarize the SSE results to sum1 and sum2. 321 float* groupSumP = reinterpret_cast<float*>(&sums1); 322 sum1 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 323 groupSumP = reinterpret_cast<float*>(&sums2); 324 sum2 += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 325 326 n %= 4; 327 while (n) { 328 CONVOLVE_ONE_SAMPLE 329 n--; 330 } 331#else 332 // FIXME: add ARM NEON optimizations for the following. The scalar code-path can probably also be optimized better. 333 334 // Optimize size 32 and size 64 kernels by unrolling the while loop. 335 // A 20 - 30% speed improvement was measured in some cases by using this approach. 336 337 if (n == 32) { 338 CONVOLVE_ONE_SAMPLE // 1 339 CONVOLVE_ONE_SAMPLE // 2 340 CONVOLVE_ONE_SAMPLE // 3 341 CONVOLVE_ONE_SAMPLE // 4 342 CONVOLVE_ONE_SAMPLE // 5 343 CONVOLVE_ONE_SAMPLE // 6 344 CONVOLVE_ONE_SAMPLE // 7 345 CONVOLVE_ONE_SAMPLE // 8 346 CONVOLVE_ONE_SAMPLE // 9 347 CONVOLVE_ONE_SAMPLE // 10 348 CONVOLVE_ONE_SAMPLE // 11 349 CONVOLVE_ONE_SAMPLE // 12 350 CONVOLVE_ONE_SAMPLE // 13 351 CONVOLVE_ONE_SAMPLE // 14 352 CONVOLVE_ONE_SAMPLE // 15 353 CONVOLVE_ONE_SAMPLE // 16 354 CONVOLVE_ONE_SAMPLE // 17 355 CONVOLVE_ONE_SAMPLE // 18 356 CONVOLVE_ONE_SAMPLE // 19 357 CONVOLVE_ONE_SAMPLE // 20 358 CONVOLVE_ONE_SAMPLE // 21 359 CONVOLVE_ONE_SAMPLE // 22 360 CONVOLVE_ONE_SAMPLE // 23 361 CONVOLVE_ONE_SAMPLE // 24 362 CONVOLVE_ONE_SAMPLE // 25 363 CONVOLVE_ONE_SAMPLE // 26 364 CONVOLVE_ONE_SAMPLE // 27 365 CONVOLVE_ONE_SAMPLE // 28 366 CONVOLVE_ONE_SAMPLE // 29 367 CONVOLVE_ONE_SAMPLE // 30 368 CONVOLVE_ONE_SAMPLE // 31 369 CONVOLVE_ONE_SAMPLE // 32 370 } else if (n == 64) { 371 CONVOLVE_ONE_SAMPLE // 1 372 CONVOLVE_ONE_SAMPLE // 2 373 CONVOLVE_ONE_SAMPLE // 3 374 CONVOLVE_ONE_SAMPLE // 4 375 CONVOLVE_ONE_SAMPLE // 5 376 CONVOLVE_ONE_SAMPLE // 6 377 CONVOLVE_ONE_SAMPLE // 7 378 CONVOLVE_ONE_SAMPLE // 8 379 CONVOLVE_ONE_SAMPLE // 9 380 CONVOLVE_ONE_SAMPLE // 10 381 CONVOLVE_ONE_SAMPLE // 11 382 CONVOLVE_ONE_SAMPLE // 12 383 CONVOLVE_ONE_SAMPLE // 13 384 CONVOLVE_ONE_SAMPLE // 14 385 CONVOLVE_ONE_SAMPLE // 15 386 CONVOLVE_ONE_SAMPLE // 16 387 CONVOLVE_ONE_SAMPLE // 17 388 CONVOLVE_ONE_SAMPLE // 18 389 CONVOLVE_ONE_SAMPLE // 19 390 CONVOLVE_ONE_SAMPLE // 20 391 CONVOLVE_ONE_SAMPLE // 21 392 CONVOLVE_ONE_SAMPLE // 22 393 CONVOLVE_ONE_SAMPLE // 23 394 CONVOLVE_ONE_SAMPLE // 24 395 CONVOLVE_ONE_SAMPLE // 25 396 CONVOLVE_ONE_SAMPLE // 26 397 CONVOLVE_ONE_SAMPLE // 27 398 CONVOLVE_ONE_SAMPLE // 28 399 CONVOLVE_ONE_SAMPLE // 29 400 CONVOLVE_ONE_SAMPLE // 30 401 CONVOLVE_ONE_SAMPLE // 31 402 CONVOLVE_ONE_SAMPLE // 32 403 CONVOLVE_ONE_SAMPLE // 33 404 CONVOLVE_ONE_SAMPLE // 34 405 CONVOLVE_ONE_SAMPLE // 35 406 CONVOLVE_ONE_SAMPLE // 36 407 CONVOLVE_ONE_SAMPLE // 37 408 CONVOLVE_ONE_SAMPLE // 38 409 CONVOLVE_ONE_SAMPLE // 39 410 CONVOLVE_ONE_SAMPLE // 40 411 CONVOLVE_ONE_SAMPLE // 41 412 CONVOLVE_ONE_SAMPLE // 42 413 CONVOLVE_ONE_SAMPLE // 43 414 CONVOLVE_ONE_SAMPLE // 44 415 CONVOLVE_ONE_SAMPLE // 45 416 CONVOLVE_ONE_SAMPLE // 46 417 CONVOLVE_ONE_SAMPLE // 47 418 CONVOLVE_ONE_SAMPLE // 48 419 CONVOLVE_ONE_SAMPLE // 49 420 CONVOLVE_ONE_SAMPLE // 50 421 CONVOLVE_ONE_SAMPLE // 51 422 CONVOLVE_ONE_SAMPLE // 52 423 CONVOLVE_ONE_SAMPLE // 53 424 CONVOLVE_ONE_SAMPLE // 54 425 CONVOLVE_ONE_SAMPLE // 55 426 CONVOLVE_ONE_SAMPLE // 56 427 CONVOLVE_ONE_SAMPLE // 57 428 CONVOLVE_ONE_SAMPLE // 58 429 CONVOLVE_ONE_SAMPLE // 59 430 CONVOLVE_ONE_SAMPLE // 60 431 CONVOLVE_ONE_SAMPLE // 61 432 CONVOLVE_ONE_SAMPLE // 62 433 CONVOLVE_ONE_SAMPLE // 63 434 CONVOLVE_ONE_SAMPLE // 64 435 } else { 436 while (n--) { 437 // Non-optimized using actual while loop. 438 CONVOLVE_ONE_SAMPLE 439 } 440 } 441#endif 442 } 443 444 // Linearly interpolate the two "convolutions". 445 double result = (1.0 - kernelInterpolationFactor) * sum1 + kernelInterpolationFactor * sum2; 446 447 *destination++ = result; 448 449 // Advance the virtual index. 450 m_virtualSourceIndex += m_scaleFactor; 451 452 --numberOfDestinationFrames; 453 if (!numberOfDestinationFrames) 454 return; 455 } 456 457 // Wrap back around to the start. 458 m_virtualSourceIndex -= m_blockSize; 459 460 // Step (3) Copy r3 to r1 and r4 to r2. 461 // This wraps the last input frames back to the start of the buffer. 462 memcpy(r1, r3, sizeof(float) * (m_kernelSize / 2)); 463 memcpy(r2, r4, sizeof(float) * (m_kernelSize / 2)); 464 465 // Step (4) 466 // Refresh the buffer with more input. 467 consumeSource(r5, m_blockSize); 468 } 469} 470 471} // namespace blink 472 473#endif // ENABLE(WEB_AUDIO) 474