15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)/*
25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Copyright (C) 2010, Google Inc. All rights reserved.
35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Redistribution and use in source and binary forms, with or without
55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * modification, are permitted provided that the following conditions
65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * are met:
75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 1.  Redistributions of source code must retain the above copyright
85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *    notice, this list of conditions and the following disclaimer.
95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 2.  Redistributions in binary form must reproduce the above copyright
105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *    notice, this list of conditions and the following disclaimer in the
115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *    documentation and/or other materials provided with the distribution.
125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *
135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) */
245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "config.h"
265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#if ENABLE(WEB_AUDIO)
285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
291e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)#include "platform/audio/VectorMath.h"
30f5e4ad553afbc08dd2e729bb77e937a9a94d5827Torne (Richard Coles)#include "wtf/Assertions.h"
318abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#include "wtf/CPU.h"
32c0e19a689c8ac22cdc96b291a8d33a5d3b0b34a4Torne (Richard Coles)#include <stdint.h>
33f5e4ad553afbc08dd2e729bb77e937a9a94d5827Torne (Richard Coles)
348abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if OS(MACOSX)
355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <Accelerate/Accelerate.h>
365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
38f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <emmintrin.h>
405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#if HAVE(ARM_NEON_INTRINSICS)
435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <arm_neon.h>
445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <math.h>
4753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)#include <algorithm>
485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
49c1847b1379d12d0e05df27436bf19a9b1bf12deaTorne (Richard Coles)namespace blink {
505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)namespace VectorMath {
525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
538abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if OS(MACOSX)
545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// On the Mac we use the highly optimized versions in Accelerate.framework
555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
608abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86)
615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else
635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
698abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86)
705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else
725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
788abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86)
795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else
815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    DSPSplitComplex sc1;
885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    DSPSplitComplex sc2;
895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    DSPSplitComplex dest;
905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    sc1.realp = const_cast<float*>(real1P);
915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    sc1.imagp = const_cast<float*>(imag1P);
925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    sc2.realp = const_cast<float*>(real2P);
935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    sc2.imagp = const_cast<float*>(imag2P);
945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    dest.realp = realDestP;
955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    dest.imagp = imagDestP;
968abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86)
975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else
995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
1015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
1115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
1195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
1215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else
1235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
1255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
1275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
128f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
1295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride == 1) && (destStride == 1)) {
1305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float k = *scale;
1315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
1335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
1345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destP += k * *sourceP;
1355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP++;
1365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP++;
1375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
1385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
1395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Now the sourceP is aligned, use SSE.
1415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
1425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
1435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 pSource;
1455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 dest;
1465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 temp;
1475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 mScale = _mm_set_ps1(k);
1485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
1505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#define SSE2_MULT_ADD(loadInstr, storeInstr)        \
1525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (destP < endP)                    \
1535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            {                                       \
1545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource = _mm_load_ps(sourceP);     \
1555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                temp = _mm_mul_ps(pSource, mScale); \
1565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                dest = _mm_##loadInstr##_ps(destP); \
1575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                dest = _mm_add_ps(dest, temp);      \
1585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                _mm_##storeInstr##_ps(destP, dest); \
1595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                sourceP += 4;                       \
1605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;                         \
1615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
1625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
16302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        if (destAligned)
1645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            SSE2_MULT_ADD(load, store)
16502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        else
1665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            SSE2_MULT_ADD(loadu, storeu)
1675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
1695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
1715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride == 1) && (destStride == 1)) {
1725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
1735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
1745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x4_t k = vdupq_n_f32(*scale);
1765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (destP < endP) {
1775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source = vld1q_f32(sourceP);
1785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t dest = vld1q_f32(destP);
1795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            dest = vmlaq_f32(dest, source, k);
1815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(destP, dest);
1825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP += 4;
1845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP += 4;
1855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
1865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
1875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
1895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n) {
1905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destP += *sourceP * *scale;
1915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sourceP += sourceStride;
1925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        destP += destStride;
1935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n--;
1945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
1955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
1965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
1975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
1985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
1995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
2005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
201f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
2025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride == 1) && (destStride == 1)) {
2035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float k = *scale;
2045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
2065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
2075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destP = k * *sourceP;
2085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP++;
2095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP++;
2105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
2115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Now the sourceP address is aligned and start to apply SSE.
2145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int group = n / 4;
2155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 mScale = _mm_set_ps1(k);
2165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128* pSource;
2175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128* pDest;
2185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 dest;
2195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (reinterpret_cast<size_t>(destP) & 0x0F) {
2225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (group--) {
2235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
2245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                dest = _mm_mul_ps(*pSource, mScale);
2255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                _mm_storeu_ps(destP, dest);
2265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                sourceP += 4;
2285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;
2295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
2305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        } else {
2315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (group--) {
2325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
2335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pDest = reinterpret_cast<__m128*>(destP);
2345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                *pDest = _mm_mul_ps(*pSource, mScale);
2355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                sourceP += 4;
2375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;
2385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
2395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Non-SSE handling for remaining frames which is less than 4.
2425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n %= 4;
2435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (n) {
2445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destP = k * *sourceP;
2455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP++;
2465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP++;
2475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
2485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    } else { // If strides are not 1, rollback to normal algorithm.
2505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
2515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride == 1) && (destStride == 1)) {
2525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float k = *scale;
2535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
2545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
2555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (destP < endP) {
2575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source = vld1q_f32(sourceP);
2585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(destP, vmulq_n_f32(source, k));
2595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP += 4;
2615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP += 4;
2625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
2645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
2655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
2665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    float k = *scale;
2675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n--) {
2685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destP = k * *sourceP;
2695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sourceP += sourceStride;
2705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        destP += destStride;
2715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
272f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
2735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
2745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
2755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
2765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
2785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
2795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
2805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
281f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
2825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
2835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
2845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
2855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destP = *source1P + *source2P;
2865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source1P++;
2875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source2P++;
2885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP++;
2895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
2905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
2915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
2925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Now the source1P address is aligned and start to apply SSE.
2935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int group = n / 4;
2945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128* pSource1;
2955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128* pSource2;
2965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128* pDest;
2975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 source2;
2985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 dest;
2995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
3015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
3025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (source2Aligned && destAligned) { // all aligned
3045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (group--) {
3055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
3065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
3075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pDest = reinterpret_cast<__m128*>(destP);
3085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                *pDest = _mm_add_ps(*pSource1, *pSource2);
3095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source1P += 4;
3115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2P += 4;
3125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;
3135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
31502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
3165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (group--) {
3175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
3185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
3195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                dest = _mm_add_ps(*pSource1, *pSource2);
3205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                _mm_storeu_ps(destP, dest);
3215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source1P += 4;
3235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2P += 4;
3245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;
3255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
32702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
3285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (group--) {
3295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
3305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2 = _mm_loadu_ps(source2P);
3315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pDest = reinterpret_cast<__m128*>(destP);
3325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                *pDest = _mm_add_ps(*pSource1, source2);
3335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source1P += 4;
3355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2P += 4;
3365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;
3375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
33802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
3395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (group--) {
3405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
3415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2 = _mm_loadu_ps(source2P);
3425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                dest = _mm_add_ps(*pSource1, source2);
3435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                _mm_storeu_ps(destP, dest);
3445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source1P += 4;
3465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2P += 4;
3475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;
3485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
3495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
3505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Non-SSE handling for remaining frames which is less than 4.
3525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n %= 4;
3535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (n) {
3545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destP = *source1P + *source2P;
3555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source1P++;
3565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source2P++;
3575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP++;
3585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
3595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
3605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    } else { // if strides are not 1, rollback to normal algorithm
3615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
3625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
3635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
3645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
3655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (destP < endP) {
3675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source1 = vld1q_f32(source1P);
3685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source2 = vld1q_f32(source2P);
3695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(destP, vaddq_f32(source1, source2));
3705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source1P += 4;
3725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source2P += 4;
3735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP += 4;
3745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
3755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
3765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
3775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
3785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n--) {
3795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destP = *source1P + *source2P;
3805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        source1P += sourceStride1;
3815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        source2P += sourceStride2;
3825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        destP += destStride;
3835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
384f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
3855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
3865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
3875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
3885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
3905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
3915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
3925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
3935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
394f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
3955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
3965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
3975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
3985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            *destP = *source1P * *source2P;
3995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source1P++;
4005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source2P++;
4015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP++;
4025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
4035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
4045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Now the source1P address aligned and start to apply SSE.
4065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
4075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
4085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 pSource1;
4095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 pSource2;
4105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 dest;
4115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
4135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
4145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#define SSE2_MULT(loadInstr, storeInstr)                   \
4165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            while (destP < endP)                           \
4175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            {                                              \
4185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource1 = _mm_load_ps(source1P);          \
4195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                pSource2 = _mm_##loadInstr##_ps(source2P); \
4205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                dest = _mm_mul_ps(pSource1, pSource2);     \
4215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                _mm_##storeInstr##_ps(destP, dest);        \
4225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source1P += 4;                             \
4235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                source2P += 4;                             \
4245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)                destP += 4;                                \
4255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            }
4265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        if (source2Aligned && destAligned) // Both aligned.
4285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            SSE2_MULT(load, store)
4295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
4305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            SSE2_MULT(load, storeu)
4315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
4325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            SSE2_MULT(loadu, store)
4335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        else // Neither aligned.
4345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            SSE2_MULT(loadu, storeu)
4355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
4375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
4385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
4395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
4405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
4415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
4425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (destP < endP) {
4445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source1 = vld1q_f32(source1P);
4455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source2 = vld1q_f32(source2P);
4465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(destP, vmulq_f32(source1, source2));
4475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source1P += 4;
4495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source2P += 4;
4505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP += 4;
4515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
4525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
4535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
4545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
4555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n) {
4565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destP = *source1P * *source2P;
4575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        source1P += sourceStride1;
4585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        source2P += sourceStride2;
4595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        destP += destStride;
4605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n--;
4615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
4625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
4635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
4645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
4655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
4665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    unsigned i = 0;
467f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
46802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
4695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // Otherwise, fall through to the scalar code below.
4705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
4715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
4725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
4735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
4745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
4755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
47602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
4775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        unsigned endSize = framesToProcess - framesToProcess % 4;
4785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (i < endSize) {
4795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            __m128 real1 = _mm_load_ps(real1P + i);
4805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            __m128 real2 = _mm_load_ps(real2P + i);
4815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            __m128 imag1 = _mm_load_ps(imag1P + i);
4825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            __m128 imag2 = _mm_load_ps(imag2P + i);
4835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            __m128 real = _mm_mul_ps(real1, real2);
4845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
4855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            __m128 imag = _mm_mul_ps(real1, imag2);
4865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
4875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            _mm_store_ps(realDestP + i, real);
4885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            _mm_store_ps(imagDestP + i, imag);
4895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            i += 4;
4905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
4915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
4925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
4935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        unsigned endSize = framesToProcess - framesToProcess % 4;
4945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (i < endSize) {
4955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t real1 = vld1q_f32(real1P + i);
4965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t real2 = vld1q_f32(real2P + i);
4975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t imag1 = vld1q_f32(imag1P + i);
4985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t imag2 = vld1q_f32(imag2P + i);
4995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
5015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
5025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(realDestP + i, realResult);
5045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(imagDestP + i, imagResult);
5055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            i += 4;
5075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
5085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
5095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    for (; i < framesToProcess; ++i) {
5105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Read and compute result before storing them, in case the
5115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // destination is the same as one of the sources.
5125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
5135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
5145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        realDestP[i] = realResult;
5165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        imagDestP[i] = imagResult;
5175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
5185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
5195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
5215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
5225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
5235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    float sum = 0;
5245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
525f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
52602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    if (sourceStride == 1) {
52702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
52802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
52902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            float sample = *sourceP;
53002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            sum += sample * sample;
53102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            sourceP++;
53202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            n--;
53302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        }
53402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
5355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Now the sourceP is aligned, use SSE.
53602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        int tailFrames = n % 4;
53702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        const float* endP = sourceP + n - tailFrames;
53802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        __m128 source;
53902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        __m128 mSum = _mm_setzero_ps();
54002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
54102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        while (sourceP < endP) {
54202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            source = _mm_load_ps(sourceP);
54302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            source = _mm_mul_ps(source, source);
54402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            mSum = _mm_add_ps(mSum, source);
54502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch            sourceP += 4;
54602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        }
54702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
54802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        // Summarize the SSE results.
54902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        const float* groupSumP = reinterpret_cast<float*>(&mSum);
55002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
55102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch
55202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch        n = tailFrames;
55302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch    }
5545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
5555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (sourceStride == 1) {
5565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
5575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = sourceP + n - tailFrames;
5585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x4_t fourSum = vdupq_n_f32(0);
5605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (sourceP < endP) {
5615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source = vld1q_f32(sourceP);
5625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            fourSum = vmlaq_f32(fourSum, source, source);
5635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP += 4;
5645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
5655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
5665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float groupSum[2];
5685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        vst1_f32(groupSum, twoSum);
5695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sum += groupSum[0] + groupSum[1];
5705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
5725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
5735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
5745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n--) {
5765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float sample = *sourceP;
5775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sum += sample * sample;
5785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sourceP += sourceStride;
5795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
5805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(sumP);
5825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    *sumP = sum;
5835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
5845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
5865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
5875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
5885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    float max = 0;
5895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
590f6b7aed3f7ce69aca0d7a032d144cbd088b04393Torne (Richard Coles)#if CPU(X86) || CPU(X86_64)
5915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (sourceStride == 1) {
5925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
5935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
5945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            max = std::max(max, fabsf(*sourceP));
5955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP++;
5965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            n--;
5975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
5985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
5995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Now the sourceP is aligned, use SSE.
6005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
6015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = sourceP + n - tailFrames;
6025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 source;
6035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 mMax = _mm_setzero_ps();
6045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int mask = 0x7FFFFFFF;
6055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
6065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (sourceP < endP) {
6085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source = _mm_load_ps(sourceP);
6095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
6105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            source = _mm_and_ps(source, mMask);
6115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            mMax = _mm_max_ps(mMax, source);
6125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP += 4;
6135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
6145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        // Get max from the SSE results.
6165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* groupMaxP = reinterpret_cast<float*>(&mMax);
6175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        max = std::max(max, groupMaxP[0]);
6185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        max = std::max(max, groupMaxP[1]);
6195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        max = std::max(max, groupMaxP[2]);
6205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        max = std::max(max, groupMaxP[3]);
6215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
6235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
6245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS)
6255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if (sourceStride == 1) {
6265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
6275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = sourceP + n - tailFrames;
6285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x4_t fourMax = vdupq_n_f32(0);
6305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (sourceP < endP) {
6315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source = vld1q_f32(sourceP);
6325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
6335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP += 4;
6345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
6355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
6365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float groupMax[2];
6385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        vst1_f32(groupMax, twoMax);
6395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        max = std::max(groupMax[0], groupMax[1]);
6405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
6425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
6435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
6445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n--) {
6465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        max = std::max(max, fabsf(*sourceP));
6475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sourceP += sourceStride;
6485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
6495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    ASSERT(maxP);
6515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    *maxP = max;
6525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
6535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
6555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){
6565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    int n = framesToProcess;
6575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    float lowThreshold = *lowThresholdP;
6585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    float highThreshold = *highThresholdP;
6595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    // FIXME: Optimize for SSE2.
6615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#if HAVE(ARM_NEON_INTRINSICS)
6625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    if ((sourceStride == 1) && (destStride == 1)) {
6635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        int tailFrames = n % 4;
6645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        const float* endP = destP + n - tailFrames;
6655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x4_t low = vdupq_n_f32(lowThreshold);
6675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        float32x4_t high = vdupq_n_f32(highThreshold);
6685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        while (destP < endP) {
6695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            float32x4_t source = vld1q_f32(sourceP);
6705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
6715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            sourceP += 4;
6725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)            destP += 4;
6735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        }
6745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        n = tailFrames;
6755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
6765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif
6775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    while (n--) {
6785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
6795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        sourceP += sourceStride;
6805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)        destP += destStride;
6815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)    }
6825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)}
6835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6848abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#endif // OS(MACOSX)
6855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} // namespace VectorMath
6875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
688c1847b1379d12d0e05df27436bf19a9b1bf12deaTorne (Richard Coles)} // namespace blink
6895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)
6905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif // ENABLE(WEB_AUDIO)
691