15c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)/* 25c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Copyright (C) 2010, Google Inc. All rights reserved. 35c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 45c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * Redistribution and use in source and binary forms, with or without 55c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * modification, are permitted provided that the following conditions 65c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * are met: 75c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 1. Redistributions of source code must retain the above copyright 85c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * notice, this list of conditions and the following disclaimer. 95c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 2. Redistributions in binary form must reproduce the above copyright 105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * notice, this list of conditions and the following disclaimer in the 115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * documentation and/or other materials provided with the distribution. 125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * 135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY 145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY 175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) */ 245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include "config.h" 265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#if ENABLE(WEB_AUDIO) 285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 291e202183a5dc46166763171984b285173f8585e5Torne (Richard Coles)#include "platform/audio/VectorMath.h" 30f5e4ad553afbc08dd2e729bb77e937a9a94d5827Torne (Richard Coles)#include "wtf/Assertions.h" 318abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#include "wtf/CPU.h" 32c0e19a689c8ac22cdc96b291a8d33a5d3b0b34a4Torne (Richard Coles)#include <stdint.h> 33f5e4ad553afbc08dd2e729bb77e937a9a94d5827Torne (Richard Coles) 348abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if OS(MACOSX) 355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <Accelerate/Accelerate.h> 365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <emmintrin.h> 405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#if HAVE(ARM_NEON_INTRINSICS) 435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <arm_neon.h> 445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#include <math.h> 4753e740f4a82e17f3ae59772501622dc354e42336Torne (Richard Coles)#include <algorithm> 485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)namespace WebCore { 505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)namespace VectorMath { 525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 538abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if OS(MACOSX) 545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// On the Mac we use the highly optimized versions in Accelerate.framework 555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as 565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file. 575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 608abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86) 615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else 635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess); 645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 698abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86) 705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else 725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 788abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86) 795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else 815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess); 825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) DSPSplitComplex sc1; 885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) DSPSplitComplex sc2; 895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) DSPSplitComplex dest; 905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sc1.realp = const_cast<float*>(real1P); 915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sc1.imagp = const_cast<float*>(imag1P); 925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sc2.realp = const_cast<float*>(real2P); 935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sc2.imagp = const_cast<float*>(imag2P); 945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest.realp = realDestP; 955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest.imagp = imagDestP; 968abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#if CPU(X86) 975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else 995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1); 1005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 1015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 1045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 1055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess); 1065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 1095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 1105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess); 1115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 1145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 1155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess); 1165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 1195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 1205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess); 1215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#else 1235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 1255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 1265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 1275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 1295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride == 1) && (destStride == 1)) { 1305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float k = *scale; 1315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 1335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 1345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP += k * *sourceP; 1355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP++; 1365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP++; 1375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 1385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Now the sourceP is aligned, use SSE. 1415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 1425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 1435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 pSource; 1455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 dest; 1465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 temp; 1475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 mScale = _mm_set_ps1(k); 1485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 1505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#define SSE2_MULT_ADD(loadInstr, storeInstr) \ 1525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) \ 1535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) { \ 1545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource = _mm_load_ps(sourceP); \ 1555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) temp = _mm_mul_ps(pSource, mScale); \ 1565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = _mm_##loadInstr##_ps(destP); \ 1575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = _mm_add_ps(dest, temp); \ 1585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_##storeInstr##_ps(destP, dest); \ 1595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; \ 1605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; \ 1615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 16302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch if (destAligned) 1645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) SSE2_MULT_ADD(load, store) 16502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch else 1665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) SSE2_MULT_ADD(loadu, storeu) 1675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 1695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 1715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride == 1) && (destStride == 1)) { 1725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 1735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 1745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t k = vdupq_n_f32(*scale); 1765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) { 1775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source = vld1q_f32(sourceP); 1785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t dest = vld1q_f32(destP); 1795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = vmlaq_f32(dest, source, k); 1815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(destP, dest); 1825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 1845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 1855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 1875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 1895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n) { 1905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP += *sourceP * *scale; 1915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += sourceStride; 1925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += destStride; 1935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 1945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 1955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 1965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 1975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess) 1985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 1995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 2005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 2025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride == 1) && (destStride == 1)) { 2035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float k = *scale; 2045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 2065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) { 2075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = k * *sourceP; 2085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP++; 2095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP++; 2105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 2115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Now the sourceP address is aligned and start to apply SSE. 2145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int group = n / 4; 2155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 mScale = _mm_set_ps1(k); 2165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128* pSource; 2175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128* pDest; 2185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 dest; 2195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (reinterpret_cast<size_t>(destP) & 0x0F) { 2225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (group--) { 2235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 2245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = _mm_mul_ps(*pSource, mScale); 2255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_storeu_ps(destP, dest); 2265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 2285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 2295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } else { 2315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (group--) { 2325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP)); 2335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pDest = reinterpret_cast<__m128*>(destP); 2345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *pDest = _mm_mul_ps(*pSource, mScale); 2355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 2375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 2385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Non-SSE handling for remaining frames which is less than 4. 2425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n %= 4; 2435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n) { 2445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = k * *sourceP; 2455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP++; 2465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP++; 2475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 2485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } else { // If strides are not 1, rollback to normal algorithm. 2505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 2515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride == 1) && (destStride == 1)) { 2525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float k = *scale; 2535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 2545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 2555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) { 2575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source = vld1q_f32(sourceP); 2585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(destP, vmulq_n_f32(source, k)); 2595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 2615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 2625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 2645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 2665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float k = *scale; 2675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n--) { 2685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = k * *sourceP; 2695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += sourceStride; 2705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += destStride; 2715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 2735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 2755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 2765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 2785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 2795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 2805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 2825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 2835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 2845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) { 2855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = *source1P + *source2P; 2865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P++; 2875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P++; 2885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP++; 2895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 2905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 2915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 2925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Now the source1P address is aligned and start to apply SSE. 2935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int group = n / 4; 2945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128* pSource1; 2955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128* pSource2; 2965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128* pDest; 2975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 source2; 2985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 dest; 2995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F); 3015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F); 3025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (source2Aligned && destAligned) { // all aligned 3045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (group--) { 3055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 3065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 3075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pDest = reinterpret_cast<__m128*>(destP); 3085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *pDest = _mm_add_ps(*pSource1, *pSource2); 3095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; 3115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; 3125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 3135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 31502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned 3165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (group--) { 3175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 3185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P)); 3195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = _mm_add_ps(*pSource1, *pSource2); 3205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_storeu_ps(destP, dest); 3215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; 3235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; 3245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 3255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 32702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned 3285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (group--) { 3295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 3305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2 = _mm_loadu_ps(source2P); 3315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pDest = reinterpret_cast<__m128*>(destP); 3325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *pDest = _mm_add_ps(*pSource1, source2); 3335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; 3355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; 3365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 3375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 33802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned 3395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (group--) { 3405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P)); 3415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2 = _mm_loadu_ps(source2P); 3425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = _mm_add_ps(*pSource1, source2); 3435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_storeu_ps(destP, dest); 3445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; 3465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; 3475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 3485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Non-SSE handling for remaining frames which is less than 4. 3525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n %= 4; 3535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n) { 3545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = *source1P + *source2P; 3555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P++; 3565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P++; 3575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP++; 3585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 3595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } else { // if strides are not 1, rollback to normal algorithm 3615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 3625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 3635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 3645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 3655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) { 3675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source1 = vld1q_f32(source1P); 3685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source2 = vld1q_f32(source2P); 3695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(destP, vaddq_f32(source1, source2)); 3705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; 3725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; 3735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 3745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 3765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 3785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n--) { 3795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = *source1P + *source2P; 3805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += sourceStride1; 3815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += sourceStride2; 3825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += destStride; 3835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 3855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 3865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 3875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 3885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess) 3905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 3915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 3935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 3945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 3955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) { 3965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 3975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) { 3985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = *source1P * *source2P; 3995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P++; 4005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P++; 4015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP++; 4025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 4035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Now the source1P address aligned and start to apply SSE. 4065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 4075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 4085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 pSource1; 4095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 pSource2; 4105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 dest; 4115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F); 4135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F); 4145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#define SSE2_MULT(loadInstr, storeInstr) \ 4165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) \ 4175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) { \ 4185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource1 = _mm_load_ps(source1P); \ 4195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) pSource2 = _mm_##loadInstr##_ps(source2P); \ 4205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) dest = _mm_mul_ps(pSource1, pSource2); \ 4215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_##storeInstr##_ps(destP, dest); \ 4225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; \ 4235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; \ 4245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; \ 4255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (source2Aligned && destAligned) // Both aligned. 4285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) SSE2_MULT(load, store) 4295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) else if (source2Aligned && !destAligned) // Source2 is aligned but dest not. 4305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) SSE2_MULT(load, storeu) 4315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) else if (!source2Aligned && destAligned) // Dest is aligned but source2 not. 4325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) SSE2_MULT(loadu, store) 4335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) else // Neither aligned. 4345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) SSE2_MULT(loadu, storeu) 4355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 4375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 4395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) { 4405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 4415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 4425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) { 4445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source1 = vld1q_f32(source1P); 4455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source2 = vld1q_f32(source2P); 4465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(destP, vmulq_f32(source1, source2)); 4475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += 4; 4495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += 4; 4505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 4515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 4535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 4555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n) { 4565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = *source1P * *source2P; 4575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source1P += sourceStride1; 4585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source2P += sourceStride2; 4595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += destStride; 4605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 4615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 4635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 4645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess) 4655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 4665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) unsigned i = 0; 4675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 46802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned. 4695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Otherwise, fall through to the scalar code below. 4705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F) 4715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F) 4725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F) 4735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F) 4745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F) 4755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) { 47602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch 4775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) unsigned endSize = framesToProcess - framesToProcess % 4; 4785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (i < endSize) { 4795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 real1 = _mm_load_ps(real1P + i); 4805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 real2 = _mm_load_ps(real2P + i); 4815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 imag1 = _mm_load_ps(imag1P + i); 4825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 imag2 = _mm_load_ps(imag2P + i); 4835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 real = _mm_mul_ps(real1, real2); 4845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2)); 4855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 imag = _mm_mul_ps(real1, imag2); 4865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2)); 4875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_store_ps(realDestP + i, real); 4885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) _mm_store_ps(imagDestP + i, imag); 4895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) i += 4; 4905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 4925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 4935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) unsigned endSize = framesToProcess - framesToProcess % 4; 4945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (i < endSize) { 4955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t real1 = vld1q_f32(real1P + i); 4965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t real2 = vld1q_f32(real2P + i); 4975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t imag1 = vld1q_f32(imag1P + i); 4985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t imag2 = vld1q_f32(imag2P + i); 4995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2); 5015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2); 5025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(realDestP + i, realResult); 5045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(imagDestP + i, imagResult); 5055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) i += 4; 5075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 5085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 5095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) for (; i < framesToProcess; ++i) { 5105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Read and compute result before storing them, in case the 5115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // destination is the same as one of the sources. 5125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i]; 5135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i]; 5145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) realDestP[i] = realResult; 5165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) imagDestP[i] = imagResult; 5175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 5185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 5195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess) 5215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 5225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 5235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float sum = 0; 5245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 52502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch#ifdef __SSE2__ 52602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch if (sourceStride == 1) { 52702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 52802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 52902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch float sample = *sourceP; 53002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch sum += sample * sample; 53102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch sourceP++; 53202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch n--; 53302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } 53402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch 5355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Now the sourceP is aligned, use SSE. 53602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch int tailFrames = n % 4; 53702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch const float* endP = sourceP + n - tailFrames; 53802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch __m128 source; 53902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch __m128 mSum = _mm_setzero_ps(); 54002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch 54102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch while (sourceP < endP) { 54202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch source = _mm_load_ps(sourceP); 54302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch source = _mm_mul_ps(source, source); 54402772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch mSum = _mm_add_ps(mSum, source); 54502772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch sourceP += 4; 54602772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } 54702772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch 54802772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch // Summarize the SSE results. 54902772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch const float* groupSumP = reinterpret_cast<float*>(&mSum); 55002772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3]; 55102772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch 55202772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch n = tailFrames; 55302772c6a72f1ee0b226341a4f4439970c29fc861Ben Murdoch } 5545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 5555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (sourceStride == 1) { 5565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 5575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = sourceP + n - tailFrames; 5585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t fourSum = vdupq_n_f32(0); 5605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (sourceP < endP) { 5615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source = vld1q_f32(sourceP); 5625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) fourSum = vmlaq_f32(fourSum, source, source); 5635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 5645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 5655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum)); 5665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float groupSum[2]; 5685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1_f32(groupSum, twoSum); 5695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sum += groupSum[0] + groupSum[1]; 5705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 5725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 5735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 5745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n--) { 5765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float sample = *sourceP; 5775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sum += sample * sample; 5785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += sourceStride; 5795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 5805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ASSERT(sumP); 5825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *sumP = sum; 5835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 5845c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess) 5865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 5875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 5885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float max = 0; 5895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#ifdef __SSE2__ 5915c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (sourceStride == 1) { 5925c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately. 5935c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) { 5945c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(max, fabsf(*sourceP)); 5955c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP++; 5965c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n--; 5975c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 5985c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 5995c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Now the sourceP is aligned, use SSE. 6005c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 6015c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = sourceP + n - tailFrames; 6025c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 source; 6035c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 mMax = _mm_setzero_ps(); 6045c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int mask = 0x7FFFFFFF; 6055c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask)); 6065c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6075c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (sourceP < endP) { 6085c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source = _mm_load_ps(sourceP); 6095c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Calculate the absolute value by anding source with mask, the sign bit is set to 0. 6105c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) source = _mm_and_ps(source, mMask); 6115c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) mMax = _mm_max_ps(mMax, source); 6125c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 6135c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6145c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6155c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // Get max from the SSE results. 6165c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* groupMaxP = reinterpret_cast<float*>(&mMax); 6175c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(max, groupMaxP[0]); 6185c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(max, groupMaxP[1]); 6195c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(max, groupMaxP[2]); 6205c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(max, groupMaxP[3]); 6215c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6225c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 6235c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6245c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#elif HAVE(ARM_NEON_INTRINSICS) 6255c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if (sourceStride == 1) { 6265c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 6275c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = sourceP + n - tailFrames; 6285c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6295c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t fourMax = vdupq_n_f32(0); 6305c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (sourceP < endP) { 6315c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source = vld1q_f32(sourceP); 6325c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) fourMax = vmaxq_f32(fourMax, vabsq_f32(source)); 6335c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 6345c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6355c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax)); 6365c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6375c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float groupMax[2]; 6385c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1_f32(groupMax, twoMax); 6395c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(groupMax[0], groupMax[1]); 6405c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6415c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 6425c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6435c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 6445c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6455c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n--) { 6465c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) max = std::max(max, fabsf(*sourceP)); 6475c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += sourceStride; 6485c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6495c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6505c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) ASSERT(maxP); 6515c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *maxP = max; 6525c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 6535c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6545c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess) 6555c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles){ 6565c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int n = framesToProcess; 6575c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float lowThreshold = *lowThresholdP; 6585c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float highThreshold = *highThresholdP; 6595c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6605c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) // FIXME: Optimize for SSE2. 6615c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#if HAVE(ARM_NEON_INTRINSICS) 6625c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) if ((sourceStride == 1) && (destStride == 1)) { 6635c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) int tailFrames = n % 4; 6645c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) const float* endP = destP + n - tailFrames; 6655c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6665c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t low = vdupq_n_f32(lowThreshold); 6675c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t high = vdupq_n_f32(highThreshold); 6685c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (destP < endP) { 6695c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) float32x4_t source = vld1q_f32(sourceP); 6705c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low)); 6715c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += 4; 6725c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += 4; 6735c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6745c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) n = tailFrames; 6755c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6765c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif 6775c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) while (n--) { 6785c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold); 6795c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) sourceP += sourceStride; 6805c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) destP += destStride; 6815c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) } 6825c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} 6835c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6848abfc5808a4e34d6e03867af8bc440dee641886fTorne (Richard Coles)#endif // OS(MACOSX) 6855c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6865c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} // namespace VectorMath 6875c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6885c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)} // namespace WebCore 6895c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles) 6905c87bf8b86a7c82ef50fb7a89697d8e02e2553beTorne (Richard Coles)#endif // ENABLE(WEB_AUDIO) 691