1/*
2 * Copyright (C) 2010, Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1.  Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2.  Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. AND ITS CONTRIBUTORS ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
16 * DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR ITS CONTRIBUTORS BE LIABLE FOR ANY
17 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
18 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
19 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
20 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
21 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
22 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 */
24
25#include "config.h"
26
27#if ENABLE(WEB_AUDIO)
28
29#include "platform/audio/VectorMath.h"
30#include "wtf/Assertions.h"
31#include "wtf/CPU.h"
32#include <stdint.h>
33
34#if OS(MACOSX)
35#include <Accelerate/Accelerate.h>
36#endif
37
38#if CPU(X86) || CPU(X86_64)
39#include <emmintrin.h>
40#endif
41
42#if HAVE(ARM_NEON_INTRINSICS)
43#include <arm_neon.h>
44#endif
45
46#include <math.h>
47#include <algorithm>
48
49namespace blink {
50
51namespace VectorMath {
52
53#if OS(MACOSX)
54// On the Mac we use the highly optimized versions in Accelerate.framework
55// In 32-bit mode (__ppc__ or __i386__) <Accelerate/Accelerate.h> includes <vecLib/vDSP_translate.h> which defines macros of the same name as
56// our namespaced function names, so we must handle this case differently. Other architectures (64bit, ARM, etc.) do not include this header file.
57
58void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
59{
60#if CPU(X86)
61    ::vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
62#else
63    vDSP_vsmul(sourceP, sourceStride, scale, destP, destStride, framesToProcess);
64#endif
65}
66
67void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
68{
69#if CPU(X86)
70    ::vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
71#else
72    vDSP_vadd(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
73#endif
74}
75
76void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
77{
78#if CPU(X86)
79    ::vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
80#else
81    vDSP_vmul(source1P, sourceStride1, source2P, sourceStride2, destP, destStride, framesToProcess);
82#endif
83}
84
85void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
86{
87    DSPSplitComplex sc1;
88    DSPSplitComplex sc2;
89    DSPSplitComplex dest;
90    sc1.realp = const_cast<float*>(real1P);
91    sc1.imagp = const_cast<float*>(imag1P);
92    sc2.realp = const_cast<float*>(real2P);
93    sc2.imagp = const_cast<float*>(imag2P);
94    dest.realp = realDestP;
95    dest.imagp = imagDestP;
96#if CPU(X86)
97    ::zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
98#else
99    vDSP_zvmul(&sc1, 1, &sc2, 1, &dest, 1, framesToProcess, 1);
100#endif
101}
102
103void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
104{
105    vDSP_vsma(sourceP, sourceStride, scale, destP, destStride, destP, destStride, framesToProcess);
106}
107
108void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
109{
110    vDSP_maxmgv(sourceP, sourceStride, maxP, framesToProcess);
111}
112
113void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
114{
115    vDSP_svesq(const_cast<float*>(sourceP), sourceStride, sumP, framesToProcess);
116}
117
118void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
119{
120    vDSP_vclip(const_cast<float*>(sourceP), sourceStride, const_cast<float*>(lowThresholdP), const_cast<float*>(highThresholdP), destP, destStride, framesToProcess);
121}
122#else
123
124void vsma(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
125{
126    int n = framesToProcess;
127
128#if CPU(X86) || CPU(X86_64)
129    if ((sourceStride == 1) && (destStride == 1)) {
130        float k = *scale;
131
132        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
133        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
134            *destP += k * *sourceP;
135            sourceP++;
136            destP++;
137            n--;
138        }
139
140        // Now the sourceP is aligned, use SSE.
141        int tailFrames = n % 4;
142        const float* endP = destP + n - tailFrames;
143
144        __m128 pSource;
145        __m128 dest;
146        __m128 temp;
147        __m128 mScale = _mm_set_ps1(k);
148
149        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
150
151#define SSE2_MULT_ADD(loadInstr, storeInstr)        \
152            while (destP < endP)                    \
153            {                                       \
154                pSource = _mm_load_ps(sourceP);     \
155                temp = _mm_mul_ps(pSource, mScale); \
156                dest = _mm_##loadInstr##_ps(destP); \
157                dest = _mm_add_ps(dest, temp);      \
158                _mm_##storeInstr##_ps(destP, dest); \
159                sourceP += 4;                       \
160                destP += 4;                         \
161            }
162
163        if (destAligned)
164            SSE2_MULT_ADD(load, store)
165        else
166            SSE2_MULT_ADD(loadu, storeu)
167
168        n = tailFrames;
169    }
170#elif HAVE(ARM_NEON_INTRINSICS)
171    if ((sourceStride == 1) && (destStride == 1)) {
172        int tailFrames = n % 4;
173        const float* endP = destP + n - tailFrames;
174
175        float32x4_t k = vdupq_n_f32(*scale);
176        while (destP < endP) {
177            float32x4_t source = vld1q_f32(sourceP);
178            float32x4_t dest = vld1q_f32(destP);
179
180            dest = vmlaq_f32(dest, source, k);
181            vst1q_f32(destP, dest);
182
183            sourceP += 4;
184            destP += 4;
185        }
186        n = tailFrames;
187    }
188#endif
189    while (n) {
190        *destP += *sourceP * *scale;
191        sourceP += sourceStride;
192        destP += destStride;
193        n--;
194    }
195}
196
197void vsmul(const float* sourceP, int sourceStride, const float* scale, float* destP, int destStride, size_t framesToProcess)
198{
199    int n = framesToProcess;
200
201#if CPU(X86) || CPU(X86_64)
202    if ((sourceStride == 1) && (destStride == 1)) {
203        float k = *scale;
204
205        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
206        while ((reinterpret_cast<size_t>(sourceP) & 0x0F) && n) {
207            *destP = k * *sourceP;
208            sourceP++;
209            destP++;
210            n--;
211        }
212
213        // Now the sourceP address is aligned and start to apply SSE.
214        int group = n / 4;
215        __m128 mScale = _mm_set_ps1(k);
216        __m128* pSource;
217        __m128* pDest;
218        __m128 dest;
219
220
221        if (reinterpret_cast<size_t>(destP) & 0x0F) {
222            while (group--) {
223                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
224                dest = _mm_mul_ps(*pSource, mScale);
225                _mm_storeu_ps(destP, dest);
226
227                sourceP += 4;
228                destP += 4;
229            }
230        } else {
231            while (group--) {
232                pSource = reinterpret_cast<__m128*>(const_cast<float*>(sourceP));
233                pDest = reinterpret_cast<__m128*>(destP);
234                *pDest = _mm_mul_ps(*pSource, mScale);
235
236                sourceP += 4;
237                destP += 4;
238            }
239        }
240
241        // Non-SSE handling for remaining frames which is less than 4.
242        n %= 4;
243        while (n) {
244            *destP = k * *sourceP;
245            sourceP++;
246            destP++;
247            n--;
248        }
249    } else { // If strides are not 1, rollback to normal algorithm.
250#elif HAVE(ARM_NEON_INTRINSICS)
251    if ((sourceStride == 1) && (destStride == 1)) {
252        float k = *scale;
253        int tailFrames = n % 4;
254        const float* endP = destP + n - tailFrames;
255
256        while (destP < endP) {
257            float32x4_t source = vld1q_f32(sourceP);
258            vst1q_f32(destP, vmulq_n_f32(source, k));
259
260            sourceP += 4;
261            destP += 4;
262        }
263        n = tailFrames;
264    }
265#endif
266    float k = *scale;
267    while (n--) {
268        *destP = k * *sourceP;
269        sourceP += sourceStride;
270        destP += destStride;
271    }
272#if CPU(X86) || CPU(X86_64)
273    }
274#endif
275}
276
277void vadd(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
278{
279    int n = framesToProcess;
280
281#if CPU(X86) || CPU(X86_64)
282    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
283        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
284        while ((reinterpret_cast<size_t>(source1P) & 0x0F) && n) {
285            *destP = *source1P + *source2P;
286            source1P++;
287            source2P++;
288            destP++;
289            n--;
290        }
291
292        // Now the source1P address is aligned and start to apply SSE.
293        int group = n / 4;
294        __m128* pSource1;
295        __m128* pSource2;
296        __m128* pDest;
297        __m128 source2;
298        __m128 dest;
299
300        bool source2Aligned = !(reinterpret_cast<size_t>(source2P) & 0x0F);
301        bool destAligned = !(reinterpret_cast<size_t>(destP) & 0x0F);
302
303        if (source2Aligned && destAligned) { // all aligned
304            while (group--) {
305                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
306                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
307                pDest = reinterpret_cast<__m128*>(destP);
308                *pDest = _mm_add_ps(*pSource1, *pSource2);
309
310                source1P += 4;
311                source2P += 4;
312                destP += 4;
313            }
314
315        } else if (source2Aligned && !destAligned) { // source2 aligned but dest not aligned
316            while (group--) {
317                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
318                pSource2 = reinterpret_cast<__m128*>(const_cast<float*>(source2P));
319                dest = _mm_add_ps(*pSource1, *pSource2);
320                _mm_storeu_ps(destP, dest);
321
322                source1P += 4;
323                source2P += 4;
324                destP += 4;
325            }
326
327        } else if (!source2Aligned && destAligned) { // source2 not aligned but dest aligned
328            while (group--) {
329                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
330                source2 = _mm_loadu_ps(source2P);
331                pDest = reinterpret_cast<__m128*>(destP);
332                *pDest = _mm_add_ps(*pSource1, source2);
333
334                source1P += 4;
335                source2P += 4;
336                destP += 4;
337            }
338        } else if (!source2Aligned && !destAligned) { // both source2 and dest not aligned
339            while (group--) {
340                pSource1 = reinterpret_cast<__m128*>(const_cast<float*>(source1P));
341                source2 = _mm_loadu_ps(source2P);
342                dest = _mm_add_ps(*pSource1, source2);
343                _mm_storeu_ps(destP, dest);
344
345                source1P += 4;
346                source2P += 4;
347                destP += 4;
348            }
349        }
350
351        // Non-SSE handling for remaining frames which is less than 4.
352        n %= 4;
353        while (n) {
354            *destP = *source1P + *source2P;
355            source1P++;
356            source2P++;
357            destP++;
358            n--;
359        }
360    } else { // if strides are not 1, rollback to normal algorithm
361#elif HAVE(ARM_NEON_INTRINSICS)
362    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
363        int tailFrames = n % 4;
364        const float* endP = destP + n - tailFrames;
365
366        while (destP < endP) {
367            float32x4_t source1 = vld1q_f32(source1P);
368            float32x4_t source2 = vld1q_f32(source2P);
369            vst1q_f32(destP, vaddq_f32(source1, source2));
370
371            source1P += 4;
372            source2P += 4;
373            destP += 4;
374        }
375        n = tailFrames;
376    }
377#endif
378    while (n--) {
379        *destP = *source1P + *source2P;
380        source1P += sourceStride1;
381        source2P += sourceStride2;
382        destP += destStride;
383    }
384#if CPU(X86) || CPU(X86_64)
385    }
386#endif
387}
388
389void vmul(const float* source1P, int sourceStride1, const float* source2P, int sourceStride2, float* destP, int destStride, size_t framesToProcess)
390{
391
392    int n = framesToProcess;
393
394#if CPU(X86) || CPU(X86_64)
395    if ((sourceStride1 == 1) && (sourceStride2 == 1) && (destStride == 1)) {
396        // If the source1P address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
397        while ((reinterpret_cast<uintptr_t>(source1P) & 0x0F) && n) {
398            *destP = *source1P * *source2P;
399            source1P++;
400            source2P++;
401            destP++;
402            n--;
403        }
404
405        // Now the source1P address aligned and start to apply SSE.
406        int tailFrames = n % 4;
407        const float* endP = destP + n - tailFrames;
408        __m128 pSource1;
409        __m128 pSource2;
410        __m128 dest;
411
412        bool source2Aligned = !(reinterpret_cast<uintptr_t>(source2P) & 0x0F);
413        bool destAligned = !(reinterpret_cast<uintptr_t>(destP) & 0x0F);
414
415#define SSE2_MULT(loadInstr, storeInstr)                   \
416            while (destP < endP)                           \
417            {                                              \
418                pSource1 = _mm_load_ps(source1P);          \
419                pSource2 = _mm_##loadInstr##_ps(source2P); \
420                dest = _mm_mul_ps(pSource1, pSource2);     \
421                _mm_##storeInstr##_ps(destP, dest);        \
422                source1P += 4;                             \
423                source2P += 4;                             \
424                destP += 4;                                \
425            }
426
427        if (source2Aligned && destAligned) // Both aligned.
428            SSE2_MULT(load, store)
429        else if (source2Aligned && !destAligned) // Source2 is aligned but dest not.
430            SSE2_MULT(load, storeu)
431        else if (!source2Aligned && destAligned) // Dest is aligned but source2 not.
432            SSE2_MULT(loadu, store)
433        else // Neither aligned.
434            SSE2_MULT(loadu, storeu)
435
436        n = tailFrames;
437    }
438#elif HAVE(ARM_NEON_INTRINSICS)
439    if ((sourceStride1 ==1) && (sourceStride2 == 1) && (destStride == 1)) {
440        int tailFrames = n % 4;
441        const float* endP = destP + n - tailFrames;
442
443        while (destP < endP) {
444            float32x4_t source1 = vld1q_f32(source1P);
445            float32x4_t source2 = vld1q_f32(source2P);
446            vst1q_f32(destP, vmulq_f32(source1, source2));
447
448            source1P += 4;
449            source2P += 4;
450            destP += 4;
451        }
452        n = tailFrames;
453    }
454#endif
455    while (n) {
456        *destP = *source1P * *source2P;
457        source1P += sourceStride1;
458        source2P += sourceStride2;
459        destP += destStride;
460        n--;
461    }
462}
463
464void zvmul(const float* real1P, const float* imag1P, const float* real2P, const float* imag2P, float* realDestP, float* imagDestP, size_t framesToProcess)
465{
466    unsigned i = 0;
467#if CPU(X86) || CPU(X86_64)
468    // Only use the SSE optimization in the very common case that all addresses are 16-byte aligned.
469    // Otherwise, fall through to the scalar code below.
470    if (!(reinterpret_cast<uintptr_t>(real1P) & 0x0F)
471        && !(reinterpret_cast<uintptr_t>(imag1P) & 0x0F)
472        && !(reinterpret_cast<uintptr_t>(real2P) & 0x0F)
473        && !(reinterpret_cast<uintptr_t>(imag2P) & 0x0F)
474        && !(reinterpret_cast<uintptr_t>(realDestP) & 0x0F)
475        && !(reinterpret_cast<uintptr_t>(imagDestP) & 0x0F)) {
476
477        unsigned endSize = framesToProcess - framesToProcess % 4;
478        while (i < endSize) {
479            __m128 real1 = _mm_load_ps(real1P + i);
480            __m128 real2 = _mm_load_ps(real2P + i);
481            __m128 imag1 = _mm_load_ps(imag1P + i);
482            __m128 imag2 = _mm_load_ps(imag2P + i);
483            __m128 real = _mm_mul_ps(real1, real2);
484            real = _mm_sub_ps(real, _mm_mul_ps(imag1, imag2));
485            __m128 imag = _mm_mul_ps(real1, imag2);
486            imag = _mm_add_ps(imag, _mm_mul_ps(imag1, real2));
487            _mm_store_ps(realDestP + i, real);
488            _mm_store_ps(imagDestP + i, imag);
489            i += 4;
490        }
491    }
492#elif HAVE(ARM_NEON_INTRINSICS)
493        unsigned endSize = framesToProcess - framesToProcess % 4;
494        while (i < endSize) {
495            float32x4_t real1 = vld1q_f32(real1P + i);
496            float32x4_t real2 = vld1q_f32(real2P + i);
497            float32x4_t imag1 = vld1q_f32(imag1P + i);
498            float32x4_t imag2 = vld1q_f32(imag2P + i);
499
500            float32x4_t realResult = vmlsq_f32(vmulq_f32(real1, real2), imag1, imag2);
501            float32x4_t imagResult = vmlaq_f32(vmulq_f32(real1, imag2), imag1, real2);
502
503            vst1q_f32(realDestP + i, realResult);
504            vst1q_f32(imagDestP + i, imagResult);
505
506            i += 4;
507        }
508#endif
509    for (; i < framesToProcess; ++i) {
510        // Read and compute result before storing them, in case the
511        // destination is the same as one of the sources.
512        float realResult = real1P[i] * real2P[i] - imag1P[i] * imag2P[i];
513        float imagResult = real1P[i] * imag2P[i] + imag1P[i] * real2P[i];
514
515        realDestP[i] = realResult;
516        imagDestP[i] = imagResult;
517    }
518}
519
520void vsvesq(const float* sourceP, int sourceStride, float* sumP, size_t framesToProcess)
521{
522    int n = framesToProcess;
523    float sum = 0;
524
525#if CPU(X86) || CPU(X86_64)
526    if (sourceStride == 1) {
527        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
528        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
529            float sample = *sourceP;
530            sum += sample * sample;
531            sourceP++;
532            n--;
533        }
534
535        // Now the sourceP is aligned, use SSE.
536        int tailFrames = n % 4;
537        const float* endP = sourceP + n - tailFrames;
538        __m128 source;
539        __m128 mSum = _mm_setzero_ps();
540
541        while (sourceP < endP) {
542            source = _mm_load_ps(sourceP);
543            source = _mm_mul_ps(source, source);
544            mSum = _mm_add_ps(mSum, source);
545            sourceP += 4;
546        }
547
548        // Summarize the SSE results.
549        const float* groupSumP = reinterpret_cast<float*>(&mSum);
550        sum += groupSumP[0] + groupSumP[1] + groupSumP[2] + groupSumP[3];
551
552        n = tailFrames;
553    }
554#elif HAVE(ARM_NEON_INTRINSICS)
555    if (sourceStride == 1) {
556        int tailFrames = n % 4;
557        const float* endP = sourceP + n - tailFrames;
558
559        float32x4_t fourSum = vdupq_n_f32(0);
560        while (sourceP < endP) {
561            float32x4_t source = vld1q_f32(sourceP);
562            fourSum = vmlaq_f32(fourSum, source, source);
563            sourceP += 4;
564        }
565        float32x2_t twoSum = vadd_f32(vget_low_f32(fourSum), vget_high_f32(fourSum));
566
567        float groupSum[2];
568        vst1_f32(groupSum, twoSum);
569        sum += groupSum[0] + groupSum[1];
570
571        n = tailFrames;
572    }
573#endif
574
575    while (n--) {
576        float sample = *sourceP;
577        sum += sample * sample;
578        sourceP += sourceStride;
579    }
580
581    ASSERT(sumP);
582    *sumP = sum;
583}
584
585void vmaxmgv(const float* sourceP, int sourceStride, float* maxP, size_t framesToProcess)
586{
587    int n = framesToProcess;
588    float max = 0;
589
590#if CPU(X86) || CPU(X86_64)
591    if (sourceStride == 1) {
592        // If the sourceP address is not 16-byte aligned, the first several frames (at most three) should be processed separately.
593        while ((reinterpret_cast<uintptr_t>(sourceP) & 0x0F) && n) {
594            max = std::max(max, fabsf(*sourceP));
595            sourceP++;
596            n--;
597        }
598
599        // Now the sourceP is aligned, use SSE.
600        int tailFrames = n % 4;
601        const float* endP = sourceP + n - tailFrames;
602        __m128 source;
603        __m128 mMax = _mm_setzero_ps();
604        int mask = 0x7FFFFFFF;
605        __m128 mMask = _mm_set1_ps(*reinterpret_cast<float*>(&mask));
606
607        while (sourceP < endP) {
608            source = _mm_load_ps(sourceP);
609            // Calculate the absolute value by anding source with mask, the sign bit is set to 0.
610            source = _mm_and_ps(source, mMask);
611            mMax = _mm_max_ps(mMax, source);
612            sourceP += 4;
613        }
614
615        // Get max from the SSE results.
616        const float* groupMaxP = reinterpret_cast<float*>(&mMax);
617        max = std::max(max, groupMaxP[0]);
618        max = std::max(max, groupMaxP[1]);
619        max = std::max(max, groupMaxP[2]);
620        max = std::max(max, groupMaxP[3]);
621
622        n = tailFrames;
623    }
624#elif HAVE(ARM_NEON_INTRINSICS)
625    if (sourceStride == 1) {
626        int tailFrames = n % 4;
627        const float* endP = sourceP + n - tailFrames;
628
629        float32x4_t fourMax = vdupq_n_f32(0);
630        while (sourceP < endP) {
631            float32x4_t source = vld1q_f32(sourceP);
632            fourMax = vmaxq_f32(fourMax, vabsq_f32(source));
633            sourceP += 4;
634        }
635        float32x2_t twoMax = vmax_f32(vget_low_f32(fourMax), vget_high_f32(fourMax));
636
637        float groupMax[2];
638        vst1_f32(groupMax, twoMax);
639        max = std::max(groupMax[0], groupMax[1]);
640
641        n = tailFrames;
642    }
643#endif
644
645    while (n--) {
646        max = std::max(max, fabsf(*sourceP));
647        sourceP += sourceStride;
648    }
649
650    ASSERT(maxP);
651    *maxP = max;
652}
653
654void vclip(const float* sourceP, int sourceStride, const float* lowThresholdP, const float* highThresholdP, float* destP, int destStride, size_t framesToProcess)
655{
656    int n = framesToProcess;
657    float lowThreshold = *lowThresholdP;
658    float highThreshold = *highThresholdP;
659
660    // FIXME: Optimize for SSE2.
661#if HAVE(ARM_NEON_INTRINSICS)
662    if ((sourceStride == 1) && (destStride == 1)) {
663        int tailFrames = n % 4;
664        const float* endP = destP + n - tailFrames;
665
666        float32x4_t low = vdupq_n_f32(lowThreshold);
667        float32x4_t high = vdupq_n_f32(highThreshold);
668        while (destP < endP) {
669            float32x4_t source = vld1q_f32(sourceP);
670            vst1q_f32(destP, vmaxq_f32(vminq_f32(source, high), low));
671            sourceP += 4;
672            destP += 4;
673        }
674        n = tailFrames;
675    }
676#endif
677    while (n--) {
678        *destP = std::max(std::min(*sourceP, highThreshold), lowThreshold);
679        sourceP += sourceStride;
680        destP += destStride;
681    }
682}
683
684#endif // OS(MACOSX)
685
686} // namespace VectorMath
687
688} // namespace blink
689
690#endif // ENABLE(WEB_AUDIO)
691