1c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein/*
2c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein * Copyright 2015 Google Inc.
3c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein *
4c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein * Use of this source code is governed by a BSD-style license that can be
5c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein * found in the LICENSE file.
6c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein */
7c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
8c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein#ifndef SkNx_neon_DEFINED
9c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein#define SkNx_neon_DEFINED
10c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
11c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein#include <arm_neon.h>
12c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
13d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein// Well, this is absurd.  The shifts require compile-time constant arguments.
14d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
15d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein#define SHIFT8(op, v, bits) switch(bits) { \
16d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case  1: return op(v,  1);  case  2: return op(v,  2);  case  3: return op(v,  3); \
17d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case  4: return op(v,  4);  case  5: return op(v,  5);  case  6: return op(v,  6); \
18d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case  7: return op(v,  7); \
19d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    } return fVec
20d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
21d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein#define SHIFT16(op, v, bits) if (bits < 8) { SHIFT8(op, v, bits); } switch(bits) { \
22d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein                                case  8: return op(v,  8);  case  9: return op(v,  9); \
23d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 10: return op(v, 10);  case 11: return op(v, 11);  case 12: return op(v, 12); \
24d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 13: return op(v, 13);  case 14: return op(v, 14);  case 15: return op(v, 15); \
25d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    } return fVec
26d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
27d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein#define SHIFT32(op, v, bits) if (bits < 16) { SHIFT16(op, v, bits); } switch(bits) { \
28d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 16: return op(v, 16);  case 17: return op(v, 17);  case 18: return op(v, 18); \
29d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 19: return op(v, 19);  case 20: return op(v, 20);  case 21: return op(v, 21); \
30d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 22: return op(v, 22);  case 23: return op(v, 23);  case 24: return op(v, 24); \
31d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 25: return op(v, 25);  case 26: return op(v, 26);  case 27: return op(v, 27); \
32d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 28: return op(v, 28);  case 29: return op(v, 29);  case 30: return op(v, 30); \
33d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    case 31: return op(v, 31); } return fVec
34d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
35c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleintemplate <>
368fe8fffdfa7464c6f7da773b8660a2043f4998e0mtkleinclass SkNb<2, 4> {
37c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinpublic:
388fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    SkNb(uint32x2_t vec) : fVec(vec) {}
39c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
408fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    SkNb() {}
418fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    bool allTrue() const { return vget_lane_u32(fVec, 0) && vget_lane_u32(fVec, 1); }
428fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    bool anyTrue() const { return vget_lane_u32(fVec, 0) || vget_lane_u32(fVec, 1); }
43d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
448fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    uint32x2_t fVec;
45c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein};
46c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
47c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleintemplate <>
488fe8fffdfa7464c6f7da773b8660a2043f4998e0mtkleinclass SkNb<4, 4> {
49c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinpublic:
508fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    SkNb(uint32x4_t vec) : fVec(vec) {}
51c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
528fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    SkNb() {}
538fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    bool allTrue() const { return vgetq_lane_u32(fVec, 0) && vgetq_lane_u32(fVec, 1)
548fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein                               && vgetq_lane_u32(fVec, 2) && vgetq_lane_u32(fVec, 3); }
558fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    bool anyTrue() const { return vgetq_lane_u32(fVec, 0) || vgetq_lane_u32(fVec, 1)
568fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein                               || vgetq_lane_u32(fVec, 2) || vgetq_lane_u32(fVec, 3); }
57d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
588fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    uint32x4_t fVec;
59c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein};
60c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
61c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleintemplate <>
62c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinclass SkNf<2, float> {
638fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    typedef SkNb<2, 4> Nb;
64c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinpublic:
65c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf(float32x2_t vec) : fVec(vec) {}
66c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
67c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf() {}
68c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    explicit SkNf(float val)           : fVec(vdup_n_f32(val)) {}
69c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Load(const float vals[2]) { return vld1_f32(vals); }
70c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf(float a, float b) { fVec = (float32x2_t) { a, b }; }
71c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
72c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    void store(float vals[2]) const { vst1_f32(vals, fVec); }
73c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
74c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf approxInvert() const {
75c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        float32x2_t est0 = vrecpe_f32(fVec),
76c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est1 = vmul_f32(vrecps_f32(est0, fVec), est0);
77c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return est1;
78c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
79c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf invert() const {
80c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        float32x2_t est1 = this->approxInvert().fVec,
81c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est2 = vmul_f32(vrecps_f32(est1, fVec), est1);
82c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return est2;
83c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
84c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
85c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator + (const SkNf& o) const { return vadd_f32(fVec, o.fVec); }
86c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator - (const SkNf& o) const { return vsub_f32(fVec, o.fVec); }
87c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator * (const SkNf& o) const { return vmul_f32(fVec, o.fVec); }
88c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator / (const SkNf& o) const {
89c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #if defined(SK_CPU_ARM64)
90c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return vdiv_f32(fVec, o.fVec);
91c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #else
92c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return vmul_f32(fVec, o.invert().fVec);
93c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #endif
94c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
95c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
968fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator == (const SkNf& o) const { return vceq_f32(fVec, o.fVec); }
978fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator  < (const SkNf& o) const { return vclt_f32(fVec, o.fVec); }
988fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator  > (const SkNf& o) const { return vcgt_f32(fVec, o.fVec); }
998fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator <= (const SkNf& o) const { return vcle_f32(fVec, o.fVec); }
1008fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator >= (const SkNf& o) const { return vcge_f32(fVec, o.fVec); }
1018fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator != (const SkNf& o) const { return vmvn_u32(vceq_f32(fVec, o.fVec)); }
102c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
103c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Min(const SkNf& l, const SkNf& r) { return vmin_f32(l.fVec, r.fVec); }
104c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Max(const SkNf& l, const SkNf& r) { return vmax_f32(l.fVec, r.fVec); }
105c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
106d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt0() const { return vrsqrte_f32(fVec); }
107d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt1() const {
108d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        float32x2_t est0 = this->rsqrt0().fVec;
109d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est0, est0)), est0);
110d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    }
111d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt2() const {
112d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        float32x2_t est1 = this->rsqrt1().fVec;
113d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return vmul_f32(vrsqrts_f32(fVec, vmul_f32(est1, est1)), est1);
114c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
115c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
116c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf sqrt() const {
117c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #if defined(SK_CPU_ARM64)
118c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return vsqrt_f32(fVec);
119c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #else
120d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return *this * this->rsqrt2();
121c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #endif
122c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
123c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
124a156a8ffbe1342a9c329e66ad1438934ac309d70mtklein    template <int k> float kth() const {
125c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        SkASSERT(0 <= k && k < 2);
126a156a8ffbe1342a9c329e66ad1438934ac309d70mtklein        return vget_lane_f32(fVec, k&1);
127c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
128c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
129c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    float32x2_t fVec;
130c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein};
131c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
132c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein#if defined(SK_CPU_ARM64)
133c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleintemplate <>
1348fe8fffdfa7464c6f7da773b8660a2043f4998e0mtkleinclass SkNb<2, 8> {
135c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinpublic:
1368fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    SkNb(uint64x2_t vec) : fVec(vec) {}
137c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
1388fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    SkNb() {}
1398fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    bool allTrue() const { return vgetq_lane_u64(fVec, 0) && vgetq_lane_u64(fVec, 1); }
1408fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    bool anyTrue() const { return vgetq_lane_u64(fVec, 0) || vgetq_lane_u64(fVec, 1); }
141d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
1428fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    uint64x2_t fVec;
143c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein};
144c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
145c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleintemplate <>
146c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinclass SkNf<2, double> {
1478fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    typedef SkNb<2, 8> Nb;
148c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinpublic:
149c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf(float64x2_t vec) : fVec(vec) {}
150c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
151c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf() {}
152c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    explicit SkNf(double val)           : fVec(vdupq_n_f64(val))  {}
153c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Load(const double vals[2]) { return vld1q_f64(vals); }
154c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf(double a, double b) { fVec = (float64x2_t) { a, b }; }
155c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
156c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    void store(double vals[2]) const { vst1q_f64(vals, fVec); }
157c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
158c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator + (const SkNf& o) const { return vaddq_f64(fVec, o.fVec); }
159c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator - (const SkNf& o) const { return vsubq_f64(fVec, o.fVec); }
160c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator * (const SkNf& o) const { return vmulq_f64(fVec, o.fVec); }
161c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator / (const SkNf& o) const { return vdivq_f64(fVec, o.fVec); }
162c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
1638fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator == (const SkNf& o) const { return vceqq_f64(fVec, o.fVec); }
1648fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator  < (const SkNf& o) const { return vcltq_f64(fVec, o.fVec); }
1658fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator  > (const SkNf& o) const { return vcgtq_f64(fVec, o.fVec); }
1668fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator <= (const SkNf& o) const { return vcleq_f64(fVec, o.fVec); }
1678fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator >= (const SkNf& o) const { return vcgeq_f64(fVec, o.fVec); }
1688fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator != (const SkNf& o) const {
1698fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein        return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(fVec, o.fVec))));
170c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
171c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
172c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f64(l.fVec, r.fVec); }
173c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f64(l.fVec, r.fVec); }
174c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
175c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf  sqrt() const { return vsqrtq_f64(fVec);  }
176d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein
177d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt0() const { return vrsqrteq_f64(fVec); }
178d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt1() const {
179d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        float64x2_t est0 = this->rsqrt0().fVec;
180d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est0, est0)), est0);
181d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    }
182d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt2() const {
183d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        float64x2_t est1 = this->rsqrt1().fVec;
184d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return vmulq_f64(vrsqrtsq_f64(fVec, vmulq_f64(est1, est1)), est1);
185c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
186c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
187c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf approxInvert() const {
188c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        float64x2_t est0 = vrecpeq_f64(fVec),
189c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est1 = vmulq_f64(vrecpsq_f64(est0, fVec), est0);
190c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return est1;
191c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
192c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
193c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf invert() const {
194c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        float64x2_t est1 = this->approxInvert().fVec,
195c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est2 = vmulq_f64(vrecpsq_f64(est1, fVec), est1),
196c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est3 = vmulq_f64(vrecpsq_f64(est2, fVec), est2);
197c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return est3;
198c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
199c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
200a156a8ffbe1342a9c329e66ad1438934ac309d70mtklein    template <int k> double kth() const {
201c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        SkASSERT(0 <= k && k < 2);
202a156a8ffbe1342a9c329e66ad1438934ac309d70mtklein        return vgetq_lane_f64(fVec, k&1);
203c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
204c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
205c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    float64x2_t fVec;
206c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein};
207c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein#endif//defined(SK_CPU_ARM64)
208c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
209c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleintemplate <>
2101113da72eced20480491bb87ade0ffcff4eb8ea7mtkleinclass SkNi<4, int> {
2111113da72eced20480491bb87ade0ffcff4eb8ea7mtkleinpublic:
2121113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi(const int32x4_t& vec) : fVec(vec) {}
2131113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
2141113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi() {}
2151113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    explicit SkNi(int val) : fVec(vdupq_n_s32(val)) {}
2161113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    static SkNi Load(const int vals[4]) { return vld1q_s32(vals); }
2171113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi(int a, int b, int c, int d) { fVec = (int32x4_t) { a, b, c, d }; }
2181113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
2191113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    void store(int vals[4]) const { vst1q_s32(vals, fVec); }
2201113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
2211113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi operator + (const SkNi& o) const { return vaddq_s32(fVec, o.fVec); }
2221113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi operator - (const SkNi& o) const { return vsubq_s32(fVec, o.fVec); }
2231113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi operator * (const SkNi& o) const { return vmulq_s32(fVec, o.fVec); }
2241113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
225d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator << (int bits) const { SHIFT32(vshlq_n_s32, fVec, bits); }
226d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator >> (int bits) const { SHIFT32(vshrq_n_s32, fVec, bits); }
2271113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
2281113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    template <int k> int kth() const {
2291113da72eced20480491bb87ade0ffcff4eb8ea7mtklein        SkASSERT(0 <= k && k < 4);
2301113da72eced20480491bb87ade0ffcff4eb8ea7mtklein        return vgetq_lane_s32(fVec, k&3);
2311113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    }
232d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
2331113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    int32x4_t fVec;
2341113da72eced20480491bb87ade0ffcff4eb8ea7mtklein};
2351113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
2361113da72eced20480491bb87ade0ffcff4eb8ea7mtkleintemplate <>
237c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinclass SkNf<4, float> {
2388fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    typedef SkNb<4, 4> Nb;
239c9adb05b64fa0bfadf9d1a782afcda470da68c9emtkleinpublic:
240c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf(float32x4_t vec) : fVec(vec) {}
241c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
242c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf() {}
243c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    explicit SkNf(float val)           : fVec(vdupq_n_f32(val)) {}
244c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Load(const float vals[4]) { return vld1q_f32(vals); }
245c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf(float a, float b, float c, float d) { fVec = (float32x4_t) { a, b, c, d }; }
246c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
247c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    void store(float vals[4]) const { vst1q_f32(vals, fVec); }
248c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
2491113da72eced20480491bb87ade0ffcff4eb8ea7mtklein    SkNi<4, int> castTrunc() const { return vcvtq_s32_f32(fVec); }
2501113da72eced20480491bb87ade0ffcff4eb8ea7mtklein
251c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf approxInvert() const {
252c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        float32x4_t est0 = vrecpeq_f32(fVec),
253c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est1 = vmulq_f32(vrecpsq_f32(est0, fVec), est0);
254c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return est1;
255c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
256c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf invert() const {
257c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        float32x4_t est1 = this->approxInvert().fVec,
258c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein                    est2 = vmulq_f32(vrecpsq_f32(est1, fVec), est1);
259c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return est2;
260c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
261c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
262c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator + (const SkNf& o) const { return vaddq_f32(fVec, o.fVec); }
263c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator - (const SkNf& o) const { return vsubq_f32(fVec, o.fVec); }
264c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator * (const SkNf& o) const { return vmulq_f32(fVec, o.fVec); }
265c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf operator / (const SkNf& o) const {
266c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #if defined(SK_CPU_ARM64)
267c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return vdivq_f32(fVec, o.fVec);
268c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #else
269c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return vmulq_f32(fVec, o.invert().fVec);
270c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #endif
271c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
272c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
2738fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator == (const SkNf& o) const { return vceqq_f32(fVec, o.fVec); }
2748fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator  < (const SkNf& o) const { return vcltq_f32(fVec, o.fVec); }
2758fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator  > (const SkNf& o) const { return vcgtq_f32(fVec, o.fVec); }
2768fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator <= (const SkNf& o) const { return vcleq_f32(fVec, o.fVec); }
2778fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator >= (const SkNf& o) const { return vcgeq_f32(fVec, o.fVec); }
2788fe8fffdfa7464c6f7da773b8660a2043f4998e0mtklein    Nb operator != (const SkNf& o) const { return vmvnq_u32(vceqq_f32(fVec, o.fVec)); }
279c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
280c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Min(const SkNf& l, const SkNf& r) { return vminq_f32(l.fVec, r.fVec); }
281c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    static SkNf Max(const SkNf& l, const SkNf& r) { return vmaxq_f32(l.fVec, r.fVec); }
282c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
283d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt0() const { return vrsqrteq_f32(fVec); }
284d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt1() const {
285d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        float32x4_t est0 = this->rsqrt0().fVec;
286d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est0, est0)), est0);
287d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    }
288d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein    SkNf rsqrt2() const {
289d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        float32x4_t est1 = this->rsqrt1().fVec;
290d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return vmulq_f32(vrsqrtsq_f32(fVec, vmulq_f32(est1, est1)), est1);
291c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
292c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
293c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    SkNf sqrt() const {
294c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #if defined(SK_CPU_ARM64)
295c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        return vsqrtq_f32(fVec);
296c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #else
297d7c014ff03d44d3ed7a6a2ddca59621a7e98f739mtklein        return *this * this->rsqrt2();
298c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    #endif
299c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
300c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
301a156a8ffbe1342a9c329e66ad1438934ac309d70mtklein    template <int k> float kth() const {
302c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein        SkASSERT(0 <= k && k < 4);
303a156a8ffbe1342a9c329e66ad1438934ac309d70mtklein        return vgetq_lane_f32(fVec, k&3);
304c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    }
305c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
306c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein    float32x4_t fVec;
307c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein};
308c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein
309d2ffd36eb62e99abe2920369d1e040954cc2044fmtkleintemplate <>
310d2ffd36eb62e99abe2920369d1e040954cc2044fmtkleinclass SkNi<8, uint16_t> {
311d2ffd36eb62e99abe2920369d1e040954cc2044fmtkleinpublic:
312d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi(const uint16x8_t& vec) : fVec(vec) {}
313d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
314d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi() {}
315d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    explicit SkNi(uint16_t val) : fVec(vdupq_n_u16(val)) {}
316d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    static SkNi Load(const uint16_t vals[8]) { return vld1q_u16(vals); }
317d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
318d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
319d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein         uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
320d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein        fVec = (uint16x8_t) { a,b,c,d, e,f,g,h };
321d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    }
322d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
323d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    void store(uint16_t vals[8]) const { vst1q_u16(vals, fVec); }
324d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
325d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator + (const SkNi& o) const { return vaddq_u16(fVec, o.fVec); }
326d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator - (const SkNi& o) const { return vsubq_u16(fVec, o.fVec); }
327d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator * (const SkNi& o) const { return vmulq_u16(fVec, o.fVec); }
328d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
329d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator << (int bits) const { SHIFT16(vshlq_n_u16, fVec, bits); }
330d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator >> (int bits) const { SHIFT16(vshrq_n_u16, fVec, bits); }
331d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
33227e517ae533775889c98c65fa2f07b98357ecbc2mtklein    static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u16(a.fVec, b.fVec); }
33327e517ae533775889c98c65fa2f07b98357ecbc2mtklein
334d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    template <int k> uint16_t kth() const {
335d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein        SkASSERT(0 <= k && k < 8);
336d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein        return vgetq_lane_u16(fVec, k&7);
337d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    }
338d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
339d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    uint16x8_t fVec;
340d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein};
341d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
342d2ffd36eb62e99abe2920369d1e040954cc2044fmtkleintemplate <>
343d2ffd36eb62e99abe2920369d1e040954cc2044fmtkleinclass SkNi<16, uint8_t> {
344d2ffd36eb62e99abe2920369d1e040954cc2044fmtkleinpublic:
345d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi(const uint8x16_t& vec) : fVec(vec) {}
346d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
347d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi() {}
348d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    explicit SkNi(uint8_t val) : fVec(vdupq_n_u8(val)) {}
349d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    static SkNi Load(const uint8_t vals[16]) { return vld1q_u8(vals); }
350d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
351d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
352d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein         uint8_t e, uint8_t f, uint8_t g, uint8_t h,
353d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein         uint8_t i, uint8_t j, uint8_t k, uint8_t l,
354d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein         uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
355d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein        fVec = (uint8x16_t) { a,b,c,d, e,f,g,h, i,j,k,l, m,n,o,p };
356d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    }
357d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
358d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    void store(uint8_t vals[16]) const { vst1q_u8(vals, fVec); }
359d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
36004d24a3f86b6f2382e5c6ffaf161ffc734a4d02amtklein    SkNi saturatedAdd(const SkNi& o) const { return vqaddq_u8(fVec, o.fVec); }
36104d24a3f86b6f2382e5c6ffaf161ffc734a4d02amtklein
362d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator + (const SkNi& o) const { return vaddq_u8(fVec, o.fVec); }
363d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator - (const SkNi& o) const { return vsubq_u8(fVec, o.fVec); }
364d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator * (const SkNi& o) const { return vmulq_u8(fVec, o.fVec); }
365d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
366d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator << (int bits) const { SHIFT8(vshlq_n_u8, fVec, bits); }
367d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    SkNi operator >> (int bits) const { SHIFT8(vshrq_n_u8, fVec, bits); }
368d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
36927e517ae533775889c98c65fa2f07b98357ecbc2mtklein    static SkNi Min(const SkNi& a, const SkNi& b) { return vminq_u8(a.fVec, b.fVec); }
37027e517ae533775889c98c65fa2f07b98357ecbc2mtklein
371d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    template <int k> uint8_t kth() const {
372d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein        SkASSERT(0 <= k && k < 15);
373d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein        return vgetq_lane_u8(fVec, k&16);
374d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    }
375d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
376d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein    uint8x16_t fVec;
377d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein};
378d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
379d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein#undef SHIFT32
380d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein#undef SHIFT16
381d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein#undef SHIFT8
382d2ffd36eb62e99abe2920369d1e040954cc2044fmtklein
383c9adb05b64fa0bfadf9d1a782afcda470da68c9emtklein#endif//SkNx_neon_DEFINED
384