1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// fixedpoint_neon.h: optimized NEON specializations of the templates
16// in fixedpoint.h.
17
18#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
19#define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
20
21#include <arm_neon.h>
22
23namespace gemmlowp {
24
25template <>
26struct FixedPointRawTypeTraits<int32x4_t> {
27  typedef std::int32_t ScalarRawType;
28  static const int kLanes = 4;
29};
30
31template <>
32inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) {
33  return vandq_s32(a, b);
34}
35
36template <>
37inline int32x4_t BitOr(int32x4_t a, int32x4_t b) {
38  return vorrq_s32(a, b);
39}
40
41template <>
42inline int32x4_t BitXor(int32x4_t a, int32x4_t b) {
43  return veorq_s32(a, b);
44}
45
46template <>
47inline int32x4_t BitNot(int32x4_t a) {
48  return veorq_s32(a, vdupq_n_s32(-1));
49}
50
51template <>
52inline int32x4_t Add(int32x4_t a, int32x4_t b) {
53  return vaddq_s32(a, b);
54}
55
56template <>
57inline int32x4_t Sub(int32x4_t a, int32x4_t b) {
58  return vsubq_s32(a, b);
59}
60
61template <>
62inline int32x4_t Neg(int32x4_t a) {
63  return vnegq_s32(a);
64}
65
66template <>
67inline int32x4_t ShiftLeft(int32x4_t a, int offset) {
68  return vshlq_s32(a, vdupq_n_s32(offset));
69}
70
71template <>
72inline int32x4_t ShiftRight(int32x4_t a, int offset) {
73  return vshlq_s32(a, vdupq_n_s32(-offset));
74}
75
76template <>
77inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val,
78                                 int32x4_t else_val) {
79  return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val);
80}
81
82template <>
83inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) {
84  return vreinterpretq_s32_u32(vceqq_s32(a, b));
85}
86
87template <>
88inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) {
89  return BitNot(MaskIfEqual(a, b));
90}
91
92template <>
93inline int32x4_t MaskIfZero(int32x4_t a) {
94  return MaskIfEqual(a, vdupq_n_s32(0));
95}
96
97template <>
98inline int32x4_t MaskIfNonZero(int32x4_t a) {
99  return vreinterpretq_s32_u32(vtstq_s32(a, a));
100}
101
102template <>
103inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) {
104  return vreinterpretq_s32_u32(vcgtq_s32(a, b));
105}
106
107template <>
108inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) {
109  return vreinterpretq_s32_u32(vcgeq_s32(a, b));
110}
111
112template <>
113inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) {
114  return vreinterpretq_s32_u32(vcltq_s32(a, b));
115}
116
117template <>
118inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) {
119  return vreinterpretq_s32_u32(vcleq_s32(a, b));
120}
121
122template <>
123inline bool All(int32x4_t a) {
124  a = vandq_s32(a, vextq_s32(a, a, 1));
125  a = vandq_s32(a, vextq_s32(a, a, 2));
126  return vgetq_lane_s32(a, 0);
127}
128
129template <>
130inline bool Any(int32x4_t a) {
131  a = vorrq_s32(a, vextq_s32(a, a, 1));
132  a = vorrq_s32(a, vextq_s32(a, a, 2));
133  return vgetq_lane_s32(a, 0);
134}
135
136template <>
137inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) {
138  return vrhaddq_s32(a, b);
139}
140
141template <>
142inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) {
143  return vqrdmulhq_s32(a, b);
144}
145
146template <>
147inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) {
148  const int32x4_t shift_vec = vdupq_n_s32(-exponent);
149  const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
150  const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
151  return vrshlq_s32(fixed_up_x, shift_vec);
152}
153
154template <int Exponent>
155struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> {
156  static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); }
157};
158
159template <int Exponent>
160struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> {
161  static int32x4_t eval(int32x4_t x) {
162    const int32x4_t fixup = vshrq_n_s32(x, 31);
163    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
164    return vrshrq_n_s32(fixed_up_x, -Exponent);
165  }
166};
167
168template <>
169inline int32x4_t Dup<int32x4_t>(std::int32_t x) {
170  return vdupq_n_s32(x);
171}
172
173}  // end namespace gemmlowp
174
175#endif  // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_
176