1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// fixedpoint_neon.h: optimized NEON specializations of the templates 16// in fixedpoint.h. 17 18#ifndef GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ 19#define GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ 20 21#include <arm_neon.h> 22 23namespace gemmlowp { 24 25template <> 26struct FixedPointRawTypeTraits<int32x4_t> { 27 typedef std::int32_t ScalarRawType; 28 static const int kLanes = 4; 29}; 30 31template <> 32inline int32x4_t BitAnd(int32x4_t a, int32x4_t b) { 33 return vandq_s32(a, b); 34} 35 36template <> 37inline int32x4_t BitOr(int32x4_t a, int32x4_t b) { 38 return vorrq_s32(a, b); 39} 40 41template <> 42inline int32x4_t BitXor(int32x4_t a, int32x4_t b) { 43 return veorq_s32(a, b); 44} 45 46template <> 47inline int32x4_t BitNot(int32x4_t a) { 48 return veorq_s32(a, vdupq_n_s32(-1)); 49} 50 51template <> 52inline int32x4_t Add(int32x4_t a, int32x4_t b) { 53 return vaddq_s32(a, b); 54} 55 56template <> 57inline int32x4_t Sub(int32x4_t a, int32x4_t b) { 58 return vsubq_s32(a, b); 59} 60 61template <> 62inline int32x4_t Neg(int32x4_t a) { 63 return vnegq_s32(a); 64} 65 66template <> 67inline int32x4_t ShiftLeft(int32x4_t a, int offset) { 68 return vshlq_s32(a, vdupq_n_s32(offset)); 69} 70 71template <> 72inline int32x4_t ShiftRight(int32x4_t a, int offset) { 73 return vshlq_s32(a, vdupq_n_s32(-offset)); 74} 75 76template <> 77inline int32x4_t SelectUsingMask(int32x4_t if_mask, int32x4_t then_val, 78 int32x4_t else_val) { 79 return vbslq_s32(vreinterpretq_u32_s32(if_mask), then_val, else_val); 80} 81 82template <> 83inline int32x4_t MaskIfEqual(int32x4_t a, int32x4_t b) { 84 return vreinterpretq_s32_u32(vceqq_s32(a, b)); 85} 86 87template <> 88inline int32x4_t MaskIfNotEqual(int32x4_t a, int32x4_t b) { 89 return BitNot(MaskIfEqual(a, b)); 90} 91 92template <> 93inline int32x4_t MaskIfZero(int32x4_t a) { 94 return MaskIfEqual(a, vdupq_n_s32(0)); 95} 96 97template <> 98inline int32x4_t MaskIfNonZero(int32x4_t a) { 99 return vreinterpretq_s32_u32(vtstq_s32(a, a)); 100} 101 102template <> 103inline int32x4_t MaskIfGreaterThan(int32x4_t a, int32x4_t b) { 104 return vreinterpretq_s32_u32(vcgtq_s32(a, b)); 105} 106 107template <> 108inline int32x4_t MaskIfGreaterThanOrEqual(int32x4_t a, int32x4_t b) { 109 return vreinterpretq_s32_u32(vcgeq_s32(a, b)); 110} 111 112template <> 113inline int32x4_t MaskIfLessThan(int32x4_t a, int32x4_t b) { 114 return vreinterpretq_s32_u32(vcltq_s32(a, b)); 115} 116 117template <> 118inline int32x4_t MaskIfLessThanOrEqual(int32x4_t a, int32x4_t b) { 119 return vreinterpretq_s32_u32(vcleq_s32(a, b)); 120} 121 122template <> 123inline bool All(int32x4_t a) { 124 a = vandq_s32(a, vextq_s32(a, a, 1)); 125 a = vandq_s32(a, vextq_s32(a, a, 2)); 126 return vgetq_lane_s32(a, 0); 127} 128 129template <> 130inline bool Any(int32x4_t a) { 131 a = vorrq_s32(a, vextq_s32(a, a, 1)); 132 a = vorrq_s32(a, vextq_s32(a, a, 2)); 133 return vgetq_lane_s32(a, 0); 134} 135 136template <> 137inline int32x4_t RoundingHalfSum(int32x4_t a, int32x4_t b) { 138 return vrhaddq_s32(a, b); 139} 140 141template <> 142inline int32x4_t SaturatingRoundingDoublingHighMul(int32x4_t a, int32x4_t b) { 143 return vqrdmulhq_s32(a, b); 144} 145 146template <> 147inline int32x4_t RoundingDivideByPOT(int32x4_t x, int exponent) { 148 const int32x4_t shift_vec = vdupq_n_s32(-exponent); 149 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift_vec), 31); 150 const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); 151 return vrshlq_s32(fixed_up_x, shift_vec); 152} 153 154template <int Exponent> 155struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, 1> { 156 static int32x4_t eval(int32x4_t x) { return vqshlq_n_s32(x, Exponent); } 157}; 158 159template <int Exponent> 160struct ImplSaturatingRoundingMultiplyByPOT<Exponent, int32x4_t, -1> { 161 static int32x4_t eval(int32x4_t x) { 162 const int32x4_t fixup = vshrq_n_s32(x, 31); 163 const int32x4_t fixed_up_x = vqaddq_s32(x, fixup); 164 return vrshrq_n_s32(fixed_up_x, -Exponent); 165 } 166}; 167 168template <> 169inline int32x4_t Dup<int32x4_t>(std::int32_t x) { 170 return vdupq_n_s32(x); 171} 172 173} // end namespace gemmlowp 174 175#endif // GEMMLOWP_INTERNAL_FIXEDPOINT_NEON_H_ 176