1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// unpack_neon.h: optimized NEON specializations of the templates in unpack.h. 16 17#ifndef GEMMLOWP_INTERNAL_UNPACK_NEON_H_ 18#define GEMMLOWP_INTERNAL_UNPACK_NEON_H_ 19 20#include "output_neon.h" 21#include "unpack.h" 22 23#include <arm_neon.h> 24 25namespace gemmlowp { 26 27template <std::uint32_t numerator, std::uint32_t denominator> 28int32x4_t RoundingMultiplyByConstantFraction(int32x4_t x) { 29 static_assert(numerator > 0 && denominator > 0, 30 "only supporting positive num/denom"); 31 32 if (numerator == denominator) { 33 return x; 34 } 35 36 static const std::int32_t int_quotient = 37 (numerator + denominator / 2) / denominator; 38 static const std::int32_t remaining_numerator = 39 numerator - int_quotient * denominator; 40 static const std::int32_t scaled_remaining_numerator = 41 static_cast<std::int32_t>( 42 (static_cast<std::int64_t>(remaining_numerator) * (1ll << 31)) / 43 denominator); 44 // Note: vqrdmulh instruction is rounding doubling multiply high. 45 const int32x4_t remaining_product = 46 vqrdmulhq_n_s32(x, scaled_remaining_numerator); 47 48 return vmlaq_n_s32(remaining_product, x, int_quotient); 49} 50 51template <typename tScalar, VectorShape tShape> 52int32x4_t get_int32x4_t_and_inc( 53 ConstIterator<VectorMap<tScalar, tShape>>* iterator) { 54 const int32x4_t result = vld1q_s32(iterator->get()); 55 *iterator += 4; 56 return result; 57} 58 59template <typename tScalar, VectorShape tShape> 60int32x4_t get_int32x4_t_and_inc( 61 ConstIterator<VectorDup<tScalar, tShape>>* iterator) { 62 const int32x4_t result = vdupq_n_s32(**iterator); 63 // Increment really does nothing for VectorDup. 64 *iterator += 4; 65 return result; 66} 67 68template <typename BitDepthParams, typename PackedResultType, 69 typename OutputScalar, typename LhsOffset, typename RhsOffset, 70 typename OutputPipelineType> 71struct UnpackResultImpl<BitDepthParams, 72 MatrixMap<OutputScalar, MapOrder::ColMajor>, 73 PackedResultType, LhsOffset, RhsOffset, 74 OutputPipelineType> { 75 typedef MatrixMap<OutputScalar, MapOrder::ColMajor> ResultBlockType; 76 static void Unpack(ResultBlockType* dst, const PackedResultType& src, 77 int depth, const std::int32_t* lhs_sums_of_each_slice, 78 const std::int32_t* rhs_sums_of_each_slice, 79 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 80 const OutputPipelineType& output_pipeline) { 81 ScopedProfilingLabel label("optimized path (NEON)"); 82 const int kLhsBits = BitDepthParams::LhsBitDepth::kBits; 83 const int kRhsBits = BitDepthParams::RhsBitDepth::kBits; 84 const std::int32_t kLhsMax = (1 << kLhsBits) - 1; 85 const std::int32_t kRhsMax = (1 << kRhsBits) - 1; 86 auto src_map = src.Map(); 87 OutputPipelineExecutor<OutputPipelineType, FragmentInt32x1x1> 88 output_pipeline_executor_int32x1x1(output_pipeline); 89 OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x4x1> 90 output_pipeline_executor_int32x4x1(output_pipeline); 91 OutputPipelineExecutor<OutputPipelineType, NEONFragmentInt32x16x1> 92 output_pipeline_executor_int32x16x1(output_pipeline); 93 94 for (int c = 0; c < dst->cols(); c++) { 95 const std::int32_t* src_ptr = src_map.data(0, c); 96 const std::int32_t* sums_of_each_slice_ptr = lhs_sums_of_each_slice; 97 auto lhs_offset_iter = const_iterator(lhs_offset); 98 const std::int32_t rhs_offset_c = rhs_offset(c); 99 const std::int32_t rhs_sums_of_each_slice_c = rhs_sums_of_each_slice[c]; 100 101 // Handle 16 values at once for higher performance 102 int dst_rows_aligned16 = RoundDown<16>(dst->rows()); 103 for (int r = 0; r < dst_rows_aligned16; r += 16) { 104 // Compute the sum of the 4 terms, 105 // q = term_xx + term_x1 + term_1x_plus_term_11 106 // Refer to the generic code in unpack.h. 107 int32x4_t raw_xx[4]; 108 for (int i = 0; i < 4; i++) { 109 raw_xx[i] = vld1q_s32(src_ptr); 110 src_ptr += 4; 111 } 112 int32x4_t raw_x1[4]; 113 for (int i = 0; i < 4; i++) { 114 const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr); 115 raw_x1[i] = vmulq_n_s32(sum_x1, rhs_offset_c); 116 sums_of_each_slice_ptr += 4; 117 } 118 int32x4_t raw_1x[4]; 119 int32x4_t term_11[4]; 120 for (int i = 0; i < 4; i++) { 121 const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter); 122 raw_1x[i] = vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c); 123 term_11[i] = vmulq_n_s32(lhs_offsets, rhs_offset_c * depth); 124 } 125 int32x4_t term_xx[4]; 126 for (int i = 0; i < 4; i++) { 127 term_xx[i] = 128 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( 129 raw_xx[i]); 130 } 131 int32x4_t term_x1[4]; 132 for (int i = 0; i < 4; i++) { 133 term_x1[i] = 134 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1[i]); 135 } 136 int32x4_t term_1x[4]; 137 for (int i = 0; i < 4; i++) { 138 term_1x[i] = 139 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x[i]); 140 } 141 int32x4x4_t q; 142 for (int i = 0; i < 4; i++) { 143 q.val[i] = vaddq_s32(vaddq_s32(term_xx[i], term_x1[i]), 144 vaddq_s32(term_1x[i], term_11[i])); 145 } 146 NEONFragmentInt32x16x1 f(q); 147 output_pipeline_executor_int32x16x1.Execute(f, dst, r, c); 148 } 149 // We have finished handling groups of 16 entries at once; now 150 // try to handle 4 entries at once. 151 int dst_rows_aligned4 = RoundDown<4>(dst->rows()); 152 for (int r = dst_rows_aligned16; r < dst_rows_aligned4; r += 4) { 153 // Compute the sum of the 4 terms, 154 // q = term_xx + term_x1 + term_1x_plus_term_11 155 // Refer to the generic code in unpack.h. 156 const int32x4_t raw_xx = vld1q_s32(src_ptr); 157 src_ptr += 4; 158 const int32x4_t term_xx = 159 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( 160 raw_xx); 161 const int32x4_t sum_x1 = vld1q_s32(sums_of_each_slice_ptr); 162 const int32x4_t raw_x1 = vmulq_n_s32(sum_x1, rhs_offset_c); 163 sums_of_each_slice_ptr += 4; 164 const int32x4_t term_x1 = 165 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1); 166 const int32x4_t lhs_offsets = get_int32x4_t_and_inc(&lhs_offset_iter); 167 const int32x4_t raw_1x = 168 vmulq_n_s32(lhs_offsets, rhs_sums_of_each_slice_c); 169 const int32x4_t term_1x = 170 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x); 171 const int32x4_t term_11 = 172 vmulq_n_s32(lhs_offsets, rhs_offset_c * depth); 173 int32x4_t q = vaddq_s32(vaddq_s32(term_xx, term_x1), 174 vaddq_s32(term_1x, term_11)); 175 NEONFragmentInt32x4x1 f(q); 176 output_pipeline_executor_int32x4x1.Execute(f, dst, r, c); 177 } 178 // We have finished handling 4 entries at once; now handle 179 // remaining entries one by one. This scalar code is similar 180 // to the code in unpack.h, see comments there. 181 for (int r = dst_rows_aligned4; r < dst->rows(); r++) { 182 const std::int32_t raw_xx = src_map(r, c); 183 const std::int32_t raw_x1 = lhs_sums_of_each_slice[r] * rhs_offset_c; 184 const std::int32_t raw_1x = rhs_sums_of_each_slice_c * lhs_offset(r); 185 const std::int32_t term_xx = 186 RoundingMultiplyByConstantFraction<255 * 255, kLhsMax * kRhsMax>( 187 raw_xx); 188 const std::int32_t term_x1 = 189 RoundingMultiplyByConstantFraction<255, kLhsMax>(raw_x1); 190 const std::int32_t term_1x = 191 RoundingMultiplyByConstantFraction<255, kRhsMax>(raw_1x); 192 const std::int32_t term_11 = lhs_offset(r) * rhs_offset(c) * depth; 193 FragmentInt32x1x1 sum = term_xx + term_x1 + term_1x + term_11; 194 output_pipeline_executor_int32x1x1.Execute(sum, dst, r, c); 195 } 196 } 197 } 198}; 199 200} // namespace gemmlowp 201 202#endif // GEMMLOWP_INTERNAL_UNPACK_NEON_H_ 203