1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// pack_neon.h: optimized NEON specializations of the templates in pack.h. 16 17#ifndef GEMMLOWP_INTERNAL_PACK_NEON_H_ 18#define GEMMLOWP_INTERNAL_PACK_NEON_H_ 19 20#include "pack.h" 21 22#include <arm_neon.h> 23 24namespace gemmlowp { 25 26template <RoundingMode tRoundingMode> 27class NEONRoundingOffsetGenerator { 28 public: 29 uint8x16_t get() { 30 assert(false); // This generic path should never be called. 31 return vdupq_n_u8(0); 32 } 33}; 34 35// A RoundingOffsetGenerator for rounding-to-nearest, always returning 36// the midpoint value 127. 37template <> 38class NEONRoundingOffsetGenerator<RoundingMode::Nearest> { 39 public: 40 uint8x16_t get() { return vdupq_n_u8(127); } 41}; 42 43// Variant of NEONRoundingOffsetGenerator that produces 44// random NEON 128-bit vectors using a 8-bit Xorshift. 45template <> 46class NEONRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> { 47 public: 48 NEONRoundingOffsetGenerator() { 49 uint8_t s = 128; 50 std::uint8_t a[16]; 51 for (int i = 0; i < 16; i++) { 52 a[i] = s; 53 // Xorshift8(7,7,1). Very important to choose a different 54 // xorshift than we do in get(), otherwise lanes would contain 55 // the same values! 56 s ^= s << 7; 57 s ^= s >> 7; 58 s ^= s << 1; 59 } 60 x_ = vld1q_u8(a); 61 } 62 63 uint8x16_t get() { 64 // Xorshift produces values in [1..255], we want [0..254]. 65 uint8x16_t result = vsubq_u8(x_, vdupq_n_u8(1)); 66 // Xorshift8(7,5,3) 67 x_ = veorq_u8(x_, vshlq_n_u8(x_, 7)); 68 x_ = veorq_u8(x_, vshrq_n_u8(x_, 5)); 69 x_ = veorq_u8(x_, vshlq_n_u8(x_, 3)); 70 return result; 71 } 72 73 private: 74 // State 75 uint8x16_t x_; 76}; 77 78// Variant of NEONRoundingOffsetGenerator that produces 79// rounding vectors using an 8-bit add/mod low-discrepancy sequence. 80template <> 81class NEONRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> { 82 public: 83 NEONRoundingOffsetGenerator() { 84 uint8_t s = 128; 85 std::uint8_t a[16]; 86 // The initial offset is set by offsetting each lane to one 87 // more iteration of the sequence (s0...s15) Then, upon iteration, 88 // each lane moves ahead by 16. 89 for (int i = 0; i < 16; i++) { 90 a[i] = s; 91 s += (97 + (s >= 158)); 92 } 93 x_ = vld1q_u8(a); 94 } 95 96 uint8x16_t get() { 97 // Get moves the lane ahead by 16 iterations of the sequence 98 // x_ = (x + (16*97)) % 255. (16*97)%255 = 22. 255-22=233, 99 // so x_ += (22 + (x >= 233)). 100 // There's an excessively opaque bit hack here: 101 // A "true" compare on NEON produces an all-1s result (0xff). 102 // So instead of adding in the comparison result, we subtract it 103 // to get the same effect as adding 1. 104 uint8x16_t extra_one = vcgeq_u8(x_, vdupq_n_u8(233)); 105 x_ = vaddq_u8(x_, vdupq_n_u8(22)); 106 x_ = vsubq_u8(x_, extra_one); 107 return x_; 108 } 109 110 private: 111 // State 112 uint8x16_t x_; 113}; 114 115// Requantizes source uint8 values in [0..255] range 116// to the range specified by BitDepth, [0..((2^bits)-1)]. 117// Bias must be avoided. Currently this is achieved 118// by probabilistic rounding. 119template <typename QuantizationParams> 120uint8x16_t Requantize( 121 uint8x16_t raw_src_data, 122 NEONRoundingOffsetGenerator<QuantizationParams::kRoundingMode>* 123 rounding_offset_generator) { 124 static const int kBits = QuantizationParams::BitDepth::kBits; 125 static const std::uint8_t kMaxVal = (1 << kBits) - 1; 126 127 if (kBits == 8) { 128 return raw_src_data; 129 } 130 131 uint8x16_t rounding_offset = rounding_offset_generator->get(); 132 133 // Compute: 134 // x = maxval * src + rounding_offset 135 uint16x8_t x[2]; 136 const uint8x8_t maxval_dup = vdup_n_u8(kMaxVal); 137 x[0] = vmlal_u8(vmovl_u8(vget_low_u8(rounding_offset)), maxval_dup, 138 vget_low_u8(raw_src_data)); 139 x[1] = vmlal_u8(vmovl_u8(vget_high_u8(rounding_offset)), maxval_dup, 140 vget_high_u8(raw_src_data)); 141 142 // Divide by 255 (truncating). 143 // 144 // Here we use the following formula, valid for all integers y in 0..65534 145 // (which is more than we need since we've already early-returned 146 // if kBits==8). 147 // 148 // y/255 = (y + 1 + (y >> 8)) >> 8. 149 uint8x8_t result[2]; 150 for (int i = 0; i < 2; i++) { 151 result[i] = vshrn_n_u16( 152 vaddq_u16(vaddq_u16(x[i], vdupq_n_u16(1)), vshrq_n_u16(x[i], 8)), 8); 153 } 154 155 return vcombine_u8(result[0], result[1]); 156} 157 158typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor> 159 WidthMajorUint8SideMap; 160 161template <int Cells> 162using DepthMajorSideFormatNCells4x2 = KernelSideFormat<CellFormat<4, 2>, Cells>; 163 164template <typename QuantizationParams, int Cells> 165class PackingRegisterBlock< 166 QuantizationParams, WidthMajorUint8SideMap, 167 PackedSideBlock<DepthMajorSideFormatNCells4x2<Cells> > > 168 : public PackingRegisterBlockBase< 169 QuantizationParams, WidthMajorUint8SideMap, 170 PackedSideBlock<DepthMajorSideFormatNCells4x2<Cells> > > { 171 public: 172 typedef DepthMajorSideFormatNCells4x2<Cells> KernelSideFormat; 173 typedef typename KernelSideFormat::Cell CellFormat; 174 static const int kCells = KernelSideFormat::kCells; 175 static const int kCellWidth = CellFormat::kWidth; 176 static const int kKernelWidth = CellFormat::kWidth * kCells; 177 static const int kCellDepth = CellFormat::kDepth; 178 static const int kCellSize = CellFormat::kSize; 179 180 typedef NEONRoundingOffsetGenerator<QuantizationParams::kRoundingMode> 181 RoundingOffsetGenerator; 182 183 void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width, 184 RoundingOffsetGenerator* rounding_offset_generator) { 185 std::uint8_t* dst_ptr = dst->current_data(); 186 const std::uint8_t* const src_ptr = this->complete_src_.data(); 187 const int stride = this->complete_src_.stride(); 188 // Load and requantize source WidthMajor data 189 uint8x16_t src_lines[4 * kCells]; 190 for (int i = 0; i < 4 * kCells; i++) { 191 src_lines[i] = Requantize<QuantizationParams>( 192 vld1q_u8(src_ptr + i * stride), rounding_offset_generator); 193 } 194 // Reorder the data within registers to make DepthMajor 4x2 cells 195 uint8x16x2_t src_lines_intertwined_2x[2 * kCells]; 196 for (int i = 0; i < kCells; i++) { 197 src_lines_intertwined_2x[2 * i] = 198 vzipq_u8(src_lines[4 * i], src_lines[4 * i + 2]); 199 src_lines_intertwined_2x[2 * i + 1] = 200 vzipq_u8(src_lines[4 * i + 1], src_lines[4 * i + 3]); 201 } 202 uint8x16x2_t src_lines_intertwined_4x[2 * kCells]; 203 for (int i = 0; i < kCells; i++) { 204 src_lines_intertwined_4x[2 * i] = 205 vzipq_u8(src_lines_intertwined_2x[2 * i].val[0], 206 src_lines_intertwined_2x[2 * i + 1].val[0]); 207 src_lines_intertwined_4x[2 * i + 1] = 208 vzipq_u8(src_lines_intertwined_2x[2 * i].val[1], 209 src_lines_intertwined_2x[2 * i + 1].val[1]); 210 } 211 // Store the resulting DepthMajor 4x2 cells in the destination packed block 212 for (int outer = 0; outer < 2; outer++) { 213 for (int inner = 0; inner < 2; inner++) { 214 for (int cell = 0; cell < kCells; cell++) { 215 uint8x8_t value = vget_low_u8( 216 src_lines_intertwined_4x[2 * cell + outer].val[inner]); 217 vst1_u8(dst_ptr, value); 218 dst_ptr += 8; 219 } 220 for (int cell = 0; cell < kCells; cell++) { 221 uint8x8_t value = vget_high_u8( 222 src_lines_intertwined_4x[2 * cell + outer].val[inner]); 223 vst1_u8(dst_ptr, value); 224 dst_ptr += 8; 225 } 226 } 227 } 228 // Compute sums across the depth dimension 229 uint16x8_t sums_of_2_cells[kCells][4]; 230 for (int outer = 0; outer < 2; outer++) { 231 for (int inner = 0; inner < 2; inner++) { 232 int i = 2 * outer + inner; 233 for (int cell = 0; cell < kCells; cell++) { 234 sums_of_2_cells[cell][i] = vaddl_u8( 235 vget_low_u8( 236 src_lines_intertwined_4x[2 * cell + outer].val[inner]), 237 vget_high_u8( 238 src_lines_intertwined_4x[2 * cell + outer].val[inner])); 239 } 240 } 241 } 242 int32x4_t sums_of_4_cells[kCells][4]; 243 for (int i = 0; i < 4; i++) { 244 for (int cell = 0; cell < kCells; cell++) { 245 sums_of_4_cells[cell][i] = vreinterpretq_s32_u32( 246 vaddl_u16(vget_low_u16(sums_of_2_cells[cell][i]), 247 vget_high_u16(sums_of_2_cells[cell][i]))); 248 } 249 } 250 // Update the sums_of_each_slice vector 251 for (int cell = 0; cell < kCells; cell++) { 252 int32x4_t s01 = 253 vaddq_s32(sums_of_4_cells[cell][0], sums_of_4_cells[cell][1]); 254 int32x4_t s23 = 255 vaddq_s32(sums_of_4_cells[cell][2], sums_of_4_cells[cell][3]); 256 int32x4_t s = vaddq_s32(s01, s23); 257 std::int32_t* sums_of_each_slice_ptr = 258 dst->sums_of_each_slice() + start_width + 4 * cell; 259 vst1q_s32(sums_of_each_slice_ptr, 260 vaddq_s32(s, vld1q_s32(sums_of_each_slice_ptr))); 261 } 262 dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); 263 } 264}; 265 266template <int Cells> 267using WidthMajorSideFormatNCells4x2 = 268 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>; 269 270template <typename QuantizationParams, int Cells> 271class PackingRegisterBlock< 272 QuantizationParams, WidthMajorUint8SideMap, 273 PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > 274 : public PackingRegisterBlockBase< 275 QuantizationParams, WidthMajorUint8SideMap, 276 PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > { 277 public: 278 typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat; 279 typedef typename KernelSideFormat::Cell CellFormat; 280 static const int kCells = KernelSideFormat::kCells; 281 static const int kCellWidth = CellFormat::kWidth; 282 static const int kKernelWidth = CellFormat::kWidth * kCells; 283 static const int kCellDepth = CellFormat::kDepth; 284 static const int kCellSize = CellFormat::kSize; 285 286 typedef NEONRoundingOffsetGenerator<QuantizationParams::kRoundingMode> 287 RoundingOffsetGenerator; 288 289 void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width, 290 RoundingOffsetGenerator* rounding_offset_generator) { 291 std::uint8_t* dst_ptr = dst->current_data(); 292 const std::uint8_t* src_ptr = this->complete_src_.data(); 293 const int stride = this->complete_src_.stride(); 294 // Load and requantize source WidthMajor data 295 uint16x8_t src_lines[kCells * 4]; 296 for (int i = 0; i < kCells; i++) { 297// This packing path is used with our current 298// less-than-8-bit kernel, and the partial unrolling of this loop 299// results in substantially faster code (thanks to better 300// register allocation) on Nexus 5. 301 302#define GEMMLOWP_UNROLLED_LOOP_ITER(k) \ 303 src_lines[4 * i + k] = vreinterpretq_u16_u8(Requantize<QuantizationParams>( \ 304 vld1q_u8(src_ptr), rounding_offset_generator)); \ 305 src_ptr += stride; 306 307 GEMMLOWP_UNROLLED_LOOP_ITER(0) 308 GEMMLOWP_UNROLLED_LOOP_ITER(1) 309 GEMMLOWP_UNROLLED_LOOP_ITER(2) 310 GEMMLOWP_UNROLLED_LOOP_ITER(3) 311 312#undef GEMMLOWP_UNROLLED_LOOP_ITER 313 } 314 // Reorder the data within registers to make WidthMajor 4x2 cells 315 uint16x8x2_t src_lines_intertwined_2x[2 * kCells]; 316 for (int i = 0; i < kCells; i++) { 317 src_lines_intertwined_2x[2 * i] = 318 vzipq_u16(src_lines[4 * i], src_lines[4 * i + 2]); 319 src_lines_intertwined_2x[2 * i + 1] = 320 vzipq_u16(src_lines[4 * i + 1], src_lines[4 * i + 3]); 321 } 322 uint16x8x2_t src_lines_intertwined_4x[2 * kCells]; 323 for (int i = 0; i < kCells; i++) { 324 src_lines_intertwined_4x[2 * i] = 325 vzipq_u16(src_lines_intertwined_2x[2 * i].val[0], 326 src_lines_intertwined_2x[2 * i + 1].val[0]); 327 src_lines_intertwined_4x[2 * i + 1] = 328 vzipq_u16(src_lines_intertwined_2x[2 * i].val[1], 329 src_lines_intertwined_2x[2 * i + 1].val[1]); 330 } 331 // Store the resulting WidthMajor 4x2 cells in the destination packed block 332 for (int outer = 0; outer < 2; outer++) { 333 for (int inner = 0; inner < 2; inner++) { 334 for (int cell = 0; cell < kCells; cell++) { 335 uint8x8_t value = vreinterpret_u8_u16(vget_low_u16( 336 src_lines_intertwined_4x[2 * cell + outer].val[inner])); 337 vst1_u8(dst_ptr, value); 338 dst_ptr += 8; 339 } 340 for (int cell = 0; cell < kCells; cell++) { 341 uint8x8_t value = vreinterpret_u8_u16(vget_high_u16( 342 src_lines_intertwined_4x[2 * cell + outer].val[inner])); 343 vst1_u8(dst_ptr, value); 344 dst_ptr += 8; 345 } 346 } 347 } 348 // Compute sums across the depth dimension 349 uint16x8_t sums_of_2[kCells][4]; 350 for (int outer = 0; outer < 2; outer++) { 351 for (int inner = 0; inner < 2; inner++) { 352 int i = 2 * outer + inner; 353 for (int cell = 0; cell < kCells; cell++) { 354 sums_of_2[cell][i] = vpaddlq_u8(vreinterpretq_u8_u16( 355 src_lines_intertwined_4x[2 * cell + outer].val[inner])); 356 } 357 } 358 } 359 uint16x8_t sums_of_4[kCells][2]; 360 for (int i = 0; i < 2; i++) { 361 for (int cell = 0; cell < kCells; cell++) { 362 sums_of_4[cell][i] = 363 vaddq_u16(sums_of_2[cell][2 * i], sums_of_2[cell][2 * i + 1]); 364 } 365 } 366 uint16x8_t sums_of_8[kCells]; 367 for (int cell = 0; cell < kCells; cell++) { 368 sums_of_8[cell] = vaddq_u16(sums_of_4[cell][0], sums_of_4[cell][1]); 369 } 370 371 uint16x4_t sums_of_16[kCells]; 372 for (int cell = 0; cell < kCells; cell++) { 373 sums_of_16[cell] = vadd_u16(vget_low_u16(sums_of_8[cell]), 374 vget_high_u16(sums_of_8[cell])); 375 } 376 // Update the sums_of_each_slice vector 377 for (int cell = 0; cell < kCells; cell++) { 378 int32x4_t s = vreinterpretq_s32_u32(vmovl_u16(sums_of_16[cell])); 379 std::int32_t* sums_of_each_slice_ptr = 380 dst->sums_of_each_slice() + start_width + 4 * cell; 381 vst1q_s32(sums_of_each_slice_ptr, 382 vaddq_s32(s, vld1q_s32(sums_of_each_slice_ptr))); 383 } 384 dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); 385 } 386}; 387 388} // namespace gemmlowp 389 390#endif // GEMMLOWP_INTERNAL_PACK_NEON_H_ 391