1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// output.h: processing the 32-bit accumulators output by the unpack 16// stage, obtaining the final result matrix entries and storing them into 17// the destination matrix. 18 19#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_ 20#define GEMMLOWP_INTERNAL_OUTPUT_H_ 21 22#include <cmath> 23#include <tuple> 24#include <type_traits> 25 26#include "../public/output_stages.h" 27#include "fixedpoint.h" 28 29namespace gemmlowp { 30 31// A Fragment is a small fixed-size matrix typically stored in one or 32// a few architecture-specific SIMD vectors. Besides plain old scalar types 33// such as int32_t, Fragment types are what can be used as input/output data 34// types for output pipeline stages. 35// 36// More details: 37// 38// In the generic scalar code in this file, we have only implemented 39// evaluation of output stages for scalar inputs (e.g. plain int32_t values). 40// Other files (e.g. output_neon.h) are to provide SIMD paths by implementing 41// evaluation of output stages for SIMD vector types. However, this raises 42// the question of how the different values ("lanes") in a SIMD vector 43// correspond to different values in the whole matrices. For simple entry-wise 44// output stages, this doesn't matter, but for other output stages depending 45// on position within the whole matrix, this does matter. To solve this 46// problem, rather than implementing evaluation of output stages for raw 47// SIMD vector types, we wrap SIMD vector types in "fragment" structs that 48// bring the additional structure of "shape" i.e. mapping SIMD lanes to 49// matrix entries, and we specialize evaluation of output stage for such 50// fragment types. The Fragment template struct here is how we generate 51// all fragment structs. For example, in output_neon.h, it may be specialized 52// with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for 53// vector shapes. While Fragment is only used for SIMD paths, we leave it 54// here in this platform-generic file because this same template should 55// cover the needs of any SIMD architectures. 56template <typename tDataType, int tRows, int tCols, MapOrder tOrder> 57struct Fragment { 58 typedef tDataType DataType; 59 static const int kRows = tRows; 60 static const int kCols = tCols; 61 static const MapOrder kOrder = tOrder; 62 63 Fragment() {} 64 Fragment(const DataType& d) : data(d) {} 65 operator DataType() const { return data; } 66 67 DataType data; 68}; 69 70typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1; 71typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1; 72 73// OutputStageEvalImpl is the template that we specialize to provide 74// implementations of each output stage for each type of input data. 75// 76// Each specialization provides a OutputType typedef and an Eval function 77// returning OutputType. The OutputType typically depends on the InputType. 78// 79// There are two dimensions in which input data types can vary: 80// 1. Different output stages may expect different data types. The 81// only hard constraint is that the first stage accepts int32, as 82// the unpack stage produces int32 accumulators. 83// 2. For a given scalar data type such as int32, there is still the 84// possibility of having SIMD vector types such as NEON int32x4_t, 85// typically wrapped as "fragment" types, see struct Fragment. 86// Thus, there can be several OutputStageEvalImpl 87// specializations for a single OutputStageType, for different 88// InputType's. 89template <typename OutputStageType, typename InputType> 90struct OutputStageEvalImpl { 91 // This generic template body should never be hit. 92 static_assert( 93 std::is_same<InputType, void>::value, 94 "Unimplemented: missing implementation of this output pipeline stage " 95 "for this data type. This would happen if some architecture-specific " 96 "SIMD back-end (output_$arch.h) were incomplete."); 97 98 OutputStageEvalImpl(const OutputStageType&) {} 99}; 100 101// Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data 102template <> 103struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale, 104 FragmentInt32x1x1> { 105 typedef FragmentInt32x1x1 InputType; 106 typedef FragmentInt32x1x1 OutputType; 107 typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage; 108 109 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 110 111 OutputType Eval(InputType input, int, int) const { 112 const std::int32_t result_shift = output_stage.result_shift; 113 const std::int32_t result_mult_int = output_stage.result_mult_int; 114 const std::int32_t result_offset = output_stage.result_offset; 115 const std::int32_t kRoundingTerm = 116 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 117 return ((input + result_offset) * result_mult_int + kRoundingTerm) >> 118 result_shift; 119 } 120 121 const OutputStage& output_stage; 122}; 123 124template <> 125struct OutputStageEvalImpl< 126 OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>, 127 FragmentInt32x1x1> { 128 typedef FragmentInt32x1x1 InputType; 129 typedef FragmentInt32x1x1 OutputType; 130 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col> 131 OutputStage; 132 133 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 134 135 OutputType Eval(InputType input, int row, int col) const { 136 const std::int32_t result_shift = output_stage.result_shift; 137 const std::int32_t result_mult_int = output_stage.result_mult_int(row); 138 const std::int32_t result_offset = output_stage.result_offset(row); 139 const std::int32_t kRoundingTerm = 140 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 141 return ((input + result_offset) * result_mult_int + kRoundingTerm) >> 142 result_shift; 143 } 144 145 const OutputStage& output_stage; 146}; 147 148template <> 149struct OutputStageEvalImpl< 150 OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>, 151 FragmentInt32x1x1> { 152 typedef FragmentInt32x1x1 InputType; 153 typedef FragmentInt32x1x1 OutputType; 154 typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row> 155 OutputStage; 156 157 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 158 159 OutputType Eval(InputType input, int row, int col) const { 160 const std::int32_t result_shift = output_stage.result_shift; 161 const std::int32_t result_mult_int = output_stage.result_mult_int(col); 162 const std::int32_t result_offset = output_stage.result_offset(col); 163 const std::int32_t kRoundingTerm = 164 (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 165 return ((input + result_offset) * result_mult_int + kRoundingTerm) >> 166 result_shift; 167 } 168 169 const OutputStage& output_stage; 170}; 171 172// Implementation of OutputStageSaturatingCastToUint8 for scalar data 173template <> 174struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8, 175 FragmentInt32x1x1> { 176 typedef FragmentInt32x1x1 InputType; 177 typedef FragmentUint8x1x1 OutputType; 178 typedef OutputStageSaturatingCastToUint8 OutputStage; 179 180 OutputStageEvalImpl(const OutputStage&) {} 181 182 OutputType Eval(InputType input, int, int) const { 183 std::int32_t data = input.data; 184 return data > 255 ? 255 : data < 0 ? 0 : data; 185 } 186}; 187 188// Implementation of OutputStageBiasAddition for scalar data 189template <typename VectorType> 190struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>, 191 FragmentInt32x1x1> { 192 typedef FragmentInt32x1x1 InputType; 193 typedef FragmentInt32x1x1 OutputType; 194 typedef OutputStageBiasAddition<VectorType> OutputStage; 195 196 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 197 198 OutputType Eval(InputType input, int row, int col) const { 199 if (VectorType::kShape == VectorShape::Row) { 200 return input + output_stage.bias_vector(col); 201 } else { 202 return input + output_stage.bias_vector(row); 203 } 204 } 205 206 const OutputStage& output_stage; 207}; 208 209// Implementation of OutputStageClamp for scalar data 210template <> 211struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> { 212 typedef FragmentInt32x1x1 InputType; 213 typedef FragmentInt32x1x1 OutputType; 214 typedef OutputStageClamp OutputStage; 215 216 OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {} 217 218 OutputType Eval(InputType input, int, int) const { 219 const std::int32_t min = output_stage.min; 220 const std::int32_t max = output_stage.max; 221 return std::min(std::max(input.data, min), max); 222 } 223 224 const OutputStage& output_stage; 225}; 226 227// Implementation of OutputStageTanh for either scalar or SIMD data 228template <typename tInputType> 229struct OutputStageTanhEvalImpl { 230 typedef tInputType InputType; 231 typedef InputType OutputType; 232 typedef typename InputType::DataType DataType; 233 typedef OutputStageTanh OutputStage; 234 235 OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) { 236 const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; 237 const std::int32_t real_amplitude_as_int32 = 238 output_stage.real_amplitude_as_int32; 239 240 input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32; 241 input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32; 242 output_min = real_zero_as_int32 - real_amplitude_as_int32; 243 output_max = real_zero_as_int32 + real_amplitude_as_int32; 244 245 double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32; 246 inverse_amplitude_neg_exponent = 0; 247 while (inverse_amplitude_normalized_double < 0.5) { 248 inverse_amplitude_normalized_double *= 2; 249 inverse_amplitude_neg_exponent++; 250 } 251 inverse_amplitude_normalized = 252 ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double); 253 254 double amplitude_normalized_double = real_amplitude_as_int32; 255 amplitude_exponent = 0; 256 while (amplitude_normalized_double >= 1.0) { 257 amplitude_normalized_double *= 0.5; 258 amplitude_exponent++; 259 } 260 amplitude_normalized = 261 ToFixedPoint<DataType, 0>(amplitude_normalized_double); 262 } 263 264 OutputType Eval(InputType input, int, int) const { 265 const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32; 266 267 typedef FixedPoint<DataType, 3> F3; 268 typedef FixedPoint<DataType, 0> F0; 269 270 // fixed-point affine transformation 271 DataType input_centered = 272 Sub(input.data, Dup<DataType>(real_zero_as_int32)); 273 F3 fixedpoint_input = 274 F3::FromRaw(input_centered) * inverse_amplitude_normalized; 275 // left shift 276 fixedpoint_input.raw() = 277 ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent); 278 // fixed-point tanh and multiplication 279 F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized; 280 // right shift 281 DataType int32_output = 282 Add(Dup<DataType>(real_zero_as_int32), 283 ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent)); 284 285 DataType mask_if_below_cutoff_min = 286 MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min)); 287 DataType mask_if_above_cutoff_max = 288 MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max)); 289 290 return SelectUsingMask( 291 mask_if_below_cutoff_min, Dup<DataType>(output_min), 292 SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max), 293 int32_output)); 294 } 295 296 const OutputStage& output_stage; 297 std::int32_t input_cutoff_min, input_cutoff_max; 298 std::int32_t output_min, output_max; 299 FixedPoint<DataType, 0> inverse_amplitude_normalized; 300 int inverse_amplitude_neg_exponent; 301 FixedPoint<DataType, 0> amplitude_normalized; 302 int amplitude_exponent; 303}; 304 305template <> 306struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1> 307 : OutputStageTanhEvalImpl<FragmentInt32x1x1> { 308 OutputStageEvalImpl(const OutputStageTanh& output_stage) 309 : OutputStageTanhEvalImpl(output_stage) {} 310}; 311 312// OutputPipelineOutputType is a helper to determine the output data type of a 313// pipeline, for a 314// given input data type. It is a recursive template; see the explanation on 315// OutputPipelineEvalImpl below. 316template <typename OutputPipelineType, int FirstStage, typename InputType, 317 bool StopRecursion = 318 FirstStage == std::tuple_size<OutputPipelineType>::value> 319struct OutputPipelineOutputType { 320 typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type 321 FirstStageType; 322 typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType 323 FirstStageOutputType; 324 typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1, 325 FirstStageOutputType>::Type Type; 326}; 327 328template <typename OutputPipelineType, int FirstStage, typename InputType> 329struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType, 330 true> { 331 typedef InputType Type; 332}; 333 334// OutputPipelineEvalImpl is a helper to implement the evaluation of 335// the whole pipeline. It is a recursive template to implement compile-time 336// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter 337// is how we implement recursion: each specialization implements only 338// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a 339// helper to implement the termination of the recursion as a partial 340// specialization below. 341template <typename OutputPipelineType, int FirstStage, typename InputType, 342 bool StopRecursion = 343 FirstStage == std::tuple_size<OutputPipelineType>::value> 344struct OutputPipelineEvalImpl { 345 typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type 346 FirstStageType; 347 typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType 348 FirstStageOutputType; 349 typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage, 350 InputType>::Type OutputType; 351 352 OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline) 353 : head_impl(std::get<FirstStage>(output_pipeline)), 354 tail_impl(output_pipeline) {} 355 356 OutputType Eval(InputType input, int row, int col) const { 357 // Evaluate the first stage. 358 FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col); 359 // Recurse into the remaining stages. 360 return tail_impl.Eval(first_stage_output, row, col); 361 } 362 363 const OutputStageEvalImpl<FirstStageType, InputType> head_impl; 364 const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1, 365 FirstStageOutputType> 366 tail_impl; 367}; 368 369// Specialization on 'StopRecursion' for terminating the recursion. 370template <typename OutputPipelineType, int FirstStage, typename InputType> 371struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> { 372 OutputPipelineEvalImpl(const OutputPipelineType&) {} 373 374 InputType Eval(InputType input, int, int) const { 375 // Terminating the recursion. 376 return input; 377 } 378}; 379 380// StoreFinalOutput takes the final value at the end of the output pipeline and 381// stores it into the destination matrix. It can be specialized for different 382// data types; the generic implementation here is typically used only for plain 383// old scalar (not SIMD) types. 384template <typename OutputType, typename DstType> 385void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) { 386 *dst->data(row, col) = value; 387} 388 389template <typename OutputPipelineType, typename InputType> 390struct OutputPipelineExecutor { 391 OutputPipelineExecutor(const OutputPipelineType& output_pipeline) 392 : output_pipeline_eval_impl_(output_pipeline) {} 393 394 // RunOutputPipeline is the entry point into the output pipeline evaluation 395 // code. It should be the only thing that unpack code calls. It takes the 396 // result 397 // of the unpack stage and stores it into the destination matrix. 398 template <typename DstType> 399 void Execute(InputType input, DstType* dst, int row, int col) { 400 // Statically assert that the output pipeline matches the given destination 401 // matrix's scalar type. 402 typedef typename OutputPipelineOutputType<OutputPipelineType, 0, 403 FragmentInt32x1x1>::Type::DataType 404 ScalarOutputType; 405 typedef typename DstType::Scalar ScalarDstType; 406 static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value, 407 "mismatched destination scalar type and output pipeline"); 408 409 // Evaluate the output pipeline. 410 auto output = output_pipeline_eval_impl_.Eval(input, row, col); 411 // Store the result into the destination matrix. 412 StoreFinalOutput(output, dst, row, col); 413 } 414 415 const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType> 416 output_pipeline_eval_impl_; 417}; 418 419} // namespace gemmlowp 420 421#endif // GEMMLOWP_INTERNAL_OUTPUT_H_ 422