1// Copyright 2015 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// output.h: processing the 32-bit accumulators output by the unpack
16// stage, obtaining the final result matrix entries and storing them into
17// the destination matrix.
18
19#ifndef GEMMLOWP_INTERNAL_OUTPUT_H_
20#define GEMMLOWP_INTERNAL_OUTPUT_H_
21
22#include <cmath>
23#include <tuple>
24#include <type_traits>
25
26#include "../public/output_stages.h"
27#include "fixedpoint.h"
28
29namespace gemmlowp {
30
31// A Fragment is a small fixed-size matrix typically stored in one or
32// a few architecture-specific SIMD vectors. Besides plain old scalar types
33// such as int32_t, Fragment types are what can be used as input/output data
34// types for output pipeline stages.
35//
36// More details:
37//
38// In the generic scalar code in this file, we have only implemented
39// evaluation of output stages for scalar inputs (e.g. plain int32_t values).
40// Other files (e.g. output_neon.h) are to provide SIMD paths by implementing
41// evaluation of output stages for SIMD vector types. However, this raises
42// the question of how the different values ("lanes") in a SIMD vector
43// correspond to different values in the whole matrices. For simple entry-wise
44// output stages, this doesn't matter, but for other output stages depending
45// on position within the whole matrix, this does matter. To solve this
46// problem, rather than implementing evaluation of output stages for raw
47// SIMD vector types, we wrap SIMD vector types in "fragment" structs that
48// bring the additional structure of "shape" i.e. mapping SIMD lanes to
49// matrix entries, and we specialize evaluation of output stage for such
50// fragment types. The Fragment template struct here is how we generate
51// all fragment structs. For example, in output_neon.h, it may be specialized
52// with DataType=int32x4_t, Rows=4, Cols=1. MapOrder doesn't matter for
53// vector shapes. While Fragment is only used for SIMD paths, we leave it
54// here in this platform-generic file because this same template should
55// cover the needs of any SIMD architectures.
56template <typename tDataType, int tRows, int tCols, MapOrder tOrder>
57struct Fragment {
58  typedef tDataType DataType;
59  static const int kRows = tRows;
60  static const int kCols = tCols;
61  static const MapOrder kOrder = tOrder;
62
63  Fragment() {}
64  Fragment(const DataType& d) : data(d) {}
65  operator DataType() const { return data; }
66
67  DataType data;
68};
69
70typedef Fragment<std::int32_t, 1, 1, MapOrder::ColMajor> FragmentInt32x1x1;
71typedef Fragment<std::uint8_t, 1, 1, MapOrder::ColMajor> FragmentUint8x1x1;
72
73// OutputStageEvalImpl is the template that we specialize to provide
74// implementations of each output stage for each type of input data.
75//
76// Each specialization provides a OutputType typedef and an Eval function
77// returning OutputType. The OutputType typically depends on the InputType.
78//
79// There are two dimensions in which input data types can vary:
80//   1. Different output stages may expect different data types. The
81//      only hard constraint is that the first stage accepts int32, as
82//      the unpack stage produces int32 accumulators.
83//   2. For a given scalar data type such as int32, there is still the
84//      possibility of having SIMD vector types such as NEON int32x4_t,
85//      typically wrapped as "fragment" types, see struct Fragment.
86//      Thus, there can be several OutputStageEvalImpl
87//      specializations for a single OutputStageType, for different
88//      InputType's.
89template <typename OutputStageType, typename InputType>
90struct OutputStageEvalImpl {
91  // This generic template body should never be hit.
92  static_assert(
93      std::is_same<InputType, void>::value,
94      "Unimplemented: missing implementation of this output pipeline stage "
95      "for this data type. This would happen if some architecture-specific "
96      "SIMD back-end (output_$arch.h) were incomplete.");
97
98  OutputStageEvalImpl(const OutputStageType&) {}
99};
100
101// Implementation of OutputStageQuantizeDownInt32ToUint8Scale for scalar data
102template <>
103struct OutputStageEvalImpl<OutputStageQuantizeDownInt32ToUint8Scale,
104                           FragmentInt32x1x1> {
105  typedef FragmentInt32x1x1 InputType;
106  typedef FragmentInt32x1x1 OutputType;
107  typedef OutputStageQuantizeDownInt32ToUint8Scale OutputStage;
108
109  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
110
111  OutputType Eval(InputType input, int, int) const {
112    const std::int32_t result_shift = output_stage.result_shift;
113    const std::int32_t result_mult_int = output_stage.result_mult_int;
114    const std::int32_t result_offset = output_stage.result_offset;
115    const std::int32_t kRoundingTerm =
116        (result_shift < 1) ? 0 : (1 << (result_shift - 1));
117    return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
118           result_shift;
119  }
120
121  const OutputStage& output_stage;
122};
123
124template <>
125struct OutputStageEvalImpl<
126    OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>,
127    FragmentInt32x1x1> {
128  typedef FragmentInt32x1x1 InputType;
129  typedef FragmentInt32x1x1 OutputType;
130  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Col>
131      OutputStage;
132
133  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
134
135  OutputType Eval(InputType input, int row, int col) const {
136    const std::int32_t result_shift = output_stage.result_shift;
137    const std::int32_t result_mult_int = output_stage.result_mult_int(row);
138    const std::int32_t result_offset = output_stage.result_offset(row);
139    const std::int32_t kRoundingTerm =
140        (result_shift < 1) ? 0 : (1 << (result_shift - 1));
141    return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
142           result_shift;
143  }
144
145  const OutputStage& output_stage;
146};
147
148template <>
149struct OutputStageEvalImpl<
150    OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>,
151    FragmentInt32x1x1> {
152  typedef FragmentInt32x1x1 InputType;
153  typedef FragmentInt32x1x1 OutputType;
154  typedef OutputStageQuantizeDownInt32ToUint8ScalePC<VectorShape::Row>
155      OutputStage;
156
157  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
158
159  OutputType Eval(InputType input, int row, int col) const {
160    const std::int32_t result_shift = output_stage.result_shift;
161    const std::int32_t result_mult_int = output_stage.result_mult_int(col);
162    const std::int32_t result_offset = output_stage.result_offset(col);
163    const std::int32_t kRoundingTerm =
164        (result_shift < 1) ? 0 : (1 << (result_shift - 1));
165    return ((input + result_offset) * result_mult_int + kRoundingTerm) >>
166           result_shift;
167  }
168
169  const OutputStage& output_stage;
170};
171
172// Implementation of OutputStageSaturatingCastToUint8 for scalar data
173template <>
174struct OutputStageEvalImpl<OutputStageSaturatingCastToUint8,
175                           FragmentInt32x1x1> {
176  typedef FragmentInt32x1x1 InputType;
177  typedef FragmentUint8x1x1 OutputType;
178  typedef OutputStageSaturatingCastToUint8 OutputStage;
179
180  OutputStageEvalImpl(const OutputStage&) {}
181
182  OutputType Eval(InputType input, int, int) const {
183    std::int32_t data = input.data;
184    return data > 255 ? 255 : data < 0 ? 0 : data;
185  }
186};
187
188// Implementation of OutputStageBiasAddition for scalar data
189template <typename VectorType>
190struct OutputStageEvalImpl<OutputStageBiasAddition<VectorType>,
191                           FragmentInt32x1x1> {
192  typedef FragmentInt32x1x1 InputType;
193  typedef FragmentInt32x1x1 OutputType;
194  typedef OutputStageBiasAddition<VectorType> OutputStage;
195
196  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
197
198  OutputType Eval(InputType input, int row, int col) const {
199    if (VectorType::kShape == VectorShape::Row) {
200      return input + output_stage.bias_vector(col);
201    } else {
202      return input + output_stage.bias_vector(row);
203    }
204  }
205
206  const OutputStage& output_stage;
207};
208
209// Implementation of OutputStageClamp for scalar data
210template <>
211struct OutputStageEvalImpl<OutputStageClamp, FragmentInt32x1x1> {
212  typedef FragmentInt32x1x1 InputType;
213  typedef FragmentInt32x1x1 OutputType;
214  typedef OutputStageClamp OutputStage;
215
216  OutputStageEvalImpl(const OutputStage& s) : output_stage(s) {}
217
218  OutputType Eval(InputType input, int, int) const {
219    const std::int32_t min = output_stage.min;
220    const std::int32_t max = output_stage.max;
221    return std::min(std::max(input.data, min), max);
222  }
223
224  const OutputStage& output_stage;
225};
226
227// Implementation of OutputStageTanh for either scalar or SIMD data
228template <typename tInputType>
229struct OutputStageTanhEvalImpl {
230  typedef tInputType InputType;
231  typedef InputType OutputType;
232  typedef typename InputType::DataType DataType;
233  typedef OutputStageTanh OutputStage;
234
235  OutputStageTanhEvalImpl(const OutputStage& s) : output_stage(s) {
236    const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
237    const std::int32_t real_amplitude_as_int32 =
238        output_stage.real_amplitude_as_int32;
239
240    input_cutoff_min = real_zero_as_int32 - 8 * real_amplitude_as_int32;
241    input_cutoff_max = real_zero_as_int32 + 8 * real_amplitude_as_int32;
242    output_min = real_zero_as_int32 - real_amplitude_as_int32;
243    output_max = real_zero_as_int32 + real_amplitude_as_int32;
244
245    double inverse_amplitude_normalized_double = 1.0 / real_amplitude_as_int32;
246    inverse_amplitude_neg_exponent = 0;
247    while (inverse_amplitude_normalized_double < 0.5) {
248      inverse_amplitude_normalized_double *= 2;
249      inverse_amplitude_neg_exponent++;
250    }
251    inverse_amplitude_normalized =
252        ToFixedPoint<DataType, 0>(inverse_amplitude_normalized_double);
253
254    double amplitude_normalized_double = real_amplitude_as_int32;
255    amplitude_exponent = 0;
256    while (amplitude_normalized_double >= 1.0) {
257      amplitude_normalized_double *= 0.5;
258      amplitude_exponent++;
259    }
260    amplitude_normalized =
261        ToFixedPoint<DataType, 0>(amplitude_normalized_double);
262  }
263
264  OutputType Eval(InputType input, int, int) const {
265    const std::int32_t real_zero_as_int32 = output_stage.real_zero_as_int32;
266
267    typedef FixedPoint<DataType, 3> F3;
268    typedef FixedPoint<DataType, 0> F0;
269
270    // fixed-point affine transformation
271    DataType input_centered =
272        Sub(input.data, Dup<DataType>(real_zero_as_int32));
273    F3 fixedpoint_input =
274        F3::FromRaw(input_centered) * inverse_amplitude_normalized;
275    // left shift
276    fixedpoint_input.raw() =
277        ShiftLeft(fixedpoint_input.raw(), 28 - inverse_amplitude_neg_exponent);
278    // fixed-point tanh and multiplication
279    F0 fixedpoint_output = tanh(fixedpoint_input) * amplitude_normalized;
280    // right shift
281    DataType int32_output =
282        Add(Dup<DataType>(real_zero_as_int32),
283            ShiftRight(fixedpoint_output.raw(), 31 - amplitude_exponent));
284
285    DataType mask_if_below_cutoff_min =
286        MaskIfLessThanOrEqual(input.data, Dup<DataType>(input_cutoff_min));
287    DataType mask_if_above_cutoff_max =
288        MaskIfGreaterThanOrEqual(input.data, Dup<DataType>(input_cutoff_max));
289
290    return SelectUsingMask(
291        mask_if_below_cutoff_min, Dup<DataType>(output_min),
292        SelectUsingMask(mask_if_above_cutoff_max, Dup<DataType>(output_max),
293                        int32_output));
294  }
295
296  const OutputStage& output_stage;
297  std::int32_t input_cutoff_min, input_cutoff_max;
298  std::int32_t output_min, output_max;
299  FixedPoint<DataType, 0> inverse_amplitude_normalized;
300  int inverse_amplitude_neg_exponent;
301  FixedPoint<DataType, 0> amplitude_normalized;
302  int amplitude_exponent;
303};
304
305template <>
306struct OutputStageEvalImpl<OutputStageTanh, FragmentInt32x1x1>
307    : OutputStageTanhEvalImpl<FragmentInt32x1x1> {
308  OutputStageEvalImpl(const OutputStageTanh& output_stage)
309      : OutputStageTanhEvalImpl(output_stage) {}
310};
311
312// OutputPipelineOutputType is a helper to determine the output data type of a
313// pipeline, for a
314// given input data type. It is a recursive template; see the explanation on
315// OutputPipelineEvalImpl below.
316template <typename OutputPipelineType, int FirstStage, typename InputType,
317          bool StopRecursion =
318              FirstStage == std::tuple_size<OutputPipelineType>::value>
319struct OutputPipelineOutputType {
320  typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
321      FirstStageType;
322  typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
323      FirstStageOutputType;
324  typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage + 1,
325                                            FirstStageOutputType>::Type Type;
326};
327
328template <typename OutputPipelineType, int FirstStage, typename InputType>
329struct OutputPipelineOutputType<OutputPipelineType, FirstStage, InputType,
330                                true> {
331  typedef InputType Type;
332};
333
334// OutputPipelineEvalImpl is a helper to implement the evaluation of
335// the whole pipeline. It is a recursive template to implement compile-time
336// unrolling of the loop over all pipeline stages. The 'FirstStage' parameter
337// is how we implement recursion: each specialization implements only
338// evaluation starting at 'FirstStage'. The StopRecursion parameter is just a
339// helper to implement the termination of the recursion as a partial
340// specialization below.
341template <typename OutputPipelineType, int FirstStage, typename InputType,
342          bool StopRecursion =
343              FirstStage == std::tuple_size<OutputPipelineType>::value>
344struct OutputPipelineEvalImpl {
345  typedef typename std::tuple_element<FirstStage, OutputPipelineType>::type
346      FirstStageType;
347  typedef typename OutputStageEvalImpl<FirstStageType, InputType>::OutputType
348      FirstStageOutputType;
349  typedef typename OutputPipelineOutputType<OutputPipelineType, FirstStage,
350                                            InputType>::Type OutputType;
351
352  OutputPipelineEvalImpl(const OutputPipelineType& output_pipeline)
353      : head_impl(std::get<FirstStage>(output_pipeline)),
354        tail_impl(output_pipeline) {}
355
356  OutputType Eval(InputType input, int row, int col) const {
357    // Evaluate the first stage.
358    FirstStageOutputType first_stage_output = head_impl.Eval(input, row, col);
359    // Recurse into the remaining stages.
360    return tail_impl.Eval(first_stage_output, row, col);
361  }
362
363  const OutputStageEvalImpl<FirstStageType, InputType> head_impl;
364  const OutputPipelineEvalImpl<OutputPipelineType, FirstStage + 1,
365                               FirstStageOutputType>
366      tail_impl;
367};
368
369// Specialization on 'StopRecursion' for terminating the recursion.
370template <typename OutputPipelineType, int FirstStage, typename InputType>
371struct OutputPipelineEvalImpl<OutputPipelineType, FirstStage, InputType, true> {
372  OutputPipelineEvalImpl(const OutputPipelineType&) {}
373
374  InputType Eval(InputType input, int, int) const {
375    // Terminating the recursion.
376    return input;
377  }
378};
379
380// StoreFinalOutput takes the final value at the end of the output pipeline and
381// stores it into the destination matrix. It can be specialized for different
382// data types; the generic implementation here is typically used only for plain
383// old scalar (not SIMD) types.
384template <typename OutputType, typename DstType>
385void StoreFinalOutput(OutputType value, DstType* dst, int row, int col) {
386  *dst->data(row, col) = value;
387}
388
389template <typename OutputPipelineType, typename InputType>
390struct OutputPipelineExecutor {
391  OutputPipelineExecutor(const OutputPipelineType& output_pipeline)
392      : output_pipeline_eval_impl_(output_pipeline) {}
393
394  // RunOutputPipeline is the entry point into the output pipeline evaluation
395  // code. It should be the only thing that unpack code calls. It takes the
396  // result
397  // of the unpack stage and stores it into the destination matrix.
398  template <typename DstType>
399  void Execute(InputType input, DstType* dst, int row, int col) {
400    // Statically assert that the output pipeline matches the given destination
401    // matrix's scalar type.
402    typedef typename OutputPipelineOutputType<OutputPipelineType, 0,
403                                              FragmentInt32x1x1>::Type::DataType
404        ScalarOutputType;
405    typedef typename DstType::Scalar ScalarDstType;
406    static_assert(std::is_same<ScalarOutputType, ScalarDstType>::value,
407                  "mismatched destination scalar type and output pipeline");
408
409    // Evaluate the output pipeline.
410    auto output = output_pipeline_eval_impl_.Eval(input, row, col);
411    // Store the result into the destination matrix.
412    StoreFinalOutput(output, dst, row, col);
413  }
414
415  const OutputPipelineEvalImpl<OutputPipelineType, 0, InputType>
416      output_pipeline_eval_impl_;
417};
418
419}  // namespace gemmlowp
420
421#endif  // GEMMLOWP_INTERNAL_OUTPUT_H_
422