output_stages.h revision a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1
1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
27b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
37b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License");
47b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// you may not use this file except in compliance with the License.
57b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// You may obtain a copy of the License at
67b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
77b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//     http://www.apache.org/licenses/LICENSE-2.0
87b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
97b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Unless required by applicable law or agreed to in writing, software
107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS,
117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// See the License for the specific language governing permissions and
137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// limitations under the License.
147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// output_stages.h: public definitions of the output stages that can
167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// be assembled into an output pipeline, to control how internal
177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 32-bit accumulators are transformed to obtain the final uint8
187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result matrix entries.
197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <tuple>
247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "../internal/common.h"
267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangnamespace gemmlowp {
287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage takes int32 values and returns still int32 values,
307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// but "quantized down" to the uint8 scale; in other words, its output
317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// is typically what one would then clamp to [0..255] and cast to uint8
327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// (see OutputStageSaturatingCastToUint8).
337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This "quantization down" process depends on 3 parameters,
357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//   result_offset, result_mult_int, result_shift,
367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// and the result is:
377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//   ((input + result_offset) * result_mult_int + rounding) >> result_shift
387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// where
397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageQuantizeDownInt32ToUint8Scale {
417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t result_offset;
427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t result_mult_int;
437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t result_shift;
447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage takes int32 values and returns still int32 values,
477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// but "quantized down" to the uint8 scale; in other words, its output
487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// is typically what one would then clamp to [0..255] and cast to uint8
497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// (see OutputStageSaturatingCastToUint8).
507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This "quantization down" process depends on 3 parameters,
527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//   result_offset, result_mult_int, result_shift,
537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// and the result is:
547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//   ((input + result_offset) * result_mult_int + rounding) >> result_shift
557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// where
567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang//
587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each
597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// row or column of the output (depending on tShape) has its own result_offset
607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// and result_mult_int numbers.
617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <VectorShape tShape>
627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageQuantizeDownInt32ToUint8ScalePC {
637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  VectorMap<const std::int32_t, tShape> result_offset;
647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  VectorMap<const std::int32_t, tShape> result_mult_int;
657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t result_shift;
667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This output stage takes int32 values and returns still int32 values,
69a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// but "quantized down" to the uint8 scale; in other words, its output
70a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is typically what one would then clamp to [0..255] and cast to uint8
71a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (see OutputStageSaturatingCastToUint8).
72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This "quantization down" process depends on 3 parameters,
74a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   result_offset, result_fixedpoint_multiplier, result_shift,
75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// and the result is:
76a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   ((FixedPointMul(input, result_fixedpoint_multiplier) +
77a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   rounding) >> result_shift) + result_offset_after_shift
78a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where
79a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1));
80a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// and where FixedPointMul(x, y) is the nearest integer to the following
81a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// mathematical expression, evaluated without overflow or intermediate
82a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// rounding:
83a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   (x * y) / 2^31
84a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// In practice, it is expected that FixedPointMul will be implemented
85a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// using hardware "rounding doubling int32 multiply high" instructions,
86a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// such as VQRDMULH on ARM. See in fixedpoint.h the generic function,
87a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// SaturatingRoundingDoublingHighMul.
88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Notice that the other difference from
90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// OutputStageQuantizeDownInt32ToUint8Scale is that the result offset
91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is applied after the multiplier and shift, not before. This ensures
92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// that no matter what the multiplier and shift are, the result offset
93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is effectively integral: offsetting the final result by an integer.
94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The motivation for this is to faithfully support quantization schemes
95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where the formula linking quantized values to the real mathematical
96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// values that they represent, is of the form
97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//   real_value = scale * (quantized_value - zero_point)
99a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang//
100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where scale is a real number (represented in quantized form by
101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// result_fixedpoint_multiplier and result_shift) and zero_point
102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is an integer telling which quantized value correspond to the
103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// real value 0, and is represented here by (the opposite of)
104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// result_offset_after_shift.
105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The motivation for such a quantization scheme, designed to
106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ensure that 0 is always a representable value, is that in
107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// many applications, we need to 0-pad arrays and that can only be
108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// done for quantized arrays if 0 is a representable value in
109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// quantized form. In particular, convolution-like operations
110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// are often implemented using 0-padding, or "im2col"-like
111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// expansions that implicitly rely on 0-padding. If 0 were not
112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// a representable value, such operations would have to pad
113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// using a nonzero value, introducing bias in the computation.
114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint {
115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::int32_t result_fixedpoint_multiplier;
116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::int32_t result_shift;
117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang  std::int32_t result_offset_after_shift;
118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang};
119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang
1207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage takes int32 values that are expected to be already
1217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// on the final uint8 scale, but not necessarily in the [0..255] range.
1227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// It clamps them to the [0..255] range and returns them casted to uint8.
1237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageSaturatingCastToUint8 {};
1247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage depends on a "bias vector" that should contain int32
1267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// entries, and be either a row-vector of the same number of columns as the
1277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result matrix, or a column-vector of the same number of rows as the
1287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result matrix. This output stage takes int32 values and adds to them
1297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// the corresponding entry of the bias vector (broadcasted in the other
1307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// direction to fit the matrix's shape), outputting int32 values.
1317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename VectorType>
1327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageBiasAddition {
1337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  VectorType bias_vector;
1347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
1357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage clamps value between the specified min and max bounds.
1377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// It can be used to implement "rectified linear unit" activation functions
1387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// in neural networks.
1397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageClamp {
1407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t min;
1417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t max;
1427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
1437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageTanh {
1457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t real_zero_as_int32;
1467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t real_amplitude_as_int32;
1477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
1487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// An output pipeline is just a std::tuple of output stages.
1507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This function generates a standard output pipeline consisting of two stages:
1517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8.
1527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wanginline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale,
1537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                  OutputStageSaturatingCastToUint8>
1547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao WangMakeStandardOutputPipeline(std::int32_t result_offset,
1557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                           std::int32_t result_mult_int,
1567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                           std::int32_t result_shift) {
1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage;
1587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  quantize_down_stage.result_offset = result_offset;
1597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  quantize_down_stage.result_mult_int = result_mult_int;
1607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  quantize_down_stage.result_shift = result_shift;
1617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  OutputStageSaturatingCastToUint8 saturating_cast_stage;
1627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return std::make_tuple(quantize_down_stage, saturating_cast_stage);
1637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}
1647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// An output pipeline is just a std::tuple of output stages.
1667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This function generates a standard output pipeline consisting of two stages:
1677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8.
1687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <VectorShape tShape>
1697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wanginline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>,
1707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                  OutputStageSaturatingCastToUint8>
171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMakeStandardOutputPipeline(
172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const VectorMap<const std::int32_t, tShape>& result_offset,
173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    const VectorMap<const std::int32_t, tShape>& result_mult_int,
174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang    std::int32_t result_shift) {
1757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage;
1767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  quantize_down_stage.result_offset = result_offset;
1777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  quantize_down_stage.result_mult_int = result_mult_int;
1787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  quantize_down_stage.result_shift = result_shift;
1797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  OutputStageSaturatingCastToUint8 saturating_cast_stage;
1807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  return std::make_tuple(quantize_down_stage, saturating_cast_stage);
1817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}
1827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}  // namespace gemmlowp
1847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
1857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#endif  // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_
186