output_stages.h revision a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1
1a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 27b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 37b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Licensed under the Apache License, Version 2.0 (the "License"); 47b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// you may not use this file except in compliance with the License. 57b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// You may obtain a copy of the License at 67b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 77b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// http://www.apache.org/licenses/LICENSE-2.0 87b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 97b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Unless required by applicable law or agreed to in writing, software 107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// distributed under the License is distributed on an "AS IS" BASIS, 117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// See the License for the specific language governing permissions and 137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// limitations under the License. 147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 157b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// output_stages.h: public definitions of the output stages that can 167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// be assembled into an output pipeline, to control how internal 177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 32-bit accumulators are transformed to obtain the final uint8 187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result matrix entries. 197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#ifndef GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ 217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#define GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ 227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include <tuple> 247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "../internal/common.h" 267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangnamespace gemmlowp { 287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage takes int32 values and returns still int32 values, 307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// but "quantized down" to the uint8 scale; in other words, its output 317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// is typically what one would then clamp to [0..255] and cast to uint8 327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// (see OutputStageSaturatingCastToUint8). 337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This "quantization down" process depends on 3 parameters, 357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result_offset, result_mult_int, result_shift, 367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// and the result is: 377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// ((input + result_offset) * result_mult_int + rounding) >> result_shift 387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// where 397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageQuantizeDownInt32ToUint8Scale { 417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t result_offset; 427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t result_mult_int; 437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t result_shift; 447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage takes int32 values and returns still int32 values, 477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// but "quantized down" to the uint8 scale; in other words, its output 487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// is typically what one would then clamp to [0..255] and cast to uint8 497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// (see OutputStageSaturatingCastToUint8). 507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This "quantization down" process depends on 3 parameters, 527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result_offset, result_mult_int, result_shift, 537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// and the result is: 547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// ((input + result_offset) * result_mult_int + rounding) >> result_shift 557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// where 567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// 587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Difference from OutputStageQuantizeDownInt32ToUint8Scale here is that each 597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// row or column of the output (depending on tShape) has its own result_offset 607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// and result_mult_int numbers. 617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <VectorShape tShape> 627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageQuantizeDownInt32ToUint8ScalePC { 637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang VectorMap<const std::int32_t, tShape> result_offset; 647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang VectorMap<const std::int32_t, tShape> result_mult_int; 657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t result_shift; 667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 68a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This output stage takes int32 values and returns still int32 values, 69a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// but "quantized down" to the uint8 scale; in other words, its output 70a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is typically what one would then clamp to [0..255] and cast to uint8 71a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (see OutputStageSaturatingCastToUint8). 72a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 73a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// This "quantization down" process depends on 3 parameters, 74a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// result_offset, result_fixedpoint_multiplier, result_shift, 75a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// and the result is: 76a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ((FixedPointMul(input, result_fixedpoint_multiplier) + 77a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// rounding) >> result_shift) + result_offset_after_shift 78a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where 79a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1)); 80a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// and where FixedPointMul(x, y) is the nearest integer to the following 81a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// mathematical expression, evaluated without overflow or intermediate 82a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// rounding: 83a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// (x * y) / 2^31 84a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// In practice, it is expected that FixedPointMul will be implemented 85a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// using hardware "rounding doubling int32 multiply high" instructions, 86a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// such as VQRDMULH on ARM. See in fixedpoint.h the generic function, 87a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// SaturatingRoundingDoublingHighMul. 88a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 89a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// Notice that the other difference from 90a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// OutputStageQuantizeDownInt32ToUint8Scale is that the result offset 91a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is applied after the multiplier and shift, not before. This ensures 92a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// that no matter what the multiplier and shift are, the result offset 93a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is effectively integral: offsetting the final result by an integer. 94a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The motivation for this is to faithfully support quantization schemes 95a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where the formula linking quantized values to the real mathematical 96a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// values that they represent, is of the form 97a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 98a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// real_value = scale * (quantized_value - zero_point) 99a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// 100a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// where scale is a real number (represented in quantized form by 101a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// result_fixedpoint_multiplier and result_shift) and zero_point 102a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// is an integer telling which quantized value correspond to the 103a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// real value 0, and is represented here by (the opposite of) 104a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// result_offset_after_shift. 105a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// The motivation for such a quantization scheme, designed to 106a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// ensure that 0 is always a representable value, is that in 107a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// many applications, we need to 0-pad arrays and that can only be 108a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// done for quantized arrays if 0 is a representable value in 109a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// quantized form. In particular, convolution-like operations 110a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// are often implemented using 0-padding, or "im2col"-like 111a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// expansions that implicitly rely on 0-padding. If 0 were not 112a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// a representable value, such operations would have to pad 113a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang// using a nonzero value, introducing bias in the computation. 114a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wangstruct OutputStageQuantizeDownInt32ToUint8ScaleByFixedPoint { 115a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::int32_t result_fixedpoint_multiplier; 116a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::int32_t result_shift; 117a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::int32_t result_offset_after_shift; 118a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang}; 119a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang 1207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage takes int32 values that are expected to be already 1217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// on the final uint8 scale, but not necessarily in the [0..255] range. 1227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// It clamps them to the [0..255] range and returns them casted to uint8. 1237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageSaturatingCastToUint8 {}; 1247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage depends on a "bias vector" that should contain int32 1267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// entries, and be either a row-vector of the same number of columns as the 1277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result matrix, or a column-vector of the same number of rows as the 1287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// result matrix. This output stage takes int32 values and adds to them 1297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// the corresponding entry of the bias vector (broadcasted in the other 1307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// direction to fit the matrix's shape), outputting int32 values. 1317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename VectorType> 1327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageBiasAddition { 1337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang VectorType bias_vector; 1347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 1357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This output stage clamps value between the specified min and max bounds. 1377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// It can be used to implement "rectified linear unit" activation functions 1387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// in neural networks. 1397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageClamp { 1407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t min; 1417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t max; 1427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 1437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct OutputStageTanh { 1457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t real_zero_as_int32; 1467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t real_amplitude_as_int32; 1477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 1487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// An output pipeline is just a std::tuple of output stages. 1507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This function generates a standard output pipeline consisting of two stages: 1517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// OutputStageQuantizeDownInt32ToUint8Scale, OutputStageSaturatingCastToUint8. 1527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wanginline std::tuple<OutputStageQuantizeDownInt32ToUint8Scale, 1537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang OutputStageSaturatingCastToUint8> 1547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao WangMakeStandardOutputPipeline(std::int32_t result_offset, 1557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t result_mult_int, 1567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t result_shift) { 1577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang OutputStageQuantizeDownInt32ToUint8Scale quantize_down_stage; 1587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang quantize_down_stage.result_offset = result_offset; 1597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang quantize_down_stage.result_mult_int = result_mult_int; 1607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang quantize_down_stage.result_shift = result_shift; 1617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang OutputStageSaturatingCastToUint8 saturating_cast_stage; 1627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return std::make_tuple(quantize_down_stage, saturating_cast_stage); 1637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang} 1647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// An output pipeline is just a std::tuple of output stages. 1667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This function generates a standard output pipeline consisting of two stages: 1677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// OutputStageQuantizeDownInt32ToUint8ScalePC, OutputStageSaturatingCastToUint8. 1687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <VectorShape tShape> 1697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wanginline std::tuple<OutputStageQuantizeDownInt32ToUint8ScalePC<tShape>, 1707b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang OutputStageSaturatingCastToUint8> 171a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao WangMakeStandardOutputPipeline( 172a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const VectorMap<const std::int32_t, tShape>& result_offset, 173a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang const VectorMap<const std::int32_t, tShape>& result_mult_int, 174a9fd919a0080e2c3c7ed1ce451c85a4d86f2f8c1Miao Wang std::int32_t result_shift) { 1757b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang OutputStageQuantizeDownInt32ToUint8ScalePC<tShape> quantize_down_stage; 1767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang quantize_down_stage.result_offset = result_offset; 1777b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang quantize_down_stage.result_mult_int = result_mult_int; 1787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang quantize_down_stage.result_shift = result_shift; 1797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang OutputStageSaturatingCastToUint8 saturating_cast_stage; 1807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return std::make_tuple(quantize_down_stage, saturating_cast_stage); 1817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang} 1827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1837b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang} // namespace gemmlowp 1847b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 1857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#endif // GEMMLOWP_PUBLIC_OUTPUT_STAGES_H_ 186