1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved. 275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License"); 475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License. 575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at 675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// http://www.apache.org/licenses/LICENSE-2.0 875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software 1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS, 1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and 1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License. 1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// pack.h: packing blocks of the LHS and RHS into the data layout 1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// that is expected by compute.h and eventually by kernels. 1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Because this data layout depends on the kernel format, code here 1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// is templated in KernelLhsFormat/KernelRhsFormat. 1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Readers note: an important theme around here is that we try hard 2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to handle both Lhs and Rhs with a single piece of code. We indifferently 2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices 2375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// by (row, column) indices, we address them by (width, depth), as explained 2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing, 2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// at once. 2675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_PACK_H_ 2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_PACK_H_ 2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#include <cstring> 3175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "../public/bit_depth.h" 337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "allocator.h" 34544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "block_params.h" 35544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "common.h" 367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "kernel.h" 3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp { 3975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// A PackedSideBlock instance is a packed block of either the LHS or RHS 4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// (whence the generic 'Side' name). 4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 4375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 'Packed' means that it is laid out in the storage order that 4475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// is expected by the specified kernel format. From a block of the input 4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs() 4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// or PackRhs(). 470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <typename tKernelSideFormat> 4875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass PackedSideBlock { 4975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public: 500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef tKernelSideFormat KernelSideFormat; 510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob PackedSideBlock(Side side, Allocator* allocator, 537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const BlockParams& block_params) 5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob : allocator_(allocator), 5575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob pos_(0) { 5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob GetSideBlockParams(side, ¶ms_, block_params); 5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob data_handle_ = 5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth); 597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang sums_of_each_slice_handle_ = 6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob allocator_->Reserve<std::int32_t>(params_.l2_width); 6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ~PackedSideBlock() {} 6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void seek_run(int start_width, int start_depth) const { 6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int kernel_run_depth = 6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::min<int>(params_.l1_depth, params_.l2_depth - start_depth); 6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth; 6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; } 7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void seek_forward_n_cells(int n) const { 7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob pos_ += n * KernelSideFormat::Cell::kSize; 7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const std::uint8_t* current_data() const { 7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; 7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob std::uint8_t* current_data() { 8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_; 8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t* sums_of_each_slice() { 867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_); 8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const std::int32_t* sums_of_each_slice() const { 907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return allocator_->GetPointer<const std::int32_t>( 917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang sums_of_each_slice_handle_); 9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const SideBlockParams& params() const { return params_; } 9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob private: 9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The block size parameters that this PackedSizeBlock follows. 9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The L2 parameters determine its overall size, while the L1 parameters, 9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // together with the kernel format template parameter, determine 10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // the fine details of the storage/traversal order. 10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SideBlockParams params_; 10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Pointer to the allocator provided by the caller. Not owned. 10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The Allocator is assumed to outlive the PackedSideBlock. 10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Allocator* const allocator_; 10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // Handle on the buffer backing this packed block. Owned. 10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Allocator::Handle data_handle_; 10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Handle on the additional buffer backing the vector of sums of slices 11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // associated with this block. Owned. 1127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang Allocator::Handle sums_of_each_slice_handle_; 11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // pos_ is the current position in the buffer, which we access 11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // sequentially, like a file. 11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The idea is that we pack data in the same order as it is 11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // going to be traversed during the computation, which for 11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // cache-friendliness reasons is complicated to random-access, 11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // as the offsets calculations would be intricate. So we 12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // give up random-access addressing, and instead content ourselves 12175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // with sequential access. 12275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // 12375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // pos_ is mutable because during the computation we will want to 12475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // be able to iterate on the data in a const PackedSideBlock. 12575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob mutable int pos_; 12675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}; 12775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 12875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WidthMajor and DepthMajor are custom phrases modelled after the 12975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// standard terminology 'row-major' and 'column-major'. Their meaning 13075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// should be transparent once one has read the explanation in kernel.h: 13175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// for example, in the Lhs, the 'width' dimension is the rows dimension, 13275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// so there WidthMajor means RowMajor, while in the Rhs it is the opposite. 13375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Another way to put it: WidthMajor means that contiguous storage is used 13475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// for entries having the same 'width' index. 13575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobenum class SideMapOrder { WidthMajor, DepthMajor }; 13675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 13775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Similar to MatrixMap from map.h, but in terms of width/depth instead of 13875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// rows/columns. Used to address blocks of the input LHS/RHS matrices when 13975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// packing them. 14075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobtemplate <typename tScalar, SideMapOrder tOrder> 14175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass SideMap { 14275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public: 14375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef tScalar Scalar; 14475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static const SideMapOrder kOrder = tOrder; 14575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 14675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SideMap(Scalar* data, int width, int depth, int stride) 14775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob : data_(data), width_(width), depth_(depth), stride_(stride) {} 14875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang SideMap(Scalar* data, int width, int depth) 1500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang : data_(data), width_(width), depth_(depth) { 1510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_; 1520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 1530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 15475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SideMap(const SideMap& other) 15575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob : data_(other.data_), 15675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob width_(other.width_), 15775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob depth_(other.depth_), 15875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob stride_(other.stride_) {} 15975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 16075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int width() const { return width_; } 16175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int depth() const { return depth_; } 16275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int stride() const { return stride_; } 16375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int width_stride() const { 16475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return kOrder == SideMapOrder::DepthMajor ? 1 : stride_; 16575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 16675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int depth_stride() const { 16775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return kOrder == SideMapOrder::WidthMajor ? 1 : stride_; 16875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 16975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Scalar* data() const { return data_; } 17075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Scalar* data(int w, int d) const { 17175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return data_ + w * width_stride() + d * depth_stride(); 17275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 17375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Scalar operator()(int w, int d) const { return *data(w, d); } 17475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Scalar& operator()(int w, int d) { return *data(w, d); } 17575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 17675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SideMap block(int start_width, int start_depth, int block_width, 17775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int block_depth) const { 17875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob assert(start_width >= 0); 17975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob assert(start_width + block_width <= width_); 18075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob assert(start_depth >= 0); 18175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob assert(start_depth + block_depth <= depth_); 18275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 18375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob return SideMap(data(start_width, start_depth), block_width, block_depth, 18475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob stride_); 18575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 18675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 18775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob private: 18875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob Scalar* data_; // not owned. 18975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int width_, depth_, stride_; 19075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}; 19175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <RoundingMode tRoundingMode> 1937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator { 1947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang public: 1957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t get() { 1967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang assert(false); // This generic path should never be called. 1977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return 0; 1987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang } 1997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 2007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// A RoundingOffsetGenerator for rounding-to-nearest, always returning 2027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// the midpoint value 127. 2037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <> 2047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator<RoundingMode::Nearest> { 2057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang public: 2067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t get() { return 127; } 2077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 2087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// A RoundingOffsetGenerator based on a 8-bit Xorshift. 2107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This gives good results as Xorshift naturally generates 2117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// uniform random *nonzero* bytes i.e. 255 different values, 2127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// so it only remains for us to subtract one. 2137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <> 2147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> { 2150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang public: 2167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ScalarRoundingOffsetGenerator() { x_ = 128; } 2170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::uint8_t get() { 2197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t result = x_ - 1; 2200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Xorshift8(7,5,3) 2210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang x_ ^= x_ << 7; 2220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang x_ ^= x_ >> 5; 2230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang x_ ^= x_ << 3; 2247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang return result; 2257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang } 2267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang private: 2287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // State 2297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t x_; 2307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 2317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// A RoundingOffsetGenerator based on an 8-bit add/mod 2337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// low-discrepancy sequence. See less-than-8-bit.txt for 2347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// an explanation (the constant 97 is important - it must 2357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// be both relatively prime to 255, in order for the sequence 2367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// to be full-period, and c/255 should be close to 0.38 to 2377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// obtain low discrepancy). Uses a small bit hack to avoid 2387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// expensive % operations. 2397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <> 2407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> { 2417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang static const uint8_t AddConst = 97; 2427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang public: 2447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ScalarRoundingOffsetGenerator() { x_ = 1; } // Start must be non-zero 2457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 2467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t get() { 2477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // The +'d boolean term causes the increment to skip over 255, 2487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // (recalling that 255+1 = 256 = 0 for an 8 bit uint), 2497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // thus implementing %255 2507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang x_ += (AddConst + (x_ >= (255 - AddConst))); 2510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang return x_; 2520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 2530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang private: 2550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // State 2560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::uint8_t x_; 2570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang}; 2580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Requantizes a source uint8 value in [0..255] range 2600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// to the range specified by BitDepth, [0..((2^bits)-1)]. 2610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Bias must be avoided. Currently this is achieved 2620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// by probabilistic rounding. 2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams> 2647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstd::uint8_t Requantize( 2657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t raw_src_val, 2667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>* 2677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang rounding_offset_generator) { 2687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang static const int kBits = QuantizationParams::BitDepth::kBits; 2690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const std::uint8_t kMaxVal = (1 << kBits) - 1; 2700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang if (kBits == 8) { 2720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang return raw_src_val; 2730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 2740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal; 2767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::uint8_t rounding_offset = rounding_offset_generator->get(); 2770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang return (scaled + rounding_offset) / 255; 2780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang} 2790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 2800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// A PackingRegisterBlock is a small fixed-size block of a matrix being 2810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// packed. This class is the generic non-optimized implementation, 2820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// it is inherited by the generic implementation of PackingRegisterBlock, 2830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// which may be overriden by template specialization. Overriding it is how 2840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// one may provide optimized packing code paths. 2850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 2860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// The packing of a block proceeds in two steps: 2870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 1. Ensuring that we have a complete block of source data, i.e. a block of 2880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// the compile-time prescribed size. This is where we handle unaligned 2890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// boundaries: if we don't have a complete block of source data, then 2900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// we copy and zero-extend it into a local temporary (complete_src_), 2910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// see MakeCompleteSrc. In the generic case, we do have a complete block, 2920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// so we just use it in-place, see UseCompleteSrcInPlace. 2930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// 2. Packing a complete block into the destination, see Pack. This is the 2940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// most critical part, so it's convenient that unaligned boundaries have 2950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// already been handled in step 1. 2967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams, typename SrcMapType, 2977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typename PackedSideBlock> 2980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangclass PackingRegisterBlockBase { 2990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang public: 3000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; 3010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef typename KernelSideFormat::Cell CellFormat; 3020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kCells = KernelSideFormat::kCells; 3030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kCellWidth = CellFormat::kWidth; 3040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kKernelWidth = CellFormat::kWidth * kCells; 3050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kCellDepth = CellFormat::kDepth; 3060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kCellSize = CellFormat::kSize; 3070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const SideMapOrder kSrcOrder = SrcMapType::kOrder; 3080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode> 3107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingOffsetGenerator; 3110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {} 3130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang protected: 3150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // The source data that's ready for packing. May point to 3160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // in-place actual source data if it's already a complete block, 3170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // (see UseCompleteSrcInPlace) 3180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // or to the local buf_ below into which we copy incomplete blocks 3190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // (see MakeCompleteSrc) 3200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang SrcMapType complete_src_; 3210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Temporary buffer for loading incomplete blocks to, 3230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // in the source storage order 3240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::uint8_t buf_[kKernelWidth * kRegisterSize]; 3250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang public: 3270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Selects a block if in-place source data that's already a complete block 3280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; } 3290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Copies an incomplete block of source data into a local temporary 3300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // complete block by zero-extending it. 3310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang void MakeCompleteSrc(const SrcMapType& src) { 3320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang memset(buf_, 0, kKernelWidth * kRegisterSize); 3330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang if (kSrcOrder == SideMapOrder::WidthMajor) { 3340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int w = 0; w < src.width(); w++) { 3350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth()); 3360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } else { 3380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(kSrcOrder == SideMapOrder::DepthMajor); 3390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int d = 0; d < src.depth(); d++) { 3400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width()); 3410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize); 3440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Packs a complete block into the destination. This is the most 3460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // critical part and the part that we most typically want to 3470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // override in architecture-specific optimized specializations. 3480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang void Pack(PackedSideBlock* dst, int start_width, 3497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingOffsetGenerator* rounding_offset_generator) { 3500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::uint8_t* dst_ptr = dst->current_data(); 3510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; 3520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang cell_start_depth += kCellDepth) { 3530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int cell_start_width = 0; cell_start_width < kKernelWidth; 3540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang cell_start_width += kCellWidth) { 3557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang std::int32_t* cell_sums_of_each_slice_ptr = 3567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang dst->sums_of_each_slice() + start_width + cell_start_width; 3570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const SideMap<const std::uint8_t, kSrcOrder> src_cell_map( 3580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang complete_src_.block(cell_start_width, cell_start_depth, kCellWidth, 3590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang kCellDepth)); 3600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int w = 0; w < kCellWidth; w++) { 3610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang std::int32_t sum = 0; 3620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int d = 0; d < kCellDepth; d++) { 3630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const std::uint8_t raw_src_val = src_cell_map(w, d); 3647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const std::uint8_t requantized = Requantize<QuantizationParams>( 3657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang raw_src_val, rounding_offset_generator); 3660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized; 3670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang sum += requantized; 3680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang cell_sums_of_each_slice_ptr[w] += sum; 3700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang dst_ptr += kCellSize; 3720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); 3750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 3760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang}; 3770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams, typename SrcMapType, 3797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typename PackedSideBlock> 3800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangclass PackingRegisterBlock 3817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang : public PackingRegisterBlockBase<QuantizationParams, SrcMapType, 3827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang PackedSideBlock> {}; 3830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 3840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Large-scale implementation of packing. 3857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams, typename SrcMapType, 3867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typename PackedSideBlock> 3870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangclass PackSideBlockImpl { 38875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public: 3890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat; 39075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef typename KernelSideFormat::Cell CellFormat; 3910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kCells = KernelSideFormat::kCells; 39275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static const int kCellWidth = CellFormat::kWidth; 3930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang static const int kKernelWidth = CellFormat::kWidth * kCells; 39475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob static const int kCellDepth = CellFormat::kDepth; 39575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock> 3977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang PackingRegisterBlockType; 3987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef typename PackingRegisterBlockType::RoundingOffsetGenerator 3997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingOffsetGenerator; 40075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 4010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PackSideBlockImpl(PackedSideBlock* packed_side_block, 4020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const SrcMapType& src_map) 4037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang : packed_side_block_(packed_side_block), src_map_(src_map) {} 4040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PackedSideBlock* packed_side_block() const { return packed_side_block_; } 4060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const SrcMapType& src_map() const { return src_map_; } 40875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 40975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The public entry point to pack a block. 41075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void PackL2() { 4117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang memset(packed_side_block_->sums_of_each_slice(), 0, 41275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob sizeof(std::int32_t) * packed_side_block_->params().l2_width); 41375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int d = 0; d < src_map_.depth(); 41475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob d += packed_side_block_->params().l1_depth) { 41575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int ds = std::min<int>(packed_side_block_->params().l1_depth, 41675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob src_map_.depth() - d); 41775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 41875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int w = 0; w < src_map_.width(); 41975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob w += packed_side_block_->params().l1_width) { 42075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int ws = std::min<int>(packed_side_block_->params().l1_width, 42175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob src_map_.width() - w); 42275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 4230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PrefetchL1(w, ws, d, ds); 42475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob PackL1(w, ws, d, ds); 42575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 42675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 42775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 42875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 42975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob protected: 43075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The intermediate-level loops, between PackL2 and PackRun. 43175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void PackL1(int start_width, int width, int start_depth, int depth) { 43275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int w = 0; w < width; w += kKernelWidth) { 43375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int ws = std::min(+kKernelWidth, width - w); 43475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_side_block_->seek_run(start_width + w, start_depth); 43575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob PackRun(start_width + w, ws, start_depth, depth); 43675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 43775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 43875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 4390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Prefetches the data that will be read by PackL1 4400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang void PrefetchL1(int start_width, int width, int start_depth, int depth) { 4410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang if (SrcMapType::kOrder == SideMapOrder::WidthMajor) { 4420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int d = 0; d < depth; d += kDefaultCacheLineSize) { 4430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int w = 0; w < width; w += 1) { 4440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang Prefetch(src_map_.data(start_width + w, start_depth + d)); 4450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } else { 4480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int d = 0; d < depth; d++) { 4490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int w = 0; w < width; w += kDefaultCacheLineSize) { 4500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang Prefetch(src_map_.data(start_width + w, start_depth + d)); 4510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang 4560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // PackRun packs only a run i.e. is the inner loop in the depth dimension. 4570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang void PackRun(int start_width, int width, int start_depth, int depth) { 4587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang PackingRegisterBlockType b; 4590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang if (width == kKernelWidth) { 4600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const int register_aligned_depth = RoundDown<kRegisterSize>(depth); 4610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang if (register_aligned_depth) { 4620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int d = 0; d < register_aligned_depth; d += kRegisterSize) { 4630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d, 4640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang width, kRegisterSize)); 4657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang b.Pack(packed_side_block_, start_width, &rounding_offset_generator_); 46675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 46775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 4680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang if (register_aligned_depth < depth) { 4690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang b.MakeCompleteSrc( 4700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang src_map_.block(start_width, start_depth + register_aligned_depth, 4710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang width, depth - register_aligned_depth)); 4727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang b.Pack(packed_side_block_, start_width, &rounding_offset_generator_); 4730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 4740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } else { 4750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(width < kKernelWidth); 4760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int d = 0; d < depth; d += kRegisterSize) { 4770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const int ds = std::min(+kRegisterSize, depth - d); 4780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang b.MakeCompleteSrc( 4790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang src_map_.block(start_width, start_depth + d, width, ds)); 4807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang b.Pack(packed_side_block_, start_width, &rounding_offset_generator_); 4810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 48275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 4830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang } 48475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 48575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // The PackedSideBlock being packed, i.e. the 'destination'. 4860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PackedSideBlock* const packed_side_block_; 48775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 48875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // A map on the block of the original matrix block being packed, 48975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob // i.e. the 'source'. 49075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const SrcMapType& src_map_; 49175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 4927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang // Used for requantization in the less-than-8-bit case. 4930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang // Otherwise unused. 4947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingOffsetGenerator rounding_offset_generator_; 4957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang}; 4967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang 4977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Quantization parameters for the side (LHS or RHS) being packed, 4987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// with the rounding strategy having been already resolved to a specific 4997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// rounding mode. 5007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename tBitDepth, RoundingMode tRoundingMode> 5017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct QuantizationParams { 5027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef tBitDepth BitDepth; 5037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang static const RoundingMode kRoundingMode = tRoundingMode; 50475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}; 50575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 50675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Packs a block of the input LHS matrix, into a PackedSideBlock 5077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename BitDepthParams, typename PackedSideBlock, 5087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typename MatrixMapType> 5090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangvoid PackLhs(PackedSideBlock* dst, const MatrixMapType& src) { 51075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ScopedProfilingLabel label("pack LHS"); 511544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang static const SideMapOrder kSideMapOrder = 512544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor 513544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang : SideMapOrder::DepthMajor; 51475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef typename MatrixMapType::Scalar Scalar; 51575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef SideMap<Scalar, kSideMapOrder> SideMapType; 51675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride()); 5177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef typename BitDepthParams::LhsBitDepth BitDepth; 5187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef typename BitDepthParams::RoundingStrategy RoundingStrategy; 5197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const int accumulation_depth = src_side_map.depth(); 5207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) { 5217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef QuantizationParams<BitDepth, 5227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingStrategy::kRoundingModeForSmallSizes> 5237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang QParams; 5247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 5257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ImplType impl(dst, src_side_map); 5267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang impl.PackL2(); 5277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang } else { 5287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef QuantizationParams<BitDepth, 5297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingStrategy::kRoundingModeForLargeSizes> 5307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang QParams; 5317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 5327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ImplType impl(dst, src_side_map); 5337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang impl.PackL2(); 5347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang } 53575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 53675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 53775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Packs a block of the input RHS matrix, into a PackedSideBlock 5387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename BitDepthParams, typename PackedSideBlock, 5397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typename MatrixMapType> 5400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangvoid PackRhs(PackedSideBlock* dst, const MatrixMapType& src) { 54175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ScopedProfilingLabel label("pack RHS"); 542544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang static const SideMapOrder kSideMapOrder = 543544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor 544544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang : SideMapOrder::DepthMajor; 54575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef typename MatrixMapType::Scalar Scalar; 54675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob typedef SideMap<Scalar, kSideMapOrder> SideMapType; 54775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride()); 5487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef typename BitDepthParams::RhsBitDepth BitDepth; 5497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef typename BitDepthParams::RoundingStrategy RoundingStrategy; 5507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang const int accumulation_depth = src_side_map.depth(); 5517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) { 5527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef QuantizationParams<BitDepth, 5537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingStrategy::kRoundingModeForSmallSizes> 5547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang QParams; 5557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 5567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ImplType impl(dst, src_side_map); 5577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang impl.PackL2(); 5587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang } else { 5597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef QuantizationParams<BitDepth, 5607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang RoundingStrategy::kRoundingModeForLargeSizes> 5617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang QParams; 5627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType; 5637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang ImplType impl(dst, src_side_map); 5647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang impl.PackL2(); 5657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang } 56675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 56775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 56875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} // namespace gemmlowp 56975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 57075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifdef GEMMLOWP_NEON 571544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "pack_neon.h" 5727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#elif defined(GEMMLOWP_SSE4) 5737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "pack_SSE.h" 57475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif 57575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 57675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif // GEMMLOWP_INTERNAL_PACK_H_ 577