1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved.
275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License");
475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License.
575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at
675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//     http://www.apache.org/licenses/LICENSE-2.0
875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software
1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS,
1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and
1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License.
1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// pack.h: packing blocks of the LHS and RHS into the data layout
1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// that is expected by compute.h and eventually by kernels.
1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Because this data layout depends on the kernel format, code here
1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// is templated in KernelLhsFormat/KernelRhsFormat.
1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Readers note: an important theme around here is that we try hard
2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to handle both Lhs and Rhs with a single piece of code. We indifferently
2275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
2375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// by (row, column) indices, we address them by (width, depth), as explained
2475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// at once.
2675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_PACK_H_
2875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_PACK_H_
2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#include <cstring>
3175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "../public/bit_depth.h"
337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "allocator.h"
34544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "block_params.h"
35544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "common.h"
367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "kernel.h"
3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp {
3975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// A PackedSideBlock instance is a packed block of either the LHS or RHS
4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// (whence the generic 'Side' name).
4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
4375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 'Packed' means that it is laid out in the storage order that
4475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// is expected by the specified kernel format. From a block of the input
4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// or PackRhs().
470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <typename tKernelSideFormat>
4875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass PackedSideBlock {
4975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public:
500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef tKernelSideFormat KernelSideFormat;
510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  PackedSideBlock(Side side, Allocator* allocator,
537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                  const BlockParams& block_params)
5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      : allocator_(allocator),
5575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        pos_(0) {
5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    GetSideBlockParams(side, &params_, block_params);
5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    data_handle_ =
5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    sums_of_each_slice_handle_ =
6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        allocator_->Reserve<std::int32_t>(params_.l2_width);
6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ~PackedSideBlock() {}
6475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
6575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void seek_run(int start_width, int start_depth) const {
6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    int kernel_run_depth =
6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
6975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void seek_forward_n_cells(int n) const {
7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    pos_ += n * KernelSideFormat::Cell::kSize;
7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  const std::uint8_t* current_data() const {
7875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
7975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
8175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  std::uint8_t* current_data() {
8275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::int32_t* sums_of_each_slice() {
867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
897b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  const std::int32_t* sums_of_each_slice() const {
907b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return allocator_->GetPointer<const std::int32_t>(
917b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        sums_of_each_slice_handle_);
9275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
9475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  const SideBlockParams& params() const { return params_; }
9575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob private:
9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The block size parameters that this PackedSizeBlock follows.
9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The L2 parameters determine its overall size, while the L1 parameters,
9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // together with the kernel format template parameter, determine
10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // the fine details of the storage/traversal order.
10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  SideBlockParams params_;
10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
10375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // Pointer to the allocator provided by the caller. Not owned.
10475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The Allocator is assumed to outlive the PackedSideBlock.
10575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Allocator* const allocator_;
10675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
10775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // Handle on the buffer backing this packed block. Owned.
10875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Allocator::Handle data_handle_;
10975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  // Handle on the additional buffer backing the vector of sums of slices
11175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // associated with this block. Owned.
1127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  Allocator::Handle sums_of_each_slice_handle_;
11375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
11475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // pos_ is the current position in the buffer, which we access
11575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // sequentially, like a file.
11675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The idea is that we pack data in the same order as it is
11775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // going to be traversed during the computation, which for
11875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // cache-friendliness reasons is complicated to random-access,
11975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // as the offsets calculations would be intricate. So we
12075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // give up random-access addressing, and instead content ourselves
12175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // with sequential access.
12275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  //
12375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // pos_ is mutable because during the computation we will want to
12475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // be able to iterate on the data in a const PackedSideBlock.
12575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  mutable int pos_;
12675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
12775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
12875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WidthMajor and DepthMajor are custom phrases modelled after the
12975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// standard terminology 'row-major' and 'column-major'. Their meaning
13075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// should be transparent once one has read the explanation in kernel.h:
13175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// for example, in the Lhs, the 'width' dimension is the rows dimension,
13275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
13375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Another way to put it: WidthMajor means that contiguous storage is used
13475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// for entries having the same 'width' index.
13575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobenum class SideMapOrder { WidthMajor, DepthMajor };
13675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
13775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Similar to MatrixMap from map.h, but in terms of width/depth instead of
13875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// rows/columns. Used to address blocks of the input LHS/RHS matrices when
13975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// packing them.
14075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobtemplate <typename tScalar, SideMapOrder tOrder>
14175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass SideMap {
14275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public:
14375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef tScalar Scalar;
14475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static const SideMapOrder kOrder = tOrder;
14575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
14675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  SideMap(Scalar* data, int width, int depth, int stride)
14775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      : data_(data), width_(width), depth_(depth), stride_(stride) {}
14875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  SideMap(Scalar* data, int width, int depth)
1500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      : data_(data), width_(width), depth_(depth) {
1510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
1520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
1530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
15475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  SideMap(const SideMap& other)
15575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      : data_(other.data_),
15675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        width_(other.width_),
15775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        depth_(other.depth_),
15875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        stride_(other.stride_) {}
15975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
16075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  int width() const { return width_; }
16175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  int depth() const { return depth_; }
16275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  int stride() const { return stride_; }
16375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  int width_stride() const {
16475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
16575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
16675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  int depth_stride() const {
16775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
16875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
16975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Scalar* data() const { return data_; }
17075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Scalar* data(int w, int d) const {
17175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return data_ + w * width_stride() + d * depth_stride();
17275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
17375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Scalar operator()(int w, int d) const { return *data(w, d); }
17475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Scalar& operator()(int w, int d) { return *data(w, d); }
17575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
17675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  SideMap block(int start_width, int start_depth, int block_width,
17775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                int block_depth) const {
17875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    assert(start_width >= 0);
17975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    assert(start_width + block_width <= width_);
18075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    assert(start_depth >= 0);
18175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    assert(start_depth + block_depth <= depth_);
18275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
18375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    return SideMap(data(start_width, start_depth), block_width, block_depth,
18475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                   stride_);
18575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
18675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
18775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob private:
18875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  Scalar* data_;  // not owned.
18975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  int width_, depth_, stride_;
19075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
19175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <RoundingMode tRoundingMode>
1937b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator {
1947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang public:
1957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::uint8_t get() {
1967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    assert(false);  // This generic path should never be called.
1977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return 0;
1987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  }
1997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
2007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// A RoundingOffsetGenerator for rounding-to-nearest, always returning
2027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// the midpoint value 127.
2037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <>
2047b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator<RoundingMode::Nearest> {
2057b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang public:
2067b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::uint8_t get() { return 127; }
2077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
2087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// A RoundingOffsetGenerator based on a 8-bit Xorshift.
2107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// This gives good results as Xorshift naturally generates
2117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// uniform random *nonzero* bytes i.e. 255 different values,
2127b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// so it only remains for us to subtract one.
2137b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <>
2147b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> {
2150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang public:
2167b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  ScalarRoundingOffsetGenerator() { x_ = 128; }
2170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  std::uint8_t get() {
2197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    std::uint8_t result = x_ - 1;
2200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    // Xorshift8(7,5,3)
2210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    x_ ^= x_ << 7;
2220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    x_ ^= x_ >> 5;
2230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    x_ ^= x_ << 3;
2247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    return result;
2257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  }
2267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang private:
2287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  // State
2297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::uint8_t x_;
2307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
2317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// A RoundingOffsetGenerator based on an 8-bit add/mod
2337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// low-discrepancy sequence.  See less-than-8-bit.txt for
2347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// an explanation (the constant 97 is important - it must
2357b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// be both relatively prime to 255, in order for the sequence
2367b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// to be full-period, and c/255 should be close to 0.38 to
2377b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// obtain low discrepancy).  Uses a small bit hack to avoid
2387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// expensive % operations.
2397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <>
2407b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangclass ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> {
2417b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  static const uint8_t AddConst = 97;
2427b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2437b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang public:
2447b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  ScalarRoundingOffsetGenerator() { x_ = 1; }  // Start must be non-zero
2457b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
2467b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::uint8_t get() {
2477b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    // The +'d boolean term causes the increment to skip over 255,
2487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    // (recalling that 255+1 = 256 = 0 for an 8 bit uint),
2497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    // thus implementing %255
2507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    x_ += (AddConst + (x_ >= (255 - AddConst)));
2510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    return x_;
2520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
2530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang private:
2550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // State
2560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  std::uint8_t x_;
2570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang};
2580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Requantizes a source uint8 value in [0..255] range
2600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// to the range specified by BitDepth, [0..((2^bits)-1)].
2610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Bias must be avoided. Currently this is achieved
2620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// by probabilistic rounding.
2637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams>
2647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstd::uint8_t Requantize(
2657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    std::uint8_t raw_src_val,
2667b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>*
2677b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        rounding_offset_generator) {
2687b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  static const int kBits = QuantizationParams::BitDepth::kBits;
2690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const std::uint8_t kMaxVal = (1 << kBits) - 1;
2700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  if (kBits == 8) {
2720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    return raw_src_val;
2730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
2740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal;
2767b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  std::uint8_t rounding_offset = rounding_offset_generator->get();
2770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  return (scaled + rounding_offset) / 255;
2780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang}
2790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
2800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// A PackingRegisterBlock is a small fixed-size block of a matrix being
2810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// packed. This class is the generic non-optimized implementation,
2820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// it is inherited by the generic implementation of PackingRegisterBlock,
2830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// which may be overriden by template specialization. Overriding it is how
2840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// one may provide optimized packing code paths.
2850a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//
2860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// The packing of a block proceeds in two steps:
2870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//   1. Ensuring that we have a complete block of source data, i.e. a block of
2880a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      the compile-time prescribed size. This is where we handle unaligned
2890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      boundaries: if we don't have a complete block of source data, then
2900a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      we copy and zero-extend it into a local temporary (complete_src_),
2910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      see MakeCompleteSrc. In the generic case, we do have a complete block,
2920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      so we just use it in-place, see UseCompleteSrcInPlace.
2930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//   2. Packing a complete block into the destination, see Pack. This is the
2940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      most critical part, so it's convenient that unaligned boundaries have
2950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang//      already been handled in step 1.
2967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams, typename SrcMapType,
2977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          typename PackedSideBlock>
2980a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangclass PackingRegisterBlockBase {
2990a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang public:
3000a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
3010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef typename KernelSideFormat::Cell CellFormat;
3020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kCells = KernelSideFormat::kCells;
3030a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kCellWidth = CellFormat::kWidth;
3040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kKernelWidth = CellFormat::kWidth * kCells;
3050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kCellDepth = CellFormat::kDepth;
3060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kCellSize = CellFormat::kSize;
3070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
3080a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3097b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>
3107b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      RoundingOffsetGenerator;
3110a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3120a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
3130a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3140a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang protected:
3150a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // The source data that's ready for packing. May point to
3160a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // in-place actual source data if it's already a complete block,
3170a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // (see UseCompleteSrcInPlace)
3180a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // or to the local buf_ below into which we copy incomplete blocks
3190a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // (see MakeCompleteSrc)
3200a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  SrcMapType complete_src_;
3210a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3220a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // Temporary buffer for loading incomplete blocks to,
3230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // in the source storage order
3240a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  std::uint8_t buf_[kKernelWidth * kRegisterSize];
3250a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3260a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang public:
3270a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // Selects a block if in-place source data that's already a complete block
3280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
3290a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // Copies an incomplete block of source data into a local temporary
3300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // complete block by zero-extending it.
3310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  void MakeCompleteSrc(const SrcMapType& src) {
3320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    memset(buf_, 0, kKernelWidth * kRegisterSize);
3330a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    if (kSrcOrder == SideMapOrder::WidthMajor) {
3340a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int w = 0; w < src.width(); w++) {
3350a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
3360a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
3370a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    } else {
3380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      assert(kSrcOrder == SideMapOrder::DepthMajor);
3390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int d = 0; d < src.depth(); d++) {
3400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
3410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
3420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
3430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
3440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
3450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // Packs a complete block into the destination. This is the most
3460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // critical part and the part that we most typically want to
3470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // override in architecture-specific optimized specializations.
3480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  void Pack(PackedSideBlock* dst, int start_width,
3497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang            RoundingOffsetGenerator* rounding_offset_generator) {
3500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    std::uint8_t* dst_ptr = dst->current_data();
3510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
3520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang         cell_start_depth += kCellDepth) {
3530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
3540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang           cell_start_width += kCellWidth) {
3557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        std::int32_t* cell_sums_of_each_slice_ptr =
3567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang            dst->sums_of_each_slice() + start_width + cell_start_width;
3570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
3580a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
3590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                                kCellDepth));
3600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        for (int w = 0; w < kCellWidth; w++) {
3610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang          std::int32_t sum = 0;
3620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang          for (int d = 0; d < kCellDepth; d++) {
3630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            const std::uint8_t raw_src_val = src_cell_map(w, d);
3647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang            const std::uint8_t requantized = Requantize<QuantizationParams>(
3657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                raw_src_val, rounding_offset_generator);
3660a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized;
3670a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            sum += requantized;
3680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang          }
3697b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          cell_sums_of_each_slice_ptr[w] += sum;
3700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        }
3710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        dst_ptr += kCellSize;
3720a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
3730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
3740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
3750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
3760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang};
3770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3787b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams, typename SrcMapType,
3797b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          typename PackedSideBlock>
3800a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangclass PackingRegisterBlock
3817b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    : public PackingRegisterBlockBase<QuantizationParams, SrcMapType,
3827b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                                      PackedSideBlock> {};
3830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
3840a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang// Large-scale implementation of packing.
3857b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename QuantizationParams, typename SrcMapType,
3867b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          typename PackedSideBlock>
3870a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangclass PackSideBlockImpl {
38875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public:
3890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
39075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef typename KernelSideFormat::Cell CellFormat;
3910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kCells = KernelSideFormat::kCells;
39275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static const int kCellWidth = CellFormat::kWidth;
3930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  static const int kKernelWidth = CellFormat::kWidth * kCells;
39475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  static const int kCellDepth = CellFormat::kDepth;
39575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock>
3977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      PackingRegisterBlockType;
3987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef typename PackingRegisterBlockType::RoundingOffsetGenerator
3997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      RoundingOffsetGenerator;
40075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4010a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  PackSideBlockImpl(PackedSideBlock* packed_side_block,
4020a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                    const SrcMapType& src_map)
4037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang      : packed_side_block_(packed_side_block), src_map_(src_map) {}
4040a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4050a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  PackedSideBlock* packed_side_block() const { return packed_side_block_; }
4060a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4070a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const SrcMapType& src_map() const { return src_map_; }
40875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
40975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The public entry point to pack a block.
41075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void PackL2() {
4117b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    memset(packed_side_block_->sums_of_each_slice(), 0,
41275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob           sizeof(std::int32_t) * packed_side_block_->params().l2_width);
41375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (int d = 0; d < src_map_.depth();
41475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob         d += packed_side_block_->params().l1_depth) {
41575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      int ds = std::min<int>(packed_side_block_->params().l1_depth,
41675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                             src_map_.depth() - d);
41775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
41875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (int w = 0; w < src_map_.width();
41975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob           w += packed_side_block_->params().l1_width) {
42075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        int ws = std::min<int>(packed_side_block_->params().l1_width,
42175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                               src_map_.width() - w);
42275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4230a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        PrefetchL1(w, ws, d, ds);
42475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        PackL1(w, ws, d, ds);
42575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
42675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
42775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
42875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
42975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob protected:
43075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The intermediate-level loops, between PackL2 and PackRun.
43175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void PackL1(int start_width, int width, int start_depth, int depth) {
43275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (int w = 0; w < width; w += kKernelWidth) {
43375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      int ws = std::min(+kKernelWidth, width - w);
43475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      packed_side_block_->seek_run(start_width + w, start_depth);
43575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      PackRun(start_width + w, ws, start_depth, depth);
43675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
43775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
43875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // Prefetches the data that will be read by PackL1
4400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  void PrefetchL1(int start_width, int width, int start_depth, int depth) {
4410a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
4420a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
4430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        for (int w = 0; w < width; w += 1) {
4440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang          Prefetch(src_map_.data(start_width + w, start_depth + d));
4450a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        }
4460a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
4470a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    } else {
4480a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int d = 0; d < depth; d++) {
4490a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        for (int w = 0; w < width; w += kDefaultCacheLineSize) {
4500a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang          Prefetch(src_map_.data(start_width + w, start_depth + d));
4510a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        }
4520a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
4530a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    }
4540a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
4550a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang
4560a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // PackRun packs only a run i.e. is the inner loop in the depth dimension.
4570a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  void PackRun(int start_width, int width, int start_depth, int depth) {
4587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    PackingRegisterBlockType b;
4590a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    if (width == kKernelWidth) {
4600a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
4610a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      if (register_aligned_depth) {
4620a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
4630a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang          b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
4640a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                                                 width, kRegisterSize));
4657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
46675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        }
46775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
4680a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      if (register_aligned_depth < depth) {
4690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        b.MakeCompleteSrc(
4700a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            src_map_.block(start_width, start_depth + register_aligned_depth,
4710a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang                           width, depth - register_aligned_depth));
4727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
4730a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
4740a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    } else {
4750a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      assert(width < kKernelWidth);
4760a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int d = 0; d < depth; d += kRegisterSize) {
4770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        const int ds = std::min(+kRegisterSize, depth - d);
4780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        b.MakeCompleteSrc(
4790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang            src_map_.block(start_width, start_depth + d, width, ds));
4807b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
4810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      }
48275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
4830a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  }
48475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
48575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // The PackedSideBlock being packed, i.e. the 'destination'.
4860a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  PackedSideBlock* const packed_side_block_;
48775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
48875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // A map on the block of the original matrix block being packed,
48975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  // i.e. the 'source'.
49075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  const SrcMapType& src_map_;
49175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4927b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  // Used for requantization in the less-than-8-bit case.
4930a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  // Otherwise unused.
4947b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  RoundingOffsetGenerator rounding_offset_generator_;
4957b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang};
4967b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang
4977b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// Quantization parameters for the side (LHS or RHS) being packed,
4987b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// with the rounding strategy having been already resolved to a specific
4997b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang// rounding mode.
5007b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename tBitDepth, RoundingMode tRoundingMode>
5017b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangstruct QuantizationParams {
5027b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef tBitDepth BitDepth;
5037b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  static const RoundingMode kRoundingMode = tRoundingMode;
50475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
50575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
50675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Packs a block of the input LHS matrix, into a PackedSideBlock
5077b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename BitDepthParams, typename PackedSideBlock,
5087b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          typename MatrixMapType>
5090a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangvoid PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
51075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ScopedProfilingLabel label("pack LHS");
511544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang  static const SideMapOrder kSideMapOrder =
512544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang      MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
513544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang                                                  : SideMapOrder::DepthMajor;
51475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef typename MatrixMapType::Scalar Scalar;
51575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
51675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
5177b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef typename BitDepthParams::LhsBitDepth BitDepth;
5187b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
5197b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  const int accumulation_depth = src_side_map.depth();
5207b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
5217b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef QuantizationParams<BitDepth,
5227b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                               RoundingStrategy::kRoundingModeForSmallSizes>
5237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        QParams;
5247b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
5257b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    ImplType impl(dst, src_side_map);
5267b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    impl.PackL2();
5277b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  } else {
5287b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef QuantizationParams<BitDepth,
5297b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                               RoundingStrategy::kRoundingModeForLargeSizes>
5307b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        QParams;
5317b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
5327b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    ImplType impl(dst, src_side_map);
5337b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    impl.PackL2();
5347b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  }
53575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
53675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
53775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Packs a block of the input RHS matrix, into a PackedSideBlock
5387b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wangtemplate <typename BitDepthParams, typename PackedSideBlock,
5397b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang          typename MatrixMapType>
5400a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangvoid PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
54175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ScopedProfilingLabel label("pack RHS");
542544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang  static const SideMapOrder kSideMapOrder =
543544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang      MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
544544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang                                                  : SideMapOrder::DepthMajor;
54575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef typename MatrixMapType::Scalar Scalar;
54675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
54775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
5487b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef typename BitDepthParams::RhsBitDepth BitDepth;
5497b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
5507b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  const int accumulation_depth = src_side_map.depth();
5517b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
5527b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef QuantizationParams<BitDepth,
5537b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                               RoundingStrategy::kRoundingModeForSmallSizes>
5547b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        QParams;
5557b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
5567b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    ImplType impl(dst, src_side_map);
5577b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    impl.PackL2();
5587b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  } else {
5597b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef QuantizationParams<BitDepth,
5607b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                               RoundingStrategy::kRoundingModeForLargeSizes>
5617b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang        QParams;
5627b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
5637b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    ImplType impl(dst, src_side_map);
5647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang    impl.PackL2();
5657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  }
56675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
56775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
56875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}  // namespace gemmlowp
56975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
57075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifdef GEMMLOWP_NEON
571544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "pack_neon.h"
5727b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#elif defined(GEMMLOWP_SSE4)
5737b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "pack_SSE.h"
57475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif
57575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
57675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif  // GEMMLOWP_INTERNAL_PACK_H_
577