1// Copyright 2015 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// pack.h: packing blocks of the LHS and RHS into the data layout
16// that is expected by compute.h and eventually by kernels.
17// Because this data layout depends on the kernel format, code here
18// is templated in KernelLhsFormat/KernelRhsFormat.
19//
20// Readers note: an important theme around here is that we try hard
21// to handle both Lhs and Rhs with a single piece of code. We indifferently
22// refer to the Lhs and Rhs as a 'Side'. Instead of addressing matrices
23// by (row, column) indices, we address them by (width, depth), as explained
24// in kernel.h. This allows us to handle both Lhs and Rhs on an equal footing,
25// at once.
26
27#ifndef GEMMLOWP_INTERNAL_PACK_H_
28#define GEMMLOWP_INTERNAL_PACK_H_
29
30#include <cstring>
31
32#include "../public/bit_depth.h"
33#include "allocator.h"
34#include "block_params.h"
35#include "common.h"
36#include "kernel.h"
37
38namespace gemmlowp {
39
40// A PackedSideBlock instance is a packed block of either the LHS or RHS
41// (whence the generic 'Side' name).
42//
43// 'Packed' means that it is laid out in the storage order that
44// is expected by the specified kernel format. From a block of the input
45// LHS or RHS matrix, one obtains a PackedSideBlock by calling PackLhs()
46// or PackRhs().
47template <typename tKernelSideFormat>
48class PackedSideBlock {
49 public:
50  typedef tKernelSideFormat KernelSideFormat;
51
52  PackedSideBlock(Side side, Allocator* allocator,
53                  const BlockParams& block_params)
54      : allocator_(allocator),
55        pos_(0) {
56    GetSideBlockParams(side, &params_, block_params);
57    data_handle_ =
58        allocator_->Reserve<std::uint8_t>(params_.l2_width * params_.l2_depth);
59    sums_of_each_slice_handle_ =
60        allocator_->Reserve<std::int32_t>(params_.l2_width);
61  }
62
63  ~PackedSideBlock() {}
64
65  void seek_run(int start_width, int start_depth) const {
66    int kernel_run_depth =
67        std::min<int>(params_.l1_depth, params_.l2_depth - start_depth);
68    pos_ = params_.l2_width * start_depth + start_width * kernel_run_depth;
69  }
70
71  void seek_next_cell() const { pos_ += KernelSideFormat::Cell::kSize; }
72
73  void seek_forward_n_cells(int n) const {
74    pos_ += n * KernelSideFormat::Cell::kSize;
75  }
76
77  const std::uint8_t* current_data() const {
78    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
79  }
80
81  std::uint8_t* current_data() {
82    return allocator_->GetPointer<std::uint8_t>(data_handle_) + pos_;
83  }
84
85  std::int32_t* sums_of_each_slice() {
86    return allocator_->GetPointer<std::int32_t>(sums_of_each_slice_handle_);
87  }
88
89  const std::int32_t* sums_of_each_slice() const {
90    return allocator_->GetPointer<const std::int32_t>(
91        sums_of_each_slice_handle_);
92  }
93
94  const SideBlockParams& params() const { return params_; }
95
96 private:
97  // The block size parameters that this PackedSizeBlock follows.
98  // The L2 parameters determine its overall size, while the L1 parameters,
99  // together with the kernel format template parameter, determine
100  // the fine details of the storage/traversal order.
101  SideBlockParams params_;
102
103  // Pointer to the allocator provided by the caller. Not owned.
104  // The Allocator is assumed to outlive the PackedSideBlock.
105  Allocator* const allocator_;
106
107  // Handle on the buffer backing this packed block. Owned.
108  Allocator::Handle data_handle_;
109
110  // Handle on the additional buffer backing the vector of sums of slices
111  // associated with this block. Owned.
112  Allocator::Handle sums_of_each_slice_handle_;
113
114  // pos_ is the current position in the buffer, which we access
115  // sequentially, like a file.
116  // The idea is that we pack data in the same order as it is
117  // going to be traversed during the computation, which for
118  // cache-friendliness reasons is complicated to random-access,
119  // as the offsets calculations would be intricate. So we
120  // give up random-access addressing, and instead content ourselves
121  // with sequential access.
122  //
123  // pos_ is mutable because during the computation we will want to
124  // be able to iterate on the data in a const PackedSideBlock.
125  mutable int pos_;
126};
127
128// WidthMajor and DepthMajor are custom phrases modelled after the
129// standard terminology 'row-major' and 'column-major'. Their meaning
130// should be transparent once one has read the explanation in kernel.h:
131// for example, in the Lhs, the 'width' dimension is the rows dimension,
132// so there WidthMajor means RowMajor, while in the Rhs it is the opposite.
133// Another way to put it: WidthMajor means that contiguous storage is used
134// for entries having the same 'width' index.
135enum class SideMapOrder { WidthMajor, DepthMajor };
136
137// Similar to MatrixMap from map.h, but in terms of width/depth instead of
138// rows/columns. Used to address blocks of the input LHS/RHS matrices when
139// packing them.
140template <typename tScalar, SideMapOrder tOrder>
141class SideMap {
142 public:
143  typedef tScalar Scalar;
144  static const SideMapOrder kOrder = tOrder;
145
146  SideMap(Scalar* data, int width, int depth, int stride)
147      : data_(data), width_(width), depth_(depth), stride_(stride) {}
148
149  SideMap(Scalar* data, int width, int depth)
150      : data_(data), width_(width), depth_(depth) {
151    stride_ = kOrder == SideMapOrder::WidthMajor ? depth_ : width_;
152  }
153
154  SideMap(const SideMap& other)
155      : data_(other.data_),
156        width_(other.width_),
157        depth_(other.depth_),
158        stride_(other.stride_) {}
159
160  int width() const { return width_; }
161  int depth() const { return depth_; }
162  int stride() const { return stride_; }
163  int width_stride() const {
164    return kOrder == SideMapOrder::DepthMajor ? 1 : stride_;
165  }
166  int depth_stride() const {
167    return kOrder == SideMapOrder::WidthMajor ? 1 : stride_;
168  }
169  Scalar* data() const { return data_; }
170  Scalar* data(int w, int d) const {
171    return data_ + w * width_stride() + d * depth_stride();
172  }
173  Scalar operator()(int w, int d) const { return *data(w, d); }
174  Scalar& operator()(int w, int d) { return *data(w, d); }
175
176  SideMap block(int start_width, int start_depth, int block_width,
177                int block_depth) const {
178    assert(start_width >= 0);
179    assert(start_width + block_width <= width_);
180    assert(start_depth >= 0);
181    assert(start_depth + block_depth <= depth_);
182
183    return SideMap(data(start_width, start_depth), block_width, block_depth,
184                   stride_);
185  }
186
187 private:
188  Scalar* data_;  // not owned.
189  int width_, depth_, stride_;
190};
191
192template <RoundingMode tRoundingMode>
193class ScalarRoundingOffsetGenerator {
194 public:
195  std::uint8_t get() {
196    assert(false);  // This generic path should never be called.
197    return 0;
198  }
199};
200
201// A RoundingOffsetGenerator for rounding-to-nearest, always returning
202// the midpoint value 127.
203template <>
204class ScalarRoundingOffsetGenerator<RoundingMode::Nearest> {
205 public:
206  std::uint8_t get() { return 127; }
207};
208
209// A RoundingOffsetGenerator based on a 8-bit Xorshift.
210// This gives good results as Xorshift naturally generates
211// uniform random *nonzero* bytes i.e. 255 different values,
212// so it only remains for us to subtract one.
213template <>
214class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticXorshift> {
215 public:
216  ScalarRoundingOffsetGenerator() { x_ = 128; }
217
218  std::uint8_t get() {
219    std::uint8_t result = x_ - 1;
220    // Xorshift8(7,5,3)
221    x_ ^= x_ << 7;
222    x_ ^= x_ >> 5;
223    x_ ^= x_ << 3;
224    return result;
225  }
226
227 private:
228  // State
229  std::uint8_t x_;
230};
231
232// A RoundingOffsetGenerator based on an 8-bit add/mod
233// low-discrepancy sequence.  See less-than-8-bit.txt for
234// an explanation (the constant 97 is important - it must
235// be both relatively prime to 255, in order for the sequence
236// to be full-period, and c/255 should be close to 0.38 to
237// obtain low discrepancy).  Uses a small bit hack to avoid
238// expensive % operations.
239template <>
240class ScalarRoundingOffsetGenerator<RoundingMode::ProbabilisticAddmod> {
241  static const uint8_t AddConst = 97;
242
243 public:
244  ScalarRoundingOffsetGenerator() { x_ = 1; }  // Start must be non-zero
245
246  std::uint8_t get() {
247    // The +'d boolean term causes the increment to skip over 255,
248    // (recalling that 255+1 = 256 = 0 for an 8 bit uint),
249    // thus implementing %255
250    x_ += (AddConst + (x_ >= (255 - AddConst)));
251    return x_;
252  }
253
254 private:
255  // State
256  std::uint8_t x_;
257};
258
259// Requantizes a source uint8 value in [0..255] range
260// to the range specified by BitDepth, [0..((2^bits)-1)].
261// Bias must be avoided. Currently this is achieved
262// by probabilistic rounding.
263template <typename QuantizationParams>
264std::uint8_t Requantize(
265    std::uint8_t raw_src_val,
266    ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>*
267        rounding_offset_generator) {
268  static const int kBits = QuantizationParams::BitDepth::kBits;
269  static const std::uint8_t kMaxVal = (1 << kBits) - 1;
270
271  if (kBits == 8) {
272    return raw_src_val;
273  }
274
275  std::uint16_t scaled = static_cast<std::uint16_t>(raw_src_val) * kMaxVal;
276  std::uint8_t rounding_offset = rounding_offset_generator->get();
277  return (scaled + rounding_offset) / 255;
278}
279
280// A PackingRegisterBlock is a small fixed-size block of a matrix being
281// packed. This class is the generic non-optimized implementation,
282// it is inherited by the generic implementation of PackingRegisterBlock,
283// which may be overriden by template specialization. Overriding it is how
284// one may provide optimized packing code paths.
285//
286// The packing of a block proceeds in two steps:
287//   1. Ensuring that we have a complete block of source data, i.e. a block of
288//      the compile-time prescribed size. This is where we handle unaligned
289//      boundaries: if we don't have a complete block of source data, then
290//      we copy and zero-extend it into a local temporary (complete_src_),
291//      see MakeCompleteSrc. In the generic case, we do have a complete block,
292//      so we just use it in-place, see UseCompleteSrcInPlace.
293//   2. Packing a complete block into the destination, see Pack. This is the
294//      most critical part, so it's convenient that unaligned boundaries have
295//      already been handled in step 1.
296template <typename QuantizationParams, typename SrcMapType,
297          typename PackedSideBlock>
298class PackingRegisterBlockBase {
299 public:
300  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
301  typedef typename KernelSideFormat::Cell CellFormat;
302  static const int kCells = KernelSideFormat::kCells;
303  static const int kCellWidth = CellFormat::kWidth;
304  static const int kKernelWidth = CellFormat::kWidth * kCells;
305  static const int kCellDepth = CellFormat::kDepth;
306  static const int kCellSize = CellFormat::kSize;
307  static const SideMapOrder kSrcOrder = SrcMapType::kOrder;
308
309  typedef ScalarRoundingOffsetGenerator<QuantizationParams::kRoundingMode>
310      RoundingOffsetGenerator;
311
312  PackingRegisterBlockBase() : complete_src_(nullptr, 0, 0, 0) {}
313
314 protected:
315  // The source data that's ready for packing. May point to
316  // in-place actual source data if it's already a complete block,
317  // (see UseCompleteSrcInPlace)
318  // or to the local buf_ below into which we copy incomplete blocks
319  // (see MakeCompleteSrc)
320  SrcMapType complete_src_;
321
322  // Temporary buffer for loading incomplete blocks to,
323  // in the source storage order
324  std::uint8_t buf_[kKernelWidth * kRegisterSize];
325
326 public:
327  // Selects a block if in-place source data that's already a complete block
328  void UseCompleteSrcInPlace(const SrcMapType& src) { complete_src_ = src; }
329  // Copies an incomplete block of source data into a local temporary
330  // complete block by zero-extending it.
331  void MakeCompleteSrc(const SrcMapType& src) {
332    memset(buf_, 0, kKernelWidth * kRegisterSize);
333    if (kSrcOrder == SideMapOrder::WidthMajor) {
334      for (int w = 0; w < src.width(); w++) {
335        memcpy(buf_ + w * kRegisterSize, src.data(w, 0), src.depth());
336      }
337    } else {
338      assert(kSrcOrder == SideMapOrder::DepthMajor);
339      for (int d = 0; d < src.depth(); d++) {
340        memcpy(buf_ + d * kKernelWidth, src.data(0, d), src.width());
341      }
342    }
343    complete_src_ = SrcMapType(buf_, kKernelWidth, kRegisterSize);
344  }
345  // Packs a complete block into the destination. This is the most
346  // critical part and the part that we most typically want to
347  // override in architecture-specific optimized specializations.
348  void Pack(PackedSideBlock* dst, int start_width,
349            RoundingOffsetGenerator* rounding_offset_generator) {
350    std::uint8_t* dst_ptr = dst->current_data();
351    for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
352         cell_start_depth += kCellDepth) {
353      for (int cell_start_width = 0; cell_start_width < kKernelWidth;
354           cell_start_width += kCellWidth) {
355        std::int32_t* cell_sums_of_each_slice_ptr =
356            dst->sums_of_each_slice() + start_width + cell_start_width;
357        const SideMap<const std::uint8_t, kSrcOrder> src_cell_map(
358            complete_src_.block(cell_start_width, cell_start_depth, kCellWidth,
359                                kCellDepth));
360        for (int w = 0; w < kCellWidth; w++) {
361          std::int32_t sum = 0;
362          for (int d = 0; d < kCellDepth; d++) {
363            const std::uint8_t raw_src_val = src_cell_map(w, d);
364            const std::uint8_t requantized = Requantize<QuantizationParams>(
365                raw_src_val, rounding_offset_generator);
366            dst_ptr[OffsetIntoCell<CellFormat>(w, d)] = requantized;
367            sum += requantized;
368          }
369          cell_sums_of_each_slice_ptr[w] += sum;
370        }
371        dst_ptr += kCellSize;
372      }
373    }
374    dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
375  }
376};
377
378template <typename QuantizationParams, typename SrcMapType,
379          typename PackedSideBlock>
380class PackingRegisterBlock
381    : public PackingRegisterBlockBase<QuantizationParams, SrcMapType,
382                                      PackedSideBlock> {};
383
384// Large-scale implementation of packing.
385template <typename QuantizationParams, typename SrcMapType,
386          typename PackedSideBlock>
387class PackSideBlockImpl {
388 public:
389  typedef typename PackedSideBlock::KernelSideFormat KernelSideFormat;
390  typedef typename KernelSideFormat::Cell CellFormat;
391  static const int kCells = KernelSideFormat::kCells;
392  static const int kCellWidth = CellFormat::kWidth;
393  static const int kKernelWidth = CellFormat::kWidth * kCells;
394  static const int kCellDepth = CellFormat::kDepth;
395
396  typedef PackingRegisterBlock<QuantizationParams, SrcMapType, PackedSideBlock>
397      PackingRegisterBlockType;
398  typedef typename PackingRegisterBlockType::RoundingOffsetGenerator
399      RoundingOffsetGenerator;
400
401  PackSideBlockImpl(PackedSideBlock* packed_side_block,
402                    const SrcMapType& src_map)
403      : packed_side_block_(packed_side_block), src_map_(src_map) {}
404
405  PackedSideBlock* packed_side_block() const { return packed_side_block_; }
406
407  const SrcMapType& src_map() const { return src_map_; }
408
409  // The public entry point to pack a block.
410  void PackL2() {
411    memset(packed_side_block_->sums_of_each_slice(), 0,
412           sizeof(std::int32_t) * packed_side_block_->params().l2_width);
413    for (int d = 0; d < src_map_.depth();
414         d += packed_side_block_->params().l1_depth) {
415      int ds = std::min<int>(packed_side_block_->params().l1_depth,
416                             src_map_.depth() - d);
417
418      for (int w = 0; w < src_map_.width();
419           w += packed_side_block_->params().l1_width) {
420        int ws = std::min<int>(packed_side_block_->params().l1_width,
421                               src_map_.width() - w);
422
423        PrefetchL1(w, ws, d, ds);
424        PackL1(w, ws, d, ds);
425      }
426    }
427  }
428
429 protected:
430  // The intermediate-level loops, between PackL2 and PackRun.
431  void PackL1(int start_width, int width, int start_depth, int depth) {
432    for (int w = 0; w < width; w += kKernelWidth) {
433      int ws = std::min(+kKernelWidth, width - w);
434      packed_side_block_->seek_run(start_width + w, start_depth);
435      PackRun(start_width + w, ws, start_depth, depth);
436    }
437  }
438
439  // Prefetches the data that will be read by PackL1
440  void PrefetchL1(int start_width, int width, int start_depth, int depth) {
441    if (SrcMapType::kOrder == SideMapOrder::WidthMajor) {
442      for (int d = 0; d < depth; d += kDefaultCacheLineSize) {
443        for (int w = 0; w < width; w += 1) {
444          Prefetch(src_map_.data(start_width + w, start_depth + d));
445        }
446      }
447    } else {
448      for (int d = 0; d < depth; d++) {
449        for (int w = 0; w < width; w += kDefaultCacheLineSize) {
450          Prefetch(src_map_.data(start_width + w, start_depth + d));
451        }
452      }
453    }
454  }
455
456  // PackRun packs only a run i.e. is the inner loop in the depth dimension.
457  void PackRun(int start_width, int width, int start_depth, int depth) {
458    PackingRegisterBlockType b;
459    if (width == kKernelWidth) {
460      const int register_aligned_depth = RoundDown<kRegisterSize>(depth);
461      if (register_aligned_depth) {
462        for (int d = 0; d < register_aligned_depth; d += kRegisterSize) {
463          b.UseCompleteSrcInPlace(src_map_.block(start_width, start_depth + d,
464                                                 width, kRegisterSize));
465          b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
466        }
467      }
468      if (register_aligned_depth < depth) {
469        b.MakeCompleteSrc(
470            src_map_.block(start_width, start_depth + register_aligned_depth,
471                           width, depth - register_aligned_depth));
472        b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
473      }
474    } else {
475      assert(width < kKernelWidth);
476      for (int d = 0; d < depth; d += kRegisterSize) {
477        const int ds = std::min(+kRegisterSize, depth - d);
478        b.MakeCompleteSrc(
479            src_map_.block(start_width, start_depth + d, width, ds));
480        b.Pack(packed_side_block_, start_width, &rounding_offset_generator_);
481      }
482    }
483  }
484
485  // The PackedSideBlock being packed, i.e. the 'destination'.
486  PackedSideBlock* const packed_side_block_;
487
488  // A map on the block of the original matrix block being packed,
489  // i.e. the 'source'.
490  const SrcMapType& src_map_;
491
492  // Used for requantization in the less-than-8-bit case.
493  // Otherwise unused.
494  RoundingOffsetGenerator rounding_offset_generator_;
495};
496
497// Quantization parameters for the side (LHS or RHS) being packed,
498// with the rounding strategy having been already resolved to a specific
499// rounding mode.
500template <typename tBitDepth, RoundingMode tRoundingMode>
501struct QuantizationParams {
502  typedef tBitDepth BitDepth;
503  static const RoundingMode kRoundingMode = tRoundingMode;
504};
505
506// Packs a block of the input LHS matrix, into a PackedSideBlock
507template <typename BitDepthParams, typename PackedSideBlock,
508          typename MatrixMapType>
509void PackLhs(PackedSideBlock* dst, const MatrixMapType& src) {
510  ScopedProfilingLabel label("pack LHS");
511  static const SideMapOrder kSideMapOrder =
512      MatrixMapType::kOrder == MapOrder::RowMajor ? SideMapOrder::WidthMajor
513                                                  : SideMapOrder::DepthMajor;
514  typedef typename MatrixMapType::Scalar Scalar;
515  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
516  SideMapType src_side_map(src.data(), src.rows(), src.cols(), src.stride());
517  typedef typename BitDepthParams::LhsBitDepth BitDepth;
518  typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
519  const int accumulation_depth = src_side_map.depth();
520  if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
521    typedef QuantizationParams<BitDepth,
522                               RoundingStrategy::kRoundingModeForSmallSizes>
523        QParams;
524    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
525    ImplType impl(dst, src_side_map);
526    impl.PackL2();
527  } else {
528    typedef QuantizationParams<BitDepth,
529                               RoundingStrategy::kRoundingModeForLargeSizes>
530        QParams;
531    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
532    ImplType impl(dst, src_side_map);
533    impl.PackL2();
534  }
535}
536
537// Packs a block of the input RHS matrix, into a PackedSideBlock
538template <typename BitDepthParams, typename PackedSideBlock,
539          typename MatrixMapType>
540void PackRhs(PackedSideBlock* dst, const MatrixMapType& src) {
541  ScopedProfilingLabel label("pack RHS");
542  static const SideMapOrder kSideMapOrder =
543      MatrixMapType::kOrder == MapOrder::ColMajor ? SideMapOrder::WidthMajor
544                                                  : SideMapOrder::DepthMajor;
545  typedef typename MatrixMapType::Scalar Scalar;
546  typedef SideMap<Scalar, kSideMapOrder> SideMapType;
547  SideMapType src_side_map(src.data(), src.cols(), src.rows(), src.stride());
548  typedef typename BitDepthParams::RhsBitDepth BitDepth;
549  typedef typename BitDepthParams::RoundingStrategy RoundingStrategy;
550  const int accumulation_depth = src_side_map.depth();
551  if (accumulation_depth < RoundingStrategy::kRoundingModeSizeThreshold) {
552    typedef QuantizationParams<BitDepth,
553                               RoundingStrategy::kRoundingModeForSmallSizes>
554        QParams;
555    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
556    ImplType impl(dst, src_side_map);
557    impl.PackL2();
558  } else {
559    typedef QuantizationParams<BitDepth,
560                               RoundingStrategy::kRoundingModeForLargeSizes>
561        QParams;
562    typedef PackSideBlockImpl<QParams, SideMapType, PackedSideBlock> ImplType;
563    ImplType impl(dst, src_side_map);
564    impl.PackL2();
565  }
566}
567
568}  // namespace gemmlowp
569
570#ifdef GEMMLOWP_NEON
571#include "pack_neon.h"
572#elif defined(GEMMLOWP_SSE4)
573#include "pack_SSE.h"
574#endif
575
576#endif  // GEMMLOWP_INTERNAL_PACK_H_
577