1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved.
275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License");
475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License.
575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at
675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//     http://www.apache.org/licenses/LICENSE-2.0
875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob//
975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software
1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS,
1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and
1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License.
1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// compute.h: the central stage of the Gemm computation, operates
1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// on already-packed LHS and RHS blocks and calls the Gemm kernel
1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to compute a block of the product.
1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_COMPUTE_H_
2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_COMPUTE_H_
2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
22544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "block_params.h"
237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "kernel.h"
24544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "pack.h"
2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
2675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp {
2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <typename PackedLhs, typename PackedRhs, typename PackedResult>
2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass ComputeImpl {
300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef typename PackedLhs::KernelSideFormat KernelLhsFormat;
310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef typename PackedRhs::KernelSideFormat KernelRhsFormat;
320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  typedef KernelFormat<KernelLhsFormat, KernelRhsFormat> Format;
3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  const KernelBase& kernel_;
3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  const BlockParams& block_params_;
3675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  PackedResult* const packed_result_;
380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const PackedLhs& packed_lhs_;
390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  const PackedRhs& packed_rhs_;
4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public:
4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params,
430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang              PackedResult* _packed_result, const PackedLhs& _packed_lhs,
440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang              const PackedRhs& _packed_rhs)
4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      : kernel_(_kernel),
4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        block_params_(_block_params),
4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        packed_result_(_packed_result),
4875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        packed_lhs_(_packed_lhs),
4975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        packed_rhs_(_packed_rhs) {}
5075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
5175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void Compute() {
5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    for (int d = 0; d < block_params_.l2_depth; d += block_params_.l1_depth) {
5375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      int ds = std::min(block_params_.l1_depth, block_params_.l2_depth - d);
5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
5575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) {
5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r);
5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds);
5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob private:
647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang  void ComputeRun(int start_row, int start_col, int start_depth,
657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang                  int depth) GEMMLOWP_NOINLINE {
6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    packed_lhs_.seek_run(start_row, start_depth);
6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    packed_rhs_.seek_run(start_col, start_depth);
6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    auto packed_result_block = packed_result_->Map().block(
690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang        start_row, start_col, Format::kRows, Format::kCols);
7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                packed_result_block.cols_stride(), packed_lhs_.current_data(),
7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                packed_rhs_.current_data(), start_depth, depth);
7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  void ComputeL1(int start_row, int rows, int start_col, int cols,
7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob                 int start_depth, int depth) {
770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    assert(rows % Format::kRows == 0);
780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    assert(cols % Format::kCols == 0);
790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    assert(depth % Format::kDepth == 0);
8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang    for (int c = 0; c < cols; c += Format::kCols) {
820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      for (int r = 0; r < rows; r += Format::kRows) {
8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob        ComputeRun(start_row + r, start_col + c, start_depth, depth);
8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob      }
8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob    }
8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  }
8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob};
8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <typename PackedLhs, typename PackedRhs, typename PackedResult>
9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobvoid Compute(const KernelBase& kernel, const BlockParams& block_params,
910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang             PackedResult* packed_result, const PackedLhs& packed_lhs,
920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang             const PackedRhs& packed_rhs) {
9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  ScopedProfilingLabel label("compute");
940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang  ComputeImpl<PackedLhs, PackedRhs, PackedResult> impl(
950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang      kernel, block_params, packed_result, packed_lhs, packed_rhs);
9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob  impl.Compute();
9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}
9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}  // namespace gemmlowp
10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob
10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif  // GEMMLOWP_INTERNAL_COMPUTE_H_
103