1321f69487c9244350b5e5b7d8fd68e56aa9eb6c8Benoit Jacob// Copyright 2015 Google Inc. All Rights Reserved. 275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Licensed under the Apache License, Version 2.0 (the "License"); 475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// you may not use this file except in compliance with the License. 575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// You may obtain a copy of the License at 675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// http://www.apache.org/licenses/LICENSE-2.0 875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// 975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// Unless required by applicable law or agreed to in writing, software 1075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// distributed under the License is distributed on an "AS IS" BASIS, 1175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// See the License for the specific language governing permissions and 1375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// limitations under the License. 1475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// compute.h: the central stage of the Gemm computation, operates 1675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// on already-packed LHS and RHS blocks and calls the Gemm kernel 1775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob// to compute a block of the product. 1875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 1975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#ifndef GEMMLOWP_INTERNAL_COMPUTE_H_ 2075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#define GEMMLOWP_INTERNAL_COMPUTE_H_ 2175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 22544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "block_params.h" 237b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang#include "kernel.h" 24544690cac8f06f1b2f5fa3799e1e8f13c75d95e9Miao Wang#include "pack.h" 2575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 2675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobnamespace gemmlowp { 2775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 280a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <typename PackedLhs, typename PackedRhs, typename PackedResult> 2975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobclass ComputeImpl { 300a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef typename PackedLhs::KernelSideFormat KernelLhsFormat; 310a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef typename PackedRhs::KernelSideFormat KernelRhsFormat; 320a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang typedef KernelFormat<KernelLhsFormat, KernelRhsFormat> Format; 3375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const KernelBase& kernel_; 3575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob const BlockParams& block_params_; 3675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 3775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob PackedResult* const packed_result_; 380a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const PackedLhs& packed_lhs_; 390a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const PackedRhs& packed_rhs_; 4075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 4175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob public: 4275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params, 430a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PackedResult* _packed_result, const PackedLhs& _packed_lhs, 440a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const PackedRhs& _packed_rhs) 4575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob : kernel_(_kernel), 4675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob block_params_(_block_params), 4775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_result_(_packed_result), 4875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_lhs_(_packed_lhs), 4975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_rhs_(_packed_rhs) {} 5075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 5175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void Compute() { 5275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int d = 0; d < block_params_.l2_depth; d += block_params_.l1_depth) { 5375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int ds = std::min(block_params_.l1_depth, block_params_.l2_depth - d); 5475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 5575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) { 5675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r); 5775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 5875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds); 5975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 6075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 6175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 6275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 6375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob private: 647b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang void ComputeRun(int start_row, int start_col, int start_depth, 657b05d573cf2e0fd3a58e98cdbfc65153a83fd6f1Miao Wang int depth) GEMMLOWP_NOINLINE { 6675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_lhs_.seek_run(start_row, start_depth); 6775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_rhs_.seek_run(start_col, start_depth); 6875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob auto packed_result_block = packed_result_->Map().block( 690a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang start_row, start_col, Format::kRows, Format::kCols); 7075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(), 7175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_result_block.cols_stride(), packed_lhs_.current_data(), 7275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob packed_rhs_.current_data(), start_depth, depth); 7375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 7475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 7575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob void ComputeL1(int start_row, int rows, int start_col, int cols, 7675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob int start_depth, int depth) { 770a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(rows % Format::kRows == 0); 780a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(cols % Format::kCols == 0); 790a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang assert(depth % Format::kDepth == 0); 8075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 810a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int c = 0; c < cols; c += Format::kCols) { 820a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang for (int r = 0; r < rows; r += Format::kRows) { 8375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ComputeRun(start_row + r, start_col + c, start_depth, depth); 8475c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8575c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob } 8775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob}; 8875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 890a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wangtemplate <typename PackedLhs, typename PackedRhs, typename PackedResult> 9075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacobvoid Compute(const KernelBase& kernel, const BlockParams& block_params, 910a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang PackedResult* packed_result, const PackedLhs& packed_lhs, 920a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang const PackedRhs& packed_rhs) { 9375c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob ScopedProfilingLabel label("compute"); 940a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang ComputeImpl<PackedLhs, PackedRhs, PackedResult> impl( 950a70f98b4be89f51cdd54bf739c953e82ec7fb55Miao Wang kernel, block_params, packed_result, packed_lhs, packed_rhs); 9675c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 9775c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob impl.Compute(); 9875c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} 9975c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 10075c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob} // namespace gemmlowp 10175c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob 10275c4ec0ba4dd86e4f763a54e01002ff29f1d57aBenoit Jacob#endif // GEMMLOWP_INTERNAL_COMPUTE_H_ 103