1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// single_thread_gemm.h: Single-threaded GEMM implementation. 16// This is a good place to start reading code, as it shows the overall 17// structure of a GEMM and is much simpler than multi_thread_gemm.h. 18 19#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 20#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 21 22#include <cassert> 23 24#include "../public/map.h" 25#include "allocator.h" 26#include "compute.h" 27#include "kernel.h" 28#include "pack.h" 29#include "unpack.h" 30 31#ifdef GEMMLOWP_PROFILING_SIZES 32#ifndef GEMMLOWP_PROFILING 33#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING 34#endif 35#include <string> 36#include <unordered_map> 37#endif 38 39namespace gemmlowp { 40 41class SingleThreadGemmContext { 42 public: 43 Allocator* allocator() { return &allocator_; } 44 45 void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; } 46 void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; } 47 void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; } 48 49 int l1_bytes_to_use() const { return l1_bytes_to_use_; } 50 int l2_bytes_to_use() const { return l2_bytes_to_use_; } 51 float l2_rhs_factor() const { return l2_rhs_factor_; } 52 53 protected: 54 Allocator allocator_; 55 56 // The cache configurationt to use. 57 int l1_bytes_to_use_ = kDefaultL1CacheSize; 58 int l2_bytes_to_use_ = kDefaultL2CacheSize; 59 float l2_rhs_factor_ = kDefaultL2RhsFactor; 60}; 61 62template <typename KernelFormat, typename InputScalar, typename OutputScalar, 63 typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder, 64 MapOrder ResultOrder, typename LhsOffset, typename RhsOffset, 65 typename OutputPipelineType> 66void SingleThreadGemm(SingleThreadGemmContext* context, 67 const KernelBase& kernel, 68 const MatrixMap<const InputScalar, LhsOrder>& lhs, 69 const MatrixMap<const InputScalar, RhsOrder>& rhs, 70 MatrixMap<OutputScalar, ResultOrder>* result, 71 const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 72 const OutputPipelineType& output_pipeline) { 73 ScopedProfilingLabel label("gemmlowp::SingleThreadGemm"); 74 75 assert(lhs.cols() == rhs.rows()); 76 77 int rows = result->rows(); 78 int cols = result->cols(); 79 int depth = lhs.cols(); 80 81 // zero sizes should have been caught earlier and early-returned. 82 assert(rows > 0); 83 assert(cols > 0); 84 assert(depth > 0); 85 86 // The case of rows<cols should have been caught earlier and transposed. 87 assert(rows >= cols); 88 89 Allocator* allocator = context->allocator(); 90 91 BlockParams block_params; 92 block_params.Init<KernelFormat>( 93 rows, cols, depth, 1, context->l1_bytes_to_use(), 94 context->l2_bytes_to_use(), context->l2_rhs_factor()); 95 96#ifdef GEMMLOWP_PROFILING_SIZES 97 // Using a static map of label strings. Not reentrant at all! 98 static std::unordered_map<std::uint64_t, std::string> labels_map; 99 std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^ 100 (static_cast<std::uint64_t>(depth) << 16) ^ 101 (static_cast<std::uint64_t>(cols) << 32); 102 if (!labels_map.count(sizes_hash)) { 103 char label[256]; 104 snprintf(label, sizeof(label), 105 "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, " 106 "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)", 107 rows, depth, cols, block_params.l2_rows, block_params.l2_depth, 108 block_params.l2_cols, block_params.l1_rows, block_params.l1_depth, 109 block_params.l1_cols); 110 labels_map[sizes_hash] = label; 111 } 112 ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str()); 113#endif 114 115 PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator, 116 block_params); 117 PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator, 118 block_params); 119 120 PackedResult packed_result(allocator, block_params); 121 122 allocator->Commit(); 123 124 const bool pack_rhs_once = block_params.l2_cols >= cols; 125 126 if (pack_rhs_once) { 127 PackRhs(&packed_rhs, rhs); 128 } 129 130 for (int r = 0; r < rows; r += block_params.l2_rows) { 131 int rs = std::min(block_params.l2_rows, rows - r); 132 133 PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); 134 135 for (int c = 0; c < cols; c += block_params.l2_cols) { 136 int cs = std::min(block_params.l2_cols, cols - c); 137 138 if (!pack_rhs_once) { 139 PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); 140 } 141 142 Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, 143 depth); 144 145 UnpackResult<KernelFormat>( 146 result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth, 147 packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), 148 lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline); 149 } 150 } 151 152 allocator->Decommit(); 153} 154 155} // namespace gemmlowp 156 157#endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 158