1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// block_params.h: Logic to choose L1 and L2 block sizes 16// to optimize cache-friendliness. 17 18#ifndef GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ 19#define GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ 20 21#include "common.h" 22 23namespace gemmlowp { 24 25// A BlockParams instance contains a full description of all the block size 26// parameters to be used by a Gemm. 27// There are two nested levels of block subdivisions: first a subdivision 28// into large blocks that should fit in last-level cache (what we call L2 here) 29// and then another subdivision into smaller blocks that should fit in 30// L1 cache. There is then actually a third level of subdivision to fit 31// in registers, but we are not concerned with that here. 32struct BlockParams { 33 // L1 block parameters determine the size of small blocks that should 34 // fit in L1 cache. 35 int l1_rows; 36 int l1_cols; 37 int l1_depth; 38 39 // L2 block parameters determine the size of larger blocks that should 40 // fit in L2 cache. 41 int l2_rows; 42 int l2_cols; 43 int l2_depth; 44 45 template <typename KernelFormat> 46 void Init(int rows, int cols, int depth, int num_threads) { 47 FindL2BlockSizes<KernelFormat>(rows, cols, depth, num_threads, &l2_rows, 48 &l2_cols, &l2_depth); 49 FindL1BlockSizes<KernelFormat>(l2_rows, l2_cols, l2_depth, &l1_rows, 50 &l1_cols, &l1_depth); 51 } 52 53 template <typename KernelFormat> 54 static void FindL2BlockSizes(int rows, int cols, int depth, int num_threads, 55 int* out_l2_rows, int* out_l2_cols, 56 int* out_l2_depth) { 57 int l2_rows = 0; 58 int l2_cols = 0; 59 int l2_depth = 0; 60 // No L2 blocking in the depth dimension at the moment. 61 // Too much loss of accuracy due to storing intermediate results in 62 // low precision. 63 // However, we still want to round l2_depth up to the next multiple 64 // of register size, so as to avoid having to special-case unaligned depths. 65 l2_depth = RoundUp<kRegisterSize>(depth); 66 67 const int l2_bytes_to_use = kDefaultL2CacheSize; 68 const float l2_rhs_factor = kDefaultL2RhsFactor; 69 70 { 71 int max_cache_friendly_l2_cols = std::max( 72 1, static_cast<int>(l2_rhs_factor * (l2_bytes_to_use / l2_depth))); 73 int min_l2_cols_blocks = 74 std::max(1, CeilQuotient(cols, max_cache_friendly_l2_cols)); 75 l2_cols = 76 RoundUp<KernelFormat::kCols>(CeilQuotient(cols, min_l2_cols_blocks)); 77 } 78 79 // No L2 blocking in the row dimension if l2_rhs_factor is 1.0 as the row 80 // dimension concerns only the LHS. Blocking only RHS matrix for L2 enhances 81 // the performance on x86. 82 if (l2_rhs_factor == 1.0f) { 83 l2_rows = RoundUp<KernelFormat::kRows>(rows); 84 } else { 85 int max_cache_friendly_l2_rows = 86 std::max(1, (l2_bytes_to_use - l2_depth * l2_cols) / 87 (num_threads * (l2_depth + 4 * l2_cols))); 88 int min_l2_rows_blocks = 89 std::max(1, CeilQuotient(rows, max_cache_friendly_l2_rows)); 90 l2_rows = 91 RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l2_rows_blocks)); 92 } 93 94 *out_l2_rows = l2_rows; 95 *out_l2_cols = l2_cols; 96 *out_l2_depth = l2_depth; 97 } 98 99 template <typename KernelFormat> 100 static void FindL1BlockSizes(int rows, int cols, int depth, int* out_l1_rows, 101 int* out_l1_cols, int* out_l1_depth) { 102 int l1_rows = 0; 103 int l1_cols = 0; 104 int l1_depth = 0; 105 106 // L2 block sizes should already be multiples of kernel block sizes. 107 assert(rows % KernelFormat::kRows == 0); 108 assert(cols % KernelFormat::kCols == 0); 109 assert(depth % KernelFormat::kDepth == 0); 110 111 // No L1 blocking in the columns dimension at the moment. 112 // Thought not to be needed. Similar to Eigen. 113 l1_cols = cols; 114 115 const int l1_bytes_to_use = kDefaultL1CacheSize; 116 117 { 118 int max_cache_friendly_l1_depth = std::max( 119 1, (l1_bytes_to_use - 4 * KernelFormat::kRows * KernelFormat::kCols) / 120 (KernelFormat::kRows + KernelFormat::kCols)); 121 int min_l1_depth_blocks = 122 std::max(1, CeilQuotient(depth, max_cache_friendly_l1_depth)); 123 l1_depth = 124 RoundUp<kRegisterSize>(CeilQuotient(depth, min_l1_depth_blocks)); 125 } 126 127 { 128 int max_cache_friendly_l1_rows = 129 std::max(1, l1_bytes_to_use / (l1_depth + 4 * l1_cols)); 130 int min_l1_rows_blocks = 131 std::max(1, CeilQuotient(rows, max_cache_friendly_l1_rows)); 132 l1_rows = 133 RoundUp<KernelFormat::kRows>(CeilQuotient(rows, min_l1_rows_blocks)); 134 } 135 136 *out_l1_rows = l1_rows; 137 *out_l1_cols = l1_cols; 138 *out_l1_depth = l1_depth; 139 } 140}; 141 142// A SideBlockParams instance contains only the block params relevant to 143// one side (LHS or RHS), expressed in terms of 'width' instead of 144// rows/colums. See the explanation in kernel.h: in the LHS, 'width' means 145// the number of rows, while in the RHS, 'width' means the number of columns. 146// That allows us to write generic code that applies to either LHS or RHS. 147struct SideBlockParams { 148 // L1 block parameters determine the size of small blocks that should 149 // fit in L1 cache. 150 int l1_width; 151 int l1_depth; 152 153 // L2 block parameters determine the size of larger blocks that should 154 // fit in L2 cache. 155 int l2_width; 156 int l2_depth; 157}; 158 159enum class Side { Lhs, Rhs }; 160 161inline void GetSideBlockParams(Side side, SideBlockParams* side_block_params, 162 const BlockParams& block_params) { 163 side_block_params->l1_width = 164 side == Side::Lhs ? block_params.l1_rows : block_params.l1_cols; 165 side_block_params->l2_width = 166 side == Side::Lhs ? block_params.l2_rows : block_params.l2_cols; 167 168 side_block_params->l1_depth = block_params.l1_depth; 169 side_block_params->l2_depth = block_params.l2_depth; 170} 171 172} // namespace gemmlowp 173 174#endif // GEMMLOWP_INTERNAL_BLOCK_PARAMS_H_ 175