1// Copyright 2015 Google Inc. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// common.h: contains stuff that's used throughout gemmlowp 16// and should always be available. 17 18#ifndef GEMMLOWP_INTERNAL_COMMON_H_ 19#define GEMMLOWP_INTERNAL_COMMON_H_ 20 21#include <pthread.h> 22 23#include <algorithm> 24#include <cassert> 25#include <cmath> 26#include <cstdlib> 27 28#include "../profiling/instrumentation.h" 29 30// Our inline assembly path assume GCC/Clang syntax. 31// Native Client doesn't seem to support inline assembly(?). 32#if defined(__GNUC__) && !defined(__native_client__) 33#define GEMMLOWP_ALLOW_INLINE_ASM 34#endif 35 36// Define macro statement that avoids inlining for GCC. 37// For non-GCC, define as empty macro. 38#if defined(__GNUC__) 39#define GEMMLOWP_NOINLINE __attribute__((noinline)) 40#else 41#define GEMMLOWP_NOINLINE 42#endif 43 44// Detect ARM, 32-bit or 64-bit 45#ifdef __arm__ 46#define GEMMLOWP_ARM_32 47#endif 48 49#ifdef __aarch64__ 50#define GEMMLOWP_ARM_64 51#endif 52 53#if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64) 54#define GEMMLOWP_ARM 55#endif 56 57// Detect x86, 32-bit or 64-bit 58#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386) 59#define GEMMLOWP_X86_32 60#endif 61 62#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) 63#define GEMMLOWP_X86_64 64#endif 65 66#if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64) 67#define GEMMLOWP_X86 68#endif 69 70// Some of our optimized paths use inline assembly and for 71// now we don't bother enabling some other optimized paths using intrinddics 72// where we can't use inline assembly paths. 73#ifdef GEMMLOWP_ALLOW_INLINE_ASM 74 75// Detect NEON. It's important to check for both tokens. 76#if (defined __ARM_NEON) || (defined __ARM_NEON__) 77#define GEMMLOWP_NEON 78#endif 79 80// Convenience NEON tokens for 32-bit or 64-bit 81#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32) 82#define GEMMLOWP_NEON_32 83#endif 84 85#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64) 86#define GEMMLOWP_NEON_64 87#endif 88 89// Detect SSE4. 90#if defined __SSE4_1__ 91#define GEMMLOWP_SSE4 92#endif 93 94// Convenience SSE4 tokens for 32-bit or 64-bit 95#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) 96#define GEMMLOWP_SSE4_32 97#endif 98 99#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) 100#define GEMMLOWP_SSE4_64 101#endif 102 103#endif // GEMMLOWP_ALLOW_INLINE_ASM 104 105// Detect Android. Don't conflate with ARM - we care about tuning 106// for non-ARM Android devices too. This can be used in conjunction 107// with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs. 108#if defined(__ANDROID__) || defined(ANDROID) 109#define GEMMLOWP_ANDROID 110#endif 111 112namespace gemmlowp { 113 114// Standard cache line size. Useful to optimize alignment and 115// prefetches. Ideally we would query this at runtime, however 116// 64 byte cache lines are the vast majority, and even if it's 117// wrong on some device, it will be wrong by no more than a 2x factor, 118// which should be acceptable. 119const int kDefaultCacheLineSize = 64; 120 121// Default L1 and L2 data cache sizes. 122// The L1 cache size is assumed to be for each core. 123// The L2 cache size is assumed to be shared among all cores. What 124// we call 'L2' here is effectively top-level cache. 125// 126// On x86, we should ideally query this at 127// runtime. On ARM, the instruction to query this is privileged and 128// Android kernels do not expose it to userspace. Fortunately, the majority 129// of ARM devices have roughly comparable values: 130// Nexus 5: L1 16k, L2 1M 131// Android One: L1 32k, L2 512k 132// The following values are equal to or somewhat lower than that, and were 133// found to perform well on both the Nexus 5 and Android One. 134// Of course, these values are in principle too low for typical x86 CPUs 135// where we should set the L2 value to (L3 cache size / number of cores) at 136// least. 137#if defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID) 138// ARM or ARM-like hardware (Android implies ARM-like) so here it's OK 139// to tune for ARM, although on x86 Atom we might be able to query 140// cache sizes at runtime, which would be better. 141const int kDefaultL1CacheSize = 16 * 1024; 142const int kDefaultL2CacheSize = 384 * 1024; 143#elif defined(GEMMLOWP_X86_64) 144// x86-64 and not Android. Therefore, likely desktop-class x86 hardware. 145// Thus we assume larger cache sizes, though we really should query 146// them at runtime. 147const int kDefaultL1CacheSize = 32 * 1024; 148const int kDefaultL2CacheSize = 4 * 1024 * 1024; 149#elif defined(GEMMLOWP_X86_32) 150// x86-32 and not Android. Same as x86-64 but less bullish. 151const int kDefaultL1CacheSize = 32 * 1024; 152const int kDefaultL2CacheSize = 2 * 1024 * 1024; 153#else 154// Less common hardware. Maybe some unusual or older or embedded thing. 155// Assume smaller caches, but don't depart too far from what we do 156// on ARM/Android to avoid accidentally exposing unexpected behavior. 157const int kDefaultL1CacheSize = 16 * 1024; 158const int kDefaultL2CacheSize = 256 * 1024; 159#endif 160 161// The proportion of the cache that we intend to use for storing 162// RHS blocks. This should be between 0 and 1, and typically closer to 1, 163// as we typically want to use most of the L2 cache for storing a large 164// RHS block. 165#if defined(GEMMLOWP_X86) 166// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked 167// for L2 cache. 168const float kDefaultL2RhsFactor = 1.00f; 169#else 170const float kDefaultL2RhsFactor = 0.75f; 171#endif 172 173// The number of bytes in a SIMD register. This is used to determine 174// the dimensions of PackingRegisterBlock so that such blocks can 175// be efficiently loaded into registers, so that packing code can 176// work within registers as much as possible. 177// In the non-SIMD generic fallback code, this is just a generic array 178// size, so any size would work there. Different platforms may set this 179// to different values but must ensure that their own optimized packing paths 180// are consistent with this value. 181const int kRegisterSize = 16; 182 183// Requantization to less-than-8-bit is costly, so it only worth 184// doing if the GEMM width is large enough 185const int kMinimumWidthForRequantization = 100; 186 187// Hints the CPU to prefetch the cache line containing ptr. 188inline void Prefetch(const void* ptr) { 189#ifdef __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. 190 __builtin_prefetch(ptr); 191#else 192 (void)ptr; 193#endif 194} 195 196// Returns the runtime argument rounded down to the nearest multiple of 197// the fixed Modulus. 198template <unsigned Modulus, typename Integer> 199Integer RoundDown(Integer i) { 200 return i - (i % Modulus); 201} 202 203// Returns the runtime argument rounded up to the nearest multiple of 204// the fixed Modulus. 205template <unsigned Modulus, typename Integer> 206Integer RoundUp(Integer i) { 207 return RoundDown<Modulus>(i + Modulus - 1); 208} 209 210// Returns the quotient a / b rounded up ('ceil') to the nearest integer. 211template <typename Integer> 212Integer CeilQuotient(Integer a, Integer b) { 213 return (a + b - 1) / b; 214} 215 216// Returns the argument rounded up to the nearest power of two. 217template <typename Integer> 218Integer RoundUpToPowerOfTwo(Integer n) { 219 Integer i = n - 1; 220 i |= i >> 1; 221 i |= i >> 2; 222 i |= i >> 4; 223 i |= i >> 8; 224 i |= i >> 16; 225 return i + 1; 226} 227 228template <int N> 229struct IsPowerOfTwo { 230 static const bool value = !(N & (N - 1)); 231}; 232 233} // namespace gemmlowp 234 235#endif // GEMMLOWP_INTERNAL_COMMON_H_ 236