1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15// common.h: contains stuff that's used throughout gemmlowp 16// and should always be available. 17 18#ifndef GEMMLOWP_INTERNAL_COMMON_H_ 19#define GEMMLOWP_INTERNAL_COMMON_H_ 20 21#include "../internal/platform.h" 22#include "../profiling/pthread_everywhere.h" 23 24#include <algorithm> 25#include <cassert> 26#include <cmath> 27#include <cstdlib> 28 29#include "../profiling/instrumentation.h" 30 31// Our inline assembly path assume GCC/Clang syntax. 32// Native Client doesn't seem to support inline assembly(?). 33#if defined(__GNUC__) && !defined(__native_client__) 34#define GEMMLOWP_ALLOW_INLINE_ASM 35#endif 36 37// Define macro statement that avoids inlining for GCC. 38// For non-GCC, define as empty macro. 39#if defined(__GNUC__) 40#define GEMMLOWP_NOINLINE __attribute__((noinline)) 41#else 42#define GEMMLOWP_NOINLINE 43#endif 44 45// Detect ARM, 32-bit or 64-bit 46#ifdef __arm__ 47#define GEMMLOWP_ARM_32 48#endif 49 50#ifdef __aarch64__ 51#define GEMMLOWP_ARM_64 52#endif 53 54#if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64) 55#define GEMMLOWP_ARM 56#endif 57 58// Detect MIPS, 32-bit or 64-bit 59#if defined(__mips) && !defined(__LP64__) 60#define GEMMLOWP_MIPS_32 61#endif 62 63#if defined(__mips) && defined(__LP64__) 64#define GEMMLOWP_MIPS_64 65#endif 66 67#if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64) 68#define GEMMLOWP_MIPS 69#endif 70 71// Detect x86, 32-bit or 64-bit 72#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386) 73#define GEMMLOWP_X86_32 74#endif 75 76#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) 77#define GEMMLOWP_X86_64 78#endif 79 80#if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64) 81#define GEMMLOWP_X86 82#endif 83 84// Some of our optimized paths use inline assembly and for 85// now we don't bother enabling some other optimized paths using intrinddics 86// where we can't use inline assembly paths. 87#ifdef GEMMLOWP_ALLOW_INLINE_ASM 88 89// Detect NEON. It's important to check for both tokens. 90#if (defined __ARM_NEON) || (defined __ARM_NEON__) 91#define GEMMLOWP_NEON 92#endif 93 94// Convenience NEON tokens for 32-bit or 64-bit 95#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32) 96#define GEMMLOWP_NEON_32 97#endif 98 99#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64) 100#define GEMMLOWP_NEON_64 101#endif 102 103// Detect MIPS MSA. 104// Limit MSA optimizations to little-endian CPUs for now. 105// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? 106#if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \ 107 defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 108#define GEMMLOWP_MSA 109#endif 110 111// Convenience MIPS MSA tokens for 32-bit or 64-bit. 112#if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32) 113#define GEMMLOWP_MSA_32 114#endif 115 116#if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64) 117#define GEMMLOWP_MSA_64 118#endif 119 120// Detect SSE. 121#ifdef __SSE4_1__ 122#define GEMMLOWP_SSE4 123#endif 124 125#ifdef __SSE3__ 126#define GEMMLOWP_SSE3 127#endif 128 129// Convenience SSE4 tokens for 32-bit or 64-bit 130#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \ 131 !defined(GEMMLOWP_DISABLE_SSE4) 132#define GEMMLOWP_SSE4_32 133#endif 134 135#if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32) 136#define GEMMLOWP_SSE3_32 137#endif 138 139#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \ 140 !defined(GEMMLOWP_DISABLE_SSE4) 141#define GEMMLOWP_SSE4_64 142#endif 143 144#if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64) 145#define GEMMLOWP_SSE3_64 146#endif 147 148#if defined(__has_feature) 149#if __has_feature(memory_sanitizer) 150#include <sanitizer/msan_interface.h> 151#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison 152#elif __has_feature(address_sanitizer) 153#include <sanitizer/asan_interface.h> 154#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region 155#endif 156#endif 157 158#endif // GEMMLOWP_ALLOW_INLINE_ASM 159 160// Detect Android. Don't conflate with ARM - we care about tuning 161// for non-ARM Android devices too. This can be used in conjunction 162// with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs. 163#if defined(__ANDROID__) || defined(ANDROID) 164#define GEMMLOWP_ANDROID 165#endif 166 167namespace gemmlowp { 168 169// Standard cache line size. Useful to optimize alignment and 170// prefetches. Ideally we would query this at runtime, however 171// 64 byte cache lines are the vast majority, and even if it's 172// wrong on some device, it will be wrong by no more than a 2x factor, 173// which should be acceptable. 174const int kDefaultCacheLineSize = 64; 175 176// Default L1 and L2 data cache sizes. 177// The L1 cache size is assumed to be for each core. 178// The L2 cache size is assumed to be shared among all cores. What 179// we call 'L2' here is effectively top-level cache. 180// 181// On x86, we should ideally query this at 182// runtime. On ARM, the instruction to query this is privileged and 183// Android kernels do not expose it to userspace. Fortunately, the majority 184// of ARM devices have roughly comparable values: 185// Nexus 5: L1 16k, L2 1M 186// Android One: L1 32k, L2 512k 187// The following values are equal to or somewhat lower than that, and were 188// found to perform well on both the Nexus 5 and Android One. 189// Of course, these values are in principle too low for typical x86 CPUs 190// where we should set the L2 value to (L3 cache size / number of cores) at 191// least. 192// 193#if defined(GEMMLOWP_ARM) && defined(__APPLE__) 194// iPhone/iPad 195const int kDefaultL1CacheSize = 48 * 1024; 196const int kDefaultL2CacheSize = 2 * 1024 * 1024; 197#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID) 198// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK 199// to tune for ARM, although on x86 Atom we might be able to query 200// cache sizes at runtime, which would be better. 201const int kDefaultL1CacheSize = 16 * 1024; 202const int kDefaultL2CacheSize = 384 * 1024; 203#elif defined(GEMMLOWP_X86_64) 204// x86-64 and not Android. Therefore, likely desktop-class x86 hardware. 205// Thus we assume larger cache sizes, though we really should query 206// them at runtime. 207const int kDefaultL1CacheSize = 32 * 1024; 208const int kDefaultL2CacheSize = 4 * 1024 * 1024; 209#elif defined(GEMMLOWP_X86_32) 210// x86-32 and not Android. Same as x86-64 but less bullish. 211const int kDefaultL1CacheSize = 32 * 1024; 212const int kDefaultL2CacheSize = 2 * 1024 * 1024; 213#elif defined(GEMMLOWP_MIPS) 214// MIPS and not Android. TODO: MIPS and Android? 215const int kDefaultL1CacheSize = 32 * 1024; 216const int kDefaultL2CacheSize = 1024 * 1024; 217#else 218// Less common hardware. Maybe some unusual or older or embedded thing. 219// Assume smaller caches, but don't depart too far from what we do 220// on ARM/Android to avoid accidentally exposing unexpected behavior. 221const int kDefaultL1CacheSize = 16 * 1024; 222const int kDefaultL2CacheSize = 256 * 1024; 223#endif 224 225// The proportion of the cache that we intend to use for storing 226// RHS blocks. This should be between 0 and 1, and typically closer to 1, 227// as we typically want to use most of the L2 cache for storing a large 228// RHS block. 229#if defined(GEMMLOWP_X86) 230// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked 231// for L2 cache. 232const float kDefaultL2RhsFactor = 1.00f; 233#else 234const float kDefaultL2RhsFactor = 0.75f; 235#endif 236 237// The number of bytes in a SIMD register. This is used to determine 238// the dimensions of PackingRegisterBlock so that such blocks can 239// be efficiently loaded into registers, so that packing code can 240// work within registers as much as possible. 241// In the non-SIMD generic fallback code, this is just a generic array 242// size, so any size would work there. Different platforms may set this 243// to different values but must ensure that their own optimized packing paths 244// are consistent with this value. 245const int kRegisterSize = 16; 246 247// Hints the CPU to prefetch the cache line containing ptr. 248inline void Prefetch(const void* ptr) { 249#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM 250 // Aarch64 has very detailed prefetch instructions, that compilers 251 // can't know how to map __builtin_prefetch to, and as a result, don't, 252 // leaving __builtin_prefetch a no-op on this architecture. 253 // For our purposes, "pldl1keep" is usually what we want, meaning: 254 // "prefetch for load, into L1 cache, using each value multiple times". 255 asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :); 256#elif defined \ 257 __GNUC__ // Clang and GCC define __GNUC__ and have __builtin_prefetch. 258 __builtin_prefetch(ptr); 259#else 260 (void)ptr; 261#endif 262} 263 264// Returns the runtime argument rounded down to the nearest multiple of 265// the fixed Modulus. 266template <unsigned Modulus, typename Integer> 267Integer RoundDown(Integer i) { 268 return i - (i % Modulus); 269} 270 271// Returns the runtime argument rounded up to the nearest multiple of 272// the fixed Modulus. 273template <unsigned Modulus, typename Integer> 274Integer RoundUp(Integer i) { 275 return RoundDown<Modulus>(i + Modulus - 1); 276} 277 278// Returns the quotient a / b rounded up ('ceil') to the nearest integer. 279template <typename Integer> 280Integer CeilQuotient(Integer a, Integer b) { 281 return (a + b - 1) / b; 282} 283 284// Returns the argument rounded up to the nearest power of two. 285template <typename Integer> 286Integer RoundUpToPowerOfTwo(Integer n) { 287 Integer i = n - 1; 288 i |= i >> 1; 289 i |= i >> 2; 290 i |= i >> 4; 291 i |= i >> 8; 292 i |= i >> 16; 293 return i + 1; 294} 295 296template <int N> 297struct IsPowerOfTwo { 298 static const bool value = !(N & (N - 1)); 299}; 300 301template <typename T> 302void MarkMemoryAsInitialized(T* ptr, int size) { 303#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED 304 GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr), 305 size * sizeof(T)); 306#else 307 (void)ptr; 308 (void)size; 309#endif 310} 311 312} // namespace gemmlowp 313 314#endif // GEMMLOWP_INTERNAL_COMMON_H_ 315