1// Copyright 2015 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// common.h: contains stuff that's used throughout gemmlowp
16// and should always be available.
17
18#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19#define GEMMLOWP_INTERNAL_COMMON_H_
20
21#include <pthread.h>
22
23#include <algorithm>
24#include <cassert>
25#include <cmath>
26#include <cstdlib>
27
28#include "../profiling/instrumentation.h"
29
30// Our inline assembly path assume GCC/Clang syntax.
31// Native Client doesn't seem to support inline assembly(?).
32#if defined(__GNUC__) && !defined(__native_client__)
33#define GEMMLOWP_ALLOW_INLINE_ASM
34#endif
35
36// Define macro statement that avoids inlining for GCC.
37// For non-GCC, define as empty macro.
38#if defined(__GNUC__)
39#define GEMMLOWP_NOINLINE __attribute__((noinline))
40#else
41#define GEMMLOWP_NOINLINE
42#endif
43
44// Detect ARM, 32-bit or 64-bit
45#ifdef __arm__
46#define GEMMLOWP_ARM_32
47#endif
48
49#ifdef __aarch64__
50#define GEMMLOWP_ARM_64
51#endif
52
53#if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
54#define GEMMLOWP_ARM
55#endif
56
57// Detect x86, 32-bit or 64-bit
58#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
59#define GEMMLOWP_X86_32
60#endif
61
62#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
63#define GEMMLOWP_X86_64
64#endif
65
66#if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
67#define GEMMLOWP_X86
68#endif
69
70// Some of our optimized paths use inline assembly and for
71// now we don't bother enabling some other optimized paths using intrinddics
72// where we can't use inline assembly paths.
73#ifdef GEMMLOWP_ALLOW_INLINE_ASM
74
75// Detect NEON. It's important to check for both tokens.
76#if (defined __ARM_NEON) || (defined __ARM_NEON__)
77#define GEMMLOWP_NEON
78#endif
79
80// Convenience NEON tokens for 32-bit or 64-bit
81#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
82#define GEMMLOWP_NEON_32
83#endif
84
85#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
86#define GEMMLOWP_NEON_64
87#endif
88
89// Detect SSE4.
90#if defined __SSE4_1__
91#define GEMMLOWP_SSE4
92#endif
93
94// Convenience SSE4 tokens for 32-bit or 64-bit
95#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32)
96#define GEMMLOWP_SSE4_32
97#endif
98
99#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64)
100#define GEMMLOWP_SSE4_64
101#endif
102
103#endif  // GEMMLOWP_ALLOW_INLINE_ASM
104
105// Detect Android. Don't conflate with ARM - we care about tuning
106// for non-ARM Android devices too. This can be used in conjunction
107// with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
108#if defined(__ANDROID__) || defined(ANDROID)
109#define GEMMLOWP_ANDROID
110#endif
111
112namespace gemmlowp {
113
114// Standard cache line size. Useful to optimize alignment and
115// prefetches. Ideally we would query this at runtime, however
116// 64 byte cache lines are the vast majority, and even if it's
117// wrong on some device, it will be wrong by no more than a 2x factor,
118// which should be acceptable.
119const int kDefaultCacheLineSize = 64;
120
121// Default L1 and L2 data cache sizes.
122// The L1 cache size is assumed to be for each core.
123// The L2 cache size is assumed to be shared among all cores. What
124// we call 'L2' here is effectively top-level cache.
125//
126// On x86, we should ideally query this at
127// runtime. On ARM, the instruction to query this is privileged and
128// Android kernels do not expose it to userspace. Fortunately, the majority
129// of ARM devices have roughly comparable values:
130//   Nexus 5: L1 16k, L2 1M
131//   Android One: L1 32k, L2 512k
132// The following values are equal to or somewhat lower than that, and were
133// found to perform well on both the Nexus 5 and Android One.
134// Of course, these values are in principle too low for typical x86 CPUs
135// where we should set the L2 value to (L3 cache size / number of cores) at
136// least.
137#if defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
138// ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
139// to tune for ARM, although on x86 Atom we might be able to query
140// cache sizes at runtime, which would be better.
141const int kDefaultL1CacheSize = 16 * 1024;
142const int kDefaultL2CacheSize = 384 * 1024;
143#elif defined(GEMMLOWP_X86_64)
144// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
145// Thus we assume larger cache sizes, though we really should query
146// them at runtime.
147const int kDefaultL1CacheSize = 32 * 1024;
148const int kDefaultL2CacheSize = 4 * 1024 * 1024;
149#elif defined(GEMMLOWP_X86_32)
150// x86-32 and not Android. Same as x86-64 but less bullish.
151const int kDefaultL1CacheSize = 32 * 1024;
152const int kDefaultL2CacheSize = 2 * 1024 * 1024;
153#else
154// Less common hardware. Maybe some unusual or older or embedded thing.
155// Assume smaller caches, but don't depart too far from what we do
156// on ARM/Android to avoid accidentally exposing unexpected behavior.
157const int kDefaultL1CacheSize = 16 * 1024;
158const int kDefaultL2CacheSize = 256 * 1024;
159#endif
160
161// The proportion of the cache that we intend to use for storing
162// RHS blocks. This should be between 0 and 1, and typically closer to 1,
163// as we typically want to use most of the L2 cache for storing a large
164// RHS block.
165#if defined(GEMMLOWP_X86)
166// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
167// for L2 cache.
168const float kDefaultL2RhsFactor = 1.00f;
169#else
170const float kDefaultL2RhsFactor = 0.75f;
171#endif
172
173// The number of bytes in a SIMD register. This is used to determine
174// the dimensions of PackingRegisterBlock so that such blocks can
175// be efficiently loaded into registers, so that packing code can
176// work within registers as much as possible.
177// In the non-SIMD generic fallback code, this is just a generic array
178// size, so any size would work there. Different platforms may set this
179// to different values but must ensure that their own optimized packing paths
180// are consistent with this value.
181const int kRegisterSize = 16;
182
183// Requantization to less-than-8-bit is costly, so it only worth
184// doing if the GEMM width is large enough
185const int kMinimumWidthForRequantization = 100;
186
187// Hints the CPU to prefetch the cache line containing ptr.
188inline void Prefetch(const void* ptr) {
189#ifdef __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
190  __builtin_prefetch(ptr);
191#else
192  (void)ptr;
193#endif
194}
195
196// Returns the runtime argument rounded down to the nearest multiple of
197// the fixed Modulus.
198template <unsigned Modulus, typename Integer>
199Integer RoundDown(Integer i) {
200  return i - (i % Modulus);
201}
202
203// Returns the runtime argument rounded up to the nearest multiple of
204// the fixed Modulus.
205template <unsigned Modulus, typename Integer>
206Integer RoundUp(Integer i) {
207  return RoundDown<Modulus>(i + Modulus - 1);
208}
209
210// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
211template <typename Integer>
212Integer CeilQuotient(Integer a, Integer b) {
213  return (a + b - 1) / b;
214}
215
216// Returns the argument rounded up to the nearest power of two.
217template <typename Integer>
218Integer RoundUpToPowerOfTwo(Integer n) {
219  Integer i = n - 1;
220  i |= i >> 1;
221  i |= i >> 2;
222  i |= i >> 4;
223  i |= i >> 8;
224  i |= i >> 16;
225  return i + 1;
226}
227
228template <int N>
229struct IsPowerOfTwo {
230  static const bool value = !(N & (N - 1));
231};
232
233}  // namespace gemmlowp
234
235#endif  // GEMMLOWP_INTERNAL_COMMON_H_
236