1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// common.h: contains stuff that's used throughout gemmlowp
16// and should always be available.
17
18#ifndef GEMMLOWP_INTERNAL_COMMON_H_
19#define GEMMLOWP_INTERNAL_COMMON_H_
20
21#include "../internal/platform.h"
22#include "../profiling/pthread_everywhere.h"
23
24#include <algorithm>
25#include <cassert>
26#include <cmath>
27#include <cstdlib>
28
29#include "../profiling/instrumentation.h"
30
31// Our inline assembly path assume GCC/Clang syntax.
32// Native Client doesn't seem to support inline assembly(?).
33#if defined(__GNUC__) && !defined(__native_client__)
34#define GEMMLOWP_ALLOW_INLINE_ASM
35#endif
36
37// Define macro statement that avoids inlining for GCC.
38// For non-GCC, define as empty macro.
39#if defined(__GNUC__)
40#define GEMMLOWP_NOINLINE __attribute__((noinline))
41#else
42#define GEMMLOWP_NOINLINE
43#endif
44
45// Detect ARM, 32-bit or 64-bit
46#ifdef __arm__
47#define GEMMLOWP_ARM_32
48#endif
49
50#ifdef __aarch64__
51#define GEMMLOWP_ARM_64
52#endif
53
54#if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
55#define GEMMLOWP_ARM
56#endif
57
58// Detect MIPS, 32-bit or 64-bit
59#if defined(__mips) && !defined(__LP64__)
60#define GEMMLOWP_MIPS_32
61#endif
62
63#if defined(__mips) && defined(__LP64__)
64#define GEMMLOWP_MIPS_64
65#endif
66
67#if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64)
68#define GEMMLOWP_MIPS
69#endif
70
71// Detect x86, 32-bit or 64-bit
72#if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
73#define GEMMLOWP_X86_32
74#endif
75
76#if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
77#define GEMMLOWP_X86_64
78#endif
79
80#if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
81#define GEMMLOWP_X86
82#endif
83
84// Some of our optimized paths use inline assembly and for
85// now we don't bother enabling some other optimized paths using intrinddics
86// where we can't use inline assembly paths.
87#ifdef GEMMLOWP_ALLOW_INLINE_ASM
88
89// Detect NEON. It's important to check for both tokens.
90#if (defined __ARM_NEON) || (defined __ARM_NEON__)
91#define GEMMLOWP_NEON
92#endif
93
94// Convenience NEON tokens for 32-bit or 64-bit
95#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
96#define GEMMLOWP_NEON_32
97#endif
98
99#if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
100#define GEMMLOWP_NEON_64
101#endif
102
103// Detect MIPS MSA.
104// Limit MSA optimizations to little-endian CPUs for now.
105// TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
106#if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \
107    defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
108#define GEMMLOWP_MSA
109#endif
110
111// Convenience MIPS MSA tokens for 32-bit or 64-bit.
112#if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32)
113#define GEMMLOWP_MSA_32
114#endif
115
116#if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64)
117#define GEMMLOWP_MSA_64
118#endif
119
120// Detect SSE.
121#ifdef __SSE4_1__
122#define GEMMLOWP_SSE4
123#endif
124
125#ifdef __SSE3__
126#define GEMMLOWP_SSE3
127#endif
128
129// Convenience SSE4 tokens for 32-bit or 64-bit
130#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \
131   !defined(GEMMLOWP_DISABLE_SSE4)
132#define GEMMLOWP_SSE4_32
133#endif
134
135#if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32)
136#define GEMMLOWP_SSE3_32
137#endif
138
139#if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \
140   !defined(GEMMLOWP_DISABLE_SSE4)
141#define GEMMLOWP_SSE4_64
142#endif
143
144#if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64)
145#define GEMMLOWP_SSE3_64
146#endif
147
148#if defined(__has_feature)
149#if __has_feature(memory_sanitizer)
150#include <sanitizer/msan_interface.h>
151#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison
152#elif __has_feature(address_sanitizer)
153#include <sanitizer/asan_interface.h>
154#define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region
155#endif
156#endif
157
158#endif  // GEMMLOWP_ALLOW_INLINE_ASM
159
160// Detect Android. Don't conflate with ARM - we care about tuning
161// for non-ARM Android devices too. This can be used in conjunction
162// with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
163#if defined(__ANDROID__) || defined(ANDROID)
164#define GEMMLOWP_ANDROID
165#endif
166
167namespace gemmlowp {
168
169// Standard cache line size. Useful to optimize alignment and
170// prefetches. Ideally we would query this at runtime, however
171// 64 byte cache lines are the vast majority, and even if it's
172// wrong on some device, it will be wrong by no more than a 2x factor,
173// which should be acceptable.
174const int kDefaultCacheLineSize = 64;
175
176// Default L1 and L2 data cache sizes.
177// The L1 cache size is assumed to be for each core.
178// The L2 cache size is assumed to be shared among all cores. What
179// we call 'L2' here is effectively top-level cache.
180//
181// On x86, we should ideally query this at
182// runtime. On ARM, the instruction to query this is privileged and
183// Android kernels do not expose it to userspace. Fortunately, the majority
184// of ARM devices have roughly comparable values:
185//   Nexus 5: L1 16k, L2 1M
186//   Android One: L1 32k, L2 512k
187// The following values are equal to or somewhat lower than that, and were
188// found to perform well on both the Nexus 5 and Android One.
189// Of course, these values are in principle too low for typical x86 CPUs
190// where we should set the L2 value to (L3 cache size / number of cores) at
191// least.
192//
193#if defined(GEMMLOWP_ARM) && defined(__APPLE__)
194// iPhone/iPad
195const int kDefaultL1CacheSize = 48 * 1024;
196const int kDefaultL2CacheSize = 2 * 1024 * 1024;
197#elif defined(GEMMLOWP_ARM) || defined(GEMMLOWP_ANDROID)
198// Other ARM or ARM-like hardware (Android implies ARM-like) so here it's OK
199// to tune for ARM, although on x86 Atom we might be able to query
200// cache sizes at runtime, which would be better.
201const int kDefaultL1CacheSize = 16 * 1024;
202const int kDefaultL2CacheSize = 384 * 1024;
203#elif defined(GEMMLOWP_X86_64)
204// x86-64 and not Android. Therefore, likely desktop-class x86 hardware.
205// Thus we assume larger cache sizes, though we really should query
206// them at runtime.
207const int kDefaultL1CacheSize = 32 * 1024;
208const int kDefaultL2CacheSize = 4 * 1024 * 1024;
209#elif defined(GEMMLOWP_X86_32)
210// x86-32 and not Android. Same as x86-64 but less bullish.
211const int kDefaultL1CacheSize = 32 * 1024;
212const int kDefaultL2CacheSize = 2 * 1024 * 1024;
213#elif defined(GEMMLOWP_MIPS)
214// MIPS and not Android. TODO: MIPS and Android?
215const int kDefaultL1CacheSize = 32 * 1024;
216const int kDefaultL2CacheSize = 1024 * 1024;
217#else
218// Less common hardware. Maybe some unusual or older or embedded thing.
219// Assume smaller caches, but don't depart too far from what we do
220// on ARM/Android to avoid accidentally exposing unexpected behavior.
221const int kDefaultL1CacheSize = 16 * 1024;
222const int kDefaultL2CacheSize = 256 * 1024;
223#endif
224
225// The proportion of the cache that we intend to use for storing
226// RHS blocks. This should be between 0 and 1, and typically closer to 1,
227// as we typically want to use most of the L2 cache for storing a large
228// RHS block.
229#if defined(GEMMLOWP_X86)
230// For IA, use the entire L2 cache for the RHS matrix. LHS matrix is not blocked
231// for L2 cache.
232const float kDefaultL2RhsFactor = 1.00f;
233#else
234const float kDefaultL2RhsFactor = 0.75f;
235#endif
236
237// The number of bytes in a SIMD register. This is used to determine
238// the dimensions of PackingRegisterBlock so that such blocks can
239// be efficiently loaded into registers, so that packing code can
240// work within registers as much as possible.
241// In the non-SIMD generic fallback code, this is just a generic array
242// size, so any size would work there. Different platforms may set this
243// to different values but must ensure that their own optimized packing paths
244// are consistent with this value.
245const int kRegisterSize = 16;
246
247// Hints the CPU to prefetch the cache line containing ptr.
248inline void Prefetch(const void* ptr) {
249#if defined GEMMLOWP_ARM_64 && defined GEMMLOWP_ALLOW_INLINE_ASM
250  // Aarch64 has very detailed prefetch instructions, that compilers
251  // can't know how to map __builtin_prefetch to, and as a result, don't,
252  // leaving __builtin_prefetch a no-op on this architecture.
253  // For our purposes, "pldl1keep" is usually what we want, meaning:
254  // "prefetch for load, into L1 cache, using each value multiple times".
255  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
256#elif defined \
257    __GNUC__  // Clang and GCC define __GNUC__ and have __builtin_prefetch.
258  __builtin_prefetch(ptr);
259#else
260  (void)ptr;
261#endif
262}
263
264// Returns the runtime argument rounded down to the nearest multiple of
265// the fixed Modulus.
266template <unsigned Modulus, typename Integer>
267Integer RoundDown(Integer i) {
268  return i - (i % Modulus);
269}
270
271// Returns the runtime argument rounded up to the nearest multiple of
272// the fixed Modulus.
273template <unsigned Modulus, typename Integer>
274Integer RoundUp(Integer i) {
275  return RoundDown<Modulus>(i + Modulus - 1);
276}
277
278// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
279template <typename Integer>
280Integer CeilQuotient(Integer a, Integer b) {
281  return (a + b - 1) / b;
282}
283
284// Returns the argument rounded up to the nearest power of two.
285template <typename Integer>
286Integer RoundUpToPowerOfTwo(Integer n) {
287  Integer i = n - 1;
288  i |= i >> 1;
289  i |= i >> 2;
290  i |= i >> 4;
291  i |= i >> 8;
292  i |= i >> 16;
293  return i + 1;
294}
295
296template <int N>
297struct IsPowerOfTwo {
298  static const bool value = !(N & (N - 1));
299};
300
301template <typename T>
302void MarkMemoryAsInitialized(T* ptr, int size) {
303#ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
304  GEMMLOWP_MARK_MEMORY_AS_INITIALIZED(static_cast<void*>(ptr),
305                                      size * sizeof(T));
306#else
307  (void)ptr;
308  (void)size;
309#endif
310}
311
312}  // namespace gemmlowp
313
314#endif  // GEMMLOWP_INTERNAL_COMMON_H_
315