1/*--------------------------------------------------------------------*/
2/*--- Cachegrind: cache configuration.                   cg-arch.c ---*/
3/*--------------------------------------------------------------------*/
4
5/*
6   This file is part of Cachegrind, a Valgrind tool for cache
7   profiling programs.
8
9   Copyright (C) 2011-2015 Nicholas Nethercote
10      njn@valgrind.org
11
12   This program is free software; you can redistribute it and/or
13   modify it under the terms of the GNU General Public License as
14   published by the Free Software Foundation; either version 2 of the
15   License, or (at your option) any later version.
16
17   This program is distributed in the hope that it will be useful, but
18   WITHOUT ANY WARRANTY; without even the implied warranty of
19   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20   General Public License for more details.
21
22   You should have received a copy of the GNU General Public License
23   along with this program; if not, write to the Free Software
24   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
25   02111-1307, USA.
26
27   The GNU General Public License is contained in the file COPYING.
28*/
29
30#include "pub_tool_basics.h"
31#include "pub_tool_libcassert.h"
32#include "pub_tool_libcbase.h"
33#include "pub_tool_libcprint.h"
34#include "pub_tool_options.h"
35#include "pub_tool_machine.h"
36
37#include "cg_arch.h"
38
39static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
40                             Bool all_caches_clo_defined);
41
42// Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
43// string otherwise.
44static const HChar* check_cache(cache_t* cache)
45{
46   // Simulator requires set count to be a power of two.
47   if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
48       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
49   {
50      return "Cache set count is not a power of two.\n";
51   }
52
53   // Simulator requires line size to be a power of two.
54   if (-1 == VG_(log2)(cache->line_size)) {
55      return "Cache line size is not a power of two.\n";
56   }
57
58   // Then check line size >= 16 -- any smaller and a single instruction could
59   // straddle three cache lines, which breaks a simulation assertion and is
60   // stupid anyway.
61   if (cache->line_size < MIN_LINE_SIZE) {
62      return "Cache line size is too small.\n";
63   }
64
65   /* Then check cache size > line size (causes seg faults if not). */
66   if (cache->size <= cache->line_size) {
67      return "Cache size <= line size.\n";
68   }
69
70   /* Then check assoc <= (size / line size) (seg faults otherwise). */
71   if (cache->assoc > (cache->size / cache->line_size)) {
72      return "Cache associativity > (size / line size).\n";
73   }
74
75   return NULL;
76}
77
78
79static void parse_cache_opt ( cache_t* cache, const HChar* opt,
80                              const HChar* optval )
81{
82   Long i1, i2, i3;
83   HChar* endptr;
84   const HChar* checkRes;
85
86   // Option argument looks like "65536,2,64".  Extract them.
87   i1 = VG_(strtoll10)(optval,   &endptr); if (*endptr != ',')  goto bad;
88   i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
89   i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
90
91   // Check for overflow.
92   cache->size      = (Int)i1;
93   cache->assoc     = (Int)i2;
94   cache->line_size = (Int)i3;
95   if (cache->size      != i1) goto overflow;
96   if (cache->assoc     != i2) goto overflow;
97   if (cache->line_size != i3) goto overflow;
98
99   checkRes = check_cache(cache);
100   if (checkRes) {
101      VG_(fmsg)("%s", checkRes);
102      goto bad;
103   }
104
105   return;
106
107  bad:
108   VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
109
110  overflow:
111   VG_(fmsg_bad_option)(opt,
112      "One of the cache parameters was too large and overflowed.\n");
113}
114
115
116Bool VG_(str_clo_cache_opt)(const HChar *arg,
117                            cache_t* clo_I1c,
118                            cache_t* clo_D1c,
119                            cache_t* clo_LLc)
120{
121   const HChar* tmp_str;
122
123   if      VG_STR_CLO(arg, "--I1", tmp_str) {
124      parse_cache_opt(clo_I1c, arg, tmp_str);
125      return True;
126   } else if VG_STR_CLO(arg, "--D1", tmp_str) {
127      parse_cache_opt(clo_D1c, arg, tmp_str);
128      return True;
129   } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
130              VG_STR_CLO(arg, "--LL", tmp_str)) {
131      parse_cache_opt(clo_LLc, arg, tmp_str);
132      return True;
133   } else
134      return False;
135}
136
137static void umsg_cache_img(const HChar* desc, cache_t* c)
138{
139   VG_(umsg)("  %s: %'d B, %d-way, %d B lines\n", desc,
140             c->size, c->assoc, c->line_size);
141}
142
143// Verifies if c is a valid cache.
144// An invalid value causes an assert, unless clo_redefined is True.
145static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
146{
147   const HChar* checkRes;
148
149   checkRes = check_cache(c);
150   if (checkRes) {
151      VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
152                desc, checkRes);
153      umsg_cache_img(desc, c);
154      if (!clo_redefined) {
155         VG_(umsg)("As it probably should be supported, please report a bug!\n");
156         VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
157         tl_assert(0);
158      }
159   }
160}
161
162
163/* If the LL cache config isn't something the simulation functions
164   can handle, try to adjust it so it is.  Caches are characterised
165   by (total size T, line size L, associativity A), and then we
166   have
167
168     number of sets S = T / (L * A)
169
170   The required constraints are:
171
172   * L must be a power of 2, but it always is in practice, so
173     no problem there
174
175   * A can be any value >= 1
176
177   * T can be any value, but ..
178
179   * S must be a power of 2.
180
181   That sometimes gives a problem.  For example, some Core iX based
182   Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
183   sets.  Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
184   1706.667 sets (!).
185
186   The "fix" is to force S down to the nearest power of two below its
187   original value, and increase A proportionately, so as to keep the
188   total cache size the same.  In fact to be safe we recalculate the
189   cache size afterwards anyway, to guarantee that it divides exactly
190   between the new number of sets.
191
192   The "fix" is "justified" (cough, cough) by alleging that
193   increases of associativity above about 4 have very little effect
194   on the actual miss rate.  It would be far more inaccurate to
195   fudge this by changing the size of the simulated cache --
196   changing the associativity is a much better option.
197*/
198
199/* (Helper function) Returns the largest power of 2 that is <= |x|.
200   Even works when |x| == 0. */
201static UInt floor_power_of_2 ( UInt x )
202{
203   x = x | (x >> 1);
204   x = x | (x >> 2);
205   x = x | (x >> 4);
206   x = x | (x >> 8);
207   x = x | (x >> 16);
208   return x - (x >> 1);
209}
210
211static void
212maybe_tweak_LLc(cache_t *LLc)
213{
214  if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
215     return;
216
217  tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
218
219  UInt old_size      = (UInt)LLc->size;
220  UInt old_assoc     = (UInt)LLc->assoc;
221  UInt old_line_size = (UInt)LLc->line_size;
222
223  UInt new_size      = old_size;
224  UInt new_assoc     = old_assoc;
225  UInt new_line_size = old_line_size;
226
227  UInt old_nSets = old_size / (old_assoc * old_line_size);
228  if (old_nSets == 0) {
229     /* This surely can't happen; but would cause chaos with the maths
230      * below if it did.  Just give up if it does. */
231     return;
232  }
233
234  if (-1 != VG_(log2_64)(old_nSets)) {
235     /* The number of sets is already a power of 2.  Make sure that
236        the size divides exactly between the sets.  Almost all of the
237        time this will have no effect. */
238     new_size = old_line_size * old_assoc * old_nSets;
239  } else {
240     /* The number of sets isn't a power of two.  Calculate some
241        scale-down factor which causes the number of sets to become a
242        power of two.  Then, increase the associativity by that
243        factor.  Finally, re-calculate the total size so as to make
244        sure it divides exactly between the sets. */
245     tl_assert(old_nSets >= 0);
246     UInt new_nSets = floor_power_of_2 ( old_nSets );
247     tl_assert(new_nSets > 0 && new_nSets < old_nSets);
248     Double factor = (Double)old_nSets / (Double)new_nSets;
249     tl_assert(factor >= 1.0);
250
251     new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
252     tl_assert(new_assoc >= old_assoc);
253
254     new_size = old_line_size * new_assoc * new_nSets;
255  }
256
257  tl_assert(new_line_size == old_line_size); /* we never change this */
258  if (new_size == old_size && new_assoc == old_assoc)
259     return;
260
261  VG_(dmsg)("warning: "
262            "specified LL cache: line_size %u  assoc %u  total_size %'u\n",
263            old_line_size, old_assoc, old_size);
264  VG_(dmsg)("warning: "
265            "simulated LL cache: line_size %u  assoc %u  total_size %'u\n",\
266            new_line_size, new_assoc, new_size);
267
268  LLc->size      = new_size;
269  LLc->assoc     = new_assoc;
270  LLc->line_size = new_line_size;
271}
272
273void VG_(post_clo_init_configure_caches)(cache_t* I1c,
274                                         cache_t* D1c,
275                                         cache_t* LLc,
276                                         cache_t* clo_I1c,
277                                         cache_t* clo_D1c,
278                                         cache_t* clo_LLc)
279{
280#define DEFINED(L)   (-1 != L->size  || -1 != L->assoc || -1 != L->line_size)
281
282   // Count how many were defined on the command line.
283   Bool all_caches_clo_defined =
284      (DEFINED(clo_I1c) &&
285       DEFINED(clo_D1c) &&
286       DEFINED(clo_LLc));
287
288   // Set the cache config (using auto-detection, if supported by the
289   // architecture).
290   configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
291
292   maybe_tweak_LLc( LLc );
293
294   // Check the default/auto-detected values.
295   // Allow the user to override invalid auto-detected caches
296   // with command line.
297   check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
298   check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
299   check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
300
301   // Then replace with any defined on the command line.  (Already checked in
302   // VG(parse_clo_cache_opt)().)
303   if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
304   if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
305   if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
306
307   if (VG_(clo_verbosity) >= 2) {
308      VG_(umsg)("Cache configuration used:\n");
309      umsg_cache_img ("I1", I1c);
310      umsg_cache_img ("D1", D1c);
311      umsg_cache_img ("LL", LLc);
312   }
313#undef DEFINED
314}
315
316void VG_(print_cache_clo_opts)()
317{
318   VG_(printf)(
319"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
320"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
321"    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
322               );
323}
324
325
326// Traverse the cache info and return a cache of the given kind and level.
327// Return NULL if no such cache exists.
328static const VexCache *
329locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
330{
331   const VexCache *c;
332
333   for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
334      if (c->level == level && c->kind == kind) {
335         return c;
336      }
337   }
338   return NULL;  // not found
339}
340
341
342// Gives the auto-detected configuration of I1, D1 and LL caches.  They get
343// overridden by any cache configurations specified on the command line.
344static void
345configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
346                 Bool all_caches_clo_defined)
347{
348   VexArchInfo vai;
349   const VexCacheInfo *ci;
350   const VexCache *i1, *d1, *ll;
351
352   VG_(machine_get_VexArchInfo)(NULL, &vai);
353   ci = &vai.hwcache_info;
354
355   // Extract what we need
356   i1 = locate_cache(ci, INSN_CACHE, 1);
357   d1 = locate_cache(ci, DATA_CACHE, 1);
358   ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
359
360   if (ci->num_caches > 0 && ll == NULL) {
361      VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
362   }
363
364   if (ll && ci->num_levels > 2) {
365      VG_(dmsg)("warning: L%u cache found, using its data for the "
366                "LL simulation.\n", ci->num_levels);
367   }
368
369   if (i1 && d1 && ll) {
370      if (i1->is_trace_cache) {
371         /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
372          * conversion to byte size is a total guess;  treat the 12K and 16K
373          * cases the same since the cache byte size must be a power of two for
374          * everything to work!.  Also guessing 32 bytes for the line size...
375          */
376         UInt adjusted_size, guessed_line_size = 32;
377
378         if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
379            adjusted_size = 16 * 1024;
380         } else {
381            adjusted_size = 32 * 1024;
382         }
383         VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
384                   i1->sizeB / 1024);
385         VG_(dmsg)("         Simulating a %u KB I-cache with %u B lines\n",
386                   adjusted_size / 1024, guessed_line_size);
387
388         *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
389      } else {
390         *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
391      }
392      *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
393      *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
394
395      return;
396   }
397
398   // Cache information could not be queried; choose some default
399   // architecture specific default setting.
400
401#if defined(VGA_ppc32)
402
403   // Default cache configuration
404   *I1c = (cache_t) {  65536, 2, 64 };
405   *D1c = (cache_t) {  65536, 2, 64 };
406   *LLc = (cache_t) { 262144, 8, 64 };
407
408#elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
409
410   // Default cache configuration
411   *I1c = (cache_t) {  65536, 2, 64 };
412   *D1c = (cache_t) {  65536, 2, 64 };
413   *LLc = (cache_t) { 262144, 8, 64 };
414
415#elif defined(VGA_arm)
416
417   // Set caches to default (for Cortex-A8 ?)
418   *I1c = (cache_t) {  16384, 4, 64 };
419   *D1c = (cache_t) {  16384, 4, 64 };
420   *LLc = (cache_t) { 262144, 8, 64 };
421
422#elif defined(VGA_arm64)
423
424   // Copy the 32-bit ARM version until such time as we have
425   // some real hardware to run on
426   *I1c = (cache_t) {  16384, 4, 64 };
427   *D1c = (cache_t) {  16384, 4, 64 };
428   *LLc = (cache_t) { 262144, 8, 64 };
429
430#elif defined(VGA_s390x)
431   //
432   // Here is the cache data from older machine models:
433   //
434   //           I1            D1      I/D L2
435   // z900  256k/256/4    256k/256/4   16MB
436   // z800  256k/256/4    256k/256/4    8MB
437   // z990  256k/256/4    256k/256/4   32MB
438   // z890  256k/256/4    256k/256/4   32MB
439   // z9    256k/256/4    256k/256/4   40MB
440   //
441   // Sources:
442   // (1) IBM System z9 109 Technical Introduction
443   //     www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
444   // (2) The microarchitecture of the IBM eServer z900 processor
445   //     IBM Journal of Research and Development
446   //     Volume 46, Number 4/5, pp 381-395, July/September 2002
447   // (3) The IBM eServer z990 microprocessor
448   //     IBM Journal of Research and Development
449   //     Volume 48, Number 3/4, pp 295-309, May/July 2004
450   // (4) Charles Webb, IBM
451   //
452   // L2 data is unfortunately incomplete. Otherwise, we could support
453   // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
454
455   // Default cache configuration is z10-EC  (Source: ECAG insn)
456   *I1c = (cache_t) {    65536,  4, 256 };
457   *D1c = (cache_t) {   131072,  8, 256 };
458   *LLc = (cache_t) { 50331648, 24, 256 };
459
460#elif defined(VGA_mips32)
461
462   // Set caches to default (for MIPS32-r2(mips 74kc))
463   *I1c = (cache_t) {  32768, 4, 32 };
464   *D1c = (cache_t) {  32768, 4, 32 };
465   *LLc = (cache_t) { 524288, 8, 32 };
466
467#elif defined(VGA_mips64)
468
469   // Set caches to default (for MIPS64 - 5kc)
470   *I1c = (cache_t) {  32768, 4, 32 };
471   *D1c = (cache_t) {  32768, 4, 32 };
472   *LLc = (cache_t) { 524288, 8, 32 };
473
474#elif defined(VGA_x86) || defined(VGA_amd64)
475
476   *I1c = (cache_t) {  65536, 2, 64 };
477   *D1c = (cache_t) {  65536, 2, 64 };
478   *LLc = (cache_t) { 262144, 8, 64 };
479
480#elif defined(VGA_tilegx)
481
482   // Set caches to default for Tilegx.
483   *I1c = (cache_t) { 0x8000,  2, 64 };
484   *D1c = (cache_t) { 0x8000,  2, 64 };
485   *LLc = (cache_t) { 0x40000, 8, 64 };
486
487#else
488
489#error "Unknown arch"
490
491#endif
492
493   if (!all_caches_clo_defined) {
494      const HChar warning[] =
495        "Warning: Cannot auto-detect cache config, using defaults.\n"
496        "         Run with -v to see.\n";
497      VG_(dmsg)("%s", warning);
498   }
499}
500
501/*--------------------------------------------------------------------*/
502/*--- end                                                          ---*/
503/*--------------------------------------------------------------------*/
504