1/*--------------------------------------------------------------------*/ 2/*--- Cachegrind: cache configuration. cg-arch.c ---*/ 3/*--------------------------------------------------------------------*/ 4 5/* 6 This file is part of Cachegrind, a Valgrind tool for cache 7 profiling programs. 8 9 Copyright (C) 2011-2015 Nicholas Nethercote 10 njn@valgrind.org 11 12 This program is free software; you can redistribute it and/or 13 modify it under the terms of the GNU General Public License as 14 published by the Free Software Foundation; either version 2 of the 15 License, or (at your option) any later version. 16 17 This program is distributed in the hope that it will be useful, but 18 WITHOUT ANY WARRANTY; without even the implied warranty of 19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 20 General Public License for more details. 21 22 You should have received a copy of the GNU General Public License 23 along with this program; if not, write to the Free Software 24 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 25 02111-1307, USA. 26 27 The GNU General Public License is contained in the file COPYING. 28*/ 29 30#include "pub_tool_basics.h" 31#include "pub_tool_libcassert.h" 32#include "pub_tool_libcbase.h" 33#include "pub_tool_libcprint.h" 34#include "pub_tool_options.h" 35#include "pub_tool_machine.h" 36 37#include "cg_arch.h" 38 39static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc, 40 Bool all_caches_clo_defined); 41 42// Checks cache config is ok. Returns NULL if ok, or a pointer to an error 43// string otherwise. 44static const HChar* check_cache(cache_t* cache) 45{ 46 // Simulator requires set count to be a power of two. 47 if ((cache->size % (cache->line_size * cache->assoc) != 0) || 48 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) 49 { 50 return "Cache set count is not a power of two.\n"; 51 } 52 53 // Simulator requires line size to be a power of two. 54 if (-1 == VG_(log2)(cache->line_size)) { 55 return "Cache line size is not a power of two.\n"; 56 } 57 58 // Then check line size >= 16 -- any smaller and a single instruction could 59 // straddle three cache lines, which breaks a simulation assertion and is 60 // stupid anyway. 61 if (cache->line_size < MIN_LINE_SIZE) { 62 return "Cache line size is too small.\n"; 63 } 64 65 /* Then check cache size > line size (causes seg faults if not). */ 66 if (cache->size <= cache->line_size) { 67 return "Cache size <= line size.\n"; 68 } 69 70 /* Then check assoc <= (size / line size) (seg faults otherwise). */ 71 if (cache->assoc > (cache->size / cache->line_size)) { 72 return "Cache associativity > (size / line size).\n"; 73 } 74 75 return NULL; 76} 77 78 79static void parse_cache_opt ( cache_t* cache, const HChar* opt, 80 const HChar* optval ) 81{ 82 Long i1, i2, i3; 83 HChar* endptr; 84 const HChar* checkRes; 85 86 // Option argument looks like "65536,2,64". Extract them. 87 i1 = VG_(strtoll10)(optval, &endptr); if (*endptr != ',') goto bad; 88 i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',') goto bad; 89 i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad; 90 91 // Check for overflow. 92 cache->size = (Int)i1; 93 cache->assoc = (Int)i2; 94 cache->line_size = (Int)i3; 95 if (cache->size != i1) goto overflow; 96 if (cache->assoc != i2) goto overflow; 97 if (cache->line_size != i3) goto overflow; 98 99 checkRes = check_cache(cache); 100 if (checkRes) { 101 VG_(fmsg)("%s", checkRes); 102 goto bad; 103 } 104 105 return; 106 107 bad: 108 VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval); 109 110 overflow: 111 VG_(fmsg_bad_option)(opt, 112 "One of the cache parameters was too large and overflowed.\n"); 113} 114 115 116Bool VG_(str_clo_cache_opt)(const HChar *arg, 117 cache_t* clo_I1c, 118 cache_t* clo_D1c, 119 cache_t* clo_LLc) 120{ 121 const HChar* tmp_str; 122 123 if VG_STR_CLO(arg, "--I1", tmp_str) { 124 parse_cache_opt(clo_I1c, arg, tmp_str); 125 return True; 126 } else if VG_STR_CLO(arg, "--D1", tmp_str) { 127 parse_cache_opt(clo_D1c, arg, tmp_str); 128 return True; 129 } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility 130 VG_STR_CLO(arg, "--LL", tmp_str)) { 131 parse_cache_opt(clo_LLc, arg, tmp_str); 132 return True; 133 } else 134 return False; 135} 136 137static void umsg_cache_img(const HChar* desc, cache_t* c) 138{ 139 VG_(umsg)(" %s: %'d B, %d-way, %d B lines\n", desc, 140 c->size, c->assoc, c->line_size); 141} 142 143// Verifies if c is a valid cache. 144// An invalid value causes an assert, unless clo_redefined is True. 145static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined) 146{ 147 const HChar* checkRes; 148 149 checkRes = check_cache(c); 150 if (checkRes) { 151 VG_(umsg)("Auto-detected %s cache configuration not supported: %s", 152 desc, checkRes); 153 umsg_cache_img(desc, c); 154 if (!clo_redefined) { 155 VG_(umsg)("As it probably should be supported, please report a bug!\n"); 156 VG_(umsg)("Bypass this message by using option --%s=...\n", desc); 157 tl_assert(0); 158 } 159 } 160} 161 162 163/* If the LL cache config isn't something the simulation functions 164 can handle, try to adjust it so it is. Caches are characterised 165 by (total size T, line size L, associativity A), and then we 166 have 167 168 number of sets S = T / (L * A) 169 170 The required constraints are: 171 172 * L must be a power of 2, but it always is in practice, so 173 no problem there 174 175 * A can be any value >= 1 176 177 * T can be any value, but .. 178 179 * S must be a power of 2. 180 181 That sometimes gives a problem. For example, some Core iX based 182 Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288 183 sets. Some AMD cpus have T = 5MB, A = 48, L = 64, which gives 184 1706.667 sets (!). 185 186 The "fix" is to force S down to the nearest power of two below its 187 original value, and increase A proportionately, so as to keep the 188 total cache size the same. In fact to be safe we recalculate the 189 cache size afterwards anyway, to guarantee that it divides exactly 190 between the new number of sets. 191 192 The "fix" is "justified" (cough, cough) by alleging that 193 increases of associativity above about 4 have very little effect 194 on the actual miss rate. It would be far more inaccurate to 195 fudge this by changing the size of the simulated cache -- 196 changing the associativity is a much better option. 197*/ 198 199/* (Helper function) Returns the largest power of 2 that is <= |x|. 200 Even works when |x| == 0. */ 201static UInt floor_power_of_2 ( UInt x ) 202{ 203 x = x | (x >> 1); 204 x = x | (x >> 2); 205 x = x | (x >> 4); 206 x = x | (x >> 8); 207 x = x | (x >> 16); 208 return x - (x >> 1); 209} 210 211static void 212maybe_tweak_LLc(cache_t *LLc) 213{ 214 if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0) 215 return; 216 217 tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0); 218 219 UInt old_size = (UInt)LLc->size; 220 UInt old_assoc = (UInt)LLc->assoc; 221 UInt old_line_size = (UInt)LLc->line_size; 222 223 UInt new_size = old_size; 224 UInt new_assoc = old_assoc; 225 UInt new_line_size = old_line_size; 226 227 UInt old_nSets = old_size / (old_assoc * old_line_size); 228 if (old_nSets == 0) { 229 /* This surely can't happen; but would cause chaos with the maths 230 * below if it did. Just give up if it does. */ 231 return; 232 } 233 234 if (-1 != VG_(log2_64)(old_nSets)) { 235 /* The number of sets is already a power of 2. Make sure that 236 the size divides exactly between the sets. Almost all of the 237 time this will have no effect. */ 238 new_size = old_line_size * old_assoc * old_nSets; 239 } else { 240 /* The number of sets isn't a power of two. Calculate some 241 scale-down factor which causes the number of sets to become a 242 power of two. Then, increase the associativity by that 243 factor. Finally, re-calculate the total size so as to make 244 sure it divides exactly between the sets. */ 245 tl_assert(old_nSets >= 0); 246 UInt new_nSets = floor_power_of_2 ( old_nSets ); 247 tl_assert(new_nSets > 0 && new_nSets < old_nSets); 248 Double factor = (Double)old_nSets / (Double)new_nSets; 249 tl_assert(factor >= 1.0); 250 251 new_assoc = (UInt)(0.5 + factor * (Double)old_assoc); 252 tl_assert(new_assoc >= old_assoc); 253 254 new_size = old_line_size * new_assoc * new_nSets; 255 } 256 257 tl_assert(new_line_size == old_line_size); /* we never change this */ 258 if (new_size == old_size && new_assoc == old_assoc) 259 return; 260 261 VG_(dmsg)("warning: " 262 "specified LL cache: line_size %u assoc %u total_size %'u\n", 263 old_line_size, old_assoc, old_size); 264 VG_(dmsg)("warning: " 265 "simulated LL cache: line_size %u assoc %u total_size %'u\n",\ 266 new_line_size, new_assoc, new_size); 267 268 LLc->size = new_size; 269 LLc->assoc = new_assoc; 270 LLc->line_size = new_line_size; 271} 272 273void VG_(post_clo_init_configure_caches)(cache_t* I1c, 274 cache_t* D1c, 275 cache_t* LLc, 276 cache_t* clo_I1c, 277 cache_t* clo_D1c, 278 cache_t* clo_LLc) 279{ 280#define DEFINED(L) (-1 != L->size || -1 != L->assoc || -1 != L->line_size) 281 282 // Count how many were defined on the command line. 283 Bool all_caches_clo_defined = 284 (DEFINED(clo_I1c) && 285 DEFINED(clo_D1c) && 286 DEFINED(clo_LLc)); 287 288 // Set the cache config (using auto-detection, if supported by the 289 // architecture). 290 configure_caches( I1c, D1c, LLc, all_caches_clo_defined ); 291 292 maybe_tweak_LLc( LLc ); 293 294 // Check the default/auto-detected values. 295 // Allow the user to override invalid auto-detected caches 296 // with command line. 297 check_cache_or_override ("I1", I1c, DEFINED(clo_I1c)); 298 check_cache_or_override ("D1", D1c, DEFINED(clo_D1c)); 299 check_cache_or_override ("LL", LLc, DEFINED(clo_LLc)); 300 301 // Then replace with any defined on the command line. (Already checked in 302 // VG(parse_clo_cache_opt)().) 303 if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; } 304 if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; } 305 if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; } 306 307 if (VG_(clo_verbosity) >= 2) { 308 VG_(umsg)("Cache configuration used:\n"); 309 umsg_cache_img ("I1", I1c); 310 umsg_cache_img ("D1", D1c); 311 umsg_cache_img ("LL", LLc); 312 } 313#undef DEFINED 314} 315 316void VG_(print_cache_clo_opts)() 317{ 318 VG_(printf)( 319" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n" 320" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n" 321" --LL=<size>,<assoc>,<line_size> set LL cache manually\n" 322 ); 323} 324 325 326// Traverse the cache info and return a cache of the given kind and level. 327// Return NULL if no such cache exists. 328static const VexCache * 329locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level) 330{ 331 const VexCache *c; 332 333 for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) { 334 if (c->level == level && c->kind == kind) { 335 return c; 336 } 337 } 338 return NULL; // not found 339} 340 341 342// Gives the auto-detected configuration of I1, D1 and LL caches. They get 343// overridden by any cache configurations specified on the command line. 344static void 345configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc, 346 Bool all_caches_clo_defined) 347{ 348 VexArchInfo vai; 349 const VexCacheInfo *ci; 350 const VexCache *i1, *d1, *ll; 351 352 VG_(machine_get_VexArchInfo)(NULL, &vai); 353 ci = &vai.hwcache_info; 354 355 // Extract what we need 356 i1 = locate_cache(ci, INSN_CACHE, 1); 357 d1 = locate_cache(ci, DATA_CACHE, 1); 358 ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels); 359 360 if (ci->num_caches > 0 && ll == NULL) { 361 VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n"); 362 } 363 364 if (ll && ci->num_levels > 2) { 365 VG_(dmsg)("warning: L%u cache found, using its data for the " 366 "LL simulation.\n", ci->num_levels); 367 } 368 369 if (i1 && d1 && ll) { 370 if (i1->is_trace_cache) { 371 /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based. 372 * conversion to byte size is a total guess; treat the 12K and 16K 373 * cases the same since the cache byte size must be a power of two for 374 * everything to work!. Also guessing 32 bytes for the line size... 375 */ 376 UInt adjusted_size, guessed_line_size = 32; 377 378 if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) { 379 adjusted_size = 16 * 1024; 380 } else { 381 adjusted_size = 32 * 1024; 382 } 383 VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n", 384 i1->sizeB / 1024); 385 VG_(dmsg)(" Simulating a %u KB I-cache with %u B lines\n", 386 adjusted_size / 1024, guessed_line_size); 387 388 *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size }; 389 } else { 390 *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB }; 391 } 392 *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB }; 393 *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB }; 394 395 return; 396 } 397 398 // Cache information could not be queried; choose some default 399 // architecture specific default setting. 400 401#if defined(VGA_ppc32) 402 403 // Default cache configuration 404 *I1c = (cache_t) { 65536, 2, 64 }; 405 *D1c = (cache_t) { 65536, 2, 64 }; 406 *LLc = (cache_t) { 262144, 8, 64 }; 407 408#elif defined(VGA_ppc64be) || defined(VGA_ppc64le) 409 410 // Default cache configuration 411 *I1c = (cache_t) { 65536, 2, 64 }; 412 *D1c = (cache_t) { 65536, 2, 64 }; 413 *LLc = (cache_t) { 262144, 8, 64 }; 414 415#elif defined(VGA_arm) 416 417 // Set caches to default (for Cortex-A8 ?) 418 *I1c = (cache_t) { 16384, 4, 64 }; 419 *D1c = (cache_t) { 16384, 4, 64 }; 420 *LLc = (cache_t) { 262144, 8, 64 }; 421 422#elif defined(VGA_arm64) 423 424 // Copy the 32-bit ARM version until such time as we have 425 // some real hardware to run on 426 *I1c = (cache_t) { 16384, 4, 64 }; 427 *D1c = (cache_t) { 16384, 4, 64 }; 428 *LLc = (cache_t) { 262144, 8, 64 }; 429 430#elif defined(VGA_s390x) 431 // 432 // Here is the cache data from older machine models: 433 // 434 // I1 D1 I/D L2 435 // z900 256k/256/4 256k/256/4 16MB 436 // z800 256k/256/4 256k/256/4 8MB 437 // z990 256k/256/4 256k/256/4 32MB 438 // z890 256k/256/4 256k/256/4 32MB 439 // z9 256k/256/4 256k/256/4 40MB 440 // 441 // Sources: 442 // (1) IBM System z9 109 Technical Introduction 443 // www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf 444 // (2) The microarchitecture of the IBM eServer z900 processor 445 // IBM Journal of Research and Development 446 // Volume 46, Number 4/5, pp 381-395, July/September 2002 447 // (3) The IBM eServer z990 microprocessor 448 // IBM Journal of Research and Development 449 // Volume 48, Number 3/4, pp 295-309, May/July 2004 450 // (4) Charles Webb, IBM 451 // 452 // L2 data is unfortunately incomplete. Otherwise, we could support 453 // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps). 454 455 // Default cache configuration is z10-EC (Source: ECAG insn) 456 *I1c = (cache_t) { 65536, 4, 256 }; 457 *D1c = (cache_t) { 131072, 8, 256 }; 458 *LLc = (cache_t) { 50331648, 24, 256 }; 459 460#elif defined(VGA_mips32) 461 462 // Set caches to default (for MIPS32-r2(mips 74kc)) 463 *I1c = (cache_t) { 32768, 4, 32 }; 464 *D1c = (cache_t) { 32768, 4, 32 }; 465 *LLc = (cache_t) { 524288, 8, 32 }; 466 467#elif defined(VGA_mips64) 468 469 // Set caches to default (for MIPS64 - 5kc) 470 *I1c = (cache_t) { 32768, 4, 32 }; 471 *D1c = (cache_t) { 32768, 4, 32 }; 472 *LLc = (cache_t) { 524288, 8, 32 }; 473 474#elif defined(VGA_x86) || defined(VGA_amd64) 475 476 *I1c = (cache_t) { 65536, 2, 64 }; 477 *D1c = (cache_t) { 65536, 2, 64 }; 478 *LLc = (cache_t) { 262144, 8, 64 }; 479 480#elif defined(VGA_tilegx) 481 482 // Set caches to default for Tilegx. 483 *I1c = (cache_t) { 0x8000, 2, 64 }; 484 *D1c = (cache_t) { 0x8000, 2, 64 }; 485 *LLc = (cache_t) { 0x40000, 8, 64 }; 486 487#else 488 489#error "Unknown arch" 490 491#endif 492 493 if (!all_caches_clo_defined) { 494 const HChar warning[] = 495 "Warning: Cannot auto-detect cache config, using defaults.\n" 496 " Run with -v to see.\n"; 497 VG_(dmsg)("%s", warning); 498 } 499} 500 501/*--------------------------------------------------------------------*/ 502/*--- end ---*/ 503/*--------------------------------------------------------------------*/ 504