sim.c revision 09ee78ec9675201840d895623d49efba1ffe05d8
1 2/*--------------------------------------------------------------------*/ 3/*--- Cache simulation. ---*/ 4/*--- sim.c ---*/ 5/*--------------------------------------------------------------------*/ 6 7/* 8 This file is part of Callgrind, a Valgrind tool for call graph 9 profiling programs. 10 11 Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de) 12 13 This tool is derived from and contains code from Cachegrind 14 Copyright (C) 2002-2008 Nicholas Nethercote (njn@valgrind.org) 15 16 This program is free software; you can redistribute it and/or 17 modify it under the terms of the GNU General Public License as 18 published by the Free Software Foundation; either version 2 of the 19 License, or (at your option) any later version. 20 21 This program is distributed in the hope that it will be useful, but 22 WITHOUT ANY WARRANTY; without even the implied warranty of 23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 24 General Public License for more details. 25 26 You should have received a copy of the GNU General Public License 27 along with this program; if not, write to the Free Software 28 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 29 02111-1307, USA. 30 31 The GNU General Public License is contained in the file COPYING. 32*/ 33 34#include "global.h" 35 36 37/* Notes: 38 - simulates a write-allocate cache 39 - (block --> set) hash function uses simple bit selection 40 - handling of references straddling two cache blocks: 41 - counts as only one cache access (not two) 42 - both blocks hit --> one hit 43 - one block hits, the other misses --> one miss 44 - both blocks miss --> one miss (not two) 45*/ 46 47/* Cache configuration */ 48#include "cg_arch.h" 49 50/* additional structures for cache use info, separated 51 * according usage frequency: 52 * - line_loaded : pointer to cost center of instruction 53 * which loaded the line into cache. 54 * Needed to increment counters when line is evicted. 55 * - line_use : updated on every access 56 */ 57typedef struct { 58 UInt count; 59 UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */ 60} line_use; 61 62typedef struct { 63 Addr memline, iaddr; 64 line_use* dep_use; /* point to higher-level cacheblock for this memline */ 65 ULong* use_base; 66} line_loaded; 67 68/* Cache state */ 69typedef struct { 70 char* name; 71 int size; /* bytes */ 72 int assoc; 73 int line_size; /* bytes */ 74 Bool sectored; /* prefetch nearside cacheline on read */ 75 int sets; 76 int sets_min_1; 77 int line_size_bits; 78 int tag_shift; 79 UWord tag_mask; 80 char desc_line[128]; 81 UWord* tags; 82 83 /* for cache use */ 84 int line_size_mask; 85 int* line_start_mask; 86 int* line_end_mask; 87 line_loaded* loaded; 88 line_use* use; 89} cache_t2; 90 91/* 92 * States of flat caches in our model. 93 * We use a 2-level hierarchy, 94 */ 95static cache_t2 I1, D1, L2; 96 97/* Lower bits of cache tags are used as flags for a cache line */ 98#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1) 99#define CACHELINE_DIRTY 1 100 101 102/* Cache simulator Options */ 103static Bool clo_simulate_writeback = False; 104static Bool clo_simulate_hwpref = False; 105static Bool clo_simulate_sectors = False; 106static Bool clo_collect_cacheuse = False; 107 108/* Following global vars are setup before by 109 * setup_bbcc()/cachesim_after_bbsetup(): 110 * 111 * - Addr bb_base (instruction start address of original BB) 112 * - ULong* cost_base (start of cost array for BB) 113 * - BBCC* nonskipped (only != 0 when in a function not skipped) 114 */ 115 116/* Offset to events in event set, used in log_* functions */ 117static Int off_D0_Ir; 118static Int off_D1r_Ir; 119static Int off_D1r_Dr; 120static Int off_D1w_Ir; 121static Int off_D1w_Dw; 122static Int off_D2_Ir; 123static Int off_D2_Dr; 124static Int off_D2_Dw; 125 126static Addr bb_base; 127static ULong* cost_base; 128static InstrInfo* current_ii; 129 130/* Cache use offsets */ 131/* FIXME: The offsets are only correct because all eventsets get 132 * the "Use" set added first ! 133 */ 134static Int off_I1_AcCost = 0; 135static Int off_I1_SpLoss = 1; 136static Int off_D1_AcCost = 0; 137static Int off_D1_SpLoss = 1; 138static Int off_L2_AcCost = 2; 139static Int off_L2_SpLoss = 3; 140 141/* Cache access types */ 142typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType; 143 144/* Result of a reference into a flat cache */ 145typedef enum { Hit = 0, Miss, MissDirty } CacheResult; 146 147/* Result of a reference into a hierarchical cache model */ 148typedef enum { 149 L1_Hit, 150 L2_Hit, 151 MemAccess, 152 WriteBackMemAccess } CacheModelResult; 153 154typedef CacheModelResult (*simcall_type)(Addr, UChar); 155 156static struct { 157 simcall_type I1_Read; 158 simcall_type D1_Read; 159 simcall_type D1_Write; 160} simulator; 161 162/*------------------------------------------------------------*/ 163/*--- Cache Simulator Initialization ---*/ 164/*------------------------------------------------------------*/ 165 166static void cachesim_clearcache(cache_t2* c) 167{ 168 Int i; 169 170 for (i = 0; i < c->sets * c->assoc; i++) 171 c->tags[i] = 0; 172 if (c->use) { 173 for (i = 0; i < c->sets * c->assoc; i++) { 174 c->loaded[i].memline = 0; 175 c->loaded[i].use_base = 0; 176 c->loaded[i].dep_use = 0; 177 c->loaded[i].iaddr = 0; 178 c->use[i].mask = 0; 179 c->use[i].count = 0; 180 c->tags[i] = i % c->assoc; /* init lower bits as pointer */ 181 } 182 } 183} 184 185static void cacheuse_initcache(cache_t2* c); 186 187/* By this point, the size/assoc/line_size has been checked. */ 188static void cachesim_initcache(cache_t config, cache_t2* c) 189{ 190 c->size = config.size; 191 c->assoc = config.assoc; 192 c->line_size = config.line_size; 193 c->sectored = False; // FIXME 194 195 c->sets = (c->size / c->line_size) / c->assoc; 196 c->sets_min_1 = c->sets - 1; 197 c->line_size_bits = VG_(log2)(c->line_size); 198 c->tag_shift = c->line_size_bits + VG_(log2)(c->sets); 199 c->tag_mask = ~((1<<c->tag_shift)-1); 200 201 /* Can bits in tag entries be used for flags? 202 * Should be always true as MIN_LINE_SIZE >= 16 */ 203 CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0); 204 205 if (c->assoc == 1) { 206 VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s", 207 c->size, c->line_size, 208 c->sectored ? ", sectored":""); 209 } else { 210 VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s", 211 c->size, c->line_size, c->assoc, 212 c->sectored ? ", sectored":""); 213 } 214 215 c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1", 216 sizeof(UWord) * c->sets * c->assoc); 217 if (clo_collect_cacheuse) 218 cacheuse_initcache(c); 219 else 220 c->use = 0; 221 cachesim_clearcache(c); 222} 223 224 225#if 0 226static void print_cache(cache_t2* c) 227{ 228 UInt set, way, i; 229 230 /* Note initialisation and update of 'i'. */ 231 for (i = 0, set = 0; set < c->sets; set++) { 232 for (way = 0; way < c->assoc; way++, i++) { 233 VG_(printf)("%8x ", c->tags[i]); 234 } 235 VG_(printf)("\n"); 236 } 237} 238#endif 239 240 241/*------------------------------------------------------------*/ 242/*--- Write Through Cache Simulation ---*/ 243/*------------------------------------------------------------*/ 244 245/* 246 * Simple model: L1 & L2 Write Through 247 * Does not distinguish among read and write references 248 * 249 * Simulator functions: 250 * CacheModelResult cachesim_I1_ref(Addr a, UChar size) 251 * CacheModelResult cachesim_D1_ref(Addr a, UChar size) 252 */ 253 254static __inline__ 255CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag) 256{ 257 int i, j; 258 UWord *set; 259 260 set = &(c->tags[set_no * c->assoc]); 261 262 /* This loop is unrolled for just the first case, which is the most */ 263 /* common. We can't unroll any further because it would screw up */ 264 /* if we have a direct-mapped (1-way) cache. */ 265 if (tag == set[0]) 266 return Hit; 267 268 /* If the tag is one other than the MRU, move it into the MRU spot */ 269 /* and shuffle the rest down. */ 270 for (i = 1; i < c->assoc; i++) { 271 if (tag == set[i]) { 272 for (j = i; j > 0; j--) { 273 set[j] = set[j - 1]; 274 } 275 set[0] = tag; 276 return Hit; 277 } 278 } 279 280 /* A miss; install this tag as MRU, shuffle rest down. */ 281 for (j = c->assoc - 1; j > 0; j--) { 282 set[j] = set[j - 1]; 283 } 284 set[0] = tag; 285 286 return Miss; 287} 288 289static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size) 290{ 291 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1); 292 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1); 293 UWord tag = a >> c->tag_shift; 294 295 /* Access entirely within line. */ 296 if (set1 == set2) 297 return cachesim_setref(c, set1, tag); 298 299 /* Access straddles two lines. */ 300 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ 301 else if (((set1 + 1) & (c->sets-1)) == set2) { 302 UWord tag2 = (a+size-1) >> c->tag_shift; 303 304 /* the call updates cache structures as side effect */ 305 CacheResult res1 = cachesim_setref(c, set1, tag); 306 CacheResult res2 = cachesim_setref(c, set2, tag2); 307 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; 308 309 } else { 310 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2); 311 VG_(tool_panic)("item straddles more than two cache sets"); 312 } 313 return Hit; 314} 315 316static 317CacheModelResult cachesim_I1_ref(Addr a, UChar size) 318{ 319 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 320 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit; 321 return MemAccess; 322} 323 324static 325CacheModelResult cachesim_D1_ref(Addr a, UChar size) 326{ 327 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 328 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit; 329 return MemAccess; 330} 331 332 333/*------------------------------------------------------------*/ 334/*--- Write Back Cache Simulation ---*/ 335/*------------------------------------------------------------*/ 336 337/* 338 * More complex model: L1 Write-through, L2 Write-back 339 * This needs to distinguish among read and write references. 340 * 341 * Simulator functions: 342 * CacheModelResult cachesim_I1_Read(Addr a, UChar size) 343 * CacheModelResult cachesim_D1_Read(Addr a, UChar size) 344 * CacheModelResult cachesim_D1_Write(Addr a, UChar size) 345 */ 346 347/* 348 * With write-back, result can be a miss evicting a dirty line 349 * The dirty state of a cache line is stored in Bit0 of the tag for 350 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference 351 * type (Read/Write), the line gets dirty on a write. 352 */ 353static __inline__ 354CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag) 355{ 356 int i, j; 357 UWord *set, tmp_tag; 358 359 set = &(c->tags[set_no * c->assoc]); 360 361 /* This loop is unrolled for just the first case, which is the most */ 362 /* common. We can't unroll any further because it would screw up */ 363 /* if we have a direct-mapped (1-way) cache. */ 364 if (tag == (set[0] & ~CACHELINE_DIRTY)) { 365 set[0] |= ref; 366 return Hit; 367 } 368 /* If the tag is one other than the MRU, move it into the MRU spot */ 369 /* and shuffle the rest down. */ 370 for (i = 1; i < c->assoc; i++) { 371 if (tag == (set[i] & ~CACHELINE_DIRTY)) { 372 tmp_tag = set[i] | ref; // update dirty flag 373 for (j = i; j > 0; j--) { 374 set[j] = set[j - 1]; 375 } 376 set[0] = tmp_tag; 377 return Hit; 378 } 379 } 380 381 /* A miss; install this tag as MRU, shuffle rest down. */ 382 tmp_tag = set[c->assoc - 1]; 383 for (j = c->assoc - 1; j > 0; j--) { 384 set[j] = set[j - 1]; 385 } 386 set[0] = tag | ref; 387 388 return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss; 389} 390 391 392static __inline__ 393CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size) 394{ 395 UInt set1 = ( a >> c->line_size_bits) & (c->sets_min_1); 396 UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1); 397 UWord tag = a & c->tag_mask; 398 399 /* Access entirely within line. */ 400 if (set1 == set2) 401 return cachesim_setref_wb(c, ref, set1, tag); 402 403 /* Access straddles two lines. */ 404 /* Nb: this is a fast way of doing ((set1+1) % c->sets) */ 405 else if (((set1 + 1) & (c->sets-1)) == set2) { 406 UWord tag2 = (a+size-1) & c->tag_mask; 407 408 /* the call updates cache structures as side effect */ 409 CacheResult res1 = cachesim_setref_wb(c, ref, set1, tag); 410 CacheResult res2 = cachesim_setref_wb(c, ref, set2, tag2); 411 412 if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty; 413 return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit; 414 415 } else { 416 VG_(printf)("addr: %lx size: %u sets: %d %d", a, size, set1, set2); 417 VG_(tool_panic)("item straddles more than two cache sets"); 418 } 419 return Hit; 420} 421 422 423static 424CacheModelResult cachesim_I1_Read(Addr a, UChar size) 425{ 426 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 427 switch( cachesim_ref_wb( &L2, Read, a, size) ) { 428 case Hit: return L2_Hit; 429 case Miss: return MemAccess; 430 default: break; 431 } 432 return WriteBackMemAccess; 433} 434 435static 436CacheModelResult cachesim_D1_Read(Addr a, UChar size) 437{ 438 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 439 switch( cachesim_ref_wb( &L2, Read, a, size) ) { 440 case Hit: return L2_Hit; 441 case Miss: return MemAccess; 442 default: break; 443 } 444 return WriteBackMemAccess; 445} 446 447static 448CacheModelResult cachesim_D1_Write(Addr a, UChar size) 449{ 450 if ( cachesim_ref( &D1, a, size) == Hit ) { 451 /* Even for a L1 hit, the write-trough L1 passes 452 * the write to the L2 to make the L2 line dirty. 453 * But this causes no latency, so return the hit. 454 */ 455 cachesim_ref_wb( &L2, Write, a, size); 456 return L1_Hit; 457 } 458 switch( cachesim_ref_wb( &L2, Write, a, size) ) { 459 case Hit: return L2_Hit; 460 case Miss: return MemAccess; 461 default: break; 462 } 463 return WriteBackMemAccess; 464} 465 466 467/*------------------------------------------------------------*/ 468/*--- Hardware Prefetch Simulation ---*/ 469/*------------------------------------------------------------*/ 470 471static ULong prefetch_up = 0; 472static ULong prefetch_down = 0; 473 474#define PF_STREAMS 8 475#define PF_PAGEBITS 12 476 477static UInt pf_lastblock[PF_STREAMS]; 478static Int pf_seqblocks[PF_STREAMS]; 479 480static 481void prefetch_clear(void) 482{ 483 int i; 484 for(i=0;i<PF_STREAMS;i++) 485 pf_lastblock[i] = pf_seqblocks[i] = 0; 486} 487 488/* 489 * HW Prefetch emulation 490 * Start prefetching when detecting sequential access to 3 memory blocks. 491 * One stream can be detected per 4k page. 492 */ 493static __inline__ 494void prefetch_L2_doref(Addr a) 495{ 496 UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS; 497 UInt block = ( a >> L2.line_size_bits); 498 499 if (block != pf_lastblock[stream]) { 500 if (pf_seqblocks[stream] == 0) { 501 if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++; 502 else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--; 503 } 504 else if (pf_seqblocks[stream] >0) { 505 if (pf_lastblock[stream] +1 == block) { 506 pf_seqblocks[stream]++; 507 if (pf_seqblocks[stream] >= 2) { 508 prefetch_up++; 509 cachesim_ref(&L2, a + 5 * L2.line_size,1); 510 } 511 } 512 else pf_seqblocks[stream] = 0; 513 } 514 else if (pf_seqblocks[stream] <0) { 515 if (pf_lastblock[stream] -1 == block) { 516 pf_seqblocks[stream]--; 517 if (pf_seqblocks[stream] <= -2) { 518 prefetch_down++; 519 cachesim_ref(&L2, a - 5 * L2.line_size,1); 520 } 521 } 522 else pf_seqblocks[stream] = 0; 523 } 524 pf_lastblock[stream] = block; 525 } 526} 527 528/* simple model with hardware prefetch */ 529 530static 531CacheModelResult prefetch_I1_ref(Addr a, UChar size) 532{ 533 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 534 prefetch_L2_doref(a); 535 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit; 536 return MemAccess; 537} 538 539static 540CacheModelResult prefetch_D1_ref(Addr a, UChar size) 541{ 542 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 543 prefetch_L2_doref(a); 544 if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit; 545 return MemAccess; 546} 547 548 549/* complex model with hardware prefetch */ 550 551static 552CacheModelResult prefetch_I1_Read(Addr a, UChar size) 553{ 554 if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit; 555 prefetch_L2_doref(a); 556 switch( cachesim_ref_wb( &L2, Read, a, size) ) { 557 case Hit: return L2_Hit; 558 case Miss: return MemAccess; 559 default: break; 560 } 561 return WriteBackMemAccess; 562} 563 564static 565CacheModelResult prefetch_D1_Read(Addr a, UChar size) 566{ 567 if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit; 568 prefetch_L2_doref(a); 569 switch( cachesim_ref_wb( &L2, Read, a, size) ) { 570 case Hit: return L2_Hit; 571 case Miss: return MemAccess; 572 default: break; 573 } 574 return WriteBackMemAccess; 575} 576 577static 578CacheModelResult prefetch_D1_Write(Addr a, UChar size) 579{ 580 prefetch_L2_doref(a); 581 if ( cachesim_ref( &D1, a, size) == Hit ) { 582 /* Even for a L1 hit, the write-trough L1 passes 583 * the write to the L2 to make the L2 line dirty. 584 * But this causes no latency, so return the hit. 585 */ 586 cachesim_ref_wb( &L2, Write, a, size); 587 return L1_Hit; 588 } 589 switch( cachesim_ref_wb( &L2, Write, a, size) ) { 590 case Hit: return L2_Hit; 591 case Miss: return MemAccess; 592 default: break; 593 } 594 return WriteBackMemAccess; 595} 596 597 598/*------------------------------------------------------------*/ 599/*--- Cache Simulation with use metric collection ---*/ 600/*------------------------------------------------------------*/ 601 602/* can not be combined with write-back or prefetch */ 603 604static 605void cacheuse_initcache(cache_t2* c) 606{ 607 int i; 608 unsigned int start_mask, start_val; 609 unsigned int end_mask, end_val; 610 611 c->use = CLG_MALLOC("cl.sim.cu_ic.1", 612 sizeof(line_use) * c->sets * c->assoc); 613 c->loaded = CLG_MALLOC("cl.sim.cu_ic.2", 614 sizeof(line_loaded) * c->sets * c->assoc); 615 c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3", 616 sizeof(int) * c->line_size); 617 c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4", 618 sizeof(int) * c->line_size); 619 620 c->line_size_mask = c->line_size-1; 621 622 /* Meaning of line_start_mask/line_end_mask 623 * Example: for a given cache line, you get an access starting at 624 * byte offset 5, length 4, byte 5 - 8 was touched. For a cache 625 * line size of 32, you have 1 bit per byte in the mask: 626 * 627 * bit31 bit8 bit5 bit 0 628 * | | | | 629 * 11..111111100000 line_start_mask[5] 630 * 00..000111111111 line_end_mask[(5+4)-1] 631 * 632 * use_mask |= line_start_mask[5] && line_end_mask[8] 633 * 634 */ 635 start_val = end_val = ~0; 636 if (c->line_size < 32) { 637 int bits_per_byte = 32/c->line_size; 638 start_mask = (1<<bits_per_byte)-1; 639 end_mask = start_mask << (32-bits_per_byte); 640 for(i=0;i<c->line_size;i++) { 641 c->line_start_mask[i] = start_val; 642 start_val = start_val & ~start_mask; 643 start_mask = start_mask << bits_per_byte; 644 645 c->line_end_mask[c->line_size-i-1] = end_val; 646 end_val = end_val & ~end_mask; 647 end_mask = end_mask >> bits_per_byte; 648 } 649 } 650 else { 651 int bytes_per_bit = c->line_size/32; 652 start_mask = 1; 653 end_mask = 1 << 31; 654 for(i=0;i<c->line_size;i++) { 655 c->line_start_mask[i] = start_val; 656 c->line_end_mask[c->line_size-i-1] = end_val; 657 if ( ((i+1)%bytes_per_bit) == 0) { 658 start_val &= ~start_mask; 659 end_val &= ~end_mask; 660 start_mask <<= 1; 661 end_mask >>= 1; 662 } 663 } 664 } 665 666 CLG_DEBUG(6, "Config %s:\n", c->desc_line); 667 for(i=0;i<c->line_size;i++) { 668 CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n", 669 i, c->line_start_mask[i], c->line_end_mask[i]); 670 } 671 672 /* We use lower tag bits as offset pointers to cache use info. 673 * I.e. some cache parameters don't work. 674 */ 675 if ( (1<<c->tag_shift) < c->assoc) { 676 VG_(message)(Vg_DebugMsg, 677 "error: Use associativity < %d for cache use statistics!", 678 (1<<c->tag_shift) ); 679 VG_(tool_panic)("Unsupported cache configuration"); 680 } 681} 682 683 684/* for I1/D1 caches */ 685#define CACHEUSE(L) \ 686 \ 687static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size) \ 688{ \ 689 UInt set1 = ( a >> L.line_size_bits) & (L.sets_min_1); \ 690 UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1); \ 691 UWord tag = a & L.tag_mask; \ 692 UWord tag2; \ 693 int i, j, idx; \ 694 UWord *set, tmp_tag; \ 695 UInt use_mask; \ 696 \ 697 CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n", \ 698 L.name, a, size, set1, set2); \ 699 \ 700 /* First case: word entirely within line. */ \ 701 if (set1 == set2) { \ 702 \ 703 set = &(L.tags[set1 * L.assoc]); \ 704 use_mask = L.line_start_mask[a & L.line_size_mask] & \ 705 L.line_end_mask[(a+size-1) & L.line_size_mask]; \ 706 \ 707 /* This loop is unrolled for just the first case, which is the most */\ 708 /* common. We can't unroll any further because it would screw up */\ 709 /* if we have a direct-mapped (1-way) cache. */\ 710 if (tag == (set[0] & L.tag_mask)) { \ 711 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ 712 L.use[idx].count ++; \ 713 L.use[idx].mask |= use_mask; \ 714 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 715 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 716 use_mask, L.use[idx].mask, L.use[idx].count); \ 717 return L1_Hit; \ 718 } \ 719 /* If the tag is one other than the MRU, move it into the MRU spot */\ 720 /* and shuffle the rest down. */\ 721 for (i = 1; i < L.assoc; i++) { \ 722 if (tag == (set[i] & L.tag_mask)) { \ 723 tmp_tag = set[i]; \ 724 for (j = i; j > 0; j--) { \ 725 set[j] = set[j - 1]; \ 726 } \ 727 set[0] = tmp_tag; \ 728 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ 729 L.use[idx].count ++; \ 730 L.use[idx].mask |= use_mask; \ 731 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 732 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 733 use_mask, L.use[idx].mask, L.use[idx].count); \ 734 return L1_Hit; \ 735 } \ 736 } \ 737 \ 738 /* A miss; install this tag as MRU, shuffle rest down. */ \ 739 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ 740 for (j = L.assoc - 1; j > 0; j--) { \ 741 set[j] = set[j - 1]; \ 742 } \ 743 set[0] = tag | tmp_tag; \ 744 idx = (set1 * L.assoc) + tmp_tag; \ 745 return update_##L##_use(&L, idx, \ 746 use_mask, a &~ L.line_size_mask); \ 747 \ 748 /* Second case: word straddles two lines. */ \ 749 /* Nb: this is a fast way of doing ((set1+1) % L.sets) */ \ 750 } else if (((set1 + 1) & (L.sets-1)) == set2) { \ 751 Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */ \ 752 set = &(L.tags[set1 * L.assoc]); \ 753 use_mask = L.line_start_mask[a & L.line_size_mask]; \ 754 if (tag == (set[0] & L.tag_mask)) { \ 755 idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask); \ 756 L.use[idx].count ++; \ 757 L.use[idx].mask |= use_mask; \ 758 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 759 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 760 use_mask, L.use[idx].mask, L.use[idx].count); \ 761 goto block2; \ 762 } \ 763 for (i = 1; i < L.assoc; i++) { \ 764 if (tag == (set[i] & L.tag_mask)) { \ 765 tmp_tag = set[i]; \ 766 for (j = i; j > 0; j--) { \ 767 set[j] = set[j - 1]; \ 768 } \ 769 set[0] = tmp_tag; \ 770 idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask); \ 771 L.use[idx].count ++; \ 772 L.use[idx].mask |= use_mask; \ 773 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 774 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 775 use_mask, L.use[idx].mask, L.use[idx].count); \ 776 goto block2; \ 777 } \ 778 } \ 779 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ 780 for (j = L.assoc - 1; j > 0; j--) { \ 781 set[j] = set[j - 1]; \ 782 } \ 783 set[0] = tag | tmp_tag; \ 784 idx = (set1 * L.assoc) + tmp_tag; \ 785 miss1 = update_##L##_use(&L, idx, \ 786 use_mask, a &~ L.line_size_mask); \ 787block2: \ 788 set = &(L.tags[set2 * L.assoc]); \ 789 use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask]; \ 790 tag2 = (a+size-1) & L.tag_mask; \ 791 if (tag2 == (set[0] & L.tag_mask)) { \ 792 idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask); \ 793 L.use[idx].count ++; \ 794 L.use[idx].mask |= use_mask; \ 795 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 796 idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 797 use_mask, L.use[idx].mask, L.use[idx].count); \ 798 return miss1; \ 799 } \ 800 for (i = 1; i < L.assoc; i++) { \ 801 if (tag2 == (set[i] & L.tag_mask)) { \ 802 tmp_tag = set[i]; \ 803 for (j = i; j > 0; j--) { \ 804 set[j] = set[j - 1]; \ 805 } \ 806 set[0] = tmp_tag; \ 807 idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask); \ 808 L.use[idx].count ++; \ 809 L.use[idx].mask |= use_mask; \ 810 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\ 811 i, idx, L.loaded[idx].memline, L.loaded[idx].iaddr, \ 812 use_mask, L.use[idx].mask, L.use[idx].count); \ 813 return miss1; \ 814 } \ 815 } \ 816 tmp_tag = set[L.assoc - 1] & ~L.tag_mask; \ 817 for (j = L.assoc - 1; j > 0; j--) { \ 818 set[j] = set[j - 1]; \ 819 } \ 820 set[0] = tag2 | tmp_tag; \ 821 idx = (set2 * L.assoc) + tmp_tag; \ 822 miss2 = update_##L##_use(&L, idx, \ 823 use_mask, (a+size-1) &~ L.line_size_mask); \ 824 return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit; \ 825 \ 826 } else { \ 827 VG_(printf)("addr: %#lx size: %u sets: %d %d", a, size, set1, set2); \ 828 VG_(tool_panic)("item straddles more than two cache sets"); \ 829 } \ 830 return 0; \ 831} 832 833 834/* logarithmic bitcounting algorithm, see 835 * http://graphics.stanford.edu/~seander/bithacks.html 836 */ 837static __inline__ unsigned int countBits(unsigned int bits) 838{ 839 unsigned int c; // store the total here 840 const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers 841 const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF}; 842 843 c = bits; 844 c = ((c >> S[0]) & B[0]) + (c & B[0]); 845 c = ((c >> S[1]) & B[1]) + (c & B[1]); 846 c = ((c >> S[2]) & B[2]) + (c & B[2]); 847 c = ((c >> S[3]) & B[3]) + (c & B[3]); 848 c = ((c >> S[4]) & B[4]) + (c & B[4]); 849 return c; 850} 851 852static void update_L2_use(int idx, Addr memline) 853{ 854 line_loaded* loaded = &(L2.loaded[idx]); 855 line_use* use = &(L2.use[idx]); 856 int i = ((32 - countBits(use->mask)) * L2.line_size)>>5; 857 858 CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n", 859 idx, bb_base + current_ii->instr_offset, memline); 860 if (use->count>0) { 861 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n", 862 use->count, i, use->mask, loaded->memline, loaded->iaddr); 863 CLG_DEBUG(2, " collect: %d, use_base %p\n", 864 CLG_(current_state).collect, loaded->use_base); 865 866 if (CLG_(current_state).collect && loaded->use_base) { 867 (loaded->use_base)[off_L2_AcCost] += 1000 / use->count; 868 (loaded->use_base)[off_L2_SpLoss] += i; 869 } 870 } 871 872 use->count = 0; 873 use->mask = 0; 874 875 loaded->memline = memline; 876 loaded->iaddr = bb_base + current_ii->instr_offset; 877 loaded->use_base = (CLG_(current_state).nonskipped) ? 878 CLG_(current_state).nonskipped->skipped : 879 cost_base + current_ii->cost_offset; 880} 881 882static 883CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded) 884{ 885 UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1); 886 UWord* set = &(L2.tags[setNo * L2.assoc]); 887 UWord tag = memline & L2.tag_mask; 888 889 int i, j, idx; 890 UWord tmp_tag; 891 892 CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo); 893 894 if (tag == (set[0] & L2.tag_mask)) { 895 idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask); 896 l1_loaded->dep_use = &(L2.use[idx]); 897 898 CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n", 899 idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr, 900 L2.use[idx].mask, L2.use[idx].count); 901 return L2_Hit; 902 } 903 for (i = 1; i < L2.assoc; i++) { 904 if (tag == (set[i] & L2.tag_mask)) { 905 tmp_tag = set[i]; 906 for (j = i; j > 0; j--) { 907 set[j] = set[j - 1]; 908 } 909 set[0] = tmp_tag; 910 idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask); 911 l1_loaded->dep_use = &(L2.use[idx]); 912 913 CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n", 914 i, idx, L2.loaded[idx].memline, L2.loaded[idx].iaddr, 915 L2.use[idx].mask, L2.use[idx].count); 916 return L2_Hit; 917 } 918 } 919 920 /* A miss; install this tag as MRU, shuffle rest down. */ 921 tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask; 922 for (j = L2.assoc - 1; j > 0; j--) { 923 set[j] = set[j - 1]; 924 } 925 set[0] = tag | tmp_tag; 926 idx = (setNo * L2.assoc) + tmp_tag; 927 l1_loaded->dep_use = &(L2.use[idx]); 928 929 update_L2_use(idx, memline); 930 931 return MemAccess; 932} 933 934 935 936 937#define UPDATE_USE(L) \ 938 \ 939static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \ 940 UInt mask, Addr memline) \ 941{ \ 942 line_loaded* loaded = &(cache->loaded[idx]); \ 943 line_use* use = &(cache->use[idx]); \ 944 int c = ((32 - countBits(use->mask)) * cache->line_size)>>5; \ 945 \ 946 CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \ 947 cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \ 948 if (use->count>0) { \ 949 CLG_DEBUG(2, " old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\ 950 use->count, c, use->mask, loaded->memline, loaded->iaddr); \ 951 CLG_DEBUG(2, " collect: %d, use_base %p\n", \ 952 CLG_(current_state).collect, loaded->use_base); \ 953 \ 954 if (CLG_(current_state).collect && loaded->use_base) { \ 955 (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count; \ 956 (loaded->use_base)[off_##L##_SpLoss] += c; \ 957 \ 958 /* FIXME (?): L1/L2 line sizes must be equal ! */ \ 959 loaded->dep_use->mask |= use->mask; \ 960 loaded->dep_use->count += use->count; \ 961 } \ 962 } \ 963 \ 964 use->count = 1; \ 965 use->mask = mask; \ 966 loaded->memline = memline; \ 967 loaded->iaddr = bb_base + current_ii->instr_offset; \ 968 loaded->use_base = (CLG_(current_state).nonskipped) ? \ 969 CLG_(current_state).nonskipped->skipped : \ 970 cost_base + current_ii->cost_offset; \ 971 \ 972 if (memline == 0) return L2_Hit; \ 973 return cacheuse_L2_access(memline, loaded); \ 974} 975 976UPDATE_USE(I1); 977UPDATE_USE(D1); 978 979CACHEUSE(I1); 980CACHEUSE(D1); 981 982 983static 984void cacheuse_finish(void) 985{ 986 int i; 987 InstrInfo ii = { 0,0,0,0,0 }; 988 989 if (!CLG_(current_state).collect) return; 990 991 bb_base = 0; 992 current_ii = ⅈ 993 cost_base = 0; 994 995 /* update usage counters */ 996 if (I1.use) 997 for (i = 0; i < I1.sets * I1.assoc; i++) 998 if (I1.loaded[i].use_base) 999 update_I1_use( &I1, i, 0,0); 1000 1001 if (D1.use) 1002 for (i = 0; i < D1.sets * D1.assoc; i++) 1003 if (D1.loaded[i].use_base) 1004 update_D1_use( &D1, i, 0,0); 1005 1006 if (L2.use) 1007 for (i = 0; i < L2.sets * L2.assoc; i++) 1008 if (L2.loaded[i].use_base) 1009 update_L2_use(i, 0); 1010} 1011 1012 1013 1014/*------------------------------------------------------------*/ 1015/*--- Helper functions called by instrumented code ---*/ 1016/*------------------------------------------------------------*/ 1017 1018 1019static __inline__ 1020void inc_costs(CacheModelResult r, ULong* c1, ULong* c2) 1021{ 1022 switch(r) { 1023 case WriteBackMemAccess: 1024 if (clo_simulate_writeback) { 1025 c1[3]++; 1026 c2[3]++; 1027 } 1028 // fall through 1029 1030 case MemAccess: 1031 c1[2]++; 1032 c2[2]++; 1033 // fall through 1034 1035 case L2_Hit: 1036 c1[1]++; 1037 c2[1]++; 1038 // fall through 1039 1040 default: 1041 c1[0]++; 1042 c2[0]++; 1043 } 1044} 1045 1046 1047VG_REGPARM(1) 1048static void log_1I0D(InstrInfo* ii) 1049{ 1050 CacheModelResult IrRes; 1051 1052 current_ii = ii; 1053 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size); 1054 1055 CLG_DEBUG(6, "log_1I0D: Ir=%#lx/%u => Ir %d\n", 1056 bb_base + ii->instr_offset, ii->instr_size, IrRes); 1057 1058 if (CLG_(current_state).collect) { 1059 ULong* cost_Ir; 1060 1061 if (CLG_(current_state).nonskipped) 1062 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir; 1063 else 1064 cost_Ir = cost_base + ii->cost_offset + off_D0_Ir; 1065 1066 inc_costs(IrRes, cost_Ir, 1067 CLG_(current_state).cost + CLG_(sets).off_full_Ir ); 1068 } 1069} 1070 1071 1072/* Instruction doing a read access */ 1073 1074VG_REGPARM(2) 1075static void log_1I1Dr(InstrInfo* ii, Addr data) 1076{ 1077 CacheModelResult IrRes, DrRes; 1078 1079 current_ii = ii; 1080 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size); 1081 DrRes = (*simulator.D1_Read)(data, ii->data_size); 1082 1083 CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n", 1084 bb_base + ii->instr_offset, ii->instr_size, 1085 data, ii->data_size, IrRes, DrRes); 1086 1087 if (CLG_(current_state).collect) { 1088 ULong *cost_Ir, *cost_Dr; 1089 1090 if (CLG_(current_state).nonskipped) { 1091 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir; 1092 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr; 1093 } 1094 else { 1095 cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir; 1096 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr; 1097 } 1098 1099 inc_costs(IrRes, cost_Ir, 1100 CLG_(current_state).cost + CLG_(sets).off_full_Ir ); 1101 inc_costs(DrRes, cost_Dr, 1102 CLG_(current_state).cost + CLG_(sets).off_full_Dr ); 1103 } 1104} 1105 1106 1107VG_REGPARM(2) 1108static void log_0I1Dr(InstrInfo* ii, Addr data) 1109{ 1110 CacheModelResult DrRes; 1111 1112 current_ii = ii; 1113 DrRes = (*simulator.D1_Read)(data, ii->data_size); 1114 1115 CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n", 1116 data, ii->data_size, DrRes); 1117 1118 if (CLG_(current_state).collect) { 1119 ULong *cost_Dr; 1120 1121 if (CLG_(current_state).nonskipped) { 1122 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr; 1123 } 1124 else { 1125 cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr; 1126 } 1127 1128 inc_costs(DrRes, cost_Dr, 1129 CLG_(current_state).cost + CLG_(sets).off_full_Dr ); 1130 } 1131} 1132 1133 1134/* Instruction doing a write access */ 1135 1136VG_REGPARM(2) 1137static void log_1I1Dw(InstrInfo* ii, Addr data) 1138{ 1139 CacheModelResult IrRes, DwRes; 1140 1141 current_ii = ii; 1142 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size); 1143 DwRes = (*simulator.D1_Write)(data, ii->data_size); 1144 1145 CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n", 1146 bb_base + ii->instr_offset, ii->instr_size, 1147 data, ii->data_size, IrRes, DwRes); 1148 1149 if (CLG_(current_state).collect) { 1150 ULong *cost_Ir, *cost_Dw; 1151 1152 if (CLG_(current_state).nonskipped) { 1153 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir; 1154 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw; 1155 } 1156 else { 1157 cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir; 1158 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw; 1159 } 1160 1161 inc_costs(IrRes, cost_Ir, 1162 CLG_(current_state).cost + CLG_(sets).off_full_Ir ); 1163 inc_costs(DwRes, cost_Dw, 1164 CLG_(current_state).cost + CLG_(sets).off_full_Dw ); 1165 } 1166} 1167 1168VG_REGPARM(2) 1169static void log_0I1Dw(InstrInfo* ii, Addr data) 1170{ 1171 CacheModelResult DwRes; 1172 1173 current_ii = ii; 1174 DwRes = (*simulator.D1_Write)(data, ii->data_size); 1175 1176 CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n", 1177 data, ii->data_size, DwRes); 1178 1179 if (CLG_(current_state).collect) { 1180 ULong *cost_Dw; 1181 1182 if (CLG_(current_state).nonskipped) { 1183 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw; 1184 } 1185 else { 1186 cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw; 1187 } 1188 1189 inc_costs(DwRes, cost_Dw, 1190 CLG_(current_state).cost + CLG_(sets).off_full_Dw ); 1191 } 1192} 1193 1194/* Instruction doing a read and a write access */ 1195 1196VG_REGPARM(3) 1197static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2) 1198{ 1199 CacheModelResult IrRes, DrRes, DwRes; 1200 1201 current_ii = ii; 1202 IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size); 1203 DrRes = (*simulator.D1_Read)(data1, ii->data_size); 1204 DwRes = (*simulator.D1_Write)(data2, ii->data_size); 1205 1206 CLG_DEBUG(6, 1207 "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n", 1208 bb_base + ii->instr_offset, ii->instr_size, 1209 data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes); 1210 1211 if (CLG_(current_state).collect) { 1212 ULong *cost_Ir, *cost_Dr, *cost_Dw; 1213 1214 if (CLG_(current_state).nonskipped) { 1215 cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir; 1216 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr; 1217 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw; 1218 } 1219 else { 1220 cost_Ir = cost_base + ii->cost_offset + off_D2_Ir; 1221 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr; 1222 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw; 1223 } 1224 1225 inc_costs(IrRes, cost_Ir, 1226 CLG_(current_state).cost + CLG_(sets).off_full_Ir ); 1227 inc_costs(DrRes, cost_Dr, 1228 CLG_(current_state).cost + CLG_(sets).off_full_Dr ); 1229 inc_costs(DwRes, cost_Dw, 1230 CLG_(current_state).cost + CLG_(sets).off_full_Dw ); 1231 } 1232} 1233 1234VG_REGPARM(3) 1235static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2) 1236{ 1237 CacheModelResult DrRes, DwRes; 1238 1239 current_ii = ii; 1240 DrRes = (*simulator.D1_Read)(data1, ii->data_size); 1241 DwRes = (*simulator.D1_Write)(data2, ii->data_size); 1242 1243 CLG_DEBUG(6, 1244 "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n", 1245 data1, ii->data_size, data2, ii->data_size, DrRes, DwRes); 1246 1247 if (CLG_(current_state).collect) { 1248 ULong *cost_Dr, *cost_Dw; 1249 1250 if (CLG_(current_state).nonskipped) { 1251 cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr; 1252 cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw; 1253 } 1254 else { 1255 cost_Dr = cost_base + ii->cost_offset + off_D2_Dr; 1256 cost_Dw = cost_base + ii->cost_offset + off_D2_Dw; 1257 } 1258 1259 inc_costs(DrRes, cost_Dr, 1260 CLG_(current_state).cost + CLG_(sets).off_full_Dr ); 1261 inc_costs(DwRes, cost_Dw, 1262 CLG_(current_state).cost + CLG_(sets).off_full_Dw ); 1263 } 1264} 1265 1266 1267/*------------------------------------------------------------*/ 1268/*--- Cache configuration ---*/ 1269/*------------------------------------------------------------*/ 1270 1271#define UNDEFINED_CACHE ((cache_t) { -1, -1, -1 }) 1272 1273static cache_t clo_I1_cache = UNDEFINED_CACHE; 1274static cache_t clo_D1_cache = UNDEFINED_CACHE; 1275static cache_t clo_L2_cache = UNDEFINED_CACHE; 1276 1277 1278/* Checks cache config is ok; makes it so if not. */ 1279static 1280void check_cache(cache_t* cache, Char *name) 1281{ 1282 /* Simulator requires line size and set count to be powers of two */ 1283 if (( cache->size % (cache->line_size * cache->assoc) != 0) || 1284 (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) { 1285 VG_(message)(Vg_UserMsg, 1286 "error: %s set count not a power of two; aborting.", 1287 name); 1288 } 1289 1290 if (-1 == VG_(log2)(cache->line_size)) { 1291 VG_(message)(Vg_UserMsg, 1292 "error: %s line size of %dB not a power of two; aborting.", 1293 name, cache->line_size); 1294 VG_(exit)(1); 1295 } 1296 1297 // Then check line size >= 16 -- any smaller and a single instruction could 1298 // straddle three cache lines, which breaks a simulation assertion and is 1299 // stupid anyway. 1300 if (cache->line_size < MIN_LINE_SIZE) { 1301 VG_(message)(Vg_UserMsg, 1302 "error: %s line size of %dB too small; aborting.", 1303 name, cache->line_size); 1304 VG_(exit)(1); 1305 } 1306 1307 /* Then check cache size > line size (causes seg faults if not). */ 1308 if (cache->size <= cache->line_size) { 1309 VG_(message)(Vg_UserMsg, 1310 "error: %s cache size of %dB <= line size of %dB; aborting.", 1311 name, cache->size, cache->line_size); 1312 VG_(exit)(1); 1313 } 1314 1315 /* Then check assoc <= (size / line size) (seg faults otherwise). */ 1316 if (cache->assoc > (cache->size / cache->line_size)) { 1317 VG_(message)(Vg_UserMsg, 1318 "warning: %s associativity > (size / line size); aborting.", name); 1319 VG_(exit)(1); 1320 } 1321} 1322 1323static 1324void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c) 1325{ 1326#define DEFINED(L) (-1 != L.size || -1 != L.assoc || -1 != L.line_size) 1327 1328 Int n_clos = 0; 1329 1330 // Count how many were defined on the command line. 1331 if (DEFINED(clo_I1_cache)) { n_clos++; } 1332 if (DEFINED(clo_D1_cache)) { n_clos++; } 1333 if (DEFINED(clo_L2_cache)) { n_clos++; } 1334 1335 // Set the cache config (using auto-detection, if supported by the 1336 // architecture) 1337 VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) ); 1338 1339 // Then replace with any defined on the command line. 1340 if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; } 1341 if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; } 1342 if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; } 1343 1344 // Then check values and fix if not acceptable. 1345 check_cache(I1c, "I1"); 1346 check_cache(D1c, "D1"); 1347 check_cache(L2c, "L2"); 1348 1349 if (VG_(clo_verbosity) > 1) { 1350 VG_(message)(Vg_UserMsg, "Cache configuration used:"); 1351 VG_(message)(Vg_UserMsg, " I1: %dB, %d-way, %dB lines", 1352 I1c->size, I1c->assoc, I1c->line_size); 1353 VG_(message)(Vg_UserMsg, " D1: %dB, %d-way, %dB lines", 1354 D1c->size, D1c->assoc, D1c->line_size); 1355 VG_(message)(Vg_UserMsg, " L2: %dB, %d-way, %dB lines", 1356 L2c->size, L2c->assoc, L2c->line_size); 1357 } 1358#undef CMD_LINE_DEFINED 1359} 1360 1361 1362/* Initialize and clear simulator state */ 1363static void cachesim_post_clo_init(void) 1364{ 1365 /* Cache configurations. */ 1366 cache_t I1c, D1c, L2c; 1367 1368 /* Initialize access handlers */ 1369 if (!CLG_(clo).simulate_cache) { 1370 CLG_(cachesim).log_1I0D = 0; 1371 CLG_(cachesim).log_1I0D_name = "(no function)"; 1372 1373 CLG_(cachesim).log_1I1Dr = 0; 1374 CLG_(cachesim).log_1I1Dw = 0; 1375 CLG_(cachesim).log_1I2D = 0; 1376 CLG_(cachesim).log_1I1Dr_name = "(no function)"; 1377 CLG_(cachesim).log_1I1Dw_name = "(no function)"; 1378 CLG_(cachesim).log_1I2D_name = "(no function)"; 1379 1380 CLG_(cachesim).log_0I1Dr = 0; 1381 CLG_(cachesim).log_0I1Dw = 0; 1382 CLG_(cachesim).log_0I2D = 0; 1383 CLG_(cachesim).log_0I1Dr_name = "(no function)"; 1384 CLG_(cachesim).log_0I1Dw_name = "(no function)"; 1385 CLG_(cachesim).log_0I2D_name = "(no function)"; 1386 return; 1387 } 1388 1389 /* Configuration of caches only needed with real cache simulation */ 1390 configure_caches(&I1c, &D1c, &L2c); 1391 1392 I1.name = "I1"; 1393 D1.name = "D1"; 1394 L2.name = "L2"; 1395 1396 cachesim_initcache(I1c, &I1); 1397 cachesim_initcache(D1c, &D1); 1398 cachesim_initcache(L2c, &L2); 1399 1400 /* the other cache simulators use the standard helpers 1401 * with dispatching via simulator struct */ 1402 1403 CLG_(cachesim).log_1I0D = log_1I0D; 1404 CLG_(cachesim).log_1I0D_name = "log_1I0D"; 1405 1406 CLG_(cachesim).log_1I1Dr = log_1I1Dr; 1407 CLG_(cachesim).log_1I1Dw = log_1I1Dw; 1408 CLG_(cachesim).log_1I2D = log_1I2D; 1409 CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr"; 1410 CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw"; 1411 CLG_(cachesim).log_1I2D_name = "log_1I2D"; 1412 1413 CLG_(cachesim).log_0I1Dr = log_0I1Dr; 1414 CLG_(cachesim).log_0I1Dw = log_0I1Dw; 1415 CLG_(cachesim).log_0I2D = log_0I2D; 1416 CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr"; 1417 CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw"; 1418 CLG_(cachesim).log_0I2D_name = "log_0I2D"; 1419 1420 if (clo_collect_cacheuse) { 1421 1422 /* Output warning for not supported option combinations */ 1423 if (clo_simulate_hwpref) { 1424 VG_(message)(Vg_DebugMsg, 1425 "warning: prefetch simulation can not be used with cache usage"); 1426 clo_simulate_hwpref = False; 1427 } 1428 1429 if (clo_simulate_writeback) { 1430 VG_(message)(Vg_DebugMsg, 1431 "warning: write-back simulation can not be used with cache usage"); 1432 clo_simulate_writeback = False; 1433 } 1434 1435 simulator.I1_Read = cacheuse_I1_doRead; 1436 simulator.D1_Read = cacheuse_D1_doRead; 1437 simulator.D1_Write = cacheuse_D1_doRead; 1438 return; 1439 } 1440 1441 if (clo_simulate_hwpref) { 1442 prefetch_clear(); 1443 1444 if (clo_simulate_writeback) { 1445 simulator.I1_Read = prefetch_I1_Read; 1446 simulator.D1_Read = prefetch_D1_Read; 1447 simulator.D1_Write = prefetch_D1_Write; 1448 } 1449 else { 1450 simulator.I1_Read = prefetch_I1_ref; 1451 simulator.D1_Read = prefetch_D1_ref; 1452 simulator.D1_Write = prefetch_D1_ref; 1453 } 1454 1455 return; 1456 } 1457 1458 if (clo_simulate_writeback) { 1459 simulator.I1_Read = cachesim_I1_Read; 1460 simulator.D1_Read = cachesim_D1_Read; 1461 simulator.D1_Write = cachesim_D1_Write; 1462 } 1463 else { 1464 simulator.I1_Read = cachesim_I1_ref; 1465 simulator.D1_Read = cachesim_D1_ref; 1466 simulator.D1_Write = cachesim_D1_ref; 1467 } 1468} 1469 1470 1471/* Clear simulator state. Has to be initialized before */ 1472static 1473void cachesim_clear(void) 1474{ 1475 cachesim_clearcache(&I1); 1476 cachesim_clearcache(&D1); 1477 cachesim_clearcache(&L2); 1478 1479 prefetch_clear(); 1480} 1481 1482 1483static void cachesim_getdesc(Char* buf) 1484{ 1485 Int p; 1486 p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line); 1487 p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line); 1488 VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line); 1489} 1490 1491static 1492void cachesim_print_opts(void) 1493{ 1494 VG_(printf)( 1495"\n cache simulator options:\n" 1496" --simulate-cache=no|yes Do cache simulation [no]\n" 1497" --simulate-wb=no|yes Count write-back events [no]\n" 1498" --simulate-hwpref=no|yes Simulate hardware prefetch [no]\n" 1499#if CLG_EXPERIMENTAL 1500" --simulate-sectors=no|yes Simulate sectored behaviour [no]\n" 1501#endif 1502" --cacheuse=no|yes Collect cache block use [no]\n" 1503" --I1=<size>,<assoc>,<line_size> set I1 cache manually\n" 1504" --D1=<size>,<assoc>,<line_size> set D1 cache manually\n" 1505" --L2=<size>,<assoc>,<line_size> set L2 cache manually\n" 1506 ); 1507} 1508 1509static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len ) 1510{ 1511 int i1, i2, i3; 1512 int i; 1513 char *opt = VG_(strdup)("cl.sim.po.1", orig_opt); 1514 1515 i = i1 = opt_len; 1516 1517 /* Option looks like "--I1=65536,2,64". 1518 * Find commas, replace with NULs to make three independent 1519 * strings, then extract numbers. Yuck. */ 1520 while (VG_(isdigit)(opt[i])) i++; 1521 if (',' == opt[i]) { 1522 opt[i++] = '\0'; 1523 i2 = i; 1524 } else goto bad; 1525 while (VG_(isdigit)(opt[i])) i++; 1526 if (',' == opt[i]) { 1527 opt[i++] = '\0'; 1528 i3 = i; 1529 } else goto bad; 1530 while (VG_(isdigit)(opt[i])) i++; 1531 if ('\0' != opt[i]) goto bad; 1532 1533 cache->size = (Int)VG_(atoll)(opt + i1); 1534 cache->assoc = (Int)VG_(atoll)(opt + i2); 1535 cache->line_size = (Int)VG_(atoll)(opt + i3); 1536 1537 VG_(free)(opt); 1538 1539 return; 1540 1541 bad: 1542 VG_(err_bad_option)(orig_opt); 1543} 1544 1545/* Check for command line option for cache configuration. 1546 * Return False if unknown and not handled. 1547 * 1548 * Called from CLG_(process_cmd_line_option)() in clo.c 1549 */ 1550static Bool cachesim_parse_opt(Char* arg) 1551{ 1552 if (0 == VG_(strcmp)(arg, "--simulate-wb=yes")) 1553 clo_simulate_writeback = True; 1554 else if (0 == VG_(strcmp)(arg, "--simulate-wb=no")) 1555 clo_simulate_writeback = False; 1556 1557 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes")) 1558 clo_simulate_hwpref = True; 1559 else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no")) 1560 clo_simulate_hwpref = False; 1561 1562 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes")) 1563 clo_simulate_sectors = True; 1564 else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no")) 1565 clo_simulate_sectors = False; 1566 1567 else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) { 1568 clo_collect_cacheuse = True; 1569 /* Use counters only make sense with fine dumping */ 1570 CLG_(clo).dump_instr = True; 1571 } 1572 else if (0 == VG_(strcmp)(arg, "--cacheuse=no")) 1573 clo_collect_cacheuse = False; 1574 1575 /* 5 is length of "--I1=" */ 1576 else if (0 == VG_(strncmp)(arg, "--I1=", 5)) 1577 parse_opt(&clo_I1_cache, arg, 5); 1578 else if (0 == VG_(strncmp)(arg, "--D1=", 5)) 1579 parse_opt(&clo_D1_cache, arg, 5); 1580 else if (0 == VG_(strncmp)(arg, "--L2=", 5)) 1581 parse_opt(&clo_L2_cache, arg, 5); 1582 else 1583 return False; 1584 1585 return True; 1586} 1587 1588/* Adds commas to ULong, right justifying in a field field_width wide, returns 1589 * the string in buf. */ 1590static 1591Int commify(ULong n, int field_width, char* buf) 1592{ 1593 int len, n_commas, i, j, new_len, space; 1594 1595 VG_(sprintf)(buf, "%llu", n); 1596 len = VG_(strlen)(buf); 1597 n_commas = (len - 1) / 3; 1598 new_len = len + n_commas; 1599 space = field_width - new_len; 1600 1601 /* Allow for printing a number in a field_width smaller than it's size */ 1602 if (space < 0) space = 0; 1603 1604 /* Make j = -1 because we copy the '\0' before doing the numbers in groups 1605 * of three. */ 1606 for (j = -1, i = len ; i >= 0; i--) { 1607 buf[i + n_commas + space] = buf[i]; 1608 1609 if ((i>0) && (3 == ++j)) { 1610 j = 0; 1611 n_commas--; 1612 buf[i + n_commas + space] = ','; 1613 } 1614 } 1615 /* Right justify in field. */ 1616 for (i = 0; i < space; i++) buf[i] = ' '; 1617 return new_len; 1618} 1619 1620static 1621void percentify(Int n, Int ex, Int field_width, char buf[]) 1622{ 1623 int i, len, space; 1624 1625 VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex); 1626 len = VG_(strlen)(buf); 1627 space = field_width - len; 1628 if (space < 0) space = 0; /* Allow for v. small field_width */ 1629 i = len; 1630 1631 /* Right justify in field */ 1632 for ( ; i >= 0; i--) buf[i + space] = buf[i]; 1633 for (i = 0; i < space; i++) buf[i] = ' '; 1634} 1635 1636static 1637void cachesim_printstat(void) 1638{ 1639 FullCost total = CLG_(total_cost), D_total = 0; 1640 ULong L2_total_m, L2_total_mr, L2_total_mw, 1641 L2_total, L2_total_r, L2_total_w; 1642 char buf1[RESULTS_BUF_LEN], 1643 buf2[RESULTS_BUF_LEN], 1644 buf3[RESULTS_BUF_LEN]; 1645 Int l1, l2, l3; 1646 Int p; 1647 1648 if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) { 1649 VG_(message)(Vg_DebugMsg, "Prefetch Up: %llu", 1650 prefetch_up); 1651 VG_(message)(Vg_DebugMsg, "Prefetch Down: %llu", 1652 prefetch_down); 1653 VG_(message)(Vg_DebugMsg, ""); 1654 } 1655 1656 /* I cache results. Use the I_refs value to determine the first column 1657 * width. */ 1658 l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1); 1659 VG_(message)(Vg_UserMsg, "I refs: %s", buf1); 1660 1661 if (!CLG_(clo).simulate_cache) return; 1662 1663 commify(total[CLG_(sets).off_full_Ir +1], l1, buf1); 1664 VG_(message)(Vg_UserMsg, "I1 misses: %s", buf1); 1665 1666 commify(total[CLG_(sets).off_full_Ir +2], l1, buf1); 1667 VG_(message)(Vg_UserMsg, "L2i misses: %s", buf1); 1668 1669 p = 100; 1670 1671 if (0 == total[CLG_(sets).off_full_Ir]) 1672 total[CLG_(sets).off_full_Ir] = 1; 1673 1674 percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p / 1675 total[CLG_(sets).off_full_Ir], p, l1+1, buf1); 1676 VG_(message)(Vg_UserMsg, "I1 miss rate: %s", buf1); 1677 1678 percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p / 1679 total[CLG_(sets).off_full_Ir], p, l1+1, buf1); 1680 VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1); 1681 VG_(message)(Vg_UserMsg, ""); 1682 1683 /* D cache results. 1684 Use the D_refs.rd and D_refs.wr values to determine the 1685 * width of columns 2 & 3. */ 1686 1687 D_total = CLG_(get_eventset_cost)( CLG_(sets).full ); 1688 CLG_(init_cost)( CLG_(sets).full, D_total); 1689 CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr ); 1690 CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw ); 1691 1692 commify( D_total[0], l1, buf1); 1693 l2 = commify(total[CLG_(sets).off_full_Dr], 0, buf2); 1694 l3 = commify(total[CLG_(sets).off_full_Dw], 0, buf3); 1695 VG_(message)(Vg_UserMsg, "D refs: %s (%s rd + %s wr)", 1696 buf1, buf2, buf3); 1697 1698 commify( D_total[1], l1, buf1); 1699 commify(total[CLG_(sets).off_full_Dr+1], l2, buf2); 1700 commify(total[CLG_(sets).off_full_Dw+1], l3, buf3); 1701 VG_(message)(Vg_UserMsg, "D1 misses: %s (%s rd + %s wr)", 1702 buf1, buf2, buf3); 1703 1704 commify( D_total[2], l1, buf1); 1705 commify(total[CLG_(sets).off_full_Dr+2], l2, buf2); 1706 commify(total[CLG_(sets).off_full_Dw+2], l3, buf3); 1707 VG_(message)(Vg_UserMsg, "L2d misses: %s (%s rd + %s wr)", 1708 buf1, buf2, buf3); 1709 1710 p = 10; 1711 1712 if (0 == D_total[0]) D_total[0] = 1; 1713 if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1; 1714 if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1; 1715 1716 percentify( D_total[1] * 100 * p / D_total[0], p, l1+1, buf1); 1717 percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p / 1718 total[CLG_(sets).off_full_Dr], p, l2+1, buf2); 1719 percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p / 1720 total[CLG_(sets).off_full_Dw], p, l3+1, buf3); 1721 VG_(message)(Vg_UserMsg, "D1 miss rate: %s (%s + %s )", buf1, buf2,buf3); 1722 1723 percentify( D_total[2] * 100 * p / D_total[0], p, l1+1, buf1); 1724 percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p / 1725 total[CLG_(sets).off_full_Dr], p, l2+1, buf2); 1726 percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p / 1727 total[CLG_(sets).off_full_Dw], p, l3+1, buf3); 1728 VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s + %s )", buf1, buf2,buf3); 1729 VG_(message)(Vg_UserMsg, ""); 1730 1731 1732 1733 /* L2 overall results */ 1734 1735 L2_total = 1736 total[CLG_(sets).off_full_Dr +1] + 1737 total[CLG_(sets).off_full_Dw +1] + 1738 total[CLG_(sets).off_full_Ir +1]; 1739 L2_total_r = 1740 total[CLG_(sets).off_full_Dr +1] + 1741 total[CLG_(sets).off_full_Ir +1]; 1742 L2_total_w = total[CLG_(sets).off_full_Dw +1]; 1743 commify(L2_total, l1, buf1); 1744 commify(L2_total_r, l2, buf2); 1745 commify(L2_total_w, l3, buf3); 1746 VG_(message)(Vg_UserMsg, "L2 refs: %s (%s rd + %s wr)", 1747 buf1, buf2, buf3); 1748 1749 L2_total_m = 1750 total[CLG_(sets).off_full_Dr +2] + 1751 total[CLG_(sets).off_full_Dw +2] + 1752 total[CLG_(sets).off_full_Ir +2]; 1753 L2_total_mr = 1754 total[CLG_(sets).off_full_Dr +2] + 1755 total[CLG_(sets).off_full_Ir +2]; 1756 L2_total_mw = total[CLG_(sets).off_full_Dw +2]; 1757 commify(L2_total_m, l1, buf1); 1758 commify(L2_total_mr, l2, buf2); 1759 commify(L2_total_mw, l3, buf3); 1760 VG_(message)(Vg_UserMsg, "L2 misses: %s (%s rd + %s wr)", 1761 buf1, buf2, buf3); 1762 1763 percentify(L2_total_m * 100 * p / 1764 (total[CLG_(sets).off_full_Ir] + D_total[0]), p, l1+1, buf1); 1765 percentify(L2_total_mr * 100 * p / 1766 (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]), 1767 p, l2+1, buf2); 1768 percentify(L2_total_mw * 100 * p / 1769 total[CLG_(sets).off_full_Dw], p, l3+1, buf3); 1770 VG_(message)(Vg_UserMsg, "L2 miss rate: %s (%s + %s )", 1771 buf1, buf2,buf3); 1772} 1773 1774 1775/*------------------------------------------------------------*/ 1776/*--- Setup for Event set. ---*/ 1777/*------------------------------------------------------------*/ 1778 1779struct event_sets CLG_(sets); 1780 1781void CLG_(init_eventsets)(Int max_user) 1782{ 1783 EventType * e1, *e2, *e3, *e4; 1784 EventSet *Ir, *Dr, *Dw; 1785 EventSet *D0, *D1r, *D1w, *D2; 1786 EventSet *sim, *full; 1787 EventSet *use; 1788 int sizeOfUseIr; 1789 1790 use = CLG_(get_eventset)("Use", 4); 1791 if (clo_collect_cacheuse) { 1792 /* if TUse is 0, there was never a load, and no loss, too */ 1793 e1 = CLG_(register_eventtype)("AcCost1"); 1794 CLG_(add_eventtype)(use, e1); 1795 e1 = CLG_(register_eventtype)("SpLoss1"); 1796 CLG_(add_eventtype)(use, e1); 1797 e1 = CLG_(register_eventtype)("AcCost2"); 1798 CLG_(add_eventtype)(use, e1); 1799 e1 = CLG_(register_eventtype)("SpLoss2"); 1800 CLG_(add_eventtype)(use, e1); 1801 } 1802 1803 Ir = CLG_(get_eventset)("Ir", 4); 1804 Dr = CLG_(get_eventset)("Dr", 4); 1805 Dw = CLG_(get_eventset)("Dw", 4); 1806 if (CLG_(clo).simulate_cache) { 1807 e1 = CLG_(register_eventtype)("Ir"); 1808 e2 = CLG_(register_eventtype)("I1mr"); 1809 e3 = CLG_(register_eventtype)("I2mr"); 1810 if (clo_simulate_writeback) { 1811 e4 = CLG_(register_eventtype)("I2dmr"); 1812 CLG_(add_dep_event4)(Ir, e1,e2,e3,e4); 1813 } 1814 else 1815 CLG_(add_dep_event3)(Ir, e1,e2,e3); 1816 1817 e1 = CLG_(register_eventtype)("Dr"); 1818 e2 = CLG_(register_eventtype)("D1mr"); 1819 e3 = CLG_(register_eventtype)("D2mr"); 1820 if (clo_simulate_writeback) { 1821 e4 = CLG_(register_eventtype)("D2dmr"); 1822 CLG_(add_dep_event4)(Dr, e1,e2,e3,e4); 1823 } 1824 else 1825 CLG_(add_dep_event3)(Dr, e1,e2,e3); 1826 1827 e1 = CLG_(register_eventtype)("Dw"); 1828 e2 = CLG_(register_eventtype)("D1mw"); 1829 e3 = CLG_(register_eventtype)("D2mw"); 1830 if (clo_simulate_writeback) { 1831 e4 = CLG_(register_eventtype)("D2dmw"); 1832 CLG_(add_dep_event4)(Dw, e1,e2,e3,e4); 1833 } 1834 else 1835 CLG_(add_dep_event3)(Dw, e1,e2,e3); 1836 1837 } 1838 else { 1839 e1 = CLG_(register_eventtype)("Ir"); 1840 CLG_(add_eventtype)(Ir, e1); 1841 } 1842 1843 sizeOfUseIr = use->size + Ir->size; 1844 D0 = CLG_(get_eventset)("D0", sizeOfUseIr); 1845 CLG_(add_eventset)(D0, use); 1846 off_D0_Ir = CLG_(add_eventset)(D0, Ir); 1847 1848 D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size); 1849 CLG_(add_eventset)(D1r, use); 1850 off_D1r_Ir = CLG_(add_eventset)(D1r, Ir); 1851 off_D1r_Dr = CLG_(add_eventset)(D1r, Dr); 1852 1853 D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size); 1854 CLG_(add_eventset)(D1w, use); 1855 off_D1w_Ir = CLG_(add_eventset)(D1w, Ir); 1856 off_D1w_Dw = CLG_(add_eventset)(D1w, Dw); 1857 1858 D2 = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size); 1859 CLG_(add_eventset)(D2, use); 1860 off_D2_Ir = CLG_(add_eventset)(D2, Ir); 1861 off_D2_Dr = CLG_(add_eventset)(D2, Dr); 1862 off_D2_Dw = CLG_(add_eventset)(D2, Dw); 1863 1864 sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size); 1865 CLG_(add_eventset)(sim, use); 1866 CLG_(sets).off_sim_Ir = CLG_(add_eventset)(sim, Ir); 1867 CLG_(sets).off_sim_Dr = CLG_(add_eventset)(sim, Dr); 1868 CLG_(sets).off_sim_Dw = CLG_(add_eventset)(sim, Dw); 1869 1870 if (CLG_(clo).collect_alloc) max_user += 2; 1871 if (CLG_(clo).collect_systime) max_user += 2; 1872 1873 full = CLG_(get_eventset)("full", sim->size + max_user); 1874 CLG_(add_eventset)(full, sim); 1875 CLG_(sets).off_full_Ir = CLG_(sets).off_sim_Ir; 1876 CLG_(sets).off_full_Dr = CLG_(sets).off_sim_Dr; 1877 CLG_(sets).off_full_Dw = CLG_(sets).off_sim_Dw; 1878 1879 CLG_(sets).use = use; 1880 CLG_(sets).Ir = Ir; 1881 CLG_(sets).Dr = Dr; 1882 CLG_(sets).Dw = Dw; 1883 1884 CLG_(sets).D0 = D0; 1885 CLG_(sets).D1r = D1r; 1886 CLG_(sets).D1w = D1w; 1887 CLG_(sets).D2 = D2; 1888 1889 CLG_(sets).sim = sim; 1890 CLG_(sets).full = full; 1891 1892 if (CLG_(clo).collect_alloc) { 1893 e1 = CLG_(register_eventtype)("allocCount"); 1894 e2 = CLG_(register_eventtype)("allocSize"); 1895 CLG_(sets).off_full_user = CLG_(add_dep_event2)(full, e1,e2); 1896 } 1897 1898 if (CLG_(clo).collect_systime) { 1899 e1 = CLG_(register_eventtype)("sysCount"); 1900 e2 = CLG_(register_eventtype)("sysTime"); 1901 CLG_(sets).off_full_systime = CLG_(add_dep_event2)(full, e1,e2); 1902 } 1903 1904 CLG_DEBUGIF(1) { 1905 CLG_DEBUG(1, "EventSets:\n"); 1906 CLG_(print_eventset)(-2, use); 1907 CLG_(print_eventset)(-2, Ir); 1908 CLG_(print_eventset)(-2, Dr); 1909 CLG_(print_eventset)(-2, Dw); 1910 CLG_(print_eventset)(-2, sim); 1911 CLG_(print_eventset)(-2, full); 1912 } 1913 1914 /* Not-existing events are silently ignored */ 1915 CLG_(dumpmap) = CLG_(get_eventmapping)(full); 1916 CLG_(append_event)(CLG_(dumpmap), "Ir"); 1917 CLG_(append_event)(CLG_(dumpmap), "Dr"); 1918 CLG_(append_event)(CLG_(dumpmap), "Dw"); 1919 CLG_(append_event)(CLG_(dumpmap), "I1mr"); 1920 CLG_(append_event)(CLG_(dumpmap), "D1mr"); 1921 CLG_(append_event)(CLG_(dumpmap), "D1mw"); 1922 CLG_(append_event)(CLG_(dumpmap), "I2mr"); 1923 CLG_(append_event)(CLG_(dumpmap), "D2mr"); 1924 CLG_(append_event)(CLG_(dumpmap), "D2mw"); 1925 CLG_(append_event)(CLG_(dumpmap), "I2dmr"); 1926 CLG_(append_event)(CLG_(dumpmap), "D2dmr"); 1927 CLG_(append_event)(CLG_(dumpmap), "D2dmw"); 1928 CLG_(append_event)(CLG_(dumpmap), "AcCost1"); 1929 CLG_(append_event)(CLG_(dumpmap), "SpLoss1"); 1930 CLG_(append_event)(CLG_(dumpmap), "AcCost2"); 1931 CLG_(append_event)(CLG_(dumpmap), "SpLoss2"); 1932 CLG_(append_event)(CLG_(dumpmap), "allocCount"); 1933 CLG_(append_event)(CLG_(dumpmap), "allocSize"); 1934 CLG_(append_event)(CLG_(dumpmap), "sysCount"); 1935 CLG_(append_event)(CLG_(dumpmap), "sysTime"); 1936 1937} 1938 1939 1940 1941static 1942void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost) 1943{ 1944 /* if eventset use is defined, it is always first (hardcoded!) */ 1945 CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost); 1946 1947 /* FIXME: This is hardcoded... */ 1948 if (es == CLG_(sets).D0) { 1949 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir, 1950 cost + off_D0_Ir); 1951 } 1952 else if (es == CLG_(sets).D1r) { 1953 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir, 1954 cost + off_D1r_Ir); 1955 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr, 1956 cost + off_D1r_Dr); 1957 } 1958 else if (es == CLG_(sets).D1w) { 1959 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir, 1960 cost + off_D1w_Ir); 1961 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw, 1962 cost + off_D1w_Dw); 1963 } 1964 else { 1965 CLG_ASSERT(es == CLG_(sets).D2); 1966 CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir, 1967 cost + off_D2_Ir); 1968 CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr, 1969 cost + off_D2_Dr); 1970 CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw, 1971 cost + off_D2_Dw); 1972 } 1973} 1974 1975/* this is called at dump time for every instruction executed */ 1976static void cachesim_add_icost(SimCost cost, BBCC* bbcc, 1977 InstrInfo* ii, ULong exe_count) 1978{ 1979 if (!CLG_(clo).simulate_cache) 1980 cost[CLG_(sets).off_sim_Ir] += exe_count; 1981 else { 1982 1983#if 0 1984/* There is always a trivial case where exe_count and Ir can be 1985 * slightly different because ecounter is updated when executing 1986 * the next BB. E.g. for last BB executed, or when toggling collection 1987 */ 1988 /* FIXME: Hardcoded that each eventset has Ir as first */ 1989 if ((bbcc->cost + ii->cost_offset)[0] != exe_count) { 1990 VG_(printf)("==> Ir %llu, exe %llu\n", 1991 (bbcc->cost + ii->cost_offset)[0], exe_count); 1992 CLG_(print_bbcc_cost)(-2, bbcc); 1993 //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count); 1994 } 1995#endif 1996 1997 add_and_zero_Dx(ii->eventset, cost, 1998 bbcc->cost + ii->cost_offset); 1999 } 2000} 2001 2002static 2003void cachesim_after_bbsetup(void) 2004{ 2005 BBCC* bbcc = CLG_(current_state).bbcc; 2006 2007 if (CLG_(clo).simulate_cache) { 2008 BB* bb = bbcc->bb; 2009 2010 /* only needed if log_* functions are called */ 2011 bb_base = bb->obj->offset + bb->offset; 2012 cost_base = bbcc->cost; 2013 } 2014} 2015 2016static 2017void cachesim_finish(void) 2018{ 2019 if (clo_collect_cacheuse) 2020 cacheuse_finish(); 2021} 2022 2023/*------------------------------------------------------------*/ 2024/*--- The simulator defined in this file ---*/ 2025/*------------------------------------------------------------*/ 2026 2027struct cachesim_if CLG_(cachesim) = { 2028 .print_opts = cachesim_print_opts, 2029 .parse_opt = cachesim_parse_opt, 2030 .post_clo_init = cachesim_post_clo_init, 2031 .clear = cachesim_clear, 2032 .getdesc = cachesim_getdesc, 2033 .printstat = cachesim_printstat, 2034 .add_icost = cachesim_add_icost, 2035 .after_bbsetup = cachesim_after_bbsetup, 2036 .finish = cachesim_finish, 2037 2038 /* these will be set by cachesim_post_clo_init */ 2039 .log_1I0D = 0, 2040 2041 .log_1I1Dr = 0, 2042 .log_1I1Dw = 0, 2043 .log_1I2D = 0, 2044 2045 .log_0I1Dr = 0, 2046 .log_0I1Dw = 0, 2047 .log_0I2D = 0, 2048 2049 .log_1I0D_name = "(no function)", 2050 2051 .log_1I1Dr_name = "(no function)", 2052 .log_1I1Dw_name = "(no function)", 2053 .log_1I2D_name = "(no function)", 2054 2055 .log_0I1Dr_name = "(no function)", 2056 .log_0I1Dw_name = "(no function)", 2057 .log_0I2D_name = "(no function)" 2058}; 2059 2060 2061/*--------------------------------------------------------------------*/ 2062/*--- end ct_sim.c ---*/ 2063/*--------------------------------------------------------------------*/ 2064 2065