1/*--------------------------------------------------------------------*/
2/*--- Cache simulation.                                            ---*/
3/*---                                                        sim.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Callgrind, a Valgrind tool for call graph
8   profiling programs.
9
10   Copyright (C) 2003-2013, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
11
12   This tool is derived from and contains code from Cachegrind
13   Copyright (C) 2002-2013 Nicholas Nethercote (njn@valgrind.org)
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "global.h"
34
35
36/* Notes:
37  - simulates a write-allocate cache
38  - (block --> set) hash function uses simple bit selection
39  - handling of references straddling two cache blocks:
40      - counts as only one cache access (not two)
41      - both blocks hit                  --> one hit
42      - one block hits, the other misses --> one miss
43      - both blocks miss                 --> one miss (not two)
44*/
45
46/* Cache configuration */
47#include "cg_arch.c"
48
49/* additional structures for cache use info, separated
50 * according usage frequency:
51 * - line_loaded : pointer to cost center of instruction
52 *                 which loaded the line into cache.
53 *                 Needed to increment counters when line is evicted.
54 * - line_use    : updated on every access
55 */
56typedef struct {
57  UInt count;
58  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
59} line_use;
60
61typedef struct {
62  Addr memline, iaddr;
63  line_use* dep_use; /* point to higher-level cacheblock for this memline */
64  ULong* use_base;
65} line_loaded;
66
67/* Cache state */
68typedef struct {
69   const HChar* name;
70   int          size;                   /* bytes */
71   int          assoc;
72   int          line_size;              /* bytes */
73   Bool         sectored;  /* prefetch nearside cacheline on read */
74   int          sets;
75   int          sets_min_1;
76   int          line_size_bits;
77   int          tag_shift;
78   UWord        tag_mask;
79   HChar        desc_line[128];
80   UWord*       tags;
81
82  /* for cache use */
83   int          line_size_mask;
84   int*         line_start_mask;
85   int*         line_end_mask;
86   line_loaded* loaded;
87   line_use*    use;
88} cache_t2;
89
90/*
91 * States of flat caches in our model.
92 * We use a 2-level hierarchy,
93 */
94static cache_t2 I1, D1, LL;
95
96/* Lower bits of cache tags are used as flags for a cache line */
97#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
98#define CACHELINE_DIRTY    1
99
100
101/* Cache simulator Options */
102static Bool clo_simulate_writeback = False;
103static Bool clo_simulate_hwpref = False;
104static Bool clo_simulate_sectors = False;
105static Bool clo_collect_cacheuse = False;
106
107/* Following global vars are setup before by setup_bbcc():
108 *
109 * - Addr   CLG_(bb_base)     (instruction start address of original BB)
110 * - ULong* CLG_(cost_base)   (start of cost array for BB)
111 */
112
113Addr   CLG_(bb_base);
114ULong* CLG_(cost_base);
115
116static InstrInfo* current_ii;
117
118/* Cache use offsets */
119/* The offsets are only correct because all per-instruction event sets get
120 * the "Use" set added first !
121 */
122static Int off_I1_AcCost  = 0;
123static Int off_I1_SpLoss  = 1;
124static Int off_D1_AcCost  = 0;
125static Int off_D1_SpLoss  = 1;
126static Int off_LL_AcCost  = 2;
127static Int off_LL_SpLoss  = 3;
128
129/* Cache access types */
130typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
131
132/* Result of a reference into a flat cache */
133typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
134
135/* Result of a reference into a hierarchical cache model */
136typedef enum {
137    L1_Hit,
138    LL_Hit,
139    MemAccess,
140    WriteBackMemAccess } CacheModelResult;
141
142typedef CacheModelResult (*simcall_type)(Addr, UChar);
143
144static struct {
145    simcall_type I1_Read;
146    simcall_type D1_Read;
147    simcall_type D1_Write;
148} simulator;
149
150/*------------------------------------------------------------*/
151/*--- Cache Simulator Initialization                       ---*/
152/*------------------------------------------------------------*/
153
154static void cachesim_clearcache(cache_t2* c)
155{
156  Int i;
157
158  for (i = 0; i < c->sets * c->assoc; i++)
159    c->tags[i] = 0;
160  if (c->use) {
161    for (i = 0; i < c->sets * c->assoc; i++) {
162      c->loaded[i].memline  = 0;
163      c->loaded[i].use_base = 0;
164      c->loaded[i].dep_use = 0;
165      c->loaded[i].iaddr = 0;
166      c->use[i].mask    = 0;
167      c->use[i].count   = 0;
168      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
169    }
170  }
171}
172
173static void cacheuse_initcache(cache_t2* c);
174
175/* By this point, the size/assoc/line_size has been checked. */
176static void cachesim_initcache(cache_t config, cache_t2* c)
177{
178   c->size      = config.size;
179   c->assoc     = config.assoc;
180   c->line_size = config.line_size;
181   c->sectored  = False; // FIXME
182
183   c->sets           = (c->size / c->line_size) / c->assoc;
184   c->sets_min_1     = c->sets - 1;
185   c->line_size_bits = VG_(log2)(c->line_size);
186   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
187   c->tag_mask       = ~((1<<c->tag_shift)-1);
188
189   /* Can bits in tag entries be used for flags?
190    * Should be always true as MIN_LINE_SIZE >= 16 */
191   CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
192
193   if (c->assoc == 1) {
194      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
195		   c->size, c->line_size,
196		   c->sectored ? ", sectored":"");
197   } else {
198      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
199		   c->size, c->line_size, c->assoc,
200		   c->sectored ? ", sectored":"");
201   }
202
203   c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
204                                 sizeof(UWord) * c->sets * c->assoc);
205   if (clo_collect_cacheuse)
206       cacheuse_initcache(c);
207   else
208     c->use = 0;
209   cachesim_clearcache(c);
210}
211
212
213#if 0
214static void print_cache(cache_t2* c)
215{
216   UInt set, way, i;
217
218   /* Note initialisation and update of 'i'. */
219   for (i = 0, set = 0; set < c->sets; set++) {
220      for (way = 0; way < c->assoc; way++, i++) {
221         VG_(printf)("%8x ", c->tags[i]);
222      }
223      VG_(printf)("\n");
224   }
225}
226#endif
227
228
229/*------------------------------------------------------------*/
230/*--- Simple Cache Simulation                              ---*/
231/*------------------------------------------------------------*/
232
233/*
234 * Model: single inclusive, 2-level cache hierarchy (L1/LL)
235 *        with write-allocate
236 *
237 * For simple cache hit/miss counts, we do not have to
238 * maintain the dirty state of lines (no need to distinguish
239 * read/write references), and the resulting counts are the
240 * same for write-through and write-back caches.
241 *
242 * Simulator functions:
243 *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
244 *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
245 */
246__attribute__((always_inline))
247static __inline__
248CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
249{
250    int i, j;
251    UWord *set;
252
253    set = &(c->tags[set_no * c->assoc]);
254
255    /* This loop is unrolled for just the first case, which is the most */
256    /* common.  We can't unroll any further because it would screw up   */
257    /* if we have a direct-mapped (1-way) cache.                        */
258    if (tag == set[0])
259        return Hit;
260
261    /* If the tag is one other than the MRU, move it into the MRU spot  */
262    /* and shuffle the rest down.                                       */
263    for (i = 1; i < c->assoc; i++) {
264        if (tag == set[i]) {
265            for (j = i; j > 0; j--) {
266                set[j] = set[j - 1];
267            }
268            set[0] = tag;
269            return Hit;
270        }
271    }
272
273    /* A miss;  install this tag as MRU, shuffle rest down. */
274    for (j = c->assoc - 1; j > 0; j--) {
275        set[j] = set[j - 1];
276    }
277    set[0] = tag;
278
279    return Miss;
280}
281
282__attribute__((always_inline))
283static __inline__
284CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
285{
286    UWord block1 =  a         >> c->line_size_bits;
287    UWord block2 = (a+size-1) >> c->line_size_bits;
288    UInt  set1   = block1 & c->sets_min_1;
289    /* the tag does not need to include bits specifying the set,
290     * but it can, and this saves instructions */
291    UWord tag1   = block1;
292
293    /* Access entirely within line. */
294    if (block1 == block2)
295	return cachesim_setref(c, set1, tag1);
296
297    /* Access straddles two lines. */
298    else if (block1 + 1 == block2) {
299        UInt  set2 = block2 & c->sets_min_1;
300        UWord tag2 = block2;
301
302	/* the call updates cache structures as side effect */
303	CacheResult res1 =  cachesim_setref(c, set1, tag1);
304	CacheResult res2 =  cachesim_setref(c, set2, tag2);
305	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
306
307   } else {
308       VG_(printf)("addr: %lx  size: %u  blocks: %ld %ld",
309		   a, size, block1, block2);
310       VG_(tool_panic)("item straddles more than two cache sets");
311   }
312   return Hit;
313}
314
315static
316CacheModelResult cachesim_I1_ref(Addr a, UChar size)
317{
318    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
319    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
320    return MemAccess;
321}
322
323static
324CacheModelResult cachesim_D1_ref(Addr a, UChar size)
325{
326    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
327    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
328    return MemAccess;
329}
330
331
332/*------------------------------------------------------------*/
333/*--- Write Back Cache Simulation                          ---*/
334/*------------------------------------------------------------*/
335
336/*
337 * More complex model: L1 Write-through, LL Write-back
338 * This needs to distinguish among read and write references.
339 *
340 * Simulator functions:
341 *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
342 *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
343 *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
344 */
345
346/*
347 * With write-back, result can be a miss evicting a dirty line
348 * The dirty state of a cache line is stored in Bit0 of the tag for
349 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
350 * type (Read/Write), the line gets dirty on a write.
351 */
352__attribute__((always_inline))
353static __inline__
354CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
355{
356    int i, j;
357    UWord *set, tmp_tag;
358
359    set = &(c->tags[set_no * c->assoc]);
360
361    /* This loop is unrolled for just the first case, which is the most */
362    /* common.  We can't unroll any further because it would screw up   */
363    /* if we have a direct-mapped (1-way) cache.                        */
364    if (tag == (set[0] & ~CACHELINE_DIRTY)) {
365	set[0] |= ref;
366        return Hit;
367    }
368    /* If the tag is one other than the MRU, move it into the MRU spot  */
369    /* and shuffle the rest down.                                       */
370    for (i = 1; i < c->assoc; i++) {
371	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
372	    tmp_tag = set[i] | ref; // update dirty flag
373            for (j = i; j > 0; j--) {
374                set[j] = set[j - 1];
375            }
376            set[0] = tmp_tag;
377            return Hit;
378        }
379    }
380
381    /* A miss;  install this tag as MRU, shuffle rest down. */
382    tmp_tag = set[c->assoc - 1];
383    for (j = c->assoc - 1; j > 0; j--) {
384        set[j] = set[j - 1];
385    }
386    set[0] = tag | ref;
387
388    return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
389}
390
391__attribute__((always_inline))
392static __inline__
393CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
394{
395    UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
396    UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
397    UWord tag = a & c->tag_mask;
398
399    /* Access entirely within line. */
400    if (set1 == set2)
401	return cachesim_setref_wb(c, ref, set1, tag);
402
403    /* Access straddles two lines. */
404    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
405    else if (((set1 + 1) & (c->sets_min_1)) == set2) {
406	UWord tag2  = (a+size-1) & c->tag_mask;
407
408	/* the call updates cache structures as side effect */
409	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
410	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
411
412	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
414
415   } else {
416       VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
417       VG_(tool_panic)("item straddles more than two cache sets");
418   }
419   return Hit;
420}
421
422
423static
424CacheModelResult cachesim_I1_Read(Addr a, UChar size)
425{
426    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
428	case Hit: return LL_Hit;
429	case Miss: return MemAccess;
430	default: break;
431    }
432    return WriteBackMemAccess;
433}
434
435static
436CacheModelResult cachesim_D1_Read(Addr a, UChar size)
437{
438    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
440	case Hit: return LL_Hit;
441	case Miss: return MemAccess;
442	default: break;
443    }
444    return WriteBackMemAccess;
445}
446
447static
448CacheModelResult cachesim_D1_Write(Addr a, UChar size)
449{
450    if ( cachesim_ref( &D1, a, size) == Hit ) {
451	/* Even for a L1 hit, the write-trough L1 passes
452	 * the write to the LL to make the LL line dirty.
453	 * But this causes no latency, so return the hit.
454	 */
455	cachesim_ref_wb( &LL, Write, a, size);
456	return L1_Hit;
457    }
458    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
459	case Hit: return LL_Hit;
460	case Miss: return MemAccess;
461	default: break;
462    }
463    return WriteBackMemAccess;
464}
465
466
467/*------------------------------------------------------------*/
468/*--- Hardware Prefetch Simulation                         ---*/
469/*------------------------------------------------------------*/
470
471static ULong prefetch_up = 0;
472static ULong prefetch_down = 0;
473
474#define PF_STREAMS  8
475#define PF_PAGEBITS 12
476
477static UInt pf_lastblock[PF_STREAMS];
478static Int  pf_seqblocks[PF_STREAMS];
479
480static
481void prefetch_clear(void)
482{
483  int i;
484  for(i=0;i<PF_STREAMS;i++)
485    pf_lastblock[i] = pf_seqblocks[i] = 0;
486}
487
488/*
489 * HW Prefetch emulation
490 * Start prefetching when detecting sequential access to 3 memory blocks.
491 * One stream can be detected per 4k page.
492 */
493static __inline__
494void prefetch_LL_doref(Addr a)
495{
496  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497  UInt block = ( a >> LL.line_size_bits);
498
499  if (block != pf_lastblock[stream]) {
500    if (pf_seqblocks[stream] == 0) {
501      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
503    }
504    else if (pf_seqblocks[stream] >0) {
505      if (pf_lastblock[stream] +1 == block) {
506	pf_seqblocks[stream]++;
507	if (pf_seqblocks[stream] >= 2) {
508	  prefetch_up++;
509	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
510	}
511      }
512      else pf_seqblocks[stream] = 0;
513    }
514    else if (pf_seqblocks[stream] <0) {
515      if (pf_lastblock[stream] -1 == block) {
516	pf_seqblocks[stream]--;
517	if (pf_seqblocks[stream] <= -2) {
518	  prefetch_down++;
519	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
520	}
521      }
522      else pf_seqblocks[stream] = 0;
523    }
524    pf_lastblock[stream] = block;
525  }
526}
527
528/* simple model with hardware prefetch */
529
530static
531CacheModelResult prefetch_I1_ref(Addr a, UChar size)
532{
533    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
534    prefetch_LL_doref(a);
535    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
536    return MemAccess;
537}
538
539static
540CacheModelResult prefetch_D1_ref(Addr a, UChar size)
541{
542    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
543    prefetch_LL_doref(a);
544    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
545    return MemAccess;
546}
547
548
549/* complex model with hardware prefetch */
550
551static
552CacheModelResult prefetch_I1_Read(Addr a, UChar size)
553{
554    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
555    prefetch_LL_doref(a);
556    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
557	case Hit: return LL_Hit;
558	case Miss: return MemAccess;
559	default: break;
560    }
561    return WriteBackMemAccess;
562}
563
564static
565CacheModelResult prefetch_D1_Read(Addr a, UChar size)
566{
567    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
568    prefetch_LL_doref(a);
569    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
570	case Hit: return LL_Hit;
571	case Miss: return MemAccess;
572	default: break;
573    }
574    return WriteBackMemAccess;
575}
576
577static
578CacheModelResult prefetch_D1_Write(Addr a, UChar size)
579{
580    prefetch_LL_doref(a);
581    if ( cachesim_ref( &D1, a, size) == Hit ) {
582	/* Even for a L1 hit, the write-trough L1 passes
583	 * the write to the LL to make the LL line dirty.
584	 * But this causes no latency, so return the hit.
585	 */
586	cachesim_ref_wb( &LL, Write, a, size);
587	return L1_Hit;
588    }
589    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
590	case Hit: return LL_Hit;
591	case Miss: return MemAccess;
592	default: break;
593    }
594    return WriteBackMemAccess;
595}
596
597
598/*------------------------------------------------------------*/
599/*--- Cache Simulation with use metric collection          ---*/
600/*------------------------------------------------------------*/
601
602/* can not be combined with write-back or prefetch */
603
604static
605void cacheuse_initcache(cache_t2* c)
606{
607    int i;
608    unsigned int start_mask, start_val;
609    unsigned int end_mask, end_val;
610
611    c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
612                           sizeof(line_use) * c->sets * c->assoc);
613    c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
614                           sizeof(line_loaded) * c->sets * c->assoc);
615    c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
616                                    sizeof(int) * c->line_size);
617    c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
618                                  sizeof(int) * c->line_size);
619
620    c->line_size_mask = c->line_size-1;
621
622    /* Meaning of line_start_mask/line_end_mask
623     * Example: for a given cache line, you get an access starting at
624     * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625     * line size of 32, you have 1 bit per byte in the mask:
626     *
627     *   bit31   bit8 bit5  bit 0
628     *       |      |  |    |
629     *       11..111111100000   line_start_mask[5]
630     *       00..000111111111   line_end_mask[(5+4)-1]
631     *
632     *  use_mask |= line_start_mask[5] && line_end_mask[8]
633     *
634     */
635    start_val = end_val = ~0;
636    if (c->line_size < 32) {
637	int bits_per_byte = 32/c->line_size;
638	start_mask = (1<<bits_per_byte)-1;
639	end_mask   = start_mask << (32-bits_per_byte);
640	for(i=0;i<c->line_size;i++) {
641	    c->line_start_mask[i] = start_val;
642	    start_val  = start_val & ~start_mask;
643	    start_mask = start_mask << bits_per_byte;
644
645	    c->line_end_mask[c->line_size-i-1] = end_val;
646	    end_val  = end_val & ~end_mask;
647	    end_mask = end_mask >> bits_per_byte;
648	}
649    }
650    else {
651	int bytes_per_bit = c->line_size/32;
652	start_mask = 1;
653	end_mask   = 1 << 31;
654	for(i=0;i<c->line_size;i++) {
655	    c->line_start_mask[i] = start_val;
656	    c->line_end_mask[c->line_size-i-1] = end_val;
657	    if ( ((i+1)%bytes_per_bit) == 0) {
658		start_val   &= ~start_mask;
659		end_val     &= ~end_mask;
660		start_mask <<= 1;
661		end_mask   >>= 1;
662	    }
663	}
664    }
665
666    CLG_DEBUG(6, "Config %s:\n", c->desc_line);
667    for(i=0;i<c->line_size;i++) {
668	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669		  i, c->line_start_mask[i], c->line_end_mask[i]);
670    }
671
672    /* We use lower tag bits as offset pointers to cache use info.
673     * I.e. some cache parameters don't work.
674     */
675    if ( (1<<c->tag_shift) < c->assoc) {
676	VG_(message)(Vg_DebugMsg,
677		     "error: Use associativity < %d for cache use statistics!\n",
678		     (1<<c->tag_shift) );
679	VG_(tool_panic)("Unsupported cache configuration");
680    }
681}
682
683
684/* for I1/D1 caches */
685#define CACHEUSE(L)                                                         \
686                                                                            \
687static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
688{                                                                           \
689   UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
690   UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
691   UWord tag  = a & L.tag_mask;                                             \
692   UWord tag2;                                                              \
693   int i, j, idx;                                                           \
694   UWord *set, tmp_tag; 						    \
695   UInt use_mask;							    \
696                                                                            \
697   CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
698	    L.name, a, size, set1, set2);				    \
699                                                                            \
700   /* First case: word entirely within line. */                             \
701   if (set1 == set2) {                                                      \
702                                                                            \
703      set = &(L.tags[set1 * L.assoc]);                                      \
704      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
705	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
706                                                                            \
707      /* This loop is unrolled for just the first case, which is the most */\
708      /* common.  We can't unroll any further because it would screw up   */\
709      /* if we have a direct-mapped (1-way) cache.                        */\
710      if (tag == (set[0] & L.tag_mask)) {                                   \
711        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
712        L.use[idx].count ++;                                                \
713        L.use[idx].mask |= use_mask;                                        \
714	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
715		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
716		 use_mask, L.use[idx].mask, L.use[idx].count);              \
717	return L1_Hit;							    \
718      }                                                                     \
719      /* If the tag is one other than the MRU, move it into the MRU spot  */\
720      /* and shuffle the rest down.                                       */\
721      for (i = 1; i < L.assoc; i++) {                                       \
722	 if (tag == (set[i] & L.tag_mask)) {			            \
723  	    tmp_tag = set[i];                                               \
724            for (j = i; j > 0; j--) {                                       \
725               set[j] = set[j - 1];                                         \
726            }                                                               \
727            set[0] = tmp_tag;			                            \
728            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
729            L.use[idx].count ++;                                            \
730            L.use[idx].mask |= use_mask;                                    \
731	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
732		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
733		 use_mask, L.use[idx].mask, L.use[idx].count);              \
734            return L1_Hit;                                                  \
735         }                                                                  \
736      }                                                                     \
737                                                                            \
738      /* A miss;  install this tag as MRU, shuffle rest down. */            \
739      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
740      for (j = L.assoc - 1; j > 0; j--) {                                   \
741         set[j] = set[j - 1];                                               \
742      }                                                                     \
743      set[0] = tag | tmp_tag;                                               \
744      idx = (set1 * L.assoc) + tmp_tag;                                     \
745      return update_##L##_use(&L, idx,         			            \
746		       use_mask, a &~ L.line_size_mask);		    \
747                                                                            \
748   /* Second case: word straddles two lines. */                             \
749   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
750   } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
751      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
752      set = &(L.tags[set1 * L.assoc]);                                      \
753      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
754      if (tag == (set[0] & L.tag_mask)) {                                   \
755         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
756         L.use[idx].count ++;                                               \
757         L.use[idx].mask |= use_mask;                                       \
758	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
759		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
760		 use_mask, L.use[idx].mask, L.use[idx].count);              \
761         goto block2;                                                       \
762      }                                                                     \
763      for (i = 1; i < L.assoc; i++) {                                       \
764	 if (tag == (set[i] & L.tag_mask)) {			            \
765  	    tmp_tag = set[i];                                               \
766            for (j = i; j > 0; j--) {                                       \
767               set[j] = set[j - 1];                                         \
768            }                                                               \
769            set[0] = tmp_tag;                                               \
770            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
771            L.use[idx].count ++;                                            \
772            L.use[idx].mask |= use_mask;                                    \
773	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
774		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
775		 use_mask, L.use[idx].mask, L.use[idx].count);              \
776            goto block2;                                                    \
777         }                                                                  \
778      }                                                                     \
779      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
780      for (j = L.assoc - 1; j > 0; j--) {                                   \
781         set[j] = set[j - 1];                                               \
782      }                                                                     \
783      set[0] = tag | tmp_tag;                                               \
784      idx = (set1 * L.assoc) + tmp_tag;                                     \
785      miss1 = update_##L##_use(&L, idx,        			            \
786		       use_mask, a &~ L.line_size_mask);		    \
787block2:                                                                     \
788      set = &(L.tags[set2 * L.assoc]);                                      \
789      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
790      tag2  = (a+size-1) & L.tag_mask;                                      \
791      if (tag2 == (set[0] & L.tag_mask)) {                                  \
792         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
793         L.use[idx].count ++;                                               \
794         L.use[idx].mask |= use_mask;                                       \
795	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
796		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
797		 use_mask, L.use[idx].mask, L.use[idx].count);              \
798         return miss1;                                                      \
799      }                                                                     \
800      for (i = 1; i < L.assoc; i++) {                                       \
801	 if (tag2 == (set[i] & L.tag_mask)) {			            \
802  	    tmp_tag = set[i];                                               \
803            for (j = i; j > 0; j--) {                                       \
804               set[j] = set[j - 1];                                         \
805            }                                                               \
806            set[0] = tmp_tag;                                               \
807            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
808            L.use[idx].count ++;                                            \
809            L.use[idx].mask |= use_mask;                                    \
810	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
811		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
812		 use_mask, L.use[idx].mask, L.use[idx].count);              \
813            return miss1;                                                   \
814         }                                                                  \
815      }                                                                     \
816      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
817      for (j = L.assoc - 1; j > 0; j--) {                                   \
818         set[j] = set[j - 1];                                               \
819      }                                                                     \
820      set[0] = tag2 | tmp_tag;                                              \
821      idx = (set2 * L.assoc) + tmp_tag;                                     \
822      miss2 = update_##L##_use(&L, idx,			                    \
823		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
824      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
825                                                                            \
826   } else {                                                                 \
827       VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
828       VG_(tool_panic)("item straddles more than two cache sets");          \
829   }                                                                        \
830   return 0;                                                                \
831}
832
833
834/* logarithmic bitcounting algorithm, see
835 * http://graphics.stanford.edu/~seander/bithacks.html
836 */
837static __inline__ unsigned int countBits(unsigned int bits)
838{
839  unsigned int c; // store the total here
840  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
841  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
842
843  c = bits;
844  c = ((c >> S[0]) & B[0]) + (c & B[0]);
845  c = ((c >> S[1]) & B[1]) + (c & B[1]);
846  c = ((c >> S[2]) & B[2]) + (c & B[2]);
847  c = ((c >> S[3]) & B[3]) + (c & B[3]);
848  c = ((c >> S[4]) & B[4]) + (c & B[4]);
849  return c;
850}
851
852static void update_LL_use(int idx, Addr memline)
853{
854  line_loaded* loaded = &(LL.loaded[idx]);
855  line_use* use = &(LL.use[idx]);
856  int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
857
858  CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
859           idx, CLG_(bb_base) + current_ii->instr_offset, memline);
860  if (use->count>0) {
861    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
862	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
863    CLG_DEBUG(2, "   collect: %d, use_base %p\n",
864	     CLG_(current_state).collect, loaded->use_base);
865
866    if (CLG_(current_state).collect && loaded->use_base) {
867      (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
868      (loaded->use_base)[off_LL_SpLoss] += i;
869    }
870   }
871
872   use->count = 0;
873   use->mask  = 0;
874
875  loaded->memline = memline;
876  loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
877  loaded->use_base = (CLG_(current_state).nonskipped) ?
878    CLG_(current_state).nonskipped->skipped :
879    CLG_(cost_base) + current_ii->cost_offset;
880}
881
882static
883CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
884{
885   UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
886   UWord* set = &(LL.tags[setNo * LL.assoc]);
887   UWord tag  = memline & LL.tag_mask;
888
889   int i, j, idx;
890   UWord tmp_tag;
891
892   CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
893
894   if (tag == (set[0] & LL.tag_mask)) {
895     idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
896     l1_loaded->dep_use = &(LL.use[idx]);
897
898     CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
899		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
900		 LL.use[idx].mask, LL.use[idx].count);
901     return LL_Hit;
902   }
903   for (i = 1; i < LL.assoc; i++) {
904     if (tag == (set[i] & LL.tag_mask)) {
905       tmp_tag = set[i];
906       for (j = i; j > 0; j--) {
907	 set[j] = set[j - 1];
908       }
909       set[0] = tmp_tag;
910       idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
911       l1_loaded->dep_use = &(LL.use[idx]);
912
913	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
914		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
915		 LL.use[idx].mask, LL.use[idx].count);
916	return LL_Hit;
917     }
918   }
919
920   /* A miss;  install this tag as MRU, shuffle rest down. */
921   tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
922   for (j = LL.assoc - 1; j > 0; j--) {
923     set[j] = set[j - 1];
924   }
925   set[0] = tag | tmp_tag;
926   idx = (setNo * LL.assoc) + tmp_tag;
927   l1_loaded->dep_use = &(LL.use[idx]);
928
929   update_LL_use(idx, memline);
930
931   return MemAccess;
932}
933
934
935
936
937#define UPDATE_USE(L)					             \
938                                                                     \
939static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
940			       UInt mask, Addr memline)		     \
941{                                                                    \
942  line_loaded* loaded = &(cache->loaded[idx]);			     \
943  line_use* use = &(cache->use[idx]);				     \
944  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
945                                                                     \
946  CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
947           cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
948  if (use->count>0) {                                                \
949    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
950	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
951    CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
952	     CLG_(current_state).collect, loaded->use_base);	     \
953                                                                     \
954    if (CLG_(current_state).collect && loaded->use_base) {           \
955      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
956      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
957                                                                     \
958      /* FIXME (?): L1/LL line sizes must be equal ! */              \
959      loaded->dep_use->mask |= use->mask;                            \
960      loaded->dep_use->count += use->count;                          \
961    }                                                                \
962  }                                                                  \
963                                                                     \
964  use->count = 1;                                                    \
965  use->mask  = mask;                                                 \
966  loaded->memline = memline;                                         \
967  loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
968  loaded->use_base = (CLG_(current_state).nonskipped) ?              \
969    CLG_(current_state).nonskipped->skipped :                        \
970    CLG_(cost_base) + current_ii->cost_offset;                       \
971                                                                     \
972  if (memline == 0) return LL_Hit;                                   \
973  return cacheuse_LL_access(memline, loaded);                        \
974}
975
976UPDATE_USE(I1);
977UPDATE_USE(D1);
978
979CACHEUSE(I1);
980CACHEUSE(D1);
981
982
983static
984void cacheuse_finish(void)
985{
986  int i;
987  InstrInfo ii = { 0,0,0,0 };
988
989  if (!CLG_(current_state).collect) return;
990
991  CLG_(bb_base) = 0;
992  current_ii = &ii; /* needs to be set for update_XX_use */
993  CLG_(cost_base) = 0;
994
995  /* update usage counters */
996  if (I1.use)
997    for (i = 0; i < I1.sets * I1.assoc; i++)
998      if (I1.loaded[i].use_base)
999	update_I1_use( &I1, i, 0,0);
1000
1001  if (D1.use)
1002    for (i = 0; i < D1.sets * D1.assoc; i++)
1003      if (D1.loaded[i].use_base)
1004	update_D1_use( &D1, i, 0,0);
1005
1006  if (LL.use)
1007    for (i = 0; i < LL.sets * LL.assoc; i++)
1008      if (LL.loaded[i].use_base)
1009	update_LL_use(i, 0);
1010
1011  current_ii = 0;
1012}
1013
1014
1015
1016/*------------------------------------------------------------*/
1017/*--- Helper functions called by instrumented code         ---*/
1018/*------------------------------------------------------------*/
1019
1020
1021static __inline__
1022void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1023{
1024    switch(r) {
1025	case WriteBackMemAccess:
1026	    if (clo_simulate_writeback) {
1027		c1[3]++;
1028		c2[3]++;
1029	    }
1030	    // fall through
1031
1032	case MemAccess:
1033	    c1[2]++;
1034	    c2[2]++;
1035	    // fall through
1036
1037	case LL_Hit:
1038	    c1[1]++;
1039	    c2[1]++;
1040	    // fall through
1041
1042	default:
1043	    c1[0]++;
1044	    c2[0]++;
1045    }
1046}
1047
1048static
1049const HChar* cacheRes(CacheModelResult r)
1050{
1051    switch(r) {
1052    case L1_Hit:    return "L1 Hit ";
1053    case LL_Hit:    return "LL Hit ";
1054    case MemAccess: return "LL Miss";
1055    case WriteBackMemAccess: return "LL Miss (dirty)";
1056    default:
1057	tl_assert(0);
1058    }
1059    return "??";
1060}
1061
1062VG_REGPARM(1)
1063static void log_1I0D(InstrInfo* ii)
1064{
1065    CacheModelResult IrRes;
1066
1067    current_ii = ii;
1068    IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1069
1070    CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1071              CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1072
1073    if (CLG_(current_state).collect) {
1074	ULong* cost_Ir;
1075
1076	if (CLG_(current_state).nonskipped)
1077	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1078	else
1079            cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1080
1081	inc_costs(IrRes, cost_Ir,
1082		  CLG_(current_state).cost + fullOffset(EG_IR) );
1083    }
1084}
1085
1086VG_REGPARM(2)
1087static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1088{
1089    CacheModelResult Ir1Res, Ir2Res;
1090    ULong *global_cost_Ir;
1091
1092    current_ii = ii1;
1093    Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1094    current_ii = ii2;
1095    Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1096
1097    CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1098              CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1099              CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1100
1101    if (!CLG_(current_state).collect) return;
1102
1103    global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1104    if (CLG_(current_state).nonskipped) {
1105	ULong* skipped_cost_Ir =
1106	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1107
1108	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1109	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1110	return;
1111    }
1112
1113    inc_costs(Ir1Res, global_cost_Ir,
1114              CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1115    inc_costs(Ir2Res, global_cost_Ir,
1116              CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1117}
1118
1119VG_REGPARM(3)
1120static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1121{
1122    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1123    ULong *global_cost_Ir;
1124
1125    current_ii = ii1;
1126    Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1127    current_ii = ii2;
1128    Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1129    current_ii = ii3;
1130    Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1131
1132    CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1133              CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1134              CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1135              CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1136
1137    if (!CLG_(current_state).collect) return;
1138
1139    global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1140    if (CLG_(current_state).nonskipped) {
1141	ULong* skipped_cost_Ir =
1142	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1143	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1144	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1145	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1146	return;
1147    }
1148
1149    inc_costs(Ir1Res, global_cost_Ir,
1150              CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1151    inc_costs(Ir2Res, global_cost_Ir,
1152              CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1153    inc_costs(Ir3Res, global_cost_Ir,
1154              CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1155}
1156
1157/* Instruction doing a read access */
1158
1159VG_REGPARM(3)
1160static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1161{
1162    CacheModelResult IrRes, DrRes;
1163
1164    current_ii = ii;
1165    IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1166    DrRes = (*simulator.D1_Read)(data_addr, data_size);
1167
1168    CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1169              CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1170	      data_addr, data_size, cacheRes(DrRes));
1171
1172    if (CLG_(current_state).collect) {
1173	ULong *cost_Ir, *cost_Dr;
1174
1175	if (CLG_(current_state).nonskipped) {
1176	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1177	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1178	}
1179	else {
1180            cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1181            cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1182	}
1183
1184	inc_costs(IrRes, cost_Ir,
1185		  CLG_(current_state).cost + fullOffset(EG_IR) );
1186	inc_costs(DrRes, cost_Dr,
1187		  CLG_(current_state).cost + fullOffset(EG_DR) );
1188    }
1189}
1190
1191
1192/* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
1193   have exactly the same prototype.  If you change them, you must
1194   change addEvent_D_guarded too. */
1195VG_REGPARM(3)
1196static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1197{
1198    CacheModelResult DrRes;
1199
1200    current_ii = ii;
1201    DrRes = (*simulator.D1_Read)(data_addr, data_size);
1202
1203    CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1204	      data_addr, data_size, cacheRes(DrRes));
1205
1206    if (CLG_(current_state).collect) {
1207	ULong *cost_Dr;
1208
1209	if (CLG_(current_state).nonskipped)
1210	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1211	else
1212            cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1213
1214	inc_costs(DrRes, cost_Dr,
1215		  CLG_(current_state).cost + fullOffset(EG_DR) );
1216    }
1217}
1218
1219
1220/* Instruction doing a write access */
1221
1222VG_REGPARM(3)
1223static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1224{
1225    CacheModelResult IrRes, DwRes;
1226
1227    current_ii = ii;
1228    IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1229    DwRes = (*simulator.D1_Write)(data_addr, data_size);
1230
1231    CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1232              CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1233	      data_addr, data_size, cacheRes(DwRes));
1234
1235    if (CLG_(current_state).collect) {
1236	ULong *cost_Ir, *cost_Dw;
1237
1238	if (CLG_(current_state).nonskipped) {
1239	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1240	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1241	}
1242	else {
1243            cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1244            cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1245	}
1246
1247	inc_costs(IrRes, cost_Ir,
1248		  CLG_(current_state).cost + fullOffset(EG_IR) );
1249	inc_costs(DwRes, cost_Dw,
1250		  CLG_(current_state).cost + fullOffset(EG_DW) );
1251    }
1252}
1253
1254/* See comment on log_0I1Dr. */
1255VG_REGPARM(3)
1256static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1257{
1258    CacheModelResult DwRes;
1259
1260    current_ii = ii;
1261    DwRes = (*simulator.D1_Write)(data_addr, data_size);
1262
1263    CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1264	      data_addr, data_size, cacheRes(DwRes));
1265
1266    if (CLG_(current_state).collect) {
1267	ULong *cost_Dw;
1268
1269	if (CLG_(current_state).nonskipped)
1270	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1271	else
1272            cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1273
1274	inc_costs(DwRes, cost_Dw,
1275		  CLG_(current_state).cost + fullOffset(EG_DW) );
1276    }
1277}
1278
1279
1280
1281/*------------------------------------------------------------*/
1282/*--- Cache configuration                                  ---*/
1283/*------------------------------------------------------------*/
1284
1285static cache_t clo_I1_cache = UNDEFINED_CACHE;
1286static cache_t clo_D1_cache = UNDEFINED_CACHE;
1287static cache_t clo_LL_cache = UNDEFINED_CACHE;
1288
1289/* Initialize and clear simulator state */
1290static void cachesim_post_clo_init(void)
1291{
1292  /* Cache configurations. */
1293  cache_t  I1c, D1c, LLc;
1294
1295  /* Initialize access handlers */
1296  if (!CLG_(clo).simulate_cache) {
1297    CLG_(cachesim).log_1I0D  = 0;
1298    CLG_(cachesim).log_1I0D_name = "(no function)";
1299    CLG_(cachesim).log_2I0D  = 0;
1300    CLG_(cachesim).log_2I0D_name = "(no function)";
1301    CLG_(cachesim).log_3I0D  = 0;
1302    CLG_(cachesim).log_3I0D_name = "(no function)";
1303
1304    CLG_(cachesim).log_1I1Dr = 0;
1305    CLG_(cachesim).log_1I1Dr_name = "(no function)";
1306    CLG_(cachesim).log_1I1Dw = 0;
1307    CLG_(cachesim).log_1I1Dw_name = "(no function)";
1308
1309    CLG_(cachesim).log_0I1Dr = 0;
1310    CLG_(cachesim).log_0I1Dr_name = "(no function)";
1311    CLG_(cachesim).log_0I1Dw = 0;
1312    CLG_(cachesim).log_0I1Dw_name = "(no function)";
1313    return;
1314  }
1315
1316  /* Configuration of caches only needed with real cache simulation */
1317  VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
1318                                      &clo_I1_cache,
1319                                      &clo_D1_cache,
1320                                      &clo_LL_cache);
1321
1322  I1.name = "I1";
1323  D1.name = "D1";
1324  LL.name = "LL";
1325
1326  // min_line_size is used to make sure that we never feed
1327  // accesses to the simulator straddling more than two
1328  // cache lines at any cache level
1329  CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
1330                           ? I1c.line_size : D1c.line_size;
1331  CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
1332                           ? LLc.line_size : CLG_(min_line_size);
1333
1334  Int largest_load_or_store_size
1335     = VG_(machine_get_size_of_largest_guest_register)();
1336  if (CLG_(min_line_size) < largest_load_or_store_size) {
1337     /* We can't continue, because the cache simulation might
1338        straddle more than 2 lines, and it will assert.  So let's
1339        just stop before we start. */
1340     VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
1341               (Int)CLG_(min_line_size));
1342     VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
1343               largest_load_or_store_size );
1344     VG_(umsg)("  but it is not.  Exiting now.\n");
1345     VG_(exit)(1);
1346  }
1347
1348  cachesim_initcache(I1c, &I1);
1349  cachesim_initcache(D1c, &D1);
1350  cachesim_initcache(LLc, &LL);
1351
1352  /* the other cache simulators use the standard helpers
1353   * with dispatching via simulator struct */
1354
1355  CLG_(cachesim).log_1I0D  = log_1I0D;
1356  CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1357  CLG_(cachesim).log_2I0D  = log_2I0D;
1358  CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1359  CLG_(cachesim).log_3I0D  = log_3I0D;
1360  CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1361
1362  CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1363  CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1364  CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1365  CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1366
1367  CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1368  CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1369  CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1370  CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1371
1372  if (clo_collect_cacheuse) {
1373
1374      /* Output warning for not supported option combinations */
1375      if (clo_simulate_hwpref) {
1376	  VG_(message)(Vg_DebugMsg,
1377		       "warning: prefetch simulation can not be "
1378                       "used with cache usage\n");
1379	  clo_simulate_hwpref = False;
1380      }
1381
1382      if (clo_simulate_writeback) {
1383	  VG_(message)(Vg_DebugMsg,
1384		       "warning: write-back simulation can not be "
1385                       "used with cache usage\n");
1386	  clo_simulate_writeback = False;
1387      }
1388
1389      simulator.I1_Read  = cacheuse_I1_doRead;
1390      simulator.D1_Read  = cacheuse_D1_doRead;
1391      simulator.D1_Write = cacheuse_D1_doRead;
1392      return;
1393  }
1394
1395  if (clo_simulate_hwpref) {
1396    prefetch_clear();
1397
1398    if (clo_simulate_writeback) {
1399      simulator.I1_Read  = prefetch_I1_Read;
1400      simulator.D1_Read  = prefetch_D1_Read;
1401      simulator.D1_Write = prefetch_D1_Write;
1402    }
1403    else {
1404      simulator.I1_Read  = prefetch_I1_ref;
1405      simulator.D1_Read  = prefetch_D1_ref;
1406      simulator.D1_Write = prefetch_D1_ref;
1407    }
1408
1409    return;
1410  }
1411
1412  if (clo_simulate_writeback) {
1413      simulator.I1_Read  = cachesim_I1_Read;
1414      simulator.D1_Read  = cachesim_D1_Read;
1415      simulator.D1_Write = cachesim_D1_Write;
1416  }
1417  else {
1418      simulator.I1_Read  = cachesim_I1_ref;
1419      simulator.D1_Read  = cachesim_D1_ref;
1420      simulator.D1_Write = cachesim_D1_ref;
1421  }
1422}
1423
1424
1425/* Clear simulator state. Has to be initialized before */
1426static
1427void cachesim_clear(void)
1428{
1429  cachesim_clearcache(&I1);
1430  cachesim_clearcache(&D1);
1431  cachesim_clearcache(&LL);
1432
1433  prefetch_clear();
1434}
1435
1436
1437static void cachesim_getdesc(HChar* buf)
1438{
1439  Int p;
1440  p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1441  p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1442  VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
1443}
1444
1445static
1446void cachesim_print_opts(void)
1447{
1448  VG_(printf)(
1449"\n   cache simulator options (does cache simulation if used):\n"
1450"    --simulate-wb=no|yes      Count write-back events [no]\n"
1451"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1452#if CLG_EXPERIMENTAL
1453"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1454#endif
1455"    --cacheuse=no|yes         Collect cache block use [no]\n");
1456  VG_(print_cache_clo_opts)();
1457}
1458
1459/* Check for command line option for cache configuration.
1460 * Return False if unknown and not handled.
1461 *
1462 * Called from CLG_(process_cmd_line_option)() in clo.c
1463 */
1464static Bool cachesim_parse_opt(const HChar* arg)
1465{
1466   if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1467   else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1468   else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1469
1470   else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1471      if (clo_collect_cacheuse) {
1472         /* Use counters only make sense with fine dumping */
1473         CLG_(clo).dump_instr = True;
1474      }
1475   }
1476
1477   else if (VG_(str_clo_cache_opt)(arg,
1478                                   &clo_I1_cache,
1479                                   &clo_D1_cache,
1480                                   &clo_LL_cache)) {}
1481
1482   else
1483     return False;
1484
1485  return True;
1486}
1487
1488/* Adds commas to ULong, right justifying in a field field_width wide, returns
1489 * the string in buf. */
1490static
1491Int commify(ULong n, int field_width, HChar* buf)
1492{
1493   int len, n_commas, i, j, new_len, space;
1494
1495   VG_(sprintf)(buf, "%llu", n);
1496   len = VG_(strlen)(buf);
1497   n_commas = (len - 1) / 3;
1498   new_len = len + n_commas;
1499   space = field_width - new_len;
1500
1501   /* Allow for printing a number in a field_width smaller than it's size */
1502   if (space < 0) space = 0;
1503
1504   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1505    * of three. */
1506   for (j = -1, i = len ; i >= 0; i--) {
1507      buf[i + n_commas + space] = buf[i];
1508
1509      if ((i>0) && (3 == ++j)) {
1510         j = 0;
1511         n_commas--;
1512         buf[i + n_commas + space] = ',';
1513      }
1514   }
1515   /* Right justify in field. */
1516   for (i = 0; i < space; i++)  buf[i] = ' ';
1517   return new_len;
1518}
1519
1520static
1521void percentify(Int n, Int ex, Int field_width, HChar buf[])
1522{
1523   int i, len, space;
1524
1525   VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1526   len = VG_(strlen)(buf);
1527   space = field_width - len;
1528   if (space < 0) space = 0;     /* Allow for v. small field_width */
1529   i = len;
1530
1531   /* Right justify in field */
1532   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1533   for (i = 0; i < space; i++)  buf[i] = ' ';
1534}
1535
1536static
1537void cachesim_printstat(Int l1, Int l2, Int l3)
1538{
1539  FullCost total = CLG_(total_cost), D_total = 0;
1540  ULong LL_total_m, LL_total_mr, LL_total_mw,
1541    LL_total, LL_total_r, LL_total_w;
1542  HChar buf1[RESULTS_BUF_LEN],
1543    buf2[RESULTS_BUF_LEN],
1544    buf3[RESULTS_BUF_LEN];
1545  Int p;
1546
1547  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1548    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1549		 prefetch_up);
1550    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1551		 prefetch_down);
1552    VG_(message)(Vg_DebugMsg, "\n");
1553  }
1554
1555  commify(total[fullOffset(EG_IR) +1], l1, buf1);
1556  VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
1557
1558  commify(total[fullOffset(EG_IR) +2], l1, buf1);
1559  VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
1560
1561  p = 100;
1562
1563  if (0 == total[fullOffset(EG_IR)])
1564    total[fullOffset(EG_IR)] = 1;
1565
1566  percentify(total[fullOffset(EG_IR)+1] * 100 * p /
1567	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1568  VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
1569
1570  percentify(total[fullOffset(EG_IR)+2] * 100 * p /
1571	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1572  VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
1573  VG_(message)(Vg_UserMsg, "\n");
1574
1575  /* D cache results.
1576     Use the D_refs.rd and D_refs.wr values to determine the
1577   * width of columns 2 & 3. */
1578
1579  D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1580  CLG_(init_cost)( CLG_(sets).full, D_total);
1581  // we only use the first 3 values of D_total, adding up Dr and Dw costs
1582  CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1583  CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1584
1585  commify( D_total[0], l1, buf1);
1586  commify(total[fullOffset(EG_DR)], l2,  buf2);
1587  commify(total[fullOffset(EG_DW)], l3,  buf3);
1588  VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
1589	       buf1,  buf2,  buf3);
1590
1591  commify( D_total[1], l1, buf1);
1592  commify(total[fullOffset(EG_DR)+1], l2, buf2);
1593  commify(total[fullOffset(EG_DW)+1], l3, buf3);
1594  VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
1595	       buf1, buf2, buf3);
1596
1597  commify( D_total[2], l1, buf1);
1598  commify(total[fullOffset(EG_DR)+2], l2, buf2);
1599  commify(total[fullOffset(EG_DW)+2], l3, buf3);
1600  VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
1601	       buf1, buf2, buf3);
1602
1603  p = 10;
1604
1605  if (0 == D_total[0])   D_total[0] = 1;
1606  if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1607  if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1608
1609  percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1610  percentify(total[fullOffset(EG_DR)+1] * 100 * p /
1611	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1612  percentify(total[fullOffset(EG_DW)+1] * 100 * p /
1613	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1614  VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
1615               buf1, buf2,buf3);
1616
1617  percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1618  percentify(total[fullOffset(EG_DR)+2] * 100 * p /
1619	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1620  percentify(total[fullOffset(EG_DW)+2] * 100 * p /
1621	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1622  VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n",
1623               buf1, buf2,buf3);
1624  VG_(message)(Vg_UserMsg, "\n");
1625
1626
1627
1628  /* LL overall results */
1629
1630  LL_total   =
1631    total[fullOffset(EG_DR) +1] +
1632    total[fullOffset(EG_DW) +1] +
1633    total[fullOffset(EG_IR) +1];
1634  LL_total_r =
1635    total[fullOffset(EG_DR) +1] +
1636    total[fullOffset(EG_IR) +1];
1637  LL_total_w = total[fullOffset(EG_DW) +1];
1638  commify(LL_total,   l1, buf1);
1639  commify(LL_total_r, l2, buf2);
1640  commify(LL_total_w, l3, buf3);
1641  VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
1642	       buf1, buf2, buf3);
1643
1644  LL_total_m  =
1645    total[fullOffset(EG_DR) +2] +
1646    total[fullOffset(EG_DW) +2] +
1647    total[fullOffset(EG_IR) +2];
1648  LL_total_mr =
1649    total[fullOffset(EG_DR) +2] +
1650    total[fullOffset(EG_IR) +2];
1651  LL_total_mw = total[fullOffset(EG_DW) +2];
1652  commify(LL_total_m,  l1, buf1);
1653  commify(LL_total_mr, l2, buf2);
1654  commify(LL_total_mw, l3, buf3);
1655  VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
1656	       buf1, buf2, buf3);
1657
1658  percentify(LL_total_m  * 100 * p /
1659	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
1660  percentify(LL_total_mr * 100 * p /
1661	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1662	     p, l2+1, buf2);
1663  percentify(LL_total_mw * 100 * p /
1664	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1665  VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
1666	       buf1, buf2,buf3);
1667}
1668
1669
1670/*------------------------------------------------------------*/
1671/*--- Setup for Event set.                                 ---*/
1672/*------------------------------------------------------------*/
1673
1674struct event_sets CLG_(sets);
1675
1676void CLG_(init_eventsets)()
1677{
1678    // Event groups from which the event sets are composed
1679    // the "Use" group only is used with "cacheuse" simulation
1680    if (clo_collect_cacheuse)
1681	CLG_(register_event_group4)(EG_USE,
1682				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1683
1684    if (!CLG_(clo).simulate_cache)
1685	CLG_(register_event_group)(EG_IR, "Ir");
1686    else if (!clo_simulate_writeback) {
1687	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1688	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1689	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1690    }
1691    else { // clo_simulate_writeback
1692	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1693        CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1694        CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1695    }
1696
1697    if (CLG_(clo).simulate_branch) {
1698        CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1699        CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1700    }
1701
1702    if (CLG_(clo).collect_bus)
1703	CLG_(register_event_group)(EG_BUS, "Ge");
1704
1705    if (CLG_(clo).collect_alloc)
1706	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1707
1708    if (CLG_(clo).collect_systime)
1709	CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1710
1711    // event set used as base for instruction self cost
1712    CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1713
1714    // event set comprising all event groups, used for inclusive cost
1715    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1716    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1717    CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1718    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1719
1720    CLG_DEBUGIF(1) {
1721	CLG_DEBUG(1, "EventSets:\n");
1722	CLG_(print_eventset)(-2, CLG_(sets).base);
1723	CLG_(print_eventset)(-2, CLG_(sets).full);
1724    }
1725
1726    /* Not-existing events are silently ignored */
1727    CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1728    CLG_(append_event)(CLG_(dumpmap), "Ir");
1729    CLG_(append_event)(CLG_(dumpmap), "Dr");
1730    CLG_(append_event)(CLG_(dumpmap), "Dw");
1731    CLG_(append_event)(CLG_(dumpmap), "I1mr");
1732    CLG_(append_event)(CLG_(dumpmap), "D1mr");
1733    CLG_(append_event)(CLG_(dumpmap), "D1mw");
1734    CLG_(append_event)(CLG_(dumpmap), "ILmr");
1735    CLG_(append_event)(CLG_(dumpmap), "DLmr");
1736    CLG_(append_event)(CLG_(dumpmap), "DLmw");
1737    CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1738    CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1739    CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1740    CLG_(append_event)(CLG_(dumpmap), "Bc");
1741    CLG_(append_event)(CLG_(dumpmap), "Bcm");
1742    CLG_(append_event)(CLG_(dumpmap), "Bi");
1743    CLG_(append_event)(CLG_(dumpmap), "Bim");
1744    CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1745    CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1746    CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1747    CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1748    CLG_(append_event)(CLG_(dumpmap), "Ge");
1749    CLG_(append_event)(CLG_(dumpmap), "allocCount");
1750    CLG_(append_event)(CLG_(dumpmap), "allocSize");
1751    CLG_(append_event)(CLG_(dumpmap), "sysCount");
1752    CLG_(append_event)(CLG_(dumpmap), "sysTime");
1753}
1754
1755
1756/* this is called at dump time for every instruction executed */
1757static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1758			       InstrInfo* ii, ULong exe_count)
1759{
1760    if (!CLG_(clo).simulate_cache)
1761	cost[ fullOffset(EG_IR) ] += exe_count;
1762
1763    if (ii->eventset)
1764	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1765				  ii->eventset, bbcc->cost + ii->cost_offset);
1766}
1767
1768static
1769void cachesim_finish(void)
1770{
1771  if (clo_collect_cacheuse)
1772    cacheuse_finish();
1773}
1774
1775/*------------------------------------------------------------*/
1776/*--- The simulator defined in this file                   ---*/
1777/*------------------------------------------------------------*/
1778
1779struct cachesim_if CLG_(cachesim) = {
1780  .print_opts    = cachesim_print_opts,
1781  .parse_opt     = cachesim_parse_opt,
1782  .post_clo_init = cachesim_post_clo_init,
1783  .clear         = cachesim_clear,
1784  .getdesc       = cachesim_getdesc,
1785  .printstat     = cachesim_printstat,
1786  .add_icost     = cachesim_add_icost,
1787  .finish        = cachesim_finish,
1788
1789  /* these will be set by cachesim_post_clo_init */
1790  .log_1I0D        = 0,
1791  .log_2I0D        = 0,
1792  .log_3I0D        = 0,
1793
1794  .log_1I1Dr       = 0,
1795  .log_1I1Dw       = 0,
1796
1797  .log_0I1Dr       = 0,
1798  .log_0I1Dw       = 0,
1799
1800  .log_1I0D_name = "(no function)",
1801  .log_2I0D_name = "(no function)",
1802  .log_3I0D_name = "(no function)",
1803
1804  .log_1I1Dr_name = "(no function)",
1805  .log_1I1Dw_name = "(no function)",
1806
1807  .log_0I1Dr_name = "(no function)",
1808  .log_0I1Dw_name = "(no function)",
1809};
1810
1811
1812/*--------------------------------------------------------------------*/
1813/*--- end                                                 ct_sim.c ---*/
1814/*--------------------------------------------------------------------*/
1815