1/*--------------------------------------------------------------------*/
2/*--- Cache simulation.                                            ---*/
3/*---                                                        sim.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Callgrind, a Valgrind tool for call graph
8   profiling programs.
9
10   Copyright (C) 2003-2010, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
11
12   This tool is derived from and contains code from Cachegrind
13   Copyright (C) 2002-2010 Nicholas Nethercote (njn@valgrind.org)
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#include "global.h"
34
35
36/* Notes:
37  - simulates a write-allocate cache
38  - (block --> set) hash function uses simple bit selection
39  - handling of references straddling two cache blocks:
40      - counts as only one cache access (not two)
41      - both blocks hit                  --> one hit
42      - one block hits, the other misses --> one miss
43      - both blocks miss                 --> one miss (not two)
44*/
45
46/* Cache configuration */
47#include "cg_arch.h"
48
49/* additional structures for cache use info, separated
50 * according usage frequency:
51 * - line_loaded : pointer to cost center of instruction
52 *                 which loaded the line into cache.
53 *                 Needed to increment counters when line is evicted.
54 * - line_use    : updated on every access
55 */
56typedef struct {
57  UInt count;
58  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
59} line_use;
60
61typedef struct {
62  Addr memline, iaddr;
63  line_use* dep_use; /* point to higher-level cacheblock for this memline */
64  ULong* use_base;
65} line_loaded;
66
67/* Cache state */
68typedef struct {
69   char*        name;
70   int          size;                   /* bytes */
71   int          assoc;
72   int          line_size;              /* bytes */
73   Bool         sectored;  /* prefetch nearside cacheline on read */
74   int          sets;
75   int          sets_min_1;
76   int          line_size_bits;
77   int          tag_shift;
78   UWord        tag_mask;
79   char         desc_line[128];
80   UWord*       tags;
81
82  /* for cache use */
83   int          line_size_mask;
84   int*         line_start_mask;
85   int*         line_end_mask;
86   line_loaded* loaded;
87   line_use*    use;
88} cache_t2;
89
90/*
91 * States of flat caches in our model.
92 * We use a 2-level hierarchy,
93 */
94static cache_t2 I1, D1, LL;
95
96/* Lower bits of cache tags are used as flags for a cache line */
97#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
98#define CACHELINE_DIRTY    1
99
100
101/* Cache simulator Options */
102static Bool clo_simulate_writeback = False;
103static Bool clo_simulate_hwpref = False;
104static Bool clo_simulate_sectors = False;
105static Bool clo_collect_cacheuse = False;
106
107/* Following global vars are setup before by setup_bbcc():
108 *
109 * - Addr   CLG_(bb_base)     (instruction start address of original BB)
110 * - ULong* CLG_(cost_base)   (start of cost array for BB)
111 */
112
113Addr   CLG_(bb_base);
114ULong* CLG_(cost_base);
115
116static InstrInfo* current_ii;
117
118/* Cache use offsets */
119/* The offsets are only correct because all per-instruction event sets get
120 * the "Use" set added first !
121 */
122static Int off_I1_AcCost  = 0;
123static Int off_I1_SpLoss  = 1;
124static Int off_D1_AcCost  = 0;
125static Int off_D1_SpLoss  = 1;
126static Int off_LL_AcCost  = 2;
127static Int off_LL_SpLoss  = 3;
128
129/* Cache access types */
130typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
131
132/* Result of a reference into a flat cache */
133typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
134
135/* Result of a reference into a hierarchical cache model */
136typedef enum {
137    L1_Hit,
138    LL_Hit,
139    MemAccess,
140    WriteBackMemAccess } CacheModelResult;
141
142typedef CacheModelResult (*simcall_type)(Addr, UChar);
143
144static struct {
145    simcall_type I1_Read;
146    simcall_type D1_Read;
147    simcall_type D1_Write;
148} simulator;
149
150/*------------------------------------------------------------*/
151/*--- Cache Simulator Initialization                       ---*/
152/*------------------------------------------------------------*/
153
154static void cachesim_clearcache(cache_t2* c)
155{
156  Int i;
157
158  for (i = 0; i < c->sets * c->assoc; i++)
159    c->tags[i] = 0;
160  if (c->use) {
161    for (i = 0; i < c->sets * c->assoc; i++) {
162      c->loaded[i].memline  = 0;
163      c->loaded[i].use_base = 0;
164      c->loaded[i].dep_use = 0;
165      c->loaded[i].iaddr = 0;
166      c->use[i].mask    = 0;
167      c->use[i].count   = 0;
168      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
169    }
170  }
171}
172
173static void cacheuse_initcache(cache_t2* c);
174
175/* By this point, the size/assoc/line_size has been checked. */
176static void cachesim_initcache(cache_t config, cache_t2* c)
177{
178   c->size      = config.size;
179   c->assoc     = config.assoc;
180   c->line_size = config.line_size;
181   c->sectored  = False; // FIXME
182
183   c->sets           = (c->size / c->line_size) / c->assoc;
184   c->sets_min_1     = c->sets - 1;
185   c->line_size_bits = VG_(log2)(c->line_size);
186   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
187   c->tag_mask       = ~((1<<c->tag_shift)-1);
188
189   /* Can bits in tag entries be used for flags?
190    * Should be always true as MIN_LINE_SIZE >= 16 */
191   CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
192
193   if (c->assoc == 1) {
194      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
195		   c->size, c->line_size,
196		   c->sectored ? ", sectored":"");
197   } else {
198      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
199		   c->size, c->line_size, c->assoc,
200		   c->sectored ? ", sectored":"");
201   }
202
203   c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
204                                 sizeof(UWord) * c->sets * c->assoc);
205   if (clo_collect_cacheuse)
206       cacheuse_initcache(c);
207   else
208     c->use = 0;
209   cachesim_clearcache(c);
210}
211
212
213#if 0
214static void print_cache(cache_t2* c)
215{
216   UInt set, way, i;
217
218   /* Note initialisation and update of 'i'. */
219   for (i = 0, set = 0; set < c->sets; set++) {
220      for (way = 0; way < c->assoc; way++, i++) {
221         VG_(printf)("%8x ", c->tags[i]);
222      }
223      VG_(printf)("\n");
224   }
225}
226#endif
227
228
229/*------------------------------------------------------------*/
230/*--- Write Through Cache Simulation                       ---*/
231/*------------------------------------------------------------*/
232
233/*
234 * Simple model: L1 & LL Write Through
235 * Does not distinguish among read and write references
236 *
237 * Simulator functions:
238 *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
239 *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
240 */
241
242static __inline__
243CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
244{
245    int i, j;
246    UWord *set;
247
248    set = &(c->tags[set_no * c->assoc]);
249
250    /* This loop is unrolled for just the first case, which is the most */
251    /* common.  We can't unroll any further because it would screw up   */
252    /* if we have a direct-mapped (1-way) cache.                        */
253    if (tag == set[0])
254        return Hit;
255
256    /* If the tag is one other than the MRU, move it into the MRU spot  */
257    /* and shuffle the rest down.                                       */
258    for (i = 1; i < c->assoc; i++) {
259        if (tag == set[i]) {
260            for (j = i; j > 0; j--) {
261                set[j] = set[j - 1];
262            }
263            set[0] = tag;
264            return Hit;
265        }
266    }
267
268    /* A miss;  install this tag as MRU, shuffle rest down. */
269    for (j = c->assoc - 1; j > 0; j--) {
270        set[j] = set[j - 1];
271    }
272    set[0] = tag;
273
274    return Miss;
275}
276
277static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
278{
279    UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
280    UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
281    UWord tag  = a >> c->tag_shift;
282
283    /* Access entirely within line. */
284    if (set1 == set2)
285	return cachesim_setref(c, set1, tag);
286
287    /* Access straddles two lines. */
288    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
289    else if (((set1 + 1) & (c->sets-1)) == set2) {
290	UWord tag2  = (a+size-1) >> c->tag_shift;
291
292	/* the call updates cache structures as side effect */
293	CacheResult res1 =  cachesim_setref(c, set1, tag);
294	CacheResult res2 =  cachesim_setref(c, set2, tag2);
295	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
296
297   } else {
298       VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
299       VG_(tool_panic)("item straddles more than two cache sets");
300   }
301   return Hit;
302}
303
304static
305CacheModelResult cachesim_I1_ref(Addr a, UChar size)
306{
307    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
308    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
309    return MemAccess;
310}
311
312static
313CacheModelResult cachesim_D1_ref(Addr a, UChar size)
314{
315    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
316    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
317    return MemAccess;
318}
319
320
321/*------------------------------------------------------------*/
322/*--- Write Back Cache Simulation                          ---*/
323/*------------------------------------------------------------*/
324
325/*
326 * More complex model: L1 Write-through, LL Write-back
327 * This needs to distinguish among read and write references.
328 *
329 * Simulator functions:
330 *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
331 *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
332 *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
333 */
334
335/*
336 * With write-back, result can be a miss evicting a dirty line
337 * The dirty state of a cache line is stored in Bit0 of the tag for
338 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
339 * type (Read/Write), the line gets dirty on a write.
340 */
341static __inline__
342CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
343{
344    int i, j;
345    UWord *set, tmp_tag;
346
347    set = &(c->tags[set_no * c->assoc]);
348
349    /* This loop is unrolled for just the first case, which is the most */
350    /* common.  We can't unroll any further because it would screw up   */
351    /* if we have a direct-mapped (1-way) cache.                        */
352    if (tag == (set[0] & ~CACHELINE_DIRTY)) {
353	set[0] |= ref;
354        return Hit;
355    }
356    /* If the tag is one other than the MRU, move it into the MRU spot  */
357    /* and shuffle the rest down.                                       */
358    for (i = 1; i < c->assoc; i++) {
359	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
360	    tmp_tag = set[i] | ref; // update dirty flag
361            for (j = i; j > 0; j--) {
362                set[j] = set[j - 1];
363            }
364            set[0] = tmp_tag;
365            return Hit;
366        }
367    }
368
369    /* A miss;  install this tag as MRU, shuffle rest down. */
370    tmp_tag = set[c->assoc - 1];
371    for (j = c->assoc - 1; j > 0; j--) {
372        set[j] = set[j - 1];
373    }
374    set[0] = tag | ref;
375
376    return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
377}
378
379
380static __inline__
381CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
382{
383    UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
384    UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
385    UWord tag = a & c->tag_mask;
386
387    /* Access entirely within line. */
388    if (set1 == set2)
389	return cachesim_setref_wb(c, ref, set1, tag);
390
391    /* Access straddles two lines. */
392    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
393    else if (((set1 + 1) & (c->sets-1)) == set2) {
394	UWord tag2  = (a+size-1) & c->tag_mask;
395
396	/* the call updates cache structures as side effect */
397	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
398	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
399
400	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
401	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
402
403   } else {
404       VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
405       VG_(tool_panic)("item straddles more than two cache sets");
406   }
407   return Hit;
408}
409
410
411static
412CacheModelResult cachesim_I1_Read(Addr a, UChar size)
413{
414    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
415    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
416	case Hit: return LL_Hit;
417	case Miss: return MemAccess;
418	default: break;
419    }
420    return WriteBackMemAccess;
421}
422
423static
424CacheModelResult cachesim_D1_Read(Addr a, UChar size)
425{
426    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
427    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
428	case Hit: return LL_Hit;
429	case Miss: return MemAccess;
430	default: break;
431    }
432    return WriteBackMemAccess;
433}
434
435static
436CacheModelResult cachesim_D1_Write(Addr a, UChar size)
437{
438    if ( cachesim_ref( &D1, a, size) == Hit ) {
439	/* Even for a L1 hit, the write-trough L1 passes
440	 * the write to the LL to make the LL line dirty.
441	 * But this causes no latency, so return the hit.
442	 */
443	cachesim_ref_wb( &LL, Write, a, size);
444	return L1_Hit;
445    }
446    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
447	case Hit: return LL_Hit;
448	case Miss: return MemAccess;
449	default: break;
450    }
451    return WriteBackMemAccess;
452}
453
454
455/*------------------------------------------------------------*/
456/*--- Hardware Prefetch Simulation                         ---*/
457/*------------------------------------------------------------*/
458
459static ULong prefetch_up = 0;
460static ULong prefetch_down = 0;
461
462#define PF_STREAMS  8
463#define PF_PAGEBITS 12
464
465static UInt pf_lastblock[PF_STREAMS];
466static Int  pf_seqblocks[PF_STREAMS];
467
468static
469void prefetch_clear(void)
470{
471  int i;
472  for(i=0;i<PF_STREAMS;i++)
473    pf_lastblock[i] = pf_seqblocks[i] = 0;
474}
475
476/*
477 * HW Prefetch emulation
478 * Start prefetching when detecting sequential access to 3 memory blocks.
479 * One stream can be detected per 4k page.
480 */
481static __inline__
482void prefetch_LL_doref(Addr a)
483{
484  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
485  UInt block = ( a >> LL.line_size_bits);
486
487  if (block != pf_lastblock[stream]) {
488    if (pf_seqblocks[stream] == 0) {
489      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
490      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
491    }
492    else if (pf_seqblocks[stream] >0) {
493      if (pf_lastblock[stream] +1 == block) {
494	pf_seqblocks[stream]++;
495	if (pf_seqblocks[stream] >= 2) {
496	  prefetch_up++;
497	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
498	}
499      }
500      else pf_seqblocks[stream] = 0;
501    }
502    else if (pf_seqblocks[stream] <0) {
503      if (pf_lastblock[stream] -1 == block) {
504	pf_seqblocks[stream]--;
505	if (pf_seqblocks[stream] <= -2) {
506	  prefetch_down++;
507	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
508	}
509      }
510      else pf_seqblocks[stream] = 0;
511    }
512    pf_lastblock[stream] = block;
513  }
514}
515
516/* simple model with hardware prefetch */
517
518static
519CacheModelResult prefetch_I1_ref(Addr a, UChar size)
520{
521    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
522    prefetch_LL_doref(a);
523    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
524    return MemAccess;
525}
526
527static
528CacheModelResult prefetch_D1_ref(Addr a, UChar size)
529{
530    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
531    prefetch_LL_doref(a);
532    if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
533    return MemAccess;
534}
535
536
537/* complex model with hardware prefetch */
538
539static
540CacheModelResult prefetch_I1_Read(Addr a, UChar size)
541{
542    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
543    prefetch_LL_doref(a);
544    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
545	case Hit: return LL_Hit;
546	case Miss: return MemAccess;
547	default: break;
548    }
549    return WriteBackMemAccess;
550}
551
552static
553CacheModelResult prefetch_D1_Read(Addr a, UChar size)
554{
555    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
556    prefetch_LL_doref(a);
557    switch( cachesim_ref_wb( &LL, Read, a, size) ) {
558	case Hit: return LL_Hit;
559	case Miss: return MemAccess;
560	default: break;
561    }
562    return WriteBackMemAccess;
563}
564
565static
566CacheModelResult prefetch_D1_Write(Addr a, UChar size)
567{
568    prefetch_LL_doref(a);
569    if ( cachesim_ref( &D1, a, size) == Hit ) {
570	/* Even for a L1 hit, the write-trough L1 passes
571	 * the write to the LL to make the LL line dirty.
572	 * But this causes no latency, so return the hit.
573	 */
574	cachesim_ref_wb( &LL, Write, a, size);
575	return L1_Hit;
576    }
577    switch( cachesim_ref_wb( &LL, Write, a, size) ) {
578	case Hit: return LL_Hit;
579	case Miss: return MemAccess;
580	default: break;
581    }
582    return WriteBackMemAccess;
583}
584
585
586/*------------------------------------------------------------*/
587/*--- Cache Simulation with use metric collection          ---*/
588/*------------------------------------------------------------*/
589
590/* can not be combined with write-back or prefetch */
591
592static
593void cacheuse_initcache(cache_t2* c)
594{
595    int i;
596    unsigned int start_mask, start_val;
597    unsigned int end_mask, end_val;
598
599    c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
600                           sizeof(line_use) * c->sets * c->assoc);
601    c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
602                           sizeof(line_loaded) * c->sets * c->assoc);
603    c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
604                                    sizeof(int) * c->line_size);
605    c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
606                                  sizeof(int) * c->line_size);
607
608    c->line_size_mask = c->line_size-1;
609
610    /* Meaning of line_start_mask/line_end_mask
611     * Example: for a given cache line, you get an access starting at
612     * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
613     * line size of 32, you have 1 bit per byte in the mask:
614     *
615     *   bit31   bit8 bit5  bit 0
616     *       |      |  |    |
617     *       11..111111100000   line_start_mask[5]
618     *       00..000111111111   line_end_mask[(5+4)-1]
619     *
620     *  use_mask |= line_start_mask[5] && line_end_mask[8]
621     *
622     */
623    start_val = end_val = ~0;
624    if (c->line_size < 32) {
625	int bits_per_byte = 32/c->line_size;
626	start_mask = (1<<bits_per_byte)-1;
627	end_mask   = start_mask << (32-bits_per_byte);
628	for(i=0;i<c->line_size;i++) {
629	    c->line_start_mask[i] = start_val;
630	    start_val  = start_val & ~start_mask;
631	    start_mask = start_mask << bits_per_byte;
632
633	    c->line_end_mask[c->line_size-i-1] = end_val;
634	    end_val  = end_val & ~end_mask;
635	    end_mask = end_mask >> bits_per_byte;
636	}
637    }
638    else {
639	int bytes_per_bit = c->line_size/32;
640	start_mask = 1;
641	end_mask   = 1 << 31;
642	for(i=0;i<c->line_size;i++) {
643	    c->line_start_mask[i] = start_val;
644	    c->line_end_mask[c->line_size-i-1] = end_val;
645	    if ( ((i+1)%bytes_per_bit) == 0) {
646		start_val   &= ~start_mask;
647		end_val     &= ~end_mask;
648		start_mask <<= 1;
649		end_mask   >>= 1;
650	    }
651	}
652    }
653
654    CLG_DEBUG(6, "Config %s:\n", c->desc_line);
655    for(i=0;i<c->line_size;i++) {
656	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
657		  i, c->line_start_mask[i], c->line_end_mask[i]);
658    }
659
660    /* We use lower tag bits as offset pointers to cache use info.
661     * I.e. some cache parameters don't work.
662     */
663    if ( (1<<c->tag_shift) < c->assoc) {
664	VG_(message)(Vg_DebugMsg,
665		     "error: Use associativity < %d for cache use statistics!\n",
666		     (1<<c->tag_shift) );
667	VG_(tool_panic)("Unsupported cache configuration");
668    }
669}
670
671
672/* for I1/D1 caches */
673#define CACHEUSE(L)                                                         \
674                                                                            \
675static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
676{                                                                           \
677   UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
678   UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
679   UWord tag  = a & L.tag_mask;                                             \
680   UWord tag2;                                                              \
681   int i, j, idx;                                                           \
682   UWord *set, tmp_tag; 						    \
683   UInt use_mask;							    \
684                                                                            \
685   CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
686	    L.name, a, size, set1, set2);				    \
687                                                                            \
688   /* First case: word entirely within line. */                             \
689   if (set1 == set2) {                                                      \
690                                                                            \
691      set = &(L.tags[set1 * L.assoc]);                                      \
692      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
693	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
694                                                                            \
695      /* This loop is unrolled for just the first case, which is the most */\
696      /* common.  We can't unroll any further because it would screw up   */\
697      /* if we have a direct-mapped (1-way) cache.                        */\
698      if (tag == (set[0] & L.tag_mask)) {                                   \
699        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
700        L.use[idx].count ++;                                                \
701        L.use[idx].mask |= use_mask;                                        \
702	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
703		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
704		 use_mask, L.use[idx].mask, L.use[idx].count);              \
705	return L1_Hit;							    \
706      }                                                                     \
707      /* If the tag is one other than the MRU, move it into the MRU spot  */\
708      /* and shuffle the rest down.                                       */\
709      for (i = 1; i < L.assoc; i++) {                                       \
710	 if (tag == (set[i] & L.tag_mask)) {			            \
711  	    tmp_tag = set[i];                                               \
712            for (j = i; j > 0; j--) {                                       \
713               set[j] = set[j - 1];                                         \
714            }                                                               \
715            set[0] = tmp_tag;			                            \
716            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
717            L.use[idx].count ++;                                            \
718            L.use[idx].mask |= use_mask;                                    \
719	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
720		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
721		 use_mask, L.use[idx].mask, L.use[idx].count);              \
722            return L1_Hit;                                                  \
723         }                                                                  \
724      }                                                                     \
725                                                                            \
726      /* A miss;  install this tag as MRU, shuffle rest down. */            \
727      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
728      for (j = L.assoc - 1; j > 0; j--) {                                   \
729         set[j] = set[j - 1];                                               \
730      }                                                                     \
731      set[0] = tag | tmp_tag;                                               \
732      idx = (set1 * L.assoc) + tmp_tag;                                     \
733      return update_##L##_use(&L, idx,         			            \
734		       use_mask, a &~ L.line_size_mask);		    \
735                                                                            \
736   /* Second case: word straddles two lines. */                             \
737   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
738   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
739      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
740      set = &(L.tags[set1 * L.assoc]);                                      \
741      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
742      if (tag == (set[0] & L.tag_mask)) {                                   \
743         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
744         L.use[idx].count ++;                                               \
745         L.use[idx].mask |= use_mask;                                       \
746	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
747		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
748		 use_mask, L.use[idx].mask, L.use[idx].count);              \
749         goto block2;                                                       \
750      }                                                                     \
751      for (i = 1; i < L.assoc; i++) {                                       \
752	 if (tag == (set[i] & L.tag_mask)) {			            \
753  	    tmp_tag = set[i];                                               \
754            for (j = i; j > 0; j--) {                                       \
755               set[j] = set[j - 1];                                         \
756            }                                                               \
757            set[0] = tmp_tag;                                               \
758            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
759            L.use[idx].count ++;                                            \
760            L.use[idx].mask |= use_mask;                                    \
761	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
762		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
763		 use_mask, L.use[idx].mask, L.use[idx].count);              \
764            goto block2;                                                    \
765         }                                                                  \
766      }                                                                     \
767      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
768      for (j = L.assoc - 1; j > 0; j--) {                                   \
769         set[j] = set[j - 1];                                               \
770      }                                                                     \
771      set[0] = tag | tmp_tag;                                               \
772      idx = (set1 * L.assoc) + tmp_tag;                                     \
773      miss1 = update_##L##_use(&L, idx,        			            \
774		       use_mask, a &~ L.line_size_mask);		    \
775block2:                                                                     \
776      set = &(L.tags[set2 * L.assoc]);                                      \
777      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
778      tag2  = (a+size-1) & L.tag_mask;                                      \
779      if (tag2 == (set[0] & L.tag_mask)) {                                  \
780         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
781         L.use[idx].count ++;                                               \
782         L.use[idx].mask |= use_mask;                                       \
783	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
784		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
785		 use_mask, L.use[idx].mask, L.use[idx].count);              \
786         return miss1;                                                      \
787      }                                                                     \
788      for (i = 1; i < L.assoc; i++) {                                       \
789	 if (tag2 == (set[i] & L.tag_mask)) {			            \
790  	    tmp_tag = set[i];                                               \
791            for (j = i; j > 0; j--) {                                       \
792               set[j] = set[j - 1];                                         \
793            }                                                               \
794            set[0] = tmp_tag;                                               \
795            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
796            L.use[idx].count ++;                                            \
797            L.use[idx].mask |= use_mask;                                    \
798	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
799		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
800		 use_mask, L.use[idx].mask, L.use[idx].count);              \
801            return miss1;                                                   \
802         }                                                                  \
803      }                                                                     \
804      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
805      for (j = L.assoc - 1; j > 0; j--) {                                   \
806         set[j] = set[j - 1];                                               \
807      }                                                                     \
808      set[0] = tag2 | tmp_tag;                                              \
809      idx = (set2 * L.assoc) + tmp_tag;                                     \
810      miss2 = update_##L##_use(&L, idx,			                    \
811		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
812      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
813                                                                            \
814   } else {                                                                 \
815       VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
816       VG_(tool_panic)("item straddles more than two cache sets");          \
817   }                                                                        \
818   return 0;                                                                \
819}
820
821
822/* logarithmic bitcounting algorithm, see
823 * http://graphics.stanford.edu/~seander/bithacks.html
824 */
825static __inline__ unsigned int countBits(unsigned int bits)
826{
827  unsigned int c; // store the total here
828  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
829  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
830
831  c = bits;
832  c = ((c >> S[0]) & B[0]) + (c & B[0]);
833  c = ((c >> S[1]) & B[1]) + (c & B[1]);
834  c = ((c >> S[2]) & B[2]) + (c & B[2]);
835  c = ((c >> S[3]) & B[3]) + (c & B[3]);
836  c = ((c >> S[4]) & B[4]) + (c & B[4]);
837  return c;
838}
839
840static void update_LL_use(int idx, Addr memline)
841{
842  line_loaded* loaded = &(LL.loaded[idx]);
843  line_use* use = &(LL.use[idx]);
844  int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
845
846  CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
847           idx, CLG_(bb_base) + current_ii->instr_offset, memline);
848  if (use->count>0) {
849    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
850	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
851    CLG_DEBUG(2, "   collect: %d, use_base %p\n",
852	     CLG_(current_state).collect, loaded->use_base);
853
854    if (CLG_(current_state).collect && loaded->use_base) {
855      (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
856      (loaded->use_base)[off_LL_SpLoss] += i;
857    }
858   }
859
860   use->count = 0;
861   use->mask  = 0;
862
863  loaded->memline = memline;
864  loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
865  loaded->use_base = (CLG_(current_state).nonskipped) ?
866    CLG_(current_state).nonskipped->skipped :
867    CLG_(cost_base) + current_ii->cost_offset;
868}
869
870static
871CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
872{
873   UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
874   UWord* set = &(LL.tags[setNo * LL.assoc]);
875   UWord tag  = memline & LL.tag_mask;
876
877   int i, j, idx;
878   UWord tmp_tag;
879
880   CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
881
882   if (tag == (set[0] & LL.tag_mask)) {
883     idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
884     l1_loaded->dep_use = &(LL.use[idx]);
885
886     CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
887		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
888		 LL.use[idx].mask, LL.use[idx].count);
889     return LL_Hit;
890   }
891   for (i = 1; i < LL.assoc; i++) {
892     if (tag == (set[i] & LL.tag_mask)) {
893       tmp_tag = set[i];
894       for (j = i; j > 0; j--) {
895	 set[j] = set[j - 1];
896       }
897       set[0] = tmp_tag;
898       idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
899       l1_loaded->dep_use = &(LL.use[idx]);
900
901	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
902		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
903		 LL.use[idx].mask, LL.use[idx].count);
904	return LL_Hit;
905     }
906   }
907
908   /* A miss;  install this tag as MRU, shuffle rest down. */
909   tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
910   for (j = LL.assoc - 1; j > 0; j--) {
911     set[j] = set[j - 1];
912   }
913   set[0] = tag | tmp_tag;
914   idx = (setNo * LL.assoc) + tmp_tag;
915   l1_loaded->dep_use = &(LL.use[idx]);
916
917   update_LL_use(idx, memline);
918
919   return MemAccess;
920}
921
922
923
924
925#define UPDATE_USE(L)					             \
926                                                                     \
927static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
928			       UInt mask, Addr memline)		     \
929{                                                                    \
930  line_loaded* loaded = &(cache->loaded[idx]);			     \
931  line_use* use = &(cache->use[idx]);				     \
932  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
933                                                                     \
934  CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
935           cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
936  if (use->count>0) {                                                \
937    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
938	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
939    CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
940	     CLG_(current_state).collect, loaded->use_base);	     \
941                                                                     \
942    if (CLG_(current_state).collect && loaded->use_base) {           \
943      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
944      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
945                                                                     \
946      /* FIXME (?): L1/LL line sizes must be equal ! */              \
947      loaded->dep_use->mask |= use->mask;                            \
948      loaded->dep_use->count += use->count;                          \
949    }                                                                \
950  }                                                                  \
951                                                                     \
952  use->count = 1;                                                    \
953  use->mask  = mask;                                                 \
954  loaded->memline = memline;                                         \
955  loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
956  loaded->use_base = (CLG_(current_state).nonskipped) ?              \
957    CLG_(current_state).nonskipped->skipped :                        \
958    CLG_(cost_base) + current_ii->cost_offset;                       \
959                                                                     \
960  if (memline == 0) return LL_Hit;                                   \
961  return cacheuse_LL_access(memline, loaded);                        \
962}
963
964UPDATE_USE(I1);
965UPDATE_USE(D1);
966
967CACHEUSE(I1);
968CACHEUSE(D1);
969
970
971static
972void cacheuse_finish(void)
973{
974  int i;
975  InstrInfo ii = { 0,0,0,0 };
976
977  if (!CLG_(current_state).collect) return;
978
979  CLG_(bb_base) = 0;
980  current_ii = &ii;
981  CLG_(cost_base) = 0;
982
983  /* update usage counters */
984  if (I1.use)
985    for (i = 0; i < I1.sets * I1.assoc; i++)
986      if (I1.loaded[i].use_base)
987	update_I1_use( &I1, i, 0,0);
988
989  if (D1.use)
990    for (i = 0; i < D1.sets * D1.assoc; i++)
991      if (D1.loaded[i].use_base)
992	update_D1_use( &D1, i, 0,0);
993
994  if (LL.use)
995    for (i = 0; i < LL.sets * LL.assoc; i++)
996      if (LL.loaded[i].use_base)
997	update_LL_use(i, 0);
998}
999
1000
1001
1002/*------------------------------------------------------------*/
1003/*--- Helper functions called by instrumented code         ---*/
1004/*------------------------------------------------------------*/
1005
1006
1007static __inline__
1008void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1009{
1010    switch(r) {
1011	case WriteBackMemAccess:
1012	    if (clo_simulate_writeback) {
1013		c1[3]++;
1014		c2[3]++;
1015	    }
1016	    // fall through
1017
1018	case MemAccess:
1019	    c1[2]++;
1020	    c2[2]++;
1021	    // fall through
1022
1023	case LL_Hit:
1024	    c1[1]++;
1025	    c2[1]++;
1026	    // fall through
1027
1028	default:
1029	    c1[0]++;
1030	    c2[0]++;
1031    }
1032}
1033
1034static
1035Char* cacheRes(CacheModelResult r)
1036{
1037    switch(r) {
1038    case L1_Hit:    return "L1 Hit ";
1039    case LL_Hit:    return "LL Hit ";
1040    case MemAccess: return "LL Miss";
1041    case WriteBackMemAccess: return "LL Miss (dirty)";
1042    default:
1043	tl_assert(0);
1044    }
1045    return "??";
1046}
1047
1048VG_REGPARM(1)
1049static void log_1I0D(InstrInfo* ii)
1050{
1051    CacheModelResult IrRes;
1052
1053    current_ii = ii;
1054    IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1055
1056    CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
1057              CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
1058
1059    if (CLG_(current_state).collect) {
1060	ULong* cost_Ir;
1061
1062	if (CLG_(current_state).nonskipped)
1063	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1064	else
1065            cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1066
1067	inc_costs(IrRes, cost_Ir,
1068		  CLG_(current_state).cost + fullOffset(EG_IR) );
1069    }
1070}
1071
1072VG_REGPARM(2)
1073static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
1074{
1075    CacheModelResult Ir1Res, Ir2Res;
1076    ULong *global_cost_Ir;
1077
1078    current_ii = ii1;
1079    Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1080    current_ii = ii2;
1081    Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1082
1083    CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
1084              CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1085              CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
1086
1087    if (!CLG_(current_state).collect) return;
1088
1089    global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1090    if (CLG_(current_state).nonskipped) {
1091	ULong* skipped_cost_Ir =
1092	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1093
1094	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1095	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1096	return;
1097    }
1098
1099    inc_costs(Ir1Res, global_cost_Ir,
1100              CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1101    inc_costs(Ir2Res, global_cost_Ir,
1102              CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1103}
1104
1105VG_REGPARM(3)
1106static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
1107{
1108    CacheModelResult Ir1Res, Ir2Res, Ir3Res;
1109    ULong *global_cost_Ir;
1110
1111    current_ii = ii1;
1112    Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
1113    current_ii = ii2;
1114    Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
1115    current_ii = ii3;
1116    Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
1117
1118    CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
1119              CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
1120              CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
1121              CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
1122
1123    if (!CLG_(current_state).collect) return;
1124
1125    global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
1126    if (CLG_(current_state).nonskipped) {
1127	ULong* skipped_cost_Ir =
1128	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1129	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
1130	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
1131	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
1132	return;
1133    }
1134
1135    inc_costs(Ir1Res, global_cost_Ir,
1136              CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
1137    inc_costs(Ir2Res, global_cost_Ir,
1138              CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
1139    inc_costs(Ir3Res, global_cost_Ir,
1140              CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
1141}
1142
1143/* Instruction doing a read access */
1144
1145VG_REGPARM(3)
1146static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1147{
1148    CacheModelResult IrRes, DrRes;
1149
1150    current_ii = ii;
1151    IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1152    DrRes = (*simulator.D1_Read)(data_addr, data_size);
1153
1154    CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
1155              CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1156	      data_addr, data_size, cacheRes(DrRes));
1157
1158    if (CLG_(current_state).collect) {
1159	ULong *cost_Ir, *cost_Dr;
1160
1161	if (CLG_(current_state).nonskipped) {
1162	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1163	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1164	}
1165	else {
1166            cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1167            cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1168	}
1169
1170	inc_costs(IrRes, cost_Ir,
1171		  CLG_(current_state).cost + fullOffset(EG_IR) );
1172	inc_costs(DrRes, cost_Dr,
1173		  CLG_(current_state).cost + fullOffset(EG_DR) );
1174    }
1175}
1176
1177
1178VG_REGPARM(3)
1179static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
1180{
1181    CacheModelResult DrRes;
1182
1183    current_ii = ii;
1184    DrRes = (*simulator.D1_Read)(data_addr, data_size);
1185
1186    CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
1187	      data_addr, data_size, cacheRes(DrRes));
1188
1189    if (CLG_(current_state).collect) {
1190	ULong *cost_Dr;
1191
1192	if (CLG_(current_state).nonskipped)
1193	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
1194	else
1195            cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
1196
1197	inc_costs(DrRes, cost_Dr,
1198		  CLG_(current_state).cost + fullOffset(EG_DR) );
1199    }
1200}
1201
1202
1203/* Instruction doing a write access */
1204
1205VG_REGPARM(3)
1206static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1207{
1208    CacheModelResult IrRes, DwRes;
1209
1210    current_ii = ii;
1211    IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
1212    DwRes = (*simulator.D1_Write)(data_addr, data_size);
1213
1214    CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
1215              CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
1216	      data_addr, data_size, cacheRes(DwRes));
1217
1218    if (CLG_(current_state).collect) {
1219	ULong *cost_Ir, *cost_Dw;
1220
1221	if (CLG_(current_state).nonskipped) {
1222	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
1223	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1224	}
1225	else {
1226            cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
1227            cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1228	}
1229
1230	inc_costs(IrRes, cost_Ir,
1231		  CLG_(current_state).cost + fullOffset(EG_IR) );
1232	inc_costs(DwRes, cost_Dw,
1233		  CLG_(current_state).cost + fullOffset(EG_DW) );
1234    }
1235}
1236
1237VG_REGPARM(3)
1238static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
1239{
1240    CacheModelResult DwRes;
1241
1242    current_ii = ii;
1243    DwRes = (*simulator.D1_Write)(data_addr, data_size);
1244
1245    CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
1246	      data_addr, data_size, cacheRes(DwRes));
1247
1248    if (CLG_(current_state).collect) {
1249	ULong *cost_Dw;
1250
1251	if (CLG_(current_state).nonskipped)
1252	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
1253	else
1254            cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
1255
1256	inc_costs(DwRes, cost_Dw,
1257		  CLG_(current_state).cost + fullOffset(EG_DW) );
1258    }
1259}
1260
1261
1262
1263/*------------------------------------------------------------*/
1264/*--- Cache configuration                                  ---*/
1265/*------------------------------------------------------------*/
1266
1267#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 })
1268
1269static cache_t clo_I1_cache = UNDEFINED_CACHE;
1270static cache_t clo_D1_cache = UNDEFINED_CACHE;
1271static cache_t clo_LL_cache = UNDEFINED_CACHE;
1272
1273
1274// Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
1275// string otherwise.
1276static Char* check_cache(cache_t* cache)
1277{
1278   // Simulator requires line size and set count to be powers of two.
1279   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1280       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
1281   {
1282      return "Cache set count is not a power of two.\n";
1283   }
1284
1285   // Simulator requires line size to be a power of two.
1286   if (-1 == VG_(log2)(cache->line_size)) {
1287      return "Cache line size is not a power of two.\n";
1288   }
1289
1290   // Then check line size >= 16 -- any smaller and a single instruction could
1291   // straddle three cache lines, which breaks a simulation assertion and is
1292   // stupid anyway.
1293   if (cache->line_size < MIN_LINE_SIZE) {
1294      return "Cache line size is too small.\n";
1295   }
1296
1297   /* Then check cache size > line size (causes seg faults if not). */
1298   if (cache->size <= cache->line_size) {
1299      return "Cache size <= line size.\n";
1300   }
1301
1302   /* Then check assoc <= (size / line size) (seg faults otherwise). */
1303   if (cache->assoc > (cache->size / cache->line_size)) {
1304      return "Cache associativity > (size / line size).\n";
1305   }
1306
1307   return NULL;
1308}
1309
1310static
1311void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
1312{
1313#define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
1314
1315   Char* checkRes;
1316
1317   Bool all_caches_clo_defined =
1318      (DEFINED(clo_I1_cache) &&
1319       DEFINED(clo_D1_cache) &&
1320       DEFINED(clo_LL_cache));
1321
1322   // Set the cache config (using auto-detection, if supported by the
1323   // architecture).
1324   VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
1325
1326   // Check the default/auto-detected values.
1327   checkRes = check_cache(I1c);  tl_assert(!checkRes);
1328   checkRes = check_cache(D1c);  tl_assert(!checkRes);
1329   checkRes = check_cache(LLc);  tl_assert(!checkRes);
1330
1331   // Then replace with any defined on the command line.
1332   if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1333   if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1334   if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
1335
1336   if (VG_(clo_verbosity) > 1) {
1337      VG_(umsg)("Cache configuration used:\n");
1338      VG_(umsg)("  I1: %dB, %d-way, %dB lines\n",
1339                I1c->size, I1c->assoc, I1c->line_size);
1340      VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
1341                D1c->size, D1c->assoc, D1c->line_size);
1342      VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
1343                LLc->size, LLc->assoc, LLc->line_size);
1344   }
1345#undef CMD_LINE_DEFINED
1346}
1347
1348
1349/* Initialize and clear simulator state */
1350static void cachesim_post_clo_init(void)
1351{
1352  /* Cache configurations. */
1353  cache_t  I1c, D1c, LLc;
1354
1355  /* Initialize access handlers */
1356  if (!CLG_(clo).simulate_cache) {
1357    CLG_(cachesim).log_1I0D  = 0;
1358    CLG_(cachesim).log_1I0D_name = "(no function)";
1359    CLG_(cachesim).log_2I0D  = 0;
1360    CLG_(cachesim).log_2I0D_name = "(no function)";
1361    CLG_(cachesim).log_3I0D  = 0;
1362    CLG_(cachesim).log_3I0D_name = "(no function)";
1363
1364    CLG_(cachesim).log_1I1Dr = 0;
1365    CLG_(cachesim).log_1I1Dr_name = "(no function)";
1366    CLG_(cachesim).log_1I1Dw = 0;
1367    CLG_(cachesim).log_1I1Dw_name = "(no function)";
1368
1369    CLG_(cachesim).log_0I1Dr = 0;
1370    CLG_(cachesim).log_0I1Dr_name = "(no function)";
1371    CLG_(cachesim).log_0I1Dw = 0;
1372    CLG_(cachesim).log_0I1Dw_name = "(no function)";
1373    return;
1374  }
1375
1376  /* Configuration of caches only needed with real cache simulation */
1377  configure_caches(&I1c, &D1c, &LLc);
1378
1379  I1.name = "I1";
1380  D1.name = "D1";
1381  LL.name = "LL";
1382
1383  cachesim_initcache(I1c, &I1);
1384  cachesim_initcache(D1c, &D1);
1385  cachesim_initcache(LLc, &LL);
1386
1387  /* the other cache simulators use the standard helpers
1388   * with dispatching via simulator struct */
1389
1390  CLG_(cachesim).log_1I0D  = log_1I0D;
1391  CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1392  CLG_(cachesim).log_2I0D  = log_2I0D;
1393  CLG_(cachesim).log_2I0D_name  = "log_2I0D";
1394  CLG_(cachesim).log_3I0D  = log_3I0D;
1395  CLG_(cachesim).log_3I0D_name  = "log_3I0D";
1396
1397  CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1398  CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1399  CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1400  CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1401
1402  CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1403  CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1404  CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1405  CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1406
1407  if (clo_collect_cacheuse) {
1408
1409      /* Output warning for not supported option combinations */
1410      if (clo_simulate_hwpref) {
1411	  VG_(message)(Vg_DebugMsg,
1412		       "warning: prefetch simulation can not be "
1413                       "used with cache usage\n");
1414	  clo_simulate_hwpref = False;
1415      }
1416
1417      if (clo_simulate_writeback) {
1418	  VG_(message)(Vg_DebugMsg,
1419		       "warning: write-back simulation can not be "
1420                       "used with cache usage\n");
1421	  clo_simulate_writeback = False;
1422      }
1423
1424      simulator.I1_Read  = cacheuse_I1_doRead;
1425      simulator.D1_Read  = cacheuse_D1_doRead;
1426      simulator.D1_Write = cacheuse_D1_doRead;
1427      return;
1428  }
1429
1430  if (clo_simulate_hwpref) {
1431    prefetch_clear();
1432
1433    if (clo_simulate_writeback) {
1434      simulator.I1_Read  = prefetch_I1_Read;
1435      simulator.D1_Read  = prefetch_D1_Read;
1436      simulator.D1_Write = prefetch_D1_Write;
1437    }
1438    else {
1439      simulator.I1_Read  = prefetch_I1_ref;
1440      simulator.D1_Read  = prefetch_D1_ref;
1441      simulator.D1_Write = prefetch_D1_ref;
1442    }
1443
1444    return;
1445  }
1446
1447  if (clo_simulate_writeback) {
1448      simulator.I1_Read  = cachesim_I1_Read;
1449      simulator.D1_Read  = cachesim_D1_Read;
1450      simulator.D1_Write = cachesim_D1_Write;
1451  }
1452  else {
1453      simulator.I1_Read  = cachesim_I1_ref;
1454      simulator.D1_Read  = cachesim_D1_ref;
1455      simulator.D1_Write = cachesim_D1_ref;
1456  }
1457}
1458
1459
1460/* Clear simulator state. Has to be initialized before */
1461static
1462void cachesim_clear(void)
1463{
1464  cachesim_clearcache(&I1);
1465  cachesim_clearcache(&D1);
1466  cachesim_clearcache(&LL);
1467
1468  prefetch_clear();
1469}
1470
1471
1472static void cachesim_getdesc(Char* buf)
1473{
1474  Int p;
1475  p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1476  p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1477  VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
1478}
1479
1480static
1481void cachesim_print_opts(void)
1482{
1483  VG_(printf)(
1484"\n   cache simulator options (does cache simulation if used):\n"
1485"    --simulate-wb=no|yes      Count write-back events [no]\n"
1486"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1487#if CLG_EXPERIMENTAL
1488"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1489#endif
1490"    --cacheuse=no|yes         Collect cache block use [no]\n"
1491"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
1492"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
1493"    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
1494	      );
1495}
1496
1497static void parse_opt ( cache_t* cache,
1498                        char* opt, Char* optval, UChar kind )
1499{
1500   Long i1, i2, i3;
1501   Char* endptr;
1502   Char* checkRes;
1503
1504   // Option argument looks like "65536,2,64".  Extract them.
1505   i1 = VG_(strtoll10)(optval,   &endptr); if (*endptr != ',')  goto bad;
1506   i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
1507   i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
1508
1509   // Check for overflow.
1510   cache->size      = (Int)i1;
1511   cache->assoc     = (Int)i2;
1512   cache->line_size = (Int)i3;
1513   if (cache->size      != i1) goto overflow;
1514   if (cache->assoc     != i2) goto overflow;
1515   if (cache->line_size != i3) goto overflow;
1516
1517   checkRes = check_cache(cache);
1518   if (checkRes) {
1519      VG_(fmsg)("%s", checkRes);
1520      goto bad;
1521   }
1522
1523   return;
1524
1525  bad:
1526   VG_(fmsg_bad_option)(opt, "");
1527
1528  overflow:
1529   VG_(fmsg_bad_option)(opt,
1530      "One of the cache parameters was too large and overflowed.\n");
1531}
1532
1533/* Check for command line option for cache configuration.
1534 * Return False if unknown and not handled.
1535 *
1536 * Called from CLG_(process_cmd_line_option)() in clo.c
1537 */
1538static Bool cachesim_parse_opt(Char* arg)
1539{
1540   Char* tmp_str;
1541
1542   if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
1543   else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
1544   else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
1545
1546   else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
1547      if (clo_collect_cacheuse) {
1548         /* Use counters only make sense with fine dumping */
1549         CLG_(clo).dump_instr = True;
1550      }
1551   }
1552
1553   else if VG_STR_CLO(arg, "--I1", tmp_str)
1554      parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
1555   else if VG_STR_CLO(arg, "--D1", tmp_str)
1556      parse_opt(&clo_D1_cache, arg, tmp_str, '1');
1557   else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
1558            VG_STR_CLO(arg, "--LL", tmp_str))
1559      parse_opt(&clo_LL_cache, arg, tmp_str, '2');
1560  else
1561    return False;
1562
1563  return True;
1564}
1565
1566/* Adds commas to ULong, right justifying in a field field_width wide, returns
1567 * the string in buf. */
1568static
1569Int commify(ULong n, int field_width, char* buf)
1570{
1571   int len, n_commas, i, j, new_len, space;
1572
1573   VG_(sprintf)(buf, "%llu", n);
1574   len = VG_(strlen)(buf);
1575   n_commas = (len - 1) / 3;
1576   new_len = len + n_commas;
1577   space = field_width - new_len;
1578
1579   /* Allow for printing a number in a field_width smaller than it's size */
1580   if (space < 0) space = 0;
1581
1582   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1583    * of three. */
1584   for (j = -1, i = len ; i >= 0; i--) {
1585      buf[i + n_commas + space] = buf[i];
1586
1587      if ((i>0) && (3 == ++j)) {
1588         j = 0;
1589         n_commas--;
1590         buf[i + n_commas + space] = ',';
1591      }
1592   }
1593   /* Right justify in field. */
1594   for (i = 0; i < space; i++)  buf[i] = ' ';
1595   return new_len;
1596}
1597
1598static
1599void percentify(Int n, Int ex, Int field_width, char buf[])
1600{
1601   int i, len, space;
1602
1603   VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1604   len = VG_(strlen)(buf);
1605   space = field_width - len;
1606   if (space < 0) space = 0;     /* Allow for v. small field_width */
1607   i = len;
1608
1609   /* Right justify in field */
1610   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1611   for (i = 0; i < space; i++)  buf[i] = ' ';
1612}
1613
1614static
1615void cachesim_printstat(Int l1, Int l2, Int l3)
1616{
1617  FullCost total = CLG_(total_cost), D_total = 0;
1618  ULong LL_total_m, LL_total_mr, LL_total_mw,
1619    LL_total, LL_total_r, LL_total_w;
1620  char buf1[RESULTS_BUF_LEN],
1621    buf2[RESULTS_BUF_LEN],
1622    buf3[RESULTS_BUF_LEN];
1623  Int p;
1624
1625  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1626    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
1627		 prefetch_up);
1628    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
1629		 prefetch_down);
1630    VG_(message)(Vg_DebugMsg, "\n");
1631  }
1632
1633  commify(total[fullOffset(EG_IR) +1], l1, buf1);
1634  VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
1635
1636  commify(total[fullOffset(EG_IR) +2], l1, buf1);
1637  VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
1638
1639  p = 100;
1640
1641  if (0 == total[fullOffset(EG_IR)])
1642    total[fullOffset(EG_IR)] = 1;
1643
1644  percentify(total[fullOffset(EG_IR)+1] * 100 * p /
1645	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1646  VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
1647
1648  percentify(total[fullOffset(EG_IR)+2] * 100 * p /
1649	     total[fullOffset(EG_IR)], p, l1+1, buf1);
1650  VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
1651  VG_(message)(Vg_UserMsg, "\n");
1652
1653  /* D cache results.
1654     Use the D_refs.rd and D_refs.wr values to determine the
1655   * width of columns 2 & 3. */
1656
1657  D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1658  CLG_(init_cost)( CLG_(sets).full, D_total);
1659  // we only use the first 3 values of D_total, adding up Dr and Dw costs
1660  CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
1661  CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
1662
1663  commify( D_total[0], l1, buf1);
1664  commify(total[fullOffset(EG_DR)], l2,  buf2);
1665  commify(total[fullOffset(EG_DW)], l3,  buf3);
1666  VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
1667	       buf1,  buf2,  buf3);
1668
1669  commify( D_total[1], l1, buf1);
1670  commify(total[fullOffset(EG_DR)+1], l2, buf2);
1671  commify(total[fullOffset(EG_DW)+1], l3, buf3);
1672  VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
1673	       buf1, buf2, buf3);
1674
1675  commify( D_total[2], l1, buf1);
1676  commify(total[fullOffset(EG_DR)+2], l2, buf2);
1677  commify(total[fullOffset(EG_DW)+2], l3, buf3);
1678  VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
1679	       buf1, buf2, buf3);
1680
1681  p = 10;
1682
1683  if (0 == D_total[0])   D_total[0] = 1;
1684  if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
1685  if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
1686
1687  percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1688  percentify(total[fullOffset(EG_DR)+1] * 100 * p /
1689	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1690  percentify(total[fullOffset(EG_DW)+1] * 100 * p /
1691	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1692  VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
1693               buf1, buf2,buf3);
1694
1695  percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1696  percentify(total[fullOffset(EG_DR)+2] * 100 * p /
1697	     total[fullOffset(EG_DR)], p, l2+1, buf2);
1698  percentify(total[fullOffset(EG_DW)+2] * 100 * p /
1699	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1700  VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n",
1701               buf1, buf2,buf3);
1702  VG_(message)(Vg_UserMsg, "\n");
1703
1704
1705
1706  /* LL overall results */
1707
1708  LL_total   =
1709    total[fullOffset(EG_DR) +1] +
1710    total[fullOffset(EG_DW) +1] +
1711    total[fullOffset(EG_IR) +1];
1712  LL_total_r =
1713    total[fullOffset(EG_DR) +1] +
1714    total[fullOffset(EG_IR) +1];
1715  LL_total_w = total[fullOffset(EG_DW) +1];
1716  commify(LL_total,   l1, buf1);
1717  commify(LL_total_r, l2, buf2);
1718  commify(LL_total_w, l3, buf3);
1719  VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
1720	       buf1, buf2, buf3);
1721
1722  LL_total_m  =
1723    total[fullOffset(EG_DR) +2] +
1724    total[fullOffset(EG_DW) +2] +
1725    total[fullOffset(EG_IR) +2];
1726  LL_total_mr =
1727    total[fullOffset(EG_DR) +2] +
1728    total[fullOffset(EG_IR) +2];
1729  LL_total_mw = total[fullOffset(EG_DW) +2];
1730  commify(LL_total_m,  l1, buf1);
1731  commify(LL_total_mr, l2, buf2);
1732  commify(LL_total_mw, l3, buf3);
1733  VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
1734	       buf1, buf2, buf3);
1735
1736  percentify(LL_total_m  * 100 * p /
1737	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
1738  percentify(LL_total_mr * 100 * p /
1739	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
1740	     p, l2+1, buf2);
1741  percentify(LL_total_mw * 100 * p /
1742	     total[fullOffset(EG_DW)], p, l3+1, buf3);
1743  VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
1744	       buf1, buf2,buf3);
1745}
1746
1747
1748/*------------------------------------------------------------*/
1749/*--- Setup for Event set.                                 ---*/
1750/*------------------------------------------------------------*/
1751
1752struct event_sets CLG_(sets);
1753
1754void CLG_(init_eventsets)()
1755{
1756    // Event groups from which the event sets are composed
1757    // the "Use" group only is used with "cacheuse" simulation
1758    if (clo_collect_cacheuse)
1759	CLG_(register_event_group4)(EG_USE,
1760				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
1761
1762    if (!CLG_(clo).simulate_cache)
1763	CLG_(register_event_group)(EG_IR, "Ir");
1764    else if (!clo_simulate_writeback) {
1765	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
1766	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
1767	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
1768    }
1769    else { // clo_simulate_writeback
1770	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
1771        CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
1772        CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
1773    }
1774
1775    if (CLG_(clo).simulate_branch) {
1776        CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
1777        CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
1778    }
1779
1780    if (CLG_(clo).collect_bus)
1781	CLG_(register_event_group)(EG_BUS, "Ge");
1782
1783    if (CLG_(clo).collect_alloc)
1784	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
1785
1786    if (CLG_(clo).collect_systime)
1787	CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
1788
1789    // event set used as base for instruction self cost
1790    CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
1791
1792    // event set comprising all event groups, used for inclusive cost
1793    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
1794    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
1795    CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
1796    CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
1797
1798    CLG_DEBUGIF(1) {
1799	CLG_DEBUG(1, "EventSets:\n");
1800	CLG_(print_eventset)(-2, CLG_(sets).base);
1801	CLG_(print_eventset)(-2, CLG_(sets).full);
1802    }
1803
1804    /* Not-existing events are silently ignored */
1805    CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
1806    CLG_(append_event)(CLG_(dumpmap), "Ir");
1807    CLG_(append_event)(CLG_(dumpmap), "Dr");
1808    CLG_(append_event)(CLG_(dumpmap), "Dw");
1809    CLG_(append_event)(CLG_(dumpmap), "I1mr");
1810    CLG_(append_event)(CLG_(dumpmap), "D1mr");
1811    CLG_(append_event)(CLG_(dumpmap), "D1mw");
1812    CLG_(append_event)(CLG_(dumpmap), "ILmr");
1813    CLG_(append_event)(CLG_(dumpmap), "DLmr");
1814    CLG_(append_event)(CLG_(dumpmap), "DLmw");
1815    CLG_(append_event)(CLG_(dumpmap), "ILdmr");
1816    CLG_(append_event)(CLG_(dumpmap), "DLdmr");
1817    CLG_(append_event)(CLG_(dumpmap), "DLdmw");
1818    CLG_(append_event)(CLG_(dumpmap), "Bc");
1819    CLG_(append_event)(CLG_(dumpmap), "Bcm");
1820    CLG_(append_event)(CLG_(dumpmap), "Bi");
1821    CLG_(append_event)(CLG_(dumpmap), "Bim");
1822    CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1823    CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1824    CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1825    CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1826    CLG_(append_event)(CLG_(dumpmap), "Ge");
1827    CLG_(append_event)(CLG_(dumpmap), "allocCount");
1828    CLG_(append_event)(CLG_(dumpmap), "allocSize");
1829    CLG_(append_event)(CLG_(dumpmap), "sysCount");
1830    CLG_(append_event)(CLG_(dumpmap), "sysTime");
1831}
1832
1833
1834/* this is called at dump time for every instruction executed */
1835static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1836			       InstrInfo* ii, ULong exe_count)
1837{
1838    if (!CLG_(clo).simulate_cache)
1839	cost[ fullOffset(EG_IR) ] += exe_count;
1840
1841    if (ii->eventset)
1842	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
1843				  ii->eventset, bbcc->cost + ii->cost_offset);
1844}
1845
1846static
1847void cachesim_finish(void)
1848{
1849  if (clo_collect_cacheuse)
1850    cacheuse_finish();
1851}
1852
1853/*------------------------------------------------------------*/
1854/*--- The simulator defined in this file                   ---*/
1855/*------------------------------------------------------------*/
1856
1857struct cachesim_if CLG_(cachesim) = {
1858  .print_opts    = cachesim_print_opts,
1859  .parse_opt     = cachesim_parse_opt,
1860  .post_clo_init = cachesim_post_clo_init,
1861  .clear         = cachesim_clear,
1862  .getdesc       = cachesim_getdesc,
1863  .printstat     = cachesim_printstat,
1864  .add_icost     = cachesim_add_icost,
1865  .finish        = cachesim_finish,
1866
1867  /* these will be set by cachesim_post_clo_init */
1868  .log_1I0D        = 0,
1869  .log_2I0D        = 0,
1870  .log_3I0D        = 0,
1871
1872  .log_1I1Dr       = 0,
1873  .log_1I1Dw       = 0,
1874
1875  .log_0I1Dr       = 0,
1876  .log_0I1Dw       = 0,
1877
1878  .log_1I0D_name = "(no function)",
1879  .log_2I0D_name = "(no function)",
1880  .log_3I0D_name = "(no function)",
1881
1882  .log_1I1Dr_name = "(no function)",
1883  .log_1I1Dw_name = "(no function)",
1884
1885  .log_0I1Dr_name = "(no function)",
1886  .log_0I1Dw_name = "(no function)",
1887};
1888
1889
1890/*--------------------------------------------------------------------*/
1891/*--- end                                                 ct_sim.c ---*/
1892/*--------------------------------------------------------------------*/
1893
1894