sim.c revision 09ee78ec9675201840d895623d49efba1ffe05d8
1
2/*--------------------------------------------------------------------*/
3/*--- Cache simulation.                                            ---*/
4/*---                                                        sim.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of Callgrind, a Valgrind tool for call graph
9   profiling programs.
10
11   Copyright (C) 2003-2005, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12
13   This tool is derived from and contains code from Cachegrind
14   Copyright (C) 2002-2008 Nicholas Nethercote (njn@valgrind.org)
15
16   This program is free software; you can redistribute it and/or
17   modify it under the terms of the GNU General Public License as
18   published by the Free Software Foundation; either version 2 of the
19   License, or (at your option) any later version.
20
21   This program is distributed in the hope that it will be useful, but
22   WITHOUT ANY WARRANTY; without even the implied warranty of
23   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24   General Public License for more details.
25
26   You should have received a copy of the GNU General Public License
27   along with this program; if not, write to the Free Software
28   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29   02111-1307, USA.
30
31   The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "global.h"
35
36
37/* Notes:
38  - simulates a write-allocate cache
39  - (block --> set) hash function uses simple bit selection
40  - handling of references straddling two cache blocks:
41      - counts as only one cache access (not two)
42      - both blocks hit                  --> one hit
43      - one block hits, the other misses --> one miss
44      - both blocks miss                 --> one miss (not two)
45*/
46
47/* Cache configuration */
48#include "cg_arch.h"
49
50/* additional structures for cache use info, separated
51 * according usage frequency:
52 * - line_loaded : pointer to cost center of instruction
53 *                 which loaded the line into cache.
54 *                 Needed to increment counters when line is evicted.
55 * - line_use    : updated on every access
56 */
57typedef struct {
58  UInt count;
59  UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
60} line_use;
61
62typedef struct {
63  Addr memline, iaddr;
64  line_use* dep_use; /* point to higher-level cacheblock for this memline */
65  ULong* use_base;
66} line_loaded;
67
68/* Cache state */
69typedef struct {
70   char*        name;
71   int          size;                   /* bytes */
72   int          assoc;
73   int          line_size;              /* bytes */
74   Bool         sectored;  /* prefetch nearside cacheline on read */
75   int          sets;
76   int          sets_min_1;
77   int          line_size_bits;
78   int          tag_shift;
79   UWord        tag_mask;
80   char         desc_line[128];
81   UWord*       tags;
82
83  /* for cache use */
84   int          line_size_mask;
85   int*         line_start_mask;
86   int*         line_end_mask;
87   line_loaded* loaded;
88   line_use*    use;
89} cache_t2;
90
91/*
92 * States of flat caches in our model.
93 * We use a 2-level hierarchy,
94 */
95static cache_t2 I1, D1, L2;
96
97/* Lower bits of cache tags are used as flags for a cache line */
98#define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
99#define CACHELINE_DIRTY    1
100
101
102/* Cache simulator Options */
103static Bool clo_simulate_writeback = False;
104static Bool clo_simulate_hwpref = False;
105static Bool clo_simulate_sectors = False;
106static Bool clo_collect_cacheuse = False;
107
108/* Following global vars are setup before by
109 *  setup_bbcc()/cachesim_after_bbsetup():
110 *
111 * - Addr   bb_base     (instruction start address of original BB)
112 * - ULong* cost_base   (start of cost array for BB)
113 * - BBCC*  nonskipped  (only != 0 when in a function not skipped)
114 */
115
116/* Offset to events in event set, used in log_* functions */
117static Int off_D0_Ir;
118static Int off_D1r_Ir;
119static Int off_D1r_Dr;
120static Int off_D1w_Ir;
121static Int off_D1w_Dw;
122static Int off_D2_Ir;
123static Int off_D2_Dr;
124static Int off_D2_Dw;
125
126static Addr   bb_base;
127static ULong* cost_base;
128static InstrInfo* current_ii;
129
130/* Cache use offsets */
131/* FIXME: The offsets are only correct because all eventsets get
132 * the "Use" set added first !
133 */
134static Int off_I1_AcCost  = 0;
135static Int off_I1_SpLoss  = 1;
136static Int off_D1_AcCost  = 0;
137static Int off_D1_SpLoss  = 1;
138static Int off_L2_AcCost  = 2;
139static Int off_L2_SpLoss  = 3;
140
141/* Cache access types */
142typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
143
144/* Result of a reference into a flat cache */
145typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
146
147/* Result of a reference into a hierarchical cache model */
148typedef enum {
149    L1_Hit,
150    L2_Hit,
151    MemAccess,
152    WriteBackMemAccess } CacheModelResult;
153
154typedef CacheModelResult (*simcall_type)(Addr, UChar);
155
156static struct {
157    simcall_type I1_Read;
158    simcall_type D1_Read;
159    simcall_type D1_Write;
160} simulator;
161
162/*------------------------------------------------------------*/
163/*--- Cache Simulator Initialization                       ---*/
164/*------------------------------------------------------------*/
165
166static void cachesim_clearcache(cache_t2* c)
167{
168  Int i;
169
170  for (i = 0; i < c->sets * c->assoc; i++)
171    c->tags[i] = 0;
172  if (c->use) {
173    for (i = 0; i < c->sets * c->assoc; i++) {
174      c->loaded[i].memline  = 0;
175      c->loaded[i].use_base = 0;
176      c->loaded[i].dep_use = 0;
177      c->loaded[i].iaddr = 0;
178      c->use[i].mask    = 0;
179      c->use[i].count   = 0;
180      c->tags[i] = i % c->assoc; /* init lower bits as pointer */
181    }
182  }
183}
184
185static void cacheuse_initcache(cache_t2* c);
186
187/* By this point, the size/assoc/line_size has been checked. */
188static void cachesim_initcache(cache_t config, cache_t2* c)
189{
190   c->size      = config.size;
191   c->assoc     = config.assoc;
192   c->line_size = config.line_size;
193   c->sectored  = False; // FIXME
194
195   c->sets           = (c->size / c->line_size) / c->assoc;
196   c->sets_min_1     = c->sets - 1;
197   c->line_size_bits = VG_(log2)(c->line_size);
198   c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
199   c->tag_mask       = ~((1<<c->tag_shift)-1);
200
201   /* Can bits in tag entries be used for flags?
202    * Should be always true as MIN_LINE_SIZE >= 16 */
203   CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
204
205   if (c->assoc == 1) {
206      VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
207		   c->size, c->line_size,
208		   c->sectored ? ", sectored":"");
209   } else {
210      VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
211		   c->size, c->line_size, c->assoc,
212		   c->sectored ? ", sectored":"");
213   }
214
215   c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
216                                 sizeof(UWord) * c->sets * c->assoc);
217   if (clo_collect_cacheuse)
218       cacheuse_initcache(c);
219   else
220     c->use = 0;
221   cachesim_clearcache(c);
222}
223
224
225#if 0
226static void print_cache(cache_t2* c)
227{
228   UInt set, way, i;
229
230   /* Note initialisation and update of 'i'. */
231   for (i = 0, set = 0; set < c->sets; set++) {
232      for (way = 0; way < c->assoc; way++, i++) {
233         VG_(printf)("%8x ", c->tags[i]);
234      }
235      VG_(printf)("\n");
236   }
237}
238#endif
239
240
241/*------------------------------------------------------------*/
242/*--- Write Through Cache Simulation                       ---*/
243/*------------------------------------------------------------*/
244
245/*
246 * Simple model: L1 & L2 Write Through
247 * Does not distinguish among read and write references
248 *
249 * Simulator functions:
250 *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
251 *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
252 */
253
254static __inline__
255CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
256{
257    int i, j;
258    UWord *set;
259
260    set = &(c->tags[set_no * c->assoc]);
261
262    /* This loop is unrolled for just the first case, which is the most */
263    /* common.  We can't unroll any further because it would screw up   */
264    /* if we have a direct-mapped (1-way) cache.                        */
265    if (tag == set[0])
266        return Hit;
267
268    /* If the tag is one other than the MRU, move it into the MRU spot  */
269    /* and shuffle the rest down.                                       */
270    for (i = 1; i < c->assoc; i++) {
271        if (tag == set[i]) {
272            for (j = i; j > 0; j--) {
273                set[j] = set[j - 1];
274            }
275            set[0] = tag;
276            return Hit;
277        }
278    }
279
280    /* A miss;  install this tag as MRU, shuffle rest down. */
281    for (j = c->assoc - 1; j > 0; j--) {
282        set[j] = set[j - 1];
283    }
284    set[0] = tag;
285
286    return Miss;
287}
288
289static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
290{
291    UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
292    UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
293    UWord tag  = a >> c->tag_shift;
294
295    /* Access entirely within line. */
296    if (set1 == set2)
297	return cachesim_setref(c, set1, tag);
298
299    /* Access straddles two lines. */
300    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
301    else if (((set1 + 1) & (c->sets-1)) == set2) {
302	UWord tag2  = (a+size-1) >> c->tag_shift;
303
304	/* the call updates cache structures as side effect */
305	CacheResult res1 =  cachesim_setref(c, set1, tag);
306	CacheResult res2 =  cachesim_setref(c, set2, tag2);
307	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
308
309   } else {
310       VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
311       VG_(tool_panic)("item straddles more than two cache sets");
312   }
313   return Hit;
314}
315
316static
317CacheModelResult cachesim_I1_ref(Addr a, UChar size)
318{
319    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
320    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
321    return MemAccess;
322}
323
324static
325CacheModelResult cachesim_D1_ref(Addr a, UChar size)
326{
327    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
328    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
329    return MemAccess;
330}
331
332
333/*------------------------------------------------------------*/
334/*--- Write Back Cache Simulation                          ---*/
335/*------------------------------------------------------------*/
336
337/*
338 * More complex model: L1 Write-through, L2 Write-back
339 * This needs to distinguish among read and write references.
340 *
341 * Simulator functions:
342 *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
343 *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
344 *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
345 */
346
347/*
348 * With write-back, result can be a miss evicting a dirty line
349 * The dirty state of a cache line is stored in Bit0 of the tag for
350 * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
351 * type (Read/Write), the line gets dirty on a write.
352 */
353static __inline__
354CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
355{
356    int i, j;
357    UWord *set, tmp_tag;
358
359    set = &(c->tags[set_no * c->assoc]);
360
361    /* This loop is unrolled for just the first case, which is the most */
362    /* common.  We can't unroll any further because it would screw up   */
363    /* if we have a direct-mapped (1-way) cache.                        */
364    if (tag == (set[0] & ~CACHELINE_DIRTY)) {
365	set[0] |= ref;
366        return Hit;
367    }
368    /* If the tag is one other than the MRU, move it into the MRU spot  */
369    /* and shuffle the rest down.                                       */
370    for (i = 1; i < c->assoc; i++) {
371	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
372	    tmp_tag = set[i] | ref; // update dirty flag
373            for (j = i; j > 0; j--) {
374                set[j] = set[j - 1];
375            }
376            set[0] = tmp_tag;
377            return Hit;
378        }
379    }
380
381    /* A miss;  install this tag as MRU, shuffle rest down. */
382    tmp_tag = set[c->assoc - 1];
383    for (j = c->assoc - 1; j > 0; j--) {
384        set[j] = set[j - 1];
385    }
386    set[0] = tag | ref;
387
388    return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
389}
390
391
392static __inline__
393CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
394{
395    UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
396    UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
397    UWord tag = a & c->tag_mask;
398
399    /* Access entirely within line. */
400    if (set1 == set2)
401	return cachesim_setref_wb(c, ref, set1, tag);
402
403    /* Access straddles two lines. */
404    /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
405    else if (((set1 + 1) & (c->sets-1)) == set2) {
406	UWord tag2  = (a+size-1) & c->tag_mask;
407
408	/* the call updates cache structures as side effect */
409	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
410	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
411
412	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
413	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
414
415   } else {
416       VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
417       VG_(tool_panic)("item straddles more than two cache sets");
418   }
419   return Hit;
420}
421
422
423static
424CacheModelResult cachesim_I1_Read(Addr a, UChar size)
425{
426    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
427    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
428	case Hit: return L2_Hit;
429	case Miss: return MemAccess;
430	default: break;
431    }
432    return WriteBackMemAccess;
433}
434
435static
436CacheModelResult cachesim_D1_Read(Addr a, UChar size)
437{
438    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
439    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
440	case Hit: return L2_Hit;
441	case Miss: return MemAccess;
442	default: break;
443    }
444    return WriteBackMemAccess;
445}
446
447static
448CacheModelResult cachesim_D1_Write(Addr a, UChar size)
449{
450    if ( cachesim_ref( &D1, a, size) == Hit ) {
451	/* Even for a L1 hit, the write-trough L1 passes
452	 * the write to the L2 to make the L2 line dirty.
453	 * But this causes no latency, so return the hit.
454	 */
455	cachesim_ref_wb( &L2, Write, a, size);
456	return L1_Hit;
457    }
458    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
459	case Hit: return L2_Hit;
460	case Miss: return MemAccess;
461	default: break;
462    }
463    return WriteBackMemAccess;
464}
465
466
467/*------------------------------------------------------------*/
468/*--- Hardware Prefetch Simulation                         ---*/
469/*------------------------------------------------------------*/
470
471static ULong prefetch_up = 0;
472static ULong prefetch_down = 0;
473
474#define PF_STREAMS  8
475#define PF_PAGEBITS 12
476
477static UInt pf_lastblock[PF_STREAMS];
478static Int  pf_seqblocks[PF_STREAMS];
479
480static
481void prefetch_clear(void)
482{
483  int i;
484  for(i=0;i<PF_STREAMS;i++)
485    pf_lastblock[i] = pf_seqblocks[i] = 0;
486}
487
488/*
489 * HW Prefetch emulation
490 * Start prefetching when detecting sequential access to 3 memory blocks.
491 * One stream can be detected per 4k page.
492 */
493static __inline__
494void prefetch_L2_doref(Addr a)
495{
496  UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
497  UInt block = ( a >> L2.line_size_bits);
498
499  if (block != pf_lastblock[stream]) {
500    if (pf_seqblocks[stream] == 0) {
501      if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
502      else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
503    }
504    else if (pf_seqblocks[stream] >0) {
505      if (pf_lastblock[stream] +1 == block) {
506	pf_seqblocks[stream]++;
507	if (pf_seqblocks[stream] >= 2) {
508	  prefetch_up++;
509	  cachesim_ref(&L2, a + 5 * L2.line_size,1);
510	}
511      }
512      else pf_seqblocks[stream] = 0;
513    }
514    else if (pf_seqblocks[stream] <0) {
515      if (pf_lastblock[stream] -1 == block) {
516	pf_seqblocks[stream]--;
517	if (pf_seqblocks[stream] <= -2) {
518	  prefetch_down++;
519	  cachesim_ref(&L2, a - 5 * L2.line_size,1);
520	}
521      }
522      else pf_seqblocks[stream] = 0;
523    }
524    pf_lastblock[stream] = block;
525  }
526}
527
528/* simple model with hardware prefetch */
529
530static
531CacheModelResult prefetch_I1_ref(Addr a, UChar size)
532{
533    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
534    prefetch_L2_doref(a);
535    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
536    return MemAccess;
537}
538
539static
540CacheModelResult prefetch_D1_ref(Addr a, UChar size)
541{
542    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
543    prefetch_L2_doref(a);
544    if ( cachesim_ref( &L2, a, size) == Hit ) return L2_Hit;
545    return MemAccess;
546}
547
548
549/* complex model with hardware prefetch */
550
551static
552CacheModelResult prefetch_I1_Read(Addr a, UChar size)
553{
554    if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
555    prefetch_L2_doref(a);
556    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
557	case Hit: return L2_Hit;
558	case Miss: return MemAccess;
559	default: break;
560    }
561    return WriteBackMemAccess;
562}
563
564static
565CacheModelResult prefetch_D1_Read(Addr a, UChar size)
566{
567    if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
568    prefetch_L2_doref(a);
569    switch( cachesim_ref_wb( &L2, Read, a, size) ) {
570	case Hit: return L2_Hit;
571	case Miss: return MemAccess;
572	default: break;
573    }
574    return WriteBackMemAccess;
575}
576
577static
578CacheModelResult prefetch_D1_Write(Addr a, UChar size)
579{
580    prefetch_L2_doref(a);
581    if ( cachesim_ref( &D1, a, size) == Hit ) {
582	/* Even for a L1 hit, the write-trough L1 passes
583	 * the write to the L2 to make the L2 line dirty.
584	 * But this causes no latency, so return the hit.
585	 */
586	cachesim_ref_wb( &L2, Write, a, size);
587	return L1_Hit;
588    }
589    switch( cachesim_ref_wb( &L2, Write, a, size) ) {
590	case Hit: return L2_Hit;
591	case Miss: return MemAccess;
592	default: break;
593    }
594    return WriteBackMemAccess;
595}
596
597
598/*------------------------------------------------------------*/
599/*--- Cache Simulation with use metric collection          ---*/
600/*------------------------------------------------------------*/
601
602/* can not be combined with write-back or prefetch */
603
604static
605void cacheuse_initcache(cache_t2* c)
606{
607    int i;
608    unsigned int start_mask, start_val;
609    unsigned int end_mask, end_val;
610
611    c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
612                           sizeof(line_use) * c->sets * c->assoc);
613    c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
614                           sizeof(line_loaded) * c->sets * c->assoc);
615    c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
616                                    sizeof(int) * c->line_size);
617    c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
618                                  sizeof(int) * c->line_size);
619
620    c->line_size_mask = c->line_size-1;
621
622    /* Meaning of line_start_mask/line_end_mask
623     * Example: for a given cache line, you get an access starting at
624     * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
625     * line size of 32, you have 1 bit per byte in the mask:
626     *
627     *   bit31   bit8 bit5  bit 0
628     *       |      |  |    |
629     *       11..111111100000   line_start_mask[5]
630     *       00..000111111111   line_end_mask[(5+4)-1]
631     *
632     *  use_mask |= line_start_mask[5] && line_end_mask[8]
633     *
634     */
635    start_val = end_val = ~0;
636    if (c->line_size < 32) {
637	int bits_per_byte = 32/c->line_size;
638	start_mask = (1<<bits_per_byte)-1;
639	end_mask   = start_mask << (32-bits_per_byte);
640	for(i=0;i<c->line_size;i++) {
641	    c->line_start_mask[i] = start_val;
642	    start_val  = start_val & ~start_mask;
643	    start_mask = start_mask << bits_per_byte;
644
645	    c->line_end_mask[c->line_size-i-1] = end_val;
646	    end_val  = end_val & ~end_mask;
647	    end_mask = end_mask >> bits_per_byte;
648	}
649    }
650    else {
651	int bytes_per_bit = c->line_size/32;
652	start_mask = 1;
653	end_mask   = 1 << 31;
654	for(i=0;i<c->line_size;i++) {
655	    c->line_start_mask[i] = start_val;
656	    c->line_end_mask[c->line_size-i-1] = end_val;
657	    if ( ((i+1)%bytes_per_bit) == 0) {
658		start_val   &= ~start_mask;
659		end_val     &= ~end_mask;
660		start_mask <<= 1;
661		end_mask   >>= 1;
662	    }
663	}
664    }
665
666    CLG_DEBUG(6, "Config %s:\n", c->desc_line);
667    for(i=0;i<c->line_size;i++) {
668	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
669		  i, c->line_start_mask[i], c->line_end_mask[i]);
670    }
671
672    /* We use lower tag bits as offset pointers to cache use info.
673     * I.e. some cache parameters don't work.
674     */
675    if ( (1<<c->tag_shift) < c->assoc) {
676	VG_(message)(Vg_DebugMsg,
677		     "error: Use associativity < %d for cache use statistics!",
678		     (1<<c->tag_shift) );
679	VG_(tool_panic)("Unsupported cache configuration");
680    }
681}
682
683
684/* for I1/D1 caches */
685#define CACHEUSE(L)                                                         \
686                                                                            \
687static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
688{                                                                           \
689   UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
690   UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
691   UWord tag  = a & L.tag_mask;                                             \
692   UWord tag2;                                                              \
693   int i, j, idx;                                                           \
694   UWord *set, tmp_tag; 						    \
695   UInt use_mask;							    \
696                                                                            \
697   CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
698	    L.name, a, size, set1, set2);				    \
699                                                                            \
700   /* First case: word entirely within line. */                             \
701   if (set1 == set2) {                                                      \
702                                                                            \
703      set = &(L.tags[set1 * L.assoc]);                                      \
704      use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
705	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
706                                                                            \
707      /* This loop is unrolled for just the first case, which is the most */\
708      /* common.  We can't unroll any further because it would screw up   */\
709      /* if we have a direct-mapped (1-way) cache.                        */\
710      if (tag == (set[0] & L.tag_mask)) {                                   \
711        idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
712        L.use[idx].count ++;                                                \
713        L.use[idx].mask |= use_mask;                                        \
714	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
715		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
716		 use_mask, L.use[idx].mask, L.use[idx].count);              \
717	return L1_Hit;							    \
718      }                                                                     \
719      /* If the tag is one other than the MRU, move it into the MRU spot  */\
720      /* and shuffle the rest down.                                       */\
721      for (i = 1; i < L.assoc; i++) {                                       \
722	 if (tag == (set[i] & L.tag_mask)) {			            \
723  	    tmp_tag = set[i];                                               \
724            for (j = i; j > 0; j--) {                                       \
725               set[j] = set[j - 1];                                         \
726            }                                                               \
727            set[0] = tmp_tag;			                            \
728            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
729            L.use[idx].count ++;                                            \
730            L.use[idx].mask |= use_mask;                                    \
731	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
732		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
733		 use_mask, L.use[idx].mask, L.use[idx].count);              \
734            return L1_Hit;                                                  \
735         }                                                                  \
736      }                                                                     \
737                                                                            \
738      /* A miss;  install this tag as MRU, shuffle rest down. */            \
739      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
740      for (j = L.assoc - 1; j > 0; j--) {                                   \
741         set[j] = set[j - 1];                                               \
742      }                                                                     \
743      set[0] = tag | tmp_tag;                                               \
744      idx = (set1 * L.assoc) + tmp_tag;                                     \
745      return update_##L##_use(&L, idx,         			            \
746		       use_mask, a &~ L.line_size_mask);		    \
747                                                                            \
748   /* Second case: word straddles two lines. */                             \
749   /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
750   } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
751      Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:L2 miss */           \
752      set = &(L.tags[set1 * L.assoc]);                                      \
753      use_mask = L.line_start_mask[a & L.line_size_mask];		    \
754      if (tag == (set[0] & L.tag_mask)) {                                   \
755         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
756         L.use[idx].count ++;                                               \
757         L.use[idx].mask |= use_mask;                                       \
758	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
759		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
760		 use_mask, L.use[idx].mask, L.use[idx].count);              \
761         goto block2;                                                       \
762      }                                                                     \
763      for (i = 1; i < L.assoc; i++) {                                       \
764	 if (tag == (set[i] & L.tag_mask)) {			            \
765  	    tmp_tag = set[i];                                               \
766            for (j = i; j > 0; j--) {                                       \
767               set[j] = set[j - 1];                                         \
768            }                                                               \
769            set[0] = tmp_tag;                                               \
770            idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
771            L.use[idx].count ++;                                            \
772            L.use[idx].mask |= use_mask;                                    \
773	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
774		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
775		 use_mask, L.use[idx].mask, L.use[idx].count);              \
776            goto block2;                                                    \
777         }                                                                  \
778      }                                                                     \
779      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
780      for (j = L.assoc - 1; j > 0; j--) {                                   \
781         set[j] = set[j - 1];                                               \
782      }                                                                     \
783      set[0] = tag | tmp_tag;                                               \
784      idx = (set1 * L.assoc) + tmp_tag;                                     \
785      miss1 = update_##L##_use(&L, idx,        			            \
786		       use_mask, a &~ L.line_size_mask);		    \
787block2:                                                                     \
788      set = &(L.tags[set2 * L.assoc]);                                      \
789      use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
790      tag2  = (a+size-1) & L.tag_mask;                                      \
791      if (tag2 == (set[0] & L.tag_mask)) {                                  \
792         idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
793         L.use[idx].count ++;                                               \
794         L.use[idx].mask |= use_mask;                                       \
795	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
796		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
797		 use_mask, L.use[idx].mask, L.use[idx].count);              \
798         return miss1;                                                      \
799      }                                                                     \
800      for (i = 1; i < L.assoc; i++) {                                       \
801	 if (tag2 == (set[i] & L.tag_mask)) {			            \
802  	    tmp_tag = set[i];                                               \
803            for (j = i; j > 0; j--) {                                       \
804               set[j] = set[j - 1];                                         \
805            }                                                               \
806            set[0] = tmp_tag;                                               \
807            idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
808            L.use[idx].count ++;                                            \
809            L.use[idx].mask |= use_mask;                                    \
810	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
811		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
812		 use_mask, L.use[idx].mask, L.use[idx].count);              \
813            return miss1;                                                   \
814         }                                                                  \
815      }                                                                     \
816      tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
817      for (j = L.assoc - 1; j > 0; j--) {                                   \
818         set[j] = set[j - 1];                                               \
819      }                                                                     \
820      set[0] = tag2 | tmp_tag;                                              \
821      idx = (set2 * L.assoc) + tmp_tag;                                     \
822      miss2 = update_##L##_use(&L, idx,			                    \
823		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
824      return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:L2_Hit;     \
825                                                                            \
826   } else {                                                                 \
827       VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
828       VG_(tool_panic)("item straddles more than two cache sets");          \
829   }                                                                        \
830   return 0;                                                                \
831}
832
833
834/* logarithmic bitcounting algorithm, see
835 * http://graphics.stanford.edu/~seander/bithacks.html
836 */
837static __inline__ unsigned int countBits(unsigned int bits)
838{
839  unsigned int c; // store the total here
840  const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
841  const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
842
843  c = bits;
844  c = ((c >> S[0]) & B[0]) + (c & B[0]);
845  c = ((c >> S[1]) & B[1]) + (c & B[1]);
846  c = ((c >> S[2]) & B[2]) + (c & B[2]);
847  c = ((c >> S[3]) & B[3]) + (c & B[3]);
848  c = ((c >> S[4]) & B[4]) + (c & B[4]);
849  return c;
850}
851
852static void update_L2_use(int idx, Addr memline)
853{
854  line_loaded* loaded = &(L2.loaded[idx]);
855  line_use* use = &(L2.use[idx]);
856  int i = ((32 - countBits(use->mask)) * L2.line_size)>>5;
857
858  CLG_DEBUG(2, " L2.miss [%d]: at %#lx accessing memline %#lx\n",
859	   idx, bb_base + current_ii->instr_offset, memline);
860  if (use->count>0) {
861    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
862	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
863    CLG_DEBUG(2, "   collect: %d, use_base %p\n",
864	     CLG_(current_state).collect, loaded->use_base);
865
866    if (CLG_(current_state).collect && loaded->use_base) {
867      (loaded->use_base)[off_L2_AcCost] += 1000 / use->count;
868      (loaded->use_base)[off_L2_SpLoss] += i;
869    }
870   }
871
872   use->count = 0;
873   use->mask  = 0;
874
875  loaded->memline = memline;
876  loaded->iaddr   = bb_base + current_ii->instr_offset;
877  loaded->use_base = (CLG_(current_state).nonskipped) ?
878    CLG_(current_state).nonskipped->skipped :
879    cost_base + current_ii->cost_offset;
880}
881
882static
883CacheModelResult cacheuse_L2_access(Addr memline, line_loaded* l1_loaded)
884{
885   UInt setNo = (memline >> L2.line_size_bits) & (L2.sets_min_1);
886   UWord* set = &(L2.tags[setNo * L2.assoc]);
887   UWord tag  = memline & L2.tag_mask;
888
889   int i, j, idx;
890   UWord tmp_tag;
891
892   CLG_DEBUG(6,"L2.Acc(Memline %#lx): Set %d\n", memline, setNo);
893
894   if (tag == (set[0] & L2.tag_mask)) {
895     idx = (setNo * L2.assoc) + (set[0] & ~L2.tag_mask);
896     l1_loaded->dep_use = &(L2.use[idx]);
897
898     CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
899		 idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
900		 L2.use[idx].mask, L2.use[idx].count);
901     return L2_Hit;
902   }
903   for (i = 1; i < L2.assoc; i++) {
904     if (tag == (set[i] & L2.tag_mask)) {
905       tmp_tag = set[i];
906       for (j = i; j > 0; j--) {
907	 set[j] = set[j - 1];
908       }
909       set[0] = tmp_tag;
910       idx = (setNo * L2.assoc) + (tmp_tag & ~L2.tag_mask);
911       l1_loaded->dep_use = &(L2.use[idx]);
912
913	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
914		 i, idx, L2.loaded[idx].memline,  L2.loaded[idx].iaddr,
915		 L2.use[idx].mask, L2.use[idx].count);
916	return L2_Hit;
917     }
918   }
919
920   /* A miss;  install this tag as MRU, shuffle rest down. */
921   tmp_tag = set[L2.assoc - 1] & ~L2.tag_mask;
922   for (j = L2.assoc - 1; j > 0; j--) {
923     set[j] = set[j - 1];
924   }
925   set[0] = tag | tmp_tag;
926   idx = (setNo * L2.assoc) + tmp_tag;
927   l1_loaded->dep_use = &(L2.use[idx]);
928
929   update_L2_use(idx, memline);
930
931   return MemAccess;
932}
933
934
935
936
937#define UPDATE_USE(L)					             \
938                                                                     \
939static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
940			       UInt mask, Addr memline)		     \
941{                                                                    \
942  line_loaded* loaded = &(cache->loaded[idx]);			     \
943  line_use* use = &(cache->use[idx]);				     \
944  int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
945                                                                     \
946  CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
947	   cache->name, idx, bb_base + current_ii->instr_offset, memline, mask); \
948  if (use->count>0) {                                                \
949    CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
950	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
951    CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
952	     CLG_(current_state).collect, loaded->use_base);	     \
953                                                                     \
954    if (CLG_(current_state).collect && loaded->use_base) {            \
955      (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
956      (loaded->use_base)[off_##L##_SpLoss] += c;                     \
957                                                                     \
958      /* FIXME (?): L1/L2 line sizes must be equal ! */              \
959      loaded->dep_use->mask |= use->mask;                            \
960      loaded->dep_use->count += use->count;                          \
961    }                                                                \
962  }                                                                  \
963                                                                     \
964  use->count = 1;                                                    \
965  use->mask  = mask;                                                 \
966  loaded->memline = memline;                                         \
967  loaded->iaddr   = bb_base + current_ii->instr_offset;              \
968  loaded->use_base = (CLG_(current_state).nonskipped) ?               \
969    CLG_(current_state).nonskipped->skipped :                         \
970    cost_base + current_ii->cost_offset;		             \
971                                                                     \
972  if (memline == 0) return L2_Hit;                                   \
973  return cacheuse_L2_access(memline, loaded);                        \
974}
975
976UPDATE_USE(I1);
977UPDATE_USE(D1);
978
979CACHEUSE(I1);
980CACHEUSE(D1);
981
982
983static
984void cacheuse_finish(void)
985{
986  int i;
987  InstrInfo ii = { 0,0,0,0,0 };
988
989  if (!CLG_(current_state).collect) return;
990
991  bb_base = 0;
992  current_ii = &ii;
993  cost_base = 0;
994
995  /* update usage counters */
996  if (I1.use)
997    for (i = 0; i < I1.sets * I1.assoc; i++)
998      if (I1.loaded[i].use_base)
999	update_I1_use( &I1, i, 0,0);
1000
1001  if (D1.use)
1002    for (i = 0; i < D1.sets * D1.assoc; i++)
1003      if (D1.loaded[i].use_base)
1004	update_D1_use( &D1, i, 0,0);
1005
1006  if (L2.use)
1007    for (i = 0; i < L2.sets * L2.assoc; i++)
1008      if (L2.loaded[i].use_base)
1009	update_L2_use(i, 0);
1010}
1011
1012
1013
1014/*------------------------------------------------------------*/
1015/*--- Helper functions called by instrumented code         ---*/
1016/*------------------------------------------------------------*/
1017
1018
1019static __inline__
1020void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
1021{
1022    switch(r) {
1023	case WriteBackMemAccess:
1024	    if (clo_simulate_writeback) {
1025		c1[3]++;
1026		c2[3]++;
1027	    }
1028	    // fall through
1029
1030	case MemAccess:
1031	    c1[2]++;
1032	    c2[2]++;
1033	    // fall through
1034
1035	case L2_Hit:
1036	    c1[1]++;
1037	    c2[1]++;
1038	    // fall through
1039
1040	default:
1041	    c1[0]++;
1042	    c2[0]++;
1043    }
1044}
1045
1046
1047VG_REGPARM(1)
1048static void log_1I0D(InstrInfo* ii)
1049{
1050    CacheModelResult IrRes;
1051
1052    current_ii = ii;
1053    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1054
1055    CLG_DEBUG(6, "log_1I0D:  Ir=%#lx/%u => Ir %d\n",
1056	      bb_base + ii->instr_offset, ii->instr_size, IrRes);
1057
1058    if (CLG_(current_state).collect) {
1059	ULong* cost_Ir;
1060
1061	if (CLG_(current_state).nonskipped)
1062	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1063	else
1064	    cost_Ir = cost_base + ii->cost_offset + off_D0_Ir;
1065
1066	inc_costs(IrRes, cost_Ir,
1067		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1068    }
1069}
1070
1071
1072/* Instruction doing a read access */
1073
1074VG_REGPARM(2)
1075static void log_1I1Dr(InstrInfo* ii, Addr data)
1076{
1077    CacheModelResult IrRes, DrRes;
1078
1079    current_ii = ii;
1080    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1081    DrRes = (*simulator.D1_Read)(data, ii->data_size);
1082
1083    CLG_DEBUG(6, "log_1I1Dr: Ir=%#lx/%u, Dr=%#lx/%u => Ir %d, Dr %d\n",
1084	      bb_base + ii->instr_offset, ii->instr_size,
1085	      data, ii->data_size, IrRes, DrRes);
1086
1087    if (CLG_(current_state).collect) {
1088	ULong *cost_Ir, *cost_Dr;
1089
1090	if (CLG_(current_state).nonskipped) {
1091	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Ir;
1092	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1093	}
1094	else {
1095	    cost_Ir = cost_base + ii->cost_offset + off_D1r_Ir;
1096	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1097	}
1098
1099	inc_costs(IrRes, cost_Ir,
1100		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1101	inc_costs(DrRes, cost_Dr,
1102		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1103    }
1104}
1105
1106
1107VG_REGPARM(2)
1108static void log_0I1Dr(InstrInfo* ii, Addr data)
1109{
1110    CacheModelResult DrRes;
1111
1112    current_ii = ii;
1113    DrRes = (*simulator.D1_Read)(data, ii->data_size);
1114
1115    CLG_DEBUG(6, "log_0I1Dr: Dr=%#lx/%u => Dr %d\n",
1116	      data, ii->data_size, DrRes);
1117
1118    if (CLG_(current_state).collect) {
1119	ULong *cost_Dr;
1120
1121	if (CLG_(current_state).nonskipped) {
1122	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dr;
1123	}
1124	else {
1125	    cost_Dr = cost_base + ii->cost_offset + off_D1r_Dr;
1126	}
1127
1128	inc_costs(DrRes, cost_Dr,
1129		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1130    }
1131}
1132
1133
1134/* Instruction doing a write access */
1135
1136VG_REGPARM(2)
1137static void log_1I1Dw(InstrInfo* ii, Addr data)
1138{
1139    CacheModelResult IrRes, DwRes;
1140
1141    current_ii = ii;
1142    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1143    DwRes = (*simulator.D1_Write)(data, ii->data_size);
1144
1145    CLG_DEBUG(6, "log_1I1Dw: Ir=%#lx/%u, Dw=%#lx/%u => Ir %d, Dw %d\n",
1146	      bb_base + ii->instr_offset, ii->instr_size,
1147	      data, ii->data_size, IrRes, DwRes);
1148
1149    if (CLG_(current_state).collect) {
1150	ULong *cost_Ir, *cost_Dw;
1151
1152	if (CLG_(current_state).nonskipped) {
1153	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1154	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1155	}
1156	else {
1157	    cost_Ir = cost_base + ii->cost_offset + off_D1w_Ir;
1158	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1159	}
1160
1161	inc_costs(IrRes, cost_Ir,
1162		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1163	inc_costs(DwRes, cost_Dw,
1164		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1165    }
1166}
1167
1168VG_REGPARM(2)
1169static void log_0I1Dw(InstrInfo* ii, Addr data)
1170{
1171    CacheModelResult DwRes;
1172
1173    current_ii = ii;
1174    DwRes = (*simulator.D1_Write)(data, ii->data_size);
1175
1176    CLG_DEBUG(6, "log_0I1Dw: Dw=%#lx/%u => Dw %d\n",
1177	      data, ii->data_size, DwRes);
1178
1179    if (CLG_(current_state).collect) {
1180	ULong *cost_Dw;
1181
1182	if (CLG_(current_state).nonskipped) {
1183	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_full_Dw;
1184	}
1185	else {
1186	    cost_Dw = cost_base + ii->cost_offset + off_D1w_Dw;
1187	}
1188
1189	inc_costs(DwRes, cost_Dw,
1190		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1191    }
1192}
1193
1194/* Instruction doing a read and a write access */
1195
1196VG_REGPARM(3)
1197static void log_1I2D(InstrInfo* ii, Addr data1, Addr data2)
1198{
1199    CacheModelResult IrRes, DrRes, DwRes;
1200
1201    current_ii = ii;
1202    IrRes = (*simulator.I1_Read)(bb_base + ii->instr_offset, ii->instr_size);
1203    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1204    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1205
1206    CLG_DEBUG(6,
1207	      "log_1I2D: Ir=%#lx/%u, Dr=%#lx/%u, Dw=%#lx/%u => Ir %d, Dr %d, Dw %d\n",
1208	      bb_base + ii->instr_offset, ii->instr_size,
1209	      data1, ii->data_size, data2, ii->data_size, IrRes, DrRes, DwRes);
1210
1211    if (CLG_(current_state).collect) {
1212	ULong *cost_Ir, *cost_Dr, *cost_Dw;
1213
1214	if (CLG_(current_state).nonskipped) {
1215	    cost_Ir = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Ir;
1216	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1217	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1218	}
1219	else {
1220	    cost_Ir = cost_base + ii->cost_offset + off_D2_Ir;
1221	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1222	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1223	}
1224
1225	inc_costs(IrRes, cost_Ir,
1226		  CLG_(current_state).cost + CLG_(sets).off_full_Ir );
1227	inc_costs(DrRes, cost_Dr,
1228		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1229	inc_costs(DwRes, cost_Dw,
1230		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1231    }
1232}
1233
1234VG_REGPARM(3)
1235static void log_0I2D(InstrInfo* ii, Addr data1, Addr data2)
1236{
1237    CacheModelResult DrRes, DwRes;
1238
1239    current_ii = ii;
1240    DrRes = (*simulator.D1_Read)(data1, ii->data_size);
1241    DwRes = (*simulator.D1_Write)(data2, ii->data_size);
1242
1243    CLG_DEBUG(6,
1244	      "log_0D2D: Dr=%#lx/%u, Dw=%#lx/%u => Dr %d, Dw %d\n",
1245	      data1, ii->data_size, data2, ii->data_size, DrRes, DwRes);
1246
1247    if (CLG_(current_state).collect) {
1248	ULong *cost_Dr, *cost_Dw;
1249
1250	if (CLG_(current_state).nonskipped) {
1251	    cost_Dr = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dr;
1252	    cost_Dw = CLG_(current_state).nonskipped->skipped + CLG_(sets).off_sim_Dw;
1253	}
1254	else {
1255	    cost_Dr = cost_base + ii->cost_offset + off_D2_Dr;
1256	    cost_Dw = cost_base + ii->cost_offset + off_D2_Dw;
1257	}
1258
1259	inc_costs(DrRes, cost_Dr,
1260		  CLG_(current_state).cost + CLG_(sets).off_full_Dr );
1261	inc_costs(DwRes, cost_Dw,
1262		  CLG_(current_state).cost + CLG_(sets).off_full_Dw );
1263    }
1264}
1265
1266
1267/*------------------------------------------------------------*/
1268/*--- Cache configuration                                  ---*/
1269/*------------------------------------------------------------*/
1270
1271#define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 })
1272
1273static cache_t clo_I1_cache = UNDEFINED_CACHE;
1274static cache_t clo_D1_cache = UNDEFINED_CACHE;
1275static cache_t clo_L2_cache = UNDEFINED_CACHE;
1276
1277
1278/* Checks cache config is ok;  makes it so if not. */
1279static
1280void check_cache(cache_t* cache, Char *name)
1281{
1282   /* Simulator requires line size and set count to be powers of two */
1283   if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
1284       (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc))) {
1285      VG_(message)(Vg_UserMsg,
1286         "error: %s set count not a power of two; aborting.",
1287         name);
1288   }
1289
1290   if (-1 == VG_(log2)(cache->line_size)) {
1291      VG_(message)(Vg_UserMsg,
1292         "error: %s line size of %dB not a power of two; aborting.",
1293         name, cache->line_size);
1294      VG_(exit)(1);
1295   }
1296
1297   // Then check line size >= 16 -- any smaller and a single instruction could
1298   // straddle three cache lines, which breaks a simulation assertion and is
1299   // stupid anyway.
1300   if (cache->line_size < MIN_LINE_SIZE) {
1301      VG_(message)(Vg_UserMsg,
1302         "error: %s line size of %dB too small; aborting.",
1303         name, cache->line_size);
1304      VG_(exit)(1);
1305   }
1306
1307   /* Then check cache size > line size (causes seg faults if not). */
1308   if (cache->size <= cache->line_size) {
1309      VG_(message)(Vg_UserMsg,
1310         "error: %s cache size of %dB <= line size of %dB; aborting.",
1311         name, cache->size, cache->line_size);
1312      VG_(exit)(1);
1313   }
1314
1315   /* Then check assoc <= (size / line size) (seg faults otherwise). */
1316   if (cache->assoc > (cache->size / cache->line_size)) {
1317      VG_(message)(Vg_UserMsg,
1318         "warning: %s associativity > (size / line size); aborting.", name);
1319      VG_(exit)(1);
1320   }
1321}
1322
1323static
1324void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* L2c)
1325{
1326#define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
1327
1328   Int n_clos = 0;
1329
1330   // Count how many were defined on the command line.
1331   if (DEFINED(clo_I1_cache)) { n_clos++; }
1332   if (DEFINED(clo_D1_cache)) { n_clos++; }
1333   if (DEFINED(clo_L2_cache)) { n_clos++; }
1334
1335   // Set the cache config (using auto-detection, if supported by the
1336   // architecture)
1337   VG_(configure_caches)( I1c, D1c, L2c, (3 == n_clos) );
1338
1339   // Then replace with any defined on the command line.
1340   if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
1341   if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
1342   if (DEFINED(clo_L2_cache)) { *L2c = clo_L2_cache; }
1343
1344   // Then check values and fix if not acceptable.
1345   check_cache(I1c, "I1");
1346   check_cache(D1c, "D1");
1347   check_cache(L2c, "L2");
1348
1349   if (VG_(clo_verbosity) > 1) {
1350      VG_(message)(Vg_UserMsg, "Cache configuration used:");
1351      VG_(message)(Vg_UserMsg, "  I1: %dB, %d-way, %dB lines",
1352                               I1c->size, I1c->assoc, I1c->line_size);
1353      VG_(message)(Vg_UserMsg, "  D1: %dB, %d-way, %dB lines",
1354                               D1c->size, D1c->assoc, D1c->line_size);
1355      VG_(message)(Vg_UserMsg, "  L2: %dB, %d-way, %dB lines",
1356                               L2c->size, L2c->assoc, L2c->line_size);
1357   }
1358#undef CMD_LINE_DEFINED
1359}
1360
1361
1362/* Initialize and clear simulator state */
1363static void cachesim_post_clo_init(void)
1364{
1365  /* Cache configurations. */
1366  cache_t  I1c, D1c, L2c;
1367
1368  /* Initialize access handlers */
1369  if (!CLG_(clo).simulate_cache) {
1370    CLG_(cachesim).log_1I0D  = 0;
1371    CLG_(cachesim).log_1I0D_name = "(no function)";
1372
1373    CLG_(cachesim).log_1I1Dr = 0;
1374    CLG_(cachesim).log_1I1Dw = 0;
1375    CLG_(cachesim).log_1I2D  = 0;
1376    CLG_(cachesim).log_1I1Dr_name = "(no function)";
1377    CLG_(cachesim).log_1I1Dw_name = "(no function)";
1378    CLG_(cachesim).log_1I2D_name = "(no function)";
1379
1380    CLG_(cachesim).log_0I1Dr = 0;
1381    CLG_(cachesim).log_0I1Dw = 0;
1382    CLG_(cachesim).log_0I2D  = 0;
1383    CLG_(cachesim).log_0I1Dr_name = "(no function)";
1384    CLG_(cachesim).log_0I1Dw_name = "(no function)";
1385    CLG_(cachesim).log_0I2D_name = "(no function)";
1386    return;
1387  }
1388
1389  /* Configuration of caches only needed with real cache simulation */
1390  configure_caches(&I1c, &D1c, &L2c);
1391
1392  I1.name = "I1";
1393  D1.name = "D1";
1394  L2.name = "L2";
1395
1396  cachesim_initcache(I1c, &I1);
1397  cachesim_initcache(D1c, &D1);
1398  cachesim_initcache(L2c, &L2);
1399
1400  /* the other cache simulators use the standard helpers
1401   * with dispatching via simulator struct */
1402
1403  CLG_(cachesim).log_1I0D  = log_1I0D;
1404  CLG_(cachesim).log_1I0D_name  = "log_1I0D";
1405
1406  CLG_(cachesim).log_1I1Dr = log_1I1Dr;
1407  CLG_(cachesim).log_1I1Dw = log_1I1Dw;
1408  CLG_(cachesim).log_1I2D  = log_1I2D;
1409  CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
1410  CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
1411  CLG_(cachesim).log_1I2D_name  = "log_1I2D";
1412
1413  CLG_(cachesim).log_0I1Dr = log_0I1Dr;
1414  CLG_(cachesim).log_0I1Dw = log_0I1Dw;
1415  CLG_(cachesim).log_0I2D  = log_0I2D;
1416  CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
1417  CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
1418  CLG_(cachesim).log_0I2D_name  = "log_0I2D";
1419
1420  if (clo_collect_cacheuse) {
1421
1422      /* Output warning for not supported option combinations */
1423      if (clo_simulate_hwpref) {
1424	  VG_(message)(Vg_DebugMsg,
1425		       "warning: prefetch simulation can not be used with cache usage");
1426	  clo_simulate_hwpref = False;
1427      }
1428
1429      if (clo_simulate_writeback) {
1430	  VG_(message)(Vg_DebugMsg,
1431		       "warning: write-back simulation can not be used with cache usage");
1432	  clo_simulate_writeback = False;
1433      }
1434
1435      simulator.I1_Read  = cacheuse_I1_doRead;
1436      simulator.D1_Read  = cacheuse_D1_doRead;
1437      simulator.D1_Write = cacheuse_D1_doRead;
1438      return;
1439  }
1440
1441  if (clo_simulate_hwpref) {
1442    prefetch_clear();
1443
1444    if (clo_simulate_writeback) {
1445      simulator.I1_Read  = prefetch_I1_Read;
1446      simulator.D1_Read  = prefetch_D1_Read;
1447      simulator.D1_Write = prefetch_D1_Write;
1448    }
1449    else {
1450      simulator.I1_Read  = prefetch_I1_ref;
1451      simulator.D1_Read  = prefetch_D1_ref;
1452      simulator.D1_Write = prefetch_D1_ref;
1453    }
1454
1455    return;
1456  }
1457
1458  if (clo_simulate_writeback) {
1459      simulator.I1_Read  = cachesim_I1_Read;
1460      simulator.D1_Read  = cachesim_D1_Read;
1461      simulator.D1_Write = cachesim_D1_Write;
1462  }
1463  else {
1464      simulator.I1_Read  = cachesim_I1_ref;
1465      simulator.D1_Read  = cachesim_D1_ref;
1466      simulator.D1_Write = cachesim_D1_ref;
1467  }
1468}
1469
1470
1471/* Clear simulator state. Has to be initialized before */
1472static
1473void cachesim_clear(void)
1474{
1475  cachesim_clearcache(&I1);
1476  cachesim_clearcache(&D1);
1477  cachesim_clearcache(&L2);
1478
1479  prefetch_clear();
1480}
1481
1482
1483static void cachesim_getdesc(Char* buf)
1484{
1485  Int p;
1486  p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
1487  p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
1488  VG_(sprintf)(buf+p, "desc: L2 cache: %s\n", L2.desc_line);
1489}
1490
1491static
1492void cachesim_print_opts(void)
1493{
1494  VG_(printf)(
1495"\n   cache simulator options:\n"
1496"    --simulate-cache=no|yes   Do cache simulation [no]\n"
1497"    --simulate-wb=no|yes      Count write-back events [no]\n"
1498"    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
1499#if CLG_EXPERIMENTAL
1500"    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
1501#endif
1502"    --cacheuse=no|yes         Collect cache block use [no]\n"
1503"    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
1504"    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
1505"    --L2=<size>,<assoc>,<line_size>  set L2 cache manually\n"
1506	      );
1507}
1508
1509static void parse_opt ( cache_t* cache, char* orig_opt, int opt_len )
1510{
1511   int   i1, i2, i3;
1512   int   i;
1513   char *opt = VG_(strdup)("cl.sim.po.1", orig_opt);
1514
1515   i = i1 = opt_len;
1516
1517   /* Option looks like "--I1=65536,2,64".
1518    * Find commas, replace with NULs to make three independent
1519    * strings, then extract numbers.  Yuck. */
1520   while (VG_(isdigit)(opt[i])) i++;
1521   if (',' == opt[i]) {
1522      opt[i++] = '\0';
1523      i2 = i;
1524   } else goto bad;
1525   while (VG_(isdigit)(opt[i])) i++;
1526   if (',' == opt[i]) {
1527      opt[i++] = '\0';
1528      i3 = i;
1529   } else goto bad;
1530   while (VG_(isdigit)(opt[i])) i++;
1531   if ('\0' != opt[i]) goto bad;
1532
1533   cache->size      = (Int)VG_(atoll)(opt + i1);
1534   cache->assoc     = (Int)VG_(atoll)(opt + i2);
1535   cache->line_size = (Int)VG_(atoll)(opt + i3);
1536
1537   VG_(free)(opt);
1538
1539   return;
1540
1541  bad:
1542   VG_(err_bad_option)(orig_opt);
1543}
1544
1545/* Check for command line option for cache configuration.
1546 * Return False if unknown and not handled.
1547 *
1548 * Called from CLG_(process_cmd_line_option)() in clo.c
1549 */
1550static Bool cachesim_parse_opt(Char* arg)
1551{
1552  if (0 == VG_(strcmp)(arg, "--simulate-wb=yes"))
1553    clo_simulate_writeback = True;
1554  else if (0 == VG_(strcmp)(arg, "--simulate-wb=no"))
1555    clo_simulate_writeback = False;
1556
1557  else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=yes"))
1558    clo_simulate_hwpref = True;
1559  else if (0 == VG_(strcmp)(arg, "--simulate-hwpref=no"))
1560    clo_simulate_hwpref = False;
1561
1562  else if (0 == VG_(strcmp)(arg, "--simulate-sectors=yes"))
1563    clo_simulate_sectors = True;
1564  else if (0 == VG_(strcmp)(arg, "--simulate-sectors=no"))
1565    clo_simulate_sectors = False;
1566
1567  else if (0 == VG_(strcmp)(arg, "--cacheuse=yes")) {
1568    clo_collect_cacheuse = True;
1569    /* Use counters only make sense with fine dumping */
1570    CLG_(clo).dump_instr = True;
1571  }
1572  else if (0 == VG_(strcmp)(arg, "--cacheuse=no"))
1573    clo_collect_cacheuse = False;
1574
1575  /* 5 is length of "--I1=" */
1576  else if (0 == VG_(strncmp)(arg, "--I1=", 5))
1577    parse_opt(&clo_I1_cache, arg,   5);
1578  else if (0 == VG_(strncmp)(arg, "--D1=", 5))
1579    parse_opt(&clo_D1_cache, arg,   5);
1580  else if (0 == VG_(strncmp)(arg, "--L2=", 5))
1581    parse_opt(&clo_L2_cache, arg,   5);
1582  else
1583    return False;
1584
1585  return True;
1586}
1587
1588/* Adds commas to ULong, right justifying in a field field_width wide, returns
1589 * the string in buf. */
1590static
1591Int commify(ULong n, int field_width, char* buf)
1592{
1593   int len, n_commas, i, j, new_len, space;
1594
1595   VG_(sprintf)(buf, "%llu", n);
1596   len = VG_(strlen)(buf);
1597   n_commas = (len - 1) / 3;
1598   new_len = len + n_commas;
1599   space = field_width - new_len;
1600
1601   /* Allow for printing a number in a field_width smaller than it's size */
1602   if (space < 0) space = 0;
1603
1604   /* Make j = -1 because we copy the '\0' before doing the numbers in groups
1605    * of three. */
1606   for (j = -1, i = len ; i >= 0; i--) {
1607      buf[i + n_commas + space] = buf[i];
1608
1609      if ((i>0) && (3 == ++j)) {
1610         j = 0;
1611         n_commas--;
1612         buf[i + n_commas + space] = ',';
1613      }
1614   }
1615   /* Right justify in field. */
1616   for (i = 0; i < space; i++)  buf[i] = ' ';
1617   return new_len;
1618}
1619
1620static
1621void percentify(Int n, Int ex, Int field_width, char buf[])
1622{
1623   int i, len, space;
1624
1625   VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
1626   len = VG_(strlen)(buf);
1627   space = field_width - len;
1628   if (space < 0) space = 0;     /* Allow for v. small field_width */
1629   i = len;
1630
1631   /* Right justify in field */
1632   for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
1633   for (i = 0; i < space; i++)  buf[i] = ' ';
1634}
1635
1636static
1637void cachesim_printstat(void)
1638{
1639  FullCost total = CLG_(total_cost), D_total = 0;
1640  ULong L2_total_m, L2_total_mr, L2_total_mw,
1641    L2_total, L2_total_r, L2_total_w;
1642  char buf1[RESULTS_BUF_LEN],
1643    buf2[RESULTS_BUF_LEN],
1644    buf3[RESULTS_BUF_LEN];
1645  Int l1, l2, l3;
1646  Int p;
1647
1648  if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
1649    VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu",
1650		 prefetch_up);
1651    VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu",
1652		 prefetch_down);
1653    VG_(message)(Vg_DebugMsg, "");
1654  }
1655
1656  /* I cache results.  Use the I_refs value to determine the first column
1657   * width. */
1658  l1 = commify(total[CLG_(sets).off_full_Ir], 0, buf1);
1659  VG_(message)(Vg_UserMsg, "I   refs:      %s", buf1);
1660
1661  if (!CLG_(clo).simulate_cache) return;
1662
1663  commify(total[CLG_(sets).off_full_Ir +1], l1, buf1);
1664  VG_(message)(Vg_UserMsg, "I1  misses:    %s", buf1);
1665
1666  commify(total[CLG_(sets).off_full_Ir +2], l1, buf1);
1667  VG_(message)(Vg_UserMsg, "L2i misses:    %s", buf1);
1668
1669  p = 100;
1670
1671  if (0 == total[CLG_(sets).off_full_Ir])
1672    total[CLG_(sets).off_full_Ir] = 1;
1673
1674  percentify(total[CLG_(sets).off_full_Ir+1] * 100 * p /
1675	     total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1676  VG_(message)(Vg_UserMsg, "I1  miss rate: %s", buf1);
1677
1678  percentify(total[CLG_(sets).off_full_Ir+2] * 100 * p /
1679	     total[CLG_(sets).off_full_Ir], p, l1+1, buf1);
1680  VG_(message)(Vg_UserMsg, "L2i miss rate: %s", buf1);
1681  VG_(message)(Vg_UserMsg, "");
1682
1683  /* D cache results.
1684     Use the D_refs.rd and D_refs.wr values to determine the
1685   * width of columns 2 & 3. */
1686
1687  D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
1688  CLG_(init_cost)( CLG_(sets).full, D_total);
1689  CLG_(copy_cost)( CLG_(sets).Dr, D_total, total + CLG_(sets).off_full_Dr );
1690  CLG_(add_cost) ( CLG_(sets).Dw, D_total, total + CLG_(sets).off_full_Dw );
1691
1692  commify( D_total[0], l1, buf1);
1693  l2 = commify(total[CLG_(sets).off_full_Dr], 0,  buf2);
1694  l3 = commify(total[CLG_(sets).off_full_Dw], 0,  buf3);
1695  VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)",
1696	       buf1,  buf2,  buf3);
1697
1698  commify( D_total[1], l1, buf1);
1699  commify(total[CLG_(sets).off_full_Dr+1], l2, buf2);
1700  commify(total[CLG_(sets).off_full_Dw+1], l3, buf3);
1701  VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)",
1702	       buf1, buf2, buf3);
1703
1704  commify( D_total[2], l1, buf1);
1705  commify(total[CLG_(sets).off_full_Dr+2], l2, buf2);
1706  commify(total[CLG_(sets).off_full_Dw+2], l3, buf3);
1707  VG_(message)(Vg_UserMsg, "L2d misses:    %s  (%s rd + %s wr)",
1708	       buf1, buf2, buf3);
1709
1710  p = 10;
1711
1712  if (0 == D_total[0])   D_total[0] = 1;
1713  if (0 == total[CLG_(sets).off_full_Dr]) total[CLG_(sets).off_full_Dr] = 1;
1714  if (0 == total[CLG_(sets).off_full_Dw]) total[CLG_(sets).off_full_Dw] = 1;
1715
1716  percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
1717  percentify(total[CLG_(sets).off_full_Dr+1] * 100 * p /
1718	     total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1719  percentify(total[CLG_(sets).off_full_Dw+1] * 100 * p /
1720	     total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1721  VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
1722
1723  percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
1724  percentify(total[CLG_(sets).off_full_Dr+2] * 100 * p /
1725	     total[CLG_(sets).off_full_Dr], p, l2+1, buf2);
1726  percentify(total[CLG_(sets).off_full_Dw+2] * 100 * p /
1727	     total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1728  VG_(message)(Vg_UserMsg, "L2d miss rate: %s (%s   + %s  )", buf1, buf2,buf3);
1729  VG_(message)(Vg_UserMsg, "");
1730
1731
1732
1733  /* L2 overall results */
1734
1735  L2_total   =
1736    total[CLG_(sets).off_full_Dr +1] +
1737    total[CLG_(sets).off_full_Dw +1] +
1738    total[CLG_(sets).off_full_Ir +1];
1739  L2_total_r =
1740    total[CLG_(sets).off_full_Dr +1] +
1741    total[CLG_(sets).off_full_Ir +1];
1742  L2_total_w = total[CLG_(sets).off_full_Dw +1];
1743  commify(L2_total,   l1, buf1);
1744  commify(L2_total_r, l2, buf2);
1745  commify(L2_total_w, l3, buf3);
1746  VG_(message)(Vg_UserMsg, "L2 refs:       %s  (%s rd + %s wr)",
1747	       buf1, buf2, buf3);
1748
1749  L2_total_m  =
1750    total[CLG_(sets).off_full_Dr +2] +
1751    total[CLG_(sets).off_full_Dw +2] +
1752    total[CLG_(sets).off_full_Ir +2];
1753  L2_total_mr =
1754    total[CLG_(sets).off_full_Dr +2] +
1755    total[CLG_(sets).off_full_Ir +2];
1756  L2_total_mw = total[CLG_(sets).off_full_Dw +2];
1757  commify(L2_total_m,  l1, buf1);
1758  commify(L2_total_mr, l2, buf2);
1759  commify(L2_total_mw, l3, buf3);
1760  VG_(message)(Vg_UserMsg, "L2 misses:     %s  (%s rd + %s wr)",
1761	       buf1, buf2, buf3);
1762
1763  percentify(L2_total_m  * 100 * p /
1764	     (total[CLG_(sets).off_full_Ir] + D_total[0]),  p, l1+1, buf1);
1765  percentify(L2_total_mr * 100 * p /
1766	     (total[CLG_(sets).off_full_Ir] + total[CLG_(sets).off_full_Dr]),
1767	     p, l2+1, buf2);
1768  percentify(L2_total_mw * 100 * p /
1769	     total[CLG_(sets).off_full_Dw], p, l3+1, buf3);
1770  VG_(message)(Vg_UserMsg, "L2 miss rate:  %s (%s   + %s  )",
1771	       buf1, buf2,buf3);
1772}
1773
1774
1775/*------------------------------------------------------------*/
1776/*--- Setup for Event set.                                 ---*/
1777/*------------------------------------------------------------*/
1778
1779struct event_sets CLG_(sets);
1780
1781void CLG_(init_eventsets)(Int max_user)
1782{
1783  EventType * e1, *e2, *e3, *e4;
1784  EventSet *Ir, *Dr, *Dw;
1785  EventSet *D0, *D1r, *D1w, *D2;
1786  EventSet *sim, *full;
1787  EventSet *use;
1788  int sizeOfUseIr;
1789
1790  use = CLG_(get_eventset)("Use", 4);
1791  if (clo_collect_cacheuse) {
1792    /* if TUse is 0, there was never a load, and no loss, too */
1793    e1 = CLG_(register_eventtype)("AcCost1");
1794    CLG_(add_eventtype)(use, e1);
1795    e1 = CLG_(register_eventtype)("SpLoss1");
1796    CLG_(add_eventtype)(use, e1);
1797    e1 = CLG_(register_eventtype)("AcCost2");
1798    CLG_(add_eventtype)(use, e1);
1799    e1 = CLG_(register_eventtype)("SpLoss2");
1800    CLG_(add_eventtype)(use, e1);
1801  }
1802
1803  Ir = CLG_(get_eventset)("Ir", 4);
1804  Dr = CLG_(get_eventset)("Dr", 4);
1805  Dw = CLG_(get_eventset)("Dw", 4);
1806  if (CLG_(clo).simulate_cache) {
1807    e1 = CLG_(register_eventtype)("Ir");
1808    e2 = CLG_(register_eventtype)("I1mr");
1809    e3 = CLG_(register_eventtype)("I2mr");
1810    if (clo_simulate_writeback) {
1811      e4 = CLG_(register_eventtype)("I2dmr");
1812      CLG_(add_dep_event4)(Ir, e1,e2,e3,e4);
1813    }
1814    else
1815      CLG_(add_dep_event3)(Ir, e1,e2,e3);
1816
1817    e1 = CLG_(register_eventtype)("Dr");
1818    e2 = CLG_(register_eventtype)("D1mr");
1819    e3 = CLG_(register_eventtype)("D2mr");
1820    if (clo_simulate_writeback) {
1821      e4 = CLG_(register_eventtype)("D2dmr");
1822      CLG_(add_dep_event4)(Dr, e1,e2,e3,e4);
1823    }
1824    else
1825      CLG_(add_dep_event3)(Dr, e1,e2,e3);
1826
1827    e1 = CLG_(register_eventtype)("Dw");
1828    e2 = CLG_(register_eventtype)("D1mw");
1829    e3 = CLG_(register_eventtype)("D2mw");
1830    if (clo_simulate_writeback) {
1831      e4 = CLG_(register_eventtype)("D2dmw");
1832      CLG_(add_dep_event4)(Dw, e1,e2,e3,e4);
1833    }
1834    else
1835      CLG_(add_dep_event3)(Dw, e1,e2,e3);
1836
1837  }
1838  else {
1839    e1 = CLG_(register_eventtype)("Ir");
1840    CLG_(add_eventtype)(Ir, e1);
1841  }
1842
1843  sizeOfUseIr =  use->size + Ir->size;
1844  D0 = CLG_(get_eventset)("D0", sizeOfUseIr);
1845  CLG_(add_eventset)(D0, use);
1846  off_D0_Ir  = CLG_(add_eventset)(D0, Ir);
1847
1848  D1r = CLG_(get_eventset)("D1r", sizeOfUseIr + Dr->size);
1849  CLG_(add_eventset)(D1r, use);
1850  off_D1r_Ir = CLG_(add_eventset)(D1r, Ir);
1851  off_D1r_Dr = CLG_(add_eventset)(D1r, Dr);
1852
1853  D1w = CLG_(get_eventset)("D1w", sizeOfUseIr + Dw->size);
1854  CLG_(add_eventset)(D1w, use);
1855  off_D1w_Ir   = CLG_(add_eventset)(D1w, Ir);
1856  off_D1w_Dw   = CLG_(add_eventset)(D1w, Dw);
1857
1858  D2  = CLG_(get_eventset)("D2", sizeOfUseIr + Dr->size + Dw->size);
1859  CLG_(add_eventset)(D2, use);
1860  off_D2_Ir    = CLG_(add_eventset)(D2, Ir);
1861  off_D2_Dr    = CLG_(add_eventset)(D2, Dr);
1862  off_D2_Dw    = CLG_(add_eventset)(D2, Dw);
1863
1864  sim = CLG_(get_eventset)("sim", sizeOfUseIr + Dr->size + Dw->size);
1865  CLG_(add_eventset)(sim, use);
1866  CLG_(sets).off_sim_Ir   = CLG_(add_eventset)(sim, Ir);
1867  CLG_(sets).off_sim_Dr   = CLG_(add_eventset)(sim, Dr);
1868  CLG_(sets).off_sim_Dw   = CLG_(add_eventset)(sim, Dw);
1869
1870  if (CLG_(clo).collect_alloc)   max_user += 2;
1871  if (CLG_(clo).collect_systime) max_user += 2;
1872
1873  full = CLG_(get_eventset)("full", sim->size + max_user);
1874  CLG_(add_eventset)(full, sim);
1875  CLG_(sets).off_full_Ir   = CLG_(sets).off_sim_Ir;
1876  CLG_(sets).off_full_Dr   = CLG_(sets).off_sim_Dr;
1877  CLG_(sets).off_full_Dw   = CLG_(sets).off_sim_Dw;
1878
1879  CLG_(sets).use = use;
1880  CLG_(sets).Ir  = Ir;
1881  CLG_(sets).Dr  = Dr;
1882  CLG_(sets).Dw  = Dw;
1883
1884  CLG_(sets).D0  = D0;
1885  CLG_(sets).D1r = D1r;
1886  CLG_(sets).D1w = D1w;
1887  CLG_(sets).D2  = D2;
1888
1889  CLG_(sets).sim  = sim;
1890  CLG_(sets).full = full;
1891
1892  if (CLG_(clo).collect_alloc) {
1893    e1 = CLG_(register_eventtype)("allocCount");
1894    e2 = CLG_(register_eventtype)("allocSize");
1895    CLG_(sets).off_full_user =  CLG_(add_dep_event2)(full, e1,e2);
1896  }
1897
1898  if (CLG_(clo).collect_systime) {
1899    e1 = CLG_(register_eventtype)("sysCount");
1900    e2 = CLG_(register_eventtype)("sysTime");
1901    CLG_(sets).off_full_systime =  CLG_(add_dep_event2)(full, e1,e2);
1902  }
1903
1904  CLG_DEBUGIF(1) {
1905    CLG_DEBUG(1, "EventSets:\n");
1906    CLG_(print_eventset)(-2, use);
1907    CLG_(print_eventset)(-2, Ir);
1908    CLG_(print_eventset)(-2, Dr);
1909    CLG_(print_eventset)(-2, Dw);
1910    CLG_(print_eventset)(-2, sim);
1911    CLG_(print_eventset)(-2, full);
1912  }
1913
1914  /* Not-existing events are silently ignored */
1915  CLG_(dumpmap) = CLG_(get_eventmapping)(full);
1916  CLG_(append_event)(CLG_(dumpmap), "Ir");
1917  CLG_(append_event)(CLG_(dumpmap), "Dr");
1918  CLG_(append_event)(CLG_(dumpmap), "Dw");
1919  CLG_(append_event)(CLG_(dumpmap), "I1mr");
1920  CLG_(append_event)(CLG_(dumpmap), "D1mr");
1921  CLG_(append_event)(CLG_(dumpmap), "D1mw");
1922  CLG_(append_event)(CLG_(dumpmap), "I2mr");
1923  CLG_(append_event)(CLG_(dumpmap), "D2mr");
1924  CLG_(append_event)(CLG_(dumpmap), "D2mw");
1925  CLG_(append_event)(CLG_(dumpmap), "I2dmr");
1926  CLG_(append_event)(CLG_(dumpmap), "D2dmr");
1927  CLG_(append_event)(CLG_(dumpmap), "D2dmw");
1928  CLG_(append_event)(CLG_(dumpmap), "AcCost1");
1929  CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
1930  CLG_(append_event)(CLG_(dumpmap), "AcCost2");
1931  CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
1932  CLG_(append_event)(CLG_(dumpmap), "allocCount");
1933  CLG_(append_event)(CLG_(dumpmap), "allocSize");
1934  CLG_(append_event)(CLG_(dumpmap), "sysCount");
1935  CLG_(append_event)(CLG_(dumpmap), "sysTime");
1936
1937}
1938
1939
1940
1941static
1942void add_and_zero_Dx(EventSet* es, SimCost dst, ULong* cost)
1943{
1944  /* if eventset use is defined, it is always first (hardcoded!) */
1945  CLG_(add_and_zero_cost)( CLG_(sets).use, dst, cost);
1946
1947  /* FIXME: This is hardcoded... */
1948  if (es == CLG_(sets).D0) {
1949    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1950			    cost + off_D0_Ir);
1951  }
1952  else if (es == CLG_(sets).D1r) {
1953    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1954			    cost + off_D1r_Ir);
1955    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
1956			    cost + off_D1r_Dr);
1957  }
1958  else if (es == CLG_(sets).D1w) {
1959    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1960			    cost + off_D1w_Ir);
1961    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
1962			    cost + off_D1w_Dw);
1963  }
1964  else {
1965    CLG_ASSERT(es == CLG_(sets).D2);
1966    CLG_(add_and_zero_cost)( CLG_(sets).Ir, dst + CLG_(sets).off_sim_Ir,
1967			    cost + off_D2_Ir);
1968    CLG_(add_and_zero_cost)( CLG_(sets).Dr, dst + CLG_(sets).off_sim_Dr,
1969			    cost + off_D2_Dr);
1970    CLG_(add_and_zero_cost)( CLG_(sets).Dw, dst + CLG_(sets).off_sim_Dw,
1971			    cost + off_D2_Dw);
1972  }
1973}
1974
1975/* this is called at dump time for every instruction executed */
1976static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
1977			       InstrInfo* ii, ULong exe_count)
1978{
1979  if (!CLG_(clo).simulate_cache)
1980      cost[CLG_(sets).off_sim_Ir] += exe_count;
1981  else {
1982
1983#if 0
1984/* There is always a trivial case where exe_count and Ir can be
1985 * slightly different because ecounter is updated when executing
1986 * the next BB. E.g. for last BB executed, or when toggling collection
1987 */
1988      /* FIXME: Hardcoded that each eventset has Ir as first */
1989      if ((bbcc->cost + ii->cost_offset)[0] != exe_count) {
1990	  VG_(printf)("==> Ir %llu, exe %llu\n",
1991		      (bbcc->cost + ii->cost_offset)[0], exe_count);
1992	  CLG_(print_bbcc_cost)(-2, bbcc);
1993	  //CLG_ASSERT((bbcc->cost + ii->cost_offset)[0] == exe_count);
1994      }
1995#endif
1996
1997      add_and_zero_Dx(ii->eventset, cost,
1998		      bbcc->cost + ii->cost_offset);
1999  }
2000}
2001
2002static
2003void cachesim_after_bbsetup(void)
2004{
2005  BBCC* bbcc = CLG_(current_state).bbcc;
2006
2007  if (CLG_(clo).simulate_cache) {
2008    BB* bb = bbcc->bb;
2009
2010    /* only needed if log_* functions are called */
2011    bb_base   = bb->obj->offset + bb->offset;
2012    cost_base = bbcc->cost;
2013  }
2014}
2015
2016static
2017void cachesim_finish(void)
2018{
2019  if (clo_collect_cacheuse)
2020    cacheuse_finish();
2021}
2022
2023/*------------------------------------------------------------*/
2024/*--- The simulator defined in this file                   ---*/
2025/*------------------------------------------------------------*/
2026
2027struct cachesim_if CLG_(cachesim) = {
2028  .print_opts    = cachesim_print_opts,
2029  .parse_opt     = cachesim_parse_opt,
2030  .post_clo_init = cachesim_post_clo_init,
2031  .clear         = cachesim_clear,
2032  .getdesc       = cachesim_getdesc,
2033  .printstat     = cachesim_printstat,
2034  .add_icost     = cachesim_add_icost,
2035  .after_bbsetup = cachesim_after_bbsetup,
2036  .finish        = cachesim_finish,
2037
2038  /* these will be set by cachesim_post_clo_init */
2039  .log_1I0D        = 0,
2040
2041  .log_1I1Dr       = 0,
2042  .log_1I1Dw       = 0,
2043  .log_1I2D        = 0,
2044
2045  .log_0I1Dr       = 0,
2046  .log_0I1Dw       = 0,
2047  .log_0I2D        = 0,
2048
2049  .log_1I0D_name = "(no function)",
2050
2051  .log_1I1Dr_name = "(no function)",
2052  .log_1I1Dw_name = "(no function)",
2053  .log_1I2D_name = "(no function)",
2054
2055  .log_0I1Dr_name = "(no function)",
2056  .log_0I1Dw_name = "(no function)",
2057  .log_0I2D_name = "(no function)"
2058};
2059
2060
2061/*--------------------------------------------------------------------*/
2062/*--- end                                                 ct_sim.c ---*/
2063/*--------------------------------------------------------------------*/
2064
2065