1
2/*--------------------------------------------------------------------*/
3/*--- Callgrind                                                    ---*/
4/*---                                                       main.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of Callgrind, a Valgrind tool for call graph
9   profiling programs.
10
11   Copyright (C) 2002-2013, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12
13   This tool is derived from and contains code from Cachegrind
14   Copyright (C) 2002-2013 Nicholas Nethercote (njn@valgrind.org)
15
16   This program is free software; you can redistribute it and/or
17   modify it under the terms of the GNU General Public License as
18   published by the Free Software Foundation; either version 2 of the
19   License, or (at your option) any later version.
20
21   This program is distributed in the hope that it will be useful, but
22   WITHOUT ANY WARRANTY; without even the implied warranty of
23   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24   General Public License for more details.
25
26   You should have received a copy of the GNU General Public License
27   along with this program; if not, write to the Free Software
28   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29   02111-1307, USA.
30
31   The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "config.h"
35#include "callgrind.h"
36#include "global.h"
37
38#include "pub_tool_threadstate.h"
39#include "pub_tool_gdbserver.h"
40
41#include "cg_branchpred.c"
42
43/*------------------------------------------------------------*/
44/*--- Global variables                                     ---*/
45/*------------------------------------------------------------*/
46
47/* for all threads */
48CommandLineOptions CLG_(clo);
49Statistics CLG_(stat);
50Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
51
52/* thread and signal handler specific */
53exec_state CLG_(current_state);
54
55/* min of L1 and LL cache line sizes.  This only gets set to a
56   non-zero value if we are doing cache simulation. */
57Int CLG_(min_line_size) = 0;
58
59
60/*------------------------------------------------------------*/
61/*--- Statistics                                           ---*/
62/*------------------------------------------------------------*/
63
64static void CLG_(init_statistics)(Statistics* s)
65{
66  s->call_counter        = 0;
67  s->jcnd_counter        = 0;
68  s->jump_counter        = 0;
69  s->rec_call_counter    = 0;
70  s->ret_counter         = 0;
71  s->bb_executions       = 0;
72
73  s->context_counter     = 0;
74  s->bb_retranslations   = 0;
75
76  s->distinct_objs       = 0;
77  s->distinct_files      = 0;
78  s->distinct_fns        = 0;
79  s->distinct_contexts   = 0;
80  s->distinct_bbs        = 0;
81  s->distinct_bbccs      = 0;
82  s->distinct_instrs     = 0;
83  s->distinct_skips      = 0;
84
85  s->bb_hash_resizes     = 0;
86  s->bbcc_hash_resizes   = 0;
87  s->jcc_hash_resizes    = 0;
88  s->cxt_hash_resizes    = 0;
89  s->fn_array_resizes    = 0;
90  s->call_stack_resizes  = 0;
91  s->fn_stack_resizes    = 0;
92
93  s->full_debug_BBs      = 0;
94  s->file_line_debug_BBs = 0;
95  s->fn_name_debug_BBs   = 0;
96  s->no_debug_BBs        = 0;
97  s->bbcc_lru_misses     = 0;
98  s->jcc_lru_misses      = 0;
99  s->cxt_lru_misses      = 0;
100  s->bbcc_clones         = 0;
101}
102
103
104/*------------------------------------------------------------*/
105/*--- Simple callbacks (not cache similator)               ---*/
106/*------------------------------------------------------------*/
107
108VG_REGPARM(1)
109static void log_global_event(InstrInfo* ii)
110{
111    ULong* cost_Bus;
112
113    CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
114              CLG_(bb_base) + ii->instr_offset, ii->instr_size);
115
116    if (!CLG_(current_state).collect) return;
117
118    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
119
120    CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
121
122    if (CLG_(current_state).nonskipped)
123        cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
124    else
125        cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
126    cost_Bus[0]++;
127}
128
129
130/* For branches, we consult two different predictors, one which
131   predicts taken/untaken for conditional branches, and the other
132   which predicts the branch target address for indirect branches
133   (jump-to-register style ones). */
134
135static VG_REGPARM(2)
136void log_cond_branch(InstrInfo* ii, Word taken)
137{
138    Bool miss;
139    Int fullOffset_Bc;
140    ULong* cost_Bc;
141
142    CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
143              CLG_(bb_base) + ii->instr_offset, taken);
144
145    miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
146
147    if (!CLG_(current_state).collect) return;
148
149    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
150
151    if (CLG_(current_state).nonskipped)
152        cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
153    else
154        cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
155
156    fullOffset_Bc = fullOffset(EG_BC);
157    CLG_(current_state).cost[ fullOffset_Bc ]++;
158    cost_Bc[0]++;
159    if (miss) {
160        CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
161        cost_Bc[1]++;
162    }
163}
164
165static VG_REGPARM(2)
166void log_ind_branch(InstrInfo* ii, UWord actual_dst)
167{
168    Bool miss;
169    Int fullOffset_Bi;
170    ULong* cost_Bi;
171
172    CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
173              CLG_(bb_base) + ii->instr_offset, actual_dst);
174
175    miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
176
177    if (!CLG_(current_state).collect) return;
178
179    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
180
181    if (CLG_(current_state).nonskipped)
182        cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
183    else
184        cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
185
186    fullOffset_Bi = fullOffset(EG_BI);
187    CLG_(current_state).cost[ fullOffset_Bi ]++;
188    cost_Bi[0]++;
189    if (miss) {
190        CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
191        cost_Bi[1]++;
192    }
193}
194
195/*------------------------------------------------------------*/
196/*--- Instrumentation structures and event queue handling  ---*/
197/*------------------------------------------------------------*/
198
199/* Maintain an ordered list of memory events which are outstanding, in
200   the sense that no IR has yet been generated to do the relevant
201   helper calls.  The BB is scanned top to bottom and memory events
202   are added to the end of the list, merging with the most recent
203   notified event where possible (Dw immediately following Dr and
204   having the same size and EA can be merged).
205
206   This merging is done so that for architectures which have
207   load-op-store instructions (x86, amd64), the insn is treated as if
208   it makes just one memory reference (a modify), rather than two (a
209   read followed by a write at the same address).
210
211   At various points the list will need to be flushed, that is, IR
212   generated from it.  That must happen before any possible exit from
213   the block (the end, or an IRStmt_Exit).  Flushing also takes place
214   when there is no space to add a new event.
215
216   If we require the simulation statistics to be up to date with
217   respect to possible memory exceptions, then the list would have to
218   be flushed before each memory reference.  That would however lose
219   performance by inhibiting event-merging during flushing.
220
221   Flushing the list consists of walking it start to end and emitting
222   instrumentation IR for each event, in the order in which they
223   appear.  It may be possible to emit a single call for two adjacent
224   events in order to reduce the number of helper function calls made.
225   For example, it could well be profitable to handle two adjacent Ir
226   events with a single helper call.  */
227
228typedef
229   IRExpr
230   IRAtom;
231
232typedef
233   enum {
234      Ev_Ir,  // Instruction read
235      Ev_Dr,  // Data read
236      Ev_Dw,  // Data write
237      Ev_Dm,  // Data modify (read then write)
238      Ev_Bc,  // branch conditional
239      Ev_Bi,  // branch indirect (to unknown destination)
240      Ev_G    // Global bus event
241   }
242   EventTag;
243
244typedef
245   struct {
246      EventTag   tag;
247      InstrInfo* inode;
248      union {
249	 struct {
250	 } Ir;
251	 struct {
252	    IRAtom* ea;
253	    Int     szB;
254	 } Dr;
255	 struct {
256	    IRAtom* ea;
257	    Int     szB;
258	 } Dw;
259	 struct {
260	    IRAtom* ea;
261	    Int     szB;
262	 } Dm;
263         struct {
264            IRAtom* taken; /* :: Ity_I1 */
265         } Bc;
266         struct {
267            IRAtom* dst;
268         } Bi;
269	 struct {
270	 } G;
271      } Ev;
272   }
273   Event;
274
275static void init_Event ( Event* ev ) {
276   VG_(memset)(ev, 0, sizeof(Event));
277}
278
279static IRAtom* get_Event_dea ( Event* ev ) {
280   switch (ev->tag) {
281      case Ev_Dr: return ev->Ev.Dr.ea;
282      case Ev_Dw: return ev->Ev.Dw.ea;
283      case Ev_Dm: return ev->Ev.Dm.ea;
284      default:    tl_assert(0);
285   }
286}
287
288static Int get_Event_dszB ( Event* ev ) {
289   switch (ev->tag) {
290      case Ev_Dr: return ev->Ev.Dr.szB;
291      case Ev_Dw: return ev->Ev.Dw.szB;
292      case Ev_Dm: return ev->Ev.Dm.szB;
293      default:    tl_assert(0);
294   }
295}
296
297
298/* Up to this many unnotified events are allowed.  Number is
299   arbitrary.  Larger numbers allow more event merging to occur, but
300   potentially induce more spilling due to extending live ranges of
301   address temporaries. */
302#define N_EVENTS 16
303
304
305/* A struct which holds all the running state during instrumentation.
306   Mostly to avoid passing loads of parameters everywhere. */
307typedef struct {
308    /* The current outstanding-memory-event list. */
309    Event events[N_EVENTS];
310    Int   events_used;
311
312    /* The array of InstrInfo's is part of BB struct. */
313    BB* bb;
314
315    /* BB seen before (ie. re-instrumentation) */
316    Bool seen_before;
317
318    /* Number InstrInfo bins 'used' so far. */
319    UInt ii_index;
320
321    // current offset of guest instructions from BB start
322    UInt instr_offset;
323
324    /* The output SB being constructed. */
325    IRSB* sbOut;
326} ClgState;
327
328
329static void showEvent ( Event* ev )
330{
331   switch (ev->tag) {
332      case Ev_Ir:
333	 VG_(printf)("Ir (InstrInfo %p) at +%d\n",
334		     ev->inode, ev->inode->instr_offset);
335	 break;
336      case Ev_Dr:
337	 VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
338		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
339	 ppIRExpr(ev->Ev.Dr.ea);
340	 VG_(printf)("\n");
341	 break;
342      case Ev_Dw:
343	 VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
344		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
345	 ppIRExpr(ev->Ev.Dw.ea);
346	 VG_(printf)("\n");
347	 break;
348      case Ev_Dm:
349	 VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
350		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
351	 ppIRExpr(ev->Ev.Dm.ea);
352	 VG_(printf)("\n");
353	 break;
354      case Ev_Bc:
355         VG_(printf)("Bc %p   GA=", ev->inode);
356         ppIRExpr(ev->Ev.Bc.taken);
357         VG_(printf)("\n");
358         break;
359      case Ev_Bi:
360         VG_(printf)("Bi %p  DST=", ev->inode);
361         ppIRExpr(ev->Ev.Bi.dst);
362         VG_(printf)("\n");
363         break;
364      case Ev_G:
365         VG_(printf)("G  %p\n", ev->inode);
366         break;
367      default:
368	 tl_assert(0);
369	 break;
370   }
371}
372
373/* Generate code for all outstanding memory events, and mark the queue
374   empty.  Code is generated into cgs->sbOut, and this activity
375   'consumes' slots in cgs->bb. */
376
377static void flushEvents ( ClgState* clgs )
378{
379   Int        i, regparms, inew;
380   const HChar* helperName;
381   void*      helperAddr;
382   IRExpr**   argv;
383   IRExpr*    i_node_expr;
384   IRDirty*   di;
385   Event*     ev;
386   Event*     ev2;
387   Event*     ev3;
388
389   if (!clgs->seen_before) {
390       // extend event sets as needed
391       // available sets: D0 Dr
392       for(i=0; i<clgs->events_used; i++) {
393	   ev  = &clgs->events[i];
394	   switch(ev->tag) {
395	   case Ev_Ir:
396	       // Ir event always is first for a guest instruction
397	       CLG_ASSERT(ev->inode->eventset == 0);
398	       ev->inode->eventset = CLG_(sets).base;
399	       break;
400	   case Ev_Dr:
401               // extend event set by Dr counters
402	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
403							   EG_DR);
404	       break;
405	   case Ev_Dw:
406	   case Ev_Dm:
407               // extend event set by Dw counters
408	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
409							   EG_DW);
410	       break;
411           case Ev_Bc:
412               // extend event set by Bc counters
413               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
414                                                           EG_BC);
415               break;
416           case Ev_Bi:
417               // extend event set by Bi counters
418               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
419                                                           EG_BI);
420               break;
421	   case Ev_G:
422               // extend event set by Bus counter
423	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
424							   EG_BUS);
425	       break;
426	   default:
427	       tl_assert(0);
428	   }
429       }
430   }
431
432   for(i = 0; i < clgs->events_used; i = inew) {
433
434      helperName = NULL;
435      helperAddr = NULL;
436      argv       = NULL;
437      regparms   = 0;
438
439      /* generate IR to notify event i and possibly the ones
440	 immediately following it. */
441      tl_assert(i >= 0 && i < clgs->events_used);
442
443      ev  = &clgs->events[i];
444      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
445      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
446
447      CLG_DEBUGIF(5) {
448	 VG_(printf)("   flush ");
449	 showEvent( ev );
450      }
451
452      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
453
454      /* Decide on helper fn to call and args to pass it, and advance
455	 i appropriately.
456	 Dm events have same effect as Dw events */
457      switch (ev->tag) {
458	 case Ev_Ir:
459	    /* Merge an Ir with a following Dr. */
460	    if (ev2 && ev2->tag == Ev_Dr) {
461	       /* Why is this true?  It's because we're merging an Ir
462		  with a following Dr.  The Ir derives from the
463		  instruction's IMark and the Dr from data
464		  references which follow it.  In short it holds
465		  because each insn starts with an IMark, hence an
466		  Ev_Ir, and so these Dr must pertain to the
467		  immediately preceding Ir.  Same applies to analogous
468		  assertions in the subsequent cases. */
469	       tl_assert(ev2->inode == ev->inode);
470	       helperName = CLG_(cachesim).log_1I1Dr_name;
471	       helperAddr = CLG_(cachesim).log_1I1Dr;
472	       argv = mkIRExprVec_3( i_node_expr,
473				     get_Event_dea(ev2),
474				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
475	       regparms = 3;
476	       inew = i+2;
477	    }
478	    /* Merge an Ir with a following Dw/Dm. */
479	    else
480	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
481	       tl_assert(ev2->inode == ev->inode);
482	       helperName = CLG_(cachesim).log_1I1Dw_name;
483	       helperAddr = CLG_(cachesim).log_1I1Dw;
484	       argv = mkIRExprVec_3( i_node_expr,
485				     get_Event_dea(ev2),
486				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
487	       regparms = 3;
488	       inew = i+2;
489	    }
490	    /* Merge an Ir with two following Irs. */
491	    else
492	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
493	       helperName = CLG_(cachesim).log_3I0D_name;
494	       helperAddr = CLG_(cachesim).log_3I0D;
495	       argv = mkIRExprVec_3( i_node_expr,
496				     mkIRExpr_HWord( (HWord)ev2->inode ),
497				     mkIRExpr_HWord( (HWord)ev3->inode ) );
498	       regparms = 3;
499	       inew = i+3;
500	    }
501	    /* Merge an Ir with one following Ir. */
502	    else
503	    if (ev2 && ev2->tag == Ev_Ir) {
504	       helperName = CLG_(cachesim).log_2I0D_name;
505	       helperAddr = CLG_(cachesim).log_2I0D;
506	       argv = mkIRExprVec_2( i_node_expr,
507				     mkIRExpr_HWord( (HWord)ev2->inode ) );
508	       regparms = 2;
509	       inew = i+2;
510	    }
511	    /* No merging possible; emit as-is. */
512	    else {
513	       helperName = CLG_(cachesim).log_1I0D_name;
514	       helperAddr = CLG_(cachesim).log_1I0D;
515	       argv = mkIRExprVec_1( i_node_expr );
516	       regparms = 1;
517	       inew = i+1;
518	    }
519	    break;
520	 case Ev_Dr:
521	    /* Data read or modify */
522	    helperName = CLG_(cachesim).log_0I1Dr_name;
523	    helperAddr = CLG_(cachesim).log_0I1Dr;
524	    argv = mkIRExprVec_3( i_node_expr,
525				  get_Event_dea(ev),
526				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
527	    regparms = 3;
528	    inew = i+1;
529	    break;
530	 case Ev_Dw:
531	 case Ev_Dm:
532	    /* Data write */
533	    helperName = CLG_(cachesim).log_0I1Dw_name;
534	    helperAddr = CLG_(cachesim).log_0I1Dw;
535	    argv = mkIRExprVec_3( i_node_expr,
536				  get_Event_dea(ev),
537				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
538	    regparms = 3;
539	    inew = i+1;
540	    break;
541         case Ev_Bc:
542            /* Conditional branch */
543            helperName = "log_cond_branch";
544            helperAddr = &log_cond_branch;
545            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
546            regparms = 2;
547            inew = i+1;
548            break;
549         case Ev_Bi:
550            /* Branch to an unknown destination */
551            helperName = "log_ind_branch";
552            helperAddr = &log_ind_branch;
553            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
554            regparms = 2;
555            inew = i+1;
556            break;
557         case Ev_G:
558            /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
559            helperName = "log_global_event";
560            helperAddr = &log_global_event;
561            argv = mkIRExprVec_1( i_node_expr );
562            regparms = 1;
563            inew = i+1;
564            break;
565	 default:
566	    tl_assert(0);
567      }
568
569      CLG_DEBUGIF(5) {
570	  if (inew > i+1) {
571	      VG_(printf)("   merge ");
572	      showEvent( ev2 );
573	  }
574	  if (inew > i+2) {
575	      VG_(printf)("   merge ");
576	      showEvent( ev3 );
577	  }
578	  if (helperAddr)
579	      VG_(printf)("   call  %s (%p)\n",
580			  helperName, helperAddr);
581      }
582
583      /* helper could be unset depending on the simulator used */
584      if (helperAddr == 0) continue;
585
586      /* Add the helper. */
587      tl_assert(helperName);
588      tl_assert(helperAddr);
589      tl_assert(argv);
590      di = unsafeIRDirty_0_N( regparms,
591			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
592			      argv );
593      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
594   }
595
596   clgs->events_used = 0;
597}
598
599static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
600{
601   Event* evt;
602   tl_assert(clgs->seen_before || (inode->eventset == 0));
603   if (!CLG_(clo).simulate_cache) return;
604
605   if (clgs->events_used == N_EVENTS)
606      flushEvents(clgs);
607   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
608   evt = &clgs->events[clgs->events_used];
609   init_Event(evt);
610   evt->tag      = Ev_Ir;
611   evt->inode    = inode;
612   clgs->events_used++;
613}
614
615static
616void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
617{
618   Event* evt;
619   tl_assert(isIRAtom(ea));
620   tl_assert(datasize >= 1);
621   if (!CLG_(clo).simulate_cache) return;
622   tl_assert(datasize <= CLG_(min_line_size));
623
624   if (clgs->events_used == N_EVENTS)
625      flushEvents(clgs);
626   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
627   evt = &clgs->events[clgs->events_used];
628   init_Event(evt);
629   evt->tag       = Ev_Dr;
630   evt->inode     = inode;
631   evt->Ev.Dr.szB = datasize;
632   evt->Ev.Dr.ea  = ea;
633   clgs->events_used++;
634}
635
636static
637void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
638{
639   Event* lastEvt;
640   Event* evt;
641   tl_assert(isIRAtom(ea));
642   tl_assert(datasize >= 1);
643   if (!CLG_(clo).simulate_cache) return;
644   tl_assert(datasize <= CLG_(min_line_size));
645
646   /* Is it possible to merge this write with the preceding read? */
647   lastEvt = &clgs->events[clgs->events_used-1];
648   if (clgs->events_used > 0
649       && lastEvt->tag       == Ev_Dr
650       && lastEvt->Ev.Dr.szB == datasize
651       && lastEvt->inode     == inode
652       && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
653   {
654      lastEvt->tag   = Ev_Dm;
655      return;
656   }
657
658   /* No.  Add as normal. */
659   if (clgs->events_used == N_EVENTS)
660      flushEvents(clgs);
661   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
662   evt = &clgs->events[clgs->events_used];
663   init_Event(evt);
664   evt->tag       = Ev_Dw;
665   evt->inode     = inode;
666   evt->Ev.Dw.szB = datasize;
667   evt->Ev.Dw.ea  = ea;
668   clgs->events_used++;
669}
670
671static
672void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
673                          Int datasize, IRAtom* ea, IRAtom* guard,
674                          Bool isWrite )
675{
676   tl_assert(isIRAtom(ea));
677   tl_assert(guard);
678   tl_assert(isIRAtom(guard));
679   tl_assert(datasize >= 1);
680   if (!CLG_(clo).simulate_cache) return;
681   tl_assert(datasize <= CLG_(min_line_size));
682
683   /* Adding guarded memory actions and merging them with the existing
684      queue is too complex.  Simply flush the queue and add this
685      action immediately.  Since guarded loads and stores are pretty
686      rare, this is not thought likely to cause any noticeable
687      performance loss as a result of the loss of event-merging
688      opportunities. */
689   tl_assert(clgs->events_used >= 0);
690   flushEvents(clgs);
691   tl_assert(clgs->events_used == 0);
692   /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
693   IRExpr*      i_node_expr;
694   const HChar* helperName;
695   void*        helperAddr;
696   IRExpr**     argv;
697   Int          regparms;
698   IRDirty*     di;
699   i_node_expr = mkIRExpr_HWord( (HWord)inode );
700   helperName  = isWrite ? CLG_(cachesim).log_0I1Dw_name
701                         : CLG_(cachesim).log_0I1Dr_name;
702   helperAddr  = isWrite ? CLG_(cachesim).log_0I1Dw
703                         : CLG_(cachesim).log_0I1Dr;
704   argv        = mkIRExprVec_3( i_node_expr,
705                                ea, mkIRExpr_HWord( datasize ) );
706   regparms    = 3;
707   di          = unsafeIRDirty_0_N(
708                    regparms,
709                    helperName, VG_(fnptr_to_fnentry)( helperAddr ),
710                    argv );
711   di->guard = guard;
712   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
713}
714
715static
716void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
717{
718   Event* evt;
719   tl_assert(isIRAtom(guard));
720   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
721             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
722   if (!CLG_(clo).simulate_branch) return;
723
724   if (clgs->events_used == N_EVENTS)
725      flushEvents(clgs);
726   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
727   evt = &clgs->events[clgs->events_used];
728   init_Event(evt);
729   evt->tag         = Ev_Bc;
730   evt->inode       = inode;
731   evt->Ev.Bc.taken = guard;
732   clgs->events_used++;
733}
734
735static
736void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
737{
738   Event* evt;
739   tl_assert(isIRAtom(whereTo));
740   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
741             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
742   if (!CLG_(clo).simulate_branch) return;
743
744   if (clgs->events_used == N_EVENTS)
745      flushEvents(clgs);
746   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
747   evt = &clgs->events[clgs->events_used];
748   init_Event(evt);
749   evt->tag       = Ev_Bi;
750   evt->inode     = inode;
751   evt->Ev.Bi.dst = whereTo;
752   clgs->events_used++;
753}
754
755static
756void addEvent_G ( ClgState* clgs, InstrInfo* inode )
757{
758   Event* evt;
759   if (!CLG_(clo).collect_bus) return;
760
761   if (clgs->events_used == N_EVENTS)
762      flushEvents(clgs);
763   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
764   evt = &clgs->events[clgs->events_used];
765   init_Event(evt);
766   evt->tag       = Ev_G;
767   evt->inode     = inode;
768   clgs->events_used++;
769}
770
771/* Initialise or check (if already seen before) an InstrInfo for next insn.
772   We only can set instr_offset/instr_size here. The required event set and
773   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
774   instructions. The event set is extended as required on flush of the event
775   queue (when Dm events were determined), cost offsets are determined at
776   end of BB instrumentation. */
777static
778InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
779{
780   InstrInfo* ii;
781   tl_assert(clgs->ii_index >= 0);
782   tl_assert(clgs->ii_index < clgs->bb->instr_count);
783   ii = &clgs->bb->instr[ clgs->ii_index ];
784
785   if (clgs->seen_before) {
786       CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
787       CLG_ASSERT(ii->instr_size == instr_size);
788   }
789   else {
790       ii->instr_offset = clgs->instr_offset;
791       ii->instr_size = instr_size;
792       ii->cost_offset = 0;
793       ii->eventset = 0;
794   }
795
796   clgs->ii_index++;
797   clgs->instr_offset += instr_size;
798   CLG_(stat).distinct_instrs++;
799
800   return ii;
801}
802
803// return total number of cost values needed for this BB
804static
805UInt update_cost_offsets( ClgState* clgs )
806{
807    Int i;
808    InstrInfo* ii;
809    UInt cost_offset = 0;
810
811    CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
812    for(i=0; i<clgs->ii_index; i++) {
813	ii = &clgs->bb->instr[i];
814	if (clgs->seen_before) {
815	    CLG_ASSERT(ii->cost_offset == cost_offset);
816	} else
817	    ii->cost_offset = cost_offset;
818	cost_offset += ii->eventset ? ii->eventset->size : 0;
819    }
820
821    return cost_offset;
822}
823
824/*------------------------------------------------------------*/
825/*--- Instrumentation                                      ---*/
826/*------------------------------------------------------------*/
827
828#if defined(VG_BIGENDIAN)
829# define CLGEndness Iend_BE
830#elif defined(VG_LITTLEENDIAN)
831# define CLGEndness Iend_LE
832#else
833# error "Unknown endianness"
834#endif
835
836static
837Addr IRConst2Addr(IRConst* con)
838{
839    Addr addr;
840
841    if (sizeof(Addr) == 4) {
842	CLG_ASSERT( con->tag == Ico_U32 );
843	addr = con->Ico.U32;
844    }
845    else if (sizeof(Addr) == 8) {
846	CLG_ASSERT( con->tag == Ico_U64 );
847	addr = con->Ico.U64;
848    }
849    else
850	VG_(tool_panic)("Callgrind: invalid Addr type");
851
852    return addr;
853}
854
855/* First pass over a BB to instrument, counting instructions and jumps
856 * This is needed for the size of the BB struct to allocate
857 *
858 * Called from CLG_(get_bb)
859 */
860void CLG_(collectBlockInfo)(IRSB* sbIn,
861			    /*INOUT*/ UInt* instrs,
862			    /*INOUT*/ UInt* cjmps,
863			    /*INOUT*/ Bool* cjmp_inverted)
864{
865    Int i;
866    IRStmt* st;
867    Addr instrAddr =0, jumpDst;
868    UInt instrLen = 0;
869    Bool toNextInstr = False;
870
871    // Ist_Exit has to be ignored in preamble code, before first IMark:
872    // preamble code is added by VEX for self modifying code, and has
873    // nothing to do with client code
874    Bool inPreamble = True;
875
876    if (!sbIn) return;
877
878    for (i = 0; i < sbIn->stmts_used; i++) {
879	  st = sbIn->stmts[i];
880	  if (Ist_IMark == st->tag) {
881	      inPreamble = False;
882
883	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
884	      instrLen  = st->Ist.IMark.len;
885
886	      (*instrs)++;
887	      toNextInstr = False;
888	  }
889	  if (inPreamble) continue;
890	  if (Ist_Exit == st->tag) {
891	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
892	      toNextInstr =  (jumpDst == instrAddr + instrLen);
893
894	      (*cjmps)++;
895	  }
896    }
897
898    /* if the last instructions of BB conditionally jumps to next instruction
899     * (= first instruction of next BB in memory), this is a inverted by VEX.
900     */
901    *cjmp_inverted = toNextInstr;
902}
903
904static
905void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
906{
907    addStmtToIRSB( bbOut,
908		   IRStmt_Store(CLGEndness,
909				IRExpr_Const(hWordTy == Ity_I32 ?
910					     IRConst_U32( addr ) :
911					     IRConst_U64( addr )),
912				IRExpr_Const(IRConst_U32(val)) ));
913}
914
915
916/* add helper call to setup_bbcc, with pointer to BB struct as argument
917 *
918 * precondition for setup_bbcc:
919 * - jmps_passed has number of cond.jumps passed in last executed BB
920 * - current_bbcc has a pointer to the BBCC of the last executed BB
921 *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
922 *     current_bbcc->bb->jmp_addr
923 *   gives the address of the jump source.
924 *
925 * the setup does 2 things:
926 * - trace call:
927 *   * Unwind own call stack, i.e sync our ESP with real ESP
928 *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
929 *   * For CALLs or JMPs crossing objects, record call arg +
930 *     push are on own call stack
931 *
932 * - prepare for cache log functions:
933 *   set current_bbcc to BBCC that gets the costs for this BB execution
934 *   attached
935 */
936static
937void addBBSetupCall(ClgState* clgs)
938{
939   IRDirty* di;
940   IRExpr  *arg1, **argv;
941
942   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
943   argv = mkIRExprVec_1(arg1);
944   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
945			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
946			      argv);
947   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
948}
949
950
951static
952IRSB* CLG_(instrument)( VgCallbackClosure* closure,
953			IRSB* sbIn,
954			VexGuestLayout* layout,
955			VexGuestExtents* vge,
956                        VexArchInfo* archinfo_host,
957			IRType gWordTy, IRType hWordTy )
958{
959   Int        i;
960   IRStmt*    st;
961   Addr       origAddr;
962   InstrInfo* curr_inode = NULL;
963   ClgState   clgs;
964   UInt       cJumps = 0;
965   IRTypeEnv* tyenv = sbIn->tyenv;
966
967   if (gWordTy != hWordTy) {
968      /* We don't currently support this case. */
969      VG_(tool_panic)("host/guest word size mismatch");
970   }
971
972   // No instrumentation if it is switched off
973   if (! CLG_(instrument_state)) {
974       CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
975		 (Addr)closure->readdr);
976       return sbIn;
977   }
978
979   CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
980
981   /* Set up SB for instrumented IR */
982   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
983
984   // Copy verbatim any IR preamble preceding the first IMark
985   i = 0;
986   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
987      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
988      i++;
989   }
990
991   // Get the first statement, and origAddr from it
992   CLG_ASSERT(sbIn->stmts_used >0);
993   CLG_ASSERT(i < sbIn->stmts_used);
994   st = sbIn->stmts[i];
995   CLG_ASSERT(Ist_IMark == st->tag);
996
997   origAddr = (Addr)st->Ist.IMark.addr + (Addr)st->Ist.IMark.delta;
998   CLG_ASSERT(origAddr == st->Ist.IMark.addr
999                          + st->Ist.IMark.delta);  // XXX: check no overflow
1000
1001   /* Get BB struct (creating if necessary).
1002    * JS: The hash table is keyed with orig_addr_noredir -- important!
1003    * JW: Why? If it is because of different chasing of the redirection,
1004    *     this is not needed, as chasing is switched off in callgrind
1005    */
1006   clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
1007
1008   addBBSetupCall(&clgs);
1009
1010   // Set up running state
1011   clgs.events_used = 0;
1012   clgs.ii_index = 0;
1013   clgs.instr_offset = 0;
1014
1015   for (/*use current i*/; i < sbIn->stmts_used; i++) {
1016
1017      st = sbIn->stmts[i];
1018      CLG_ASSERT(isFlatIRStmt(st));
1019
1020      switch (st->tag) {
1021	 case Ist_NoOp:
1022	 case Ist_AbiHint:
1023	 case Ist_Put:
1024	 case Ist_PutI:
1025	 case Ist_MBE:
1026	    break;
1027
1028	 case Ist_IMark: {
1029            Addr64 cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
1030            Int    isize = st->Ist.IMark.len;
1031            CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
1032	    // If Vex fails to decode an instruction, the size will be zero.
1033	    // Pretend otherwise.
1034	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
1035
1036	    // Sanity-check size.
1037	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
1038		     || VG_CLREQ_SZB == isize );
1039
1040	    // Init the inode, record it as the current one.
1041	    // Subsequent Dr/Dw/Dm events from the same instruction will
1042	    // also use it.
1043	    curr_inode = next_InstrInfo (&clgs, isize);
1044
1045	    addEvent_Ir( &clgs, curr_inode );
1046	    break;
1047	 }
1048
1049	 case Ist_WrTmp: {
1050	    IRExpr* data = st->Ist.WrTmp.data;
1051	    if (data->tag == Iex_Load) {
1052	       IRExpr* aexpr = data->Iex.Load.addr;
1053	       // Note also, endianness info is ignored.  I guess
1054	       // that's not interesting.
1055	       addEvent_Dr( &clgs, curr_inode,
1056			    sizeofIRType(data->Iex.Load.ty), aexpr );
1057	    }
1058	    break;
1059	 }
1060
1061	 case Ist_Store: {
1062	    IRExpr* data  = st->Ist.Store.data;
1063	    IRExpr* aexpr = st->Ist.Store.addr;
1064	    addEvent_Dw( &clgs, curr_inode,
1065			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
1066	    break;
1067	 }
1068
1069         case Ist_StoreG: {
1070            IRStoreG* sg   = st->Ist.StoreG.details;
1071            IRExpr*   data = sg->data;
1072            IRExpr*   addr = sg->addr;
1073            IRType    type = typeOfIRExpr(tyenv, data);
1074            tl_assert(type != Ity_INVALID);
1075            addEvent_D_guarded( &clgs, curr_inode,
1076                                sizeofIRType(type), addr, sg->guard,
1077                                True/*isWrite*/ );
1078            break;
1079         }
1080
1081         case Ist_LoadG: {
1082            IRLoadG* lg       = st->Ist.LoadG.details;
1083            IRType   type     = Ity_INVALID; /* loaded type */
1084            IRType   typeWide = Ity_INVALID; /* after implicit widening */
1085            IRExpr*  addr     = lg->addr;
1086            typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
1087            tl_assert(type != Ity_INVALID);
1088            addEvent_D_guarded( &clgs, curr_inode,
1089                                sizeofIRType(type), addr, lg->guard,
1090                                False/*!isWrite*/ );
1091            break;
1092         }
1093
1094	 case Ist_Dirty: {
1095	    Int      dataSize;
1096	    IRDirty* d = st->Ist.Dirty.details;
1097	    if (d->mFx != Ifx_None) {
1098	       /* This dirty helper accesses memory.  Collect the details. */
1099	       tl_assert(d->mAddr != NULL);
1100	       tl_assert(d->mSize != 0);
1101	       dataSize = d->mSize;
1102	       // Large (eg. 28B, 108B, 512B on x86) data-sized
1103	       // instructions will be done inaccurately, but they're
1104	       // very rare and this avoids errors from hitting more
1105	       // than two cache lines in the simulation.
1106	       if (CLG_(clo).simulate_cache && dataSize > CLG_(min_line_size))
1107		  dataSize = CLG_(min_line_size);
1108	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
1109		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
1110	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
1111		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
1112	    } else {
1113	       tl_assert(d->mAddr == NULL);
1114	       tl_assert(d->mSize == 0);
1115	    }
1116	    break;
1117	 }
1118
1119         case Ist_CAS: {
1120            /* We treat it as a read and a write of the location.  I
1121               think that is the same behaviour as it was before IRCAS
1122               was introduced, since prior to that point, the Vex
1123               front ends would translate a lock-prefixed instruction
1124               into a (normal) read followed by a (normal) write. */
1125            Int    dataSize;
1126            IRCAS* cas = st->Ist.CAS.details;
1127            CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
1128            CLG_ASSERT(cas->dataLo);
1129            dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
1130            if (cas->dataHi != NULL)
1131               dataSize *= 2; /* since this is a doubleword-cas */
1132            addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
1133            addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
1134            addEvent_G(  &clgs, curr_inode );
1135            break;
1136         }
1137
1138         case Ist_LLSC: {
1139            IRType dataTy;
1140            if (st->Ist.LLSC.storedata == NULL) {
1141               /* LL */
1142               dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
1143               addEvent_Dr( &clgs, curr_inode,
1144                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
1145               /* flush events before LL, should help SC to succeed */
1146               flushEvents( &clgs );
1147            } else {
1148               /* SC */
1149               dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
1150               addEvent_Dw( &clgs, curr_inode,
1151                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
1152               /* I don't know whether the global-bus-lock cost should
1153                  be attributed to the LL or the SC, but it doesn't
1154                  really matter since they always have to be used in
1155                  pairs anyway.  Hence put it (quite arbitrarily) on
1156                  the SC. */
1157               addEvent_G(  &clgs, curr_inode );
1158            }
1159            break;
1160         }
1161
1162 	 case Ist_Exit: {
1163            Bool guest_exit, inverted;
1164
1165            /* VEX code generation sometimes inverts conditional branches.
1166             * As Callgrind counts (conditional) jumps, it has to correct
1167             * inversions. The heuristic is the following:
1168             * (1) Callgrind switches off SB chasing and unrolling, and
1169             *     therefore it assumes that a candidate for inversion only is
1170             *     the last conditional branch in an SB.
1171             * (2) inversion is assumed if the branch jumps to the address of
1172             *     the next guest instruction in memory.
1173             * This heuristic is precalculated in CLG_(collectBlockInfo)().
1174             *
1175             * Branching behavior is also used for branch prediction. Note that
1176             * above heuristic is different from what Cachegrind does.
1177             * Cachegrind uses (2) for all branches.
1178             */
1179            if (cJumps+1 == clgs.bb->cjmp_count)
1180                inverted = clgs.bb->cjmp_inverted;
1181            else
1182                inverted = False;
1183
1184            // call branch predictor only if this is a branch in guest code
1185            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
1186                         (st->Ist.Exit.jk == Ijk_Call) ||
1187                         (st->Ist.Exit.jk == Ijk_Ret);
1188
1189            if (guest_exit) {
1190                /* Stuff to widen the guard expression to a host word, so
1191                   we can pass it to the branch predictor simulation
1192                   functions easily. */
1193                IRType   tyW    = hWordTy;
1194                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
1195                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
1196                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
1197                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
1198                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
1199                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
1200                                               : IRExpr_Const(IRConst_U64(1));
1201
1202                /* Widen the guard expression. */
1203                addStmtToIRSB( clgs.sbOut,
1204                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
1205                addStmtToIRSB( clgs.sbOut,
1206                               IRStmt_WrTmp( guardW,
1207                                             IRExpr_Unop(widen,
1208                                                         IRExpr_RdTmp(guard1))) );
1209                /* If the exit is inverted, invert the sense of the guard. */
1210                addStmtToIRSB(
1211                        clgs.sbOut,
1212                        IRStmt_WrTmp(
1213                                guard,
1214                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
1215                                    : IRExpr_RdTmp(guardW)
1216                                    ));
1217                /* And post the event. */
1218                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
1219            }
1220
1221	    /* We may never reach the next statement, so need to flush
1222	       all outstanding transactions now. */
1223	    flushEvents( &clgs );
1224
1225	    CLG_ASSERT(clgs.ii_index>0);
1226	    if (!clgs.seen_before) {
1227	      ClgJumpKind jk;
1228
1229	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
1230	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
1231	      else {
1232		if (IRConst2Addr(st->Ist.Exit.dst) ==
1233		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
1234		  jk = jk_None;
1235		else
1236		  jk = jk_Jump;
1237	      }
1238
1239	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
1240	      clgs.bb->jmp[cJumps].jmpkind = jk;
1241	    }
1242
1243	    /* Update global variable jmps_passed before the jump
1244	     * A correction is needed if VEX inverted the last jump condition
1245	    */
1246	    UInt val = inverted ? cJumps+1 : cJumps;
1247	    addConstMemStoreStmt( clgs.sbOut,
1248				  (UWord) &CLG_(current_state).jmps_passed,
1249				  val, hWordTy);
1250	    cJumps++;
1251
1252	    break;
1253	 }
1254
1255	 default:
1256	    tl_assert(0);
1257	    break;
1258      }
1259
1260      /* Copy the original statement */
1261      addStmtToIRSB( clgs.sbOut, st );
1262
1263      CLG_DEBUGIF(5) {
1264	 VG_(printf)("   pass  ");
1265	 ppIRStmt(st);
1266	 VG_(printf)("\n");
1267      }
1268   }
1269
1270   /* Deal with branches to unknown destinations.  Except ignore ones
1271      which are function returns as we assume the return stack
1272      predictor never mispredicts. */
1273   if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
1274      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
1275      switch (sbIn->next->tag) {
1276         case Iex_Const:
1277            break; /* boring - branch to known address */
1278         case Iex_RdTmp:
1279            /* looks like an indirect branch (branch to unknown) */
1280            addEvent_Bi( &clgs, curr_inode, sbIn->next );
1281            break;
1282         default:
1283            /* shouldn't happen - if the incoming IR is properly
1284               flattened, should only have tmp and const cases to
1285               consider. */
1286            tl_assert(0);
1287      }
1288   }
1289
1290   /* At the end of the bb.  Flush outstandings. */
1291   flushEvents( &clgs );
1292
1293   /* Update global variable jmps_passed at end of SB.
1294    * As CLG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
1295    * this can be omitted if there is no conditional jump in this SB.
1296    * A correction is needed if VEX inverted the last jump condition
1297    */
1298   if (cJumps>0) {
1299      UInt jmps_passed = cJumps;
1300      if (clgs.bb->cjmp_inverted) jmps_passed--;
1301      addConstMemStoreStmt( clgs.sbOut,
1302			    (UWord) &CLG_(current_state).jmps_passed,
1303			    jmps_passed, hWordTy);
1304   }
1305   CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
1306   CLG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
1307
1308   /* Info for final exit from BB */
1309   {
1310     ClgJumpKind jk;
1311
1312     if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
1313     else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
1314     else {
1315       jk = jk_Jump;
1316       if ((sbIn->next->tag == Iex_Const) &&
1317	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
1318	    origAddr + clgs.instr_offset))
1319	 jk = jk_None;
1320     }
1321     clgs.bb->jmp[cJumps].jmpkind = jk;
1322     /* Instruction index of the call/ret at BB end
1323      * (it is wrong for fall-through, but does not matter) */
1324     clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
1325   }
1326
1327   /* swap information of last exit with final exit if inverted */
1328   if (clgs.bb->cjmp_inverted) {
1329     ClgJumpKind jk;
1330     UInt instr;
1331
1332     jk = clgs.bb->jmp[cJumps].jmpkind;
1333     clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
1334     clgs.bb->jmp[cJumps-1].jmpkind = jk;
1335     instr = clgs.bb->jmp[cJumps].instr;
1336     clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
1337     clgs.bb->jmp[cJumps-1].instr = instr;
1338   }
1339
1340   if (clgs.seen_before) {
1341       CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
1342       CLG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
1343   }
1344   else {
1345       clgs.bb->cost_count = update_cost_offsets(&clgs);
1346       clgs.bb->instr_len = clgs.instr_offset;
1347   }
1348
1349   CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
1350	     origAddr, clgs.bb->instr_len,
1351	     clgs.bb->cjmp_count, clgs.bb->cost_count);
1352   if (cJumps>0) {
1353       CLG_DEBUG(3, "                     [ ");
1354       for (i=0;i<cJumps;i++)
1355	   CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
1356       CLG_DEBUG(3, "], last inverted: %s \n",
1357		 clgs.bb->cjmp_inverted ? "yes":"no");
1358   }
1359
1360  return clgs.sbOut;
1361}
1362
1363/*--------------------------------------------------------------------*/
1364/*--- Discarding BB info                                           ---*/
1365/*--------------------------------------------------------------------*/
1366
1367// Called when a translation is removed from the translation cache for
1368// any reason at all: to free up space, because the guest code was
1369// unmapped or modified, or for any arbitrary reason.
1370static
1371void clg_discard_superblock_info ( Addr64 orig_addr64, VexGuestExtents vge )
1372{
1373    Addr orig_addr = (Addr)orig_addr64;
1374
1375    tl_assert(vge.n_used > 0);
1376
1377   if (0)
1378      VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
1379                   (void*)(Addr)orig_addr,
1380                   (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
1381
1382   // Get BB info, remove from table, free BB info.  Simple!  Note that we
1383   // use orig_addr, not the first instruction address in vge.
1384   CLG_(delete_bb)(orig_addr);
1385}
1386
1387
1388/*------------------------------------------------------------*/
1389/*--- CLG_(fini)() and related function                     ---*/
1390/*------------------------------------------------------------*/
1391
1392
1393
1394static void zero_thread_cost(thread_info* t)
1395{
1396  Int i;
1397
1398  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
1399    if (!CLG_(current_call_stack).entry[i].jcc) continue;
1400
1401    /* reset call counters to current for active calls */
1402    CLG_(copy_cost)( CLG_(sets).full,
1403		    CLG_(current_call_stack).entry[i].enter_cost,
1404		    CLG_(current_state).cost );
1405    CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
1406  }
1407
1408  CLG_(forall_bbccs)(CLG_(zero_bbcc));
1409
1410  /* set counter for last dump */
1411  CLG_(copy_cost)( CLG_(sets).full,
1412		  t->lastdump_cost, CLG_(current_state).cost );
1413}
1414
1415void CLG_(zero_all_cost)(Bool only_current_thread)
1416{
1417  if (VG_(clo_verbosity) > 1)
1418    VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
1419
1420  if (only_current_thread)
1421    zero_thread_cost(CLG_(get_current_thread)());
1422  else
1423    CLG_(forall_threads)(zero_thread_cost);
1424
1425  if (VG_(clo_verbosity) > 1)
1426    VG_(message)(Vg_DebugMsg, "  ...done\n");
1427}
1428
1429static
1430void unwind_thread(thread_info* t)
1431{
1432  /* unwind signal handlers */
1433  while(CLG_(current_state).sig !=0)
1434    CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
1435
1436  /* unwind regular call stack */
1437  while(CLG_(current_call_stack).sp>0)
1438    CLG_(pop_call_stack)();
1439
1440  /* reset context and function stack for context generation */
1441  CLG_(init_exec_state)( &CLG_(current_state) );
1442  CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
1443}
1444
1445static
1446void zero_state_cost(thread_info* t)
1447{
1448    CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
1449}
1450
1451/* Ups, this can go very wrong...
1452   FIXME: We should export this function or provide other means to get a handle */
1453extern void VG_(discard_translations) ( Addr64 start, ULong range, const HChar* who );
1454
1455void CLG_(set_instrument_state)(const HChar* reason, Bool state)
1456{
1457  if (CLG_(instrument_state) == state) {
1458    CLG_DEBUG(2, "%s: instrumentation already %s\n",
1459	     reason, state ? "ON" : "OFF");
1460    return;
1461  }
1462  CLG_(instrument_state) = state;
1463  CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
1464	   reason, state ? "ON" : "OFF");
1465
1466  VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl, "callgrind");
1467
1468  /* reset internal state: call stacks, simulator */
1469  CLG_(forall_threads)(unwind_thread);
1470  CLG_(forall_threads)(zero_state_cost);
1471  (*CLG_(cachesim).clear)();
1472
1473  if (VG_(clo_verbosity) > 1)
1474    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
1475		 reason, state ? "ON" : "OFF");
1476}
1477
1478/* helper for dump_state_togdb */
1479static void dump_state_of_thread_togdb(thread_info* ti)
1480{
1481    static HChar buf[512];
1482    static FullCost sum = 0, tmp = 0;
1483    Int t, p, i;
1484    BBCC *from, *to;
1485    call_entry* ce;
1486
1487    t = CLG_(current_tid);
1488    CLG_(init_cost_lz)( CLG_(sets).full, &sum );
1489    CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
1490    CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
1491			 ti->states.entry[0]->cost);
1492    CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
1493    CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), sum);
1494    VG_(gdb_printf)("events-%d: %s\n", t, buf);
1495    VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
1496
1497    ce = 0;
1498    for(i = 0; i < CLG_(current_call_stack).sp; i++) {
1499      ce = CLG_(get_call_entry)(i);
1500      /* if this frame is skipped, we don't have counters */
1501      if (!ce->jcc) continue;
1502
1503      from = ce->jcc->from;
1504      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
1505      VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
1506
1507      /* FIXME: EventSets! */
1508      CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
1509      CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
1510      CLG_(add_diff_cost)( CLG_(sets).full, sum,
1511			  ce->enter_cost, CLG_(current_state).cost );
1512      CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
1513
1514      p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
1515      CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum );
1516      VG_(gdb_printf)("%s\n", buf);
1517    }
1518    if (ce && ce->jcc) {
1519      to = ce->jcc->to;
1520      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
1521    }
1522}
1523
1524/* Dump current state */
1525static void dump_state_togdb(void)
1526{
1527    static HChar buf[512];
1528    thread_info** th;
1529    int t, p;
1530    Int orig_tid = CLG_(current_tid);
1531
1532    VG_(gdb_printf)("instrumentation: %s\n",
1533		    CLG_(instrument_state) ? "on":"off");
1534    if (!CLG_(instrument_state)) return;
1535
1536    VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
1537    VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
1538    VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
1539    VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
1540    VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
1541    VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
1542
1543    /* "events:" line. Given here because it will be dynamic in the future */
1544    p = VG_(sprintf)(buf, "events: ");
1545    CLG_(sprint_eventmapping)(buf+p, CLG_(dumpmap));
1546    VG_(gdb_printf)("%s\n", buf);
1547    /* "part:" line (number of last part. Is 0 at start */
1548    VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
1549
1550    /* threads */
1551    th = CLG_(get_threads)();
1552    p = VG_(sprintf)(buf, "threads:");
1553    for(t=1;t<VG_N_THREADS;t++) {
1554	if (!th[t]) continue;
1555	p += VG_(sprintf)(buf+p, " %d", t);
1556    }
1557    VG_(gdb_printf)("%s\n", buf);
1558    VG_(gdb_printf)("current-tid: %d\n", orig_tid);
1559    CLG_(forall_threads)(dump_state_of_thread_togdb);
1560}
1561
1562
1563static void print_monitor_help ( void )
1564{
1565   VG_(gdb_printf) ("\n");
1566   VG_(gdb_printf) ("callgrind monitor commands:\n");
1567   VG_(gdb_printf) ("  dump [<dump_hint>]\n");
1568   VG_(gdb_printf) ("        dump counters\n");
1569   VG_(gdb_printf) ("  zero\n");
1570   VG_(gdb_printf) ("        zero counters\n");
1571   VG_(gdb_printf) ("  status\n");
1572   VG_(gdb_printf) ("        print status\n");
1573   VG_(gdb_printf) ("  instrumentation [on|off]\n");
1574   VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
1575   VG_(gdb_printf) ("\n");
1576}
1577
1578/* return True if request recognised, False otherwise */
1579static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
1580{
1581   HChar* wcmd;
1582   HChar s[VG_(strlen(req)) + 1]; /* copy for strtok_r */
1583   HChar *ssaveptr;
1584
1585   VG_(strcpy) (s, req);
1586
1587   wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
1588   switch (VG_(keyword_id) ("help dump zero status instrumentation",
1589                            wcmd, kwd_report_duplicated_matches)) {
1590   case -2: /* multiple matches */
1591      return True;
1592   case -1: /* not found */
1593      return False;
1594   case  0: /* help */
1595      print_monitor_help();
1596      return True;
1597   case  1: { /* dump */
1598      CLG_(dump_profile)(req, False);
1599      return True;
1600   }
1601   case  2: { /* zero */
1602      CLG_(zero_all_cost)(False);
1603      return True;
1604   }
1605
1606   case 3: { /* status */
1607     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
1608     if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
1609       /* internal interface to callgrind_control */
1610       dump_state_togdb();
1611       return True;
1612     }
1613
1614     if (!CLG_(instrument_state)) {
1615       VG_(gdb_printf)("No status available as instrumentation is switched off\n");
1616     } else {
1617       // Status information to be improved ...
1618       thread_info** th = CLG_(get_threads)();
1619       Int t, tcount = 0;
1620       for(t=1;t<VG_N_THREADS;t++)
1621	 if (th[t]) tcount++;
1622       VG_(gdb_printf)("%d thread(s) running.\n", tcount);
1623     }
1624     return True;
1625   }
1626
1627   case 4: { /* instrumentation */
1628     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
1629     if (!arg) {
1630       VG_(gdb_printf)("instrumentation: %s\n",
1631		       CLG_(instrument_state) ? "on":"off");
1632     }
1633     else
1634       CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
1635     return True;
1636   }
1637
1638   default:
1639      tl_assert(0);
1640      return False;
1641   }
1642}
1643
1644static
1645Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
1646{
1647   if (!VG_IS_TOOL_USERREQ('C','T',args[0])
1648       && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
1649      return False;
1650
1651   switch(args[0]) {
1652   case VG_USERREQ__DUMP_STATS:
1653      CLG_(dump_profile)("Client Request", True);
1654      *ret = 0;                 /* meaningless */
1655      break;
1656
1657   case VG_USERREQ__DUMP_STATS_AT:
1658     {
1659       HChar buf[512];
1660       VG_(sprintf)(buf,"Client Request: %s", (HChar*)args[1]);
1661       CLG_(dump_profile)(buf, True);
1662       *ret = 0;                 /* meaningless */
1663     }
1664     break;
1665
1666   case VG_USERREQ__ZERO_STATS:
1667     CLG_(zero_all_cost)(True);
1668      *ret = 0;                 /* meaningless */
1669      break;
1670
1671   case VG_USERREQ__TOGGLE_COLLECT:
1672     CLG_(current_state).collect = !CLG_(current_state).collect;
1673     CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
1674	      CLG_(current_state).collect ? "ON" : "OFF");
1675     *ret = 0;                 /* meaningless */
1676     break;
1677
1678   case VG_USERREQ__START_INSTRUMENTATION:
1679     CLG_(set_instrument_state)("Client Request", True);
1680     *ret = 0;                 /* meaningless */
1681     break;
1682
1683   case VG_USERREQ__STOP_INSTRUMENTATION:
1684     CLG_(set_instrument_state)("Client Request", False);
1685     *ret = 0;                 /* meaningless */
1686     break;
1687
1688   case VG_USERREQ__GDB_MONITOR_COMMAND: {
1689      Bool handled = handle_gdb_monitor_command (tid, (HChar*)args[1]);
1690      if (handled)
1691         *ret = 1;
1692      else
1693         *ret = 0;
1694      return handled;
1695   }
1696   default:
1697      return False;
1698   }
1699
1700   return True;
1701}
1702
1703
1704/* Syscall Timing */
1705
1706/* struct timeval syscalltime[VG_N_THREADS]; */
1707#if CLG_MICROSYSTIME
1708#include <sys/time.h>
1709#include <sys/syscall.h>
1710extern Int VG_(do_syscall) ( UInt, ... );
1711
1712ULong syscalltime[VG_N_THREADS];
1713#else
1714UInt syscalltime[VG_N_THREADS];
1715#endif
1716
1717static
1718void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
1719                           UWord* args, UInt nArgs)
1720{
1721  if (CLG_(clo).collect_systime) {
1722#if CLG_MICROSYSTIME
1723    struct vki_timeval tv_now;
1724    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
1725    syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
1726#else
1727    syscalltime[tid] = VG_(read_millisecond_timer)();
1728#endif
1729  }
1730}
1731
1732static
1733void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
1734                            UWord* args, UInt nArgs, SysRes res)
1735{
1736  if (CLG_(clo).collect_systime &&
1737      CLG_(current_state).bbcc) {
1738      Int o;
1739#if CLG_MICROSYSTIME
1740    struct vki_timeval tv_now;
1741    ULong diff;
1742
1743    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
1744    diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
1745#else
1746    UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
1747#endif
1748
1749    /* offset o is for "SysCount", o+1 for "SysTime" */
1750    o = fullOffset(EG_SYS);
1751    CLG_ASSERT(o>=0);
1752    CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
1753
1754    CLG_(current_state).cost[o] ++;
1755    CLG_(current_state).cost[o+1] += diff;
1756    if (!CLG_(current_state).bbcc->skipped)
1757      CLG_(init_cost_lz)(CLG_(sets).full,
1758			&(CLG_(current_state).bbcc->skipped));
1759    CLG_(current_state).bbcc->skipped[o] ++;
1760    CLG_(current_state).bbcc->skipped[o+1] += diff;
1761  }
1762}
1763
1764static UInt ULong_width(ULong n)
1765{
1766   UInt w = 0;
1767   while (n > 0) {
1768      n = n / 10;
1769      w++;
1770   }
1771   if (w == 0) w = 1;
1772   return w + (w-1)/3;   // add space for commas
1773}
1774
1775static
1776void branchsim_printstat(int l1, int l2, int l3)
1777{
1778    static HChar buf1[128], buf2[128], buf3[128];
1779    static HChar fmt[128];
1780    FullCost total;
1781    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
1782    ULong B_total_b, B_total_mp;
1783
1784    total = CLG_(total_cost);
1785    Bc_total_b  = total[ fullOffset(EG_BC)   ];
1786    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
1787    Bi_total_b  = total[ fullOffset(EG_BI)   ];
1788    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
1789
1790    /* Make format string, getting width right for numbers */
1791    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
1792                 l1, l2, l3);
1793
1794    if (0 == Bc_total_b)  Bc_total_b = 1;
1795    if (0 == Bi_total_b)  Bi_total_b = 1;
1796    B_total_b  = Bc_total_b  + Bi_total_b;
1797    B_total_mp = Bc_total_mp + Bi_total_mp;
1798
1799    VG_(umsg)("\n");
1800    VG_(umsg)(fmt, "Branches:     ",
1801              B_total_b, Bc_total_b, Bi_total_b);
1802
1803    VG_(umsg)(fmt, "Mispredicts:  ",
1804              B_total_mp, Bc_total_mp, Bi_total_mp);
1805
1806    VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
1807    VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
1808    VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
1809
1810    VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
1811}
1812
1813static
1814void clg_print_stats(void)
1815{
1816   int BB_lookups =
1817     CLG_(stat).full_debug_BBs +
1818     CLG_(stat).fn_name_debug_BBs +
1819     CLG_(stat).file_line_debug_BBs +
1820     CLG_(stat).no_debug_BBs;
1821
1822   /* Hash table stats */
1823   VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
1824		CLG_(stat).distinct_objs);
1825   VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
1826		CLG_(stat).distinct_files);
1827   VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
1828		CLG_(stat).distinct_fns);
1829   VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
1830		CLG_(stat).distinct_contexts);
1831   VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
1832		CLG_(stat).distinct_bbs);
1833   VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)\n",
1834		CLG_(costarray_entries), CLG_(costarray_chunks));
1835   VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
1836		CLG_(stat).distinct_bbccs);
1837   VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
1838		CLG_(stat).distinct_jccs);
1839   VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
1840		CLG_(stat).distinct_skips);
1841   VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
1842		BB_lookups);
1843   if (BB_lookups>0) {
1844      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
1845		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
1846		   CLG_(stat).full_debug_BBs);
1847      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
1848		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
1849		   CLG_(stat).file_line_debug_BBs);
1850      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
1851		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
1852		   CLG_(stat).fn_name_debug_BBs);
1853      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
1854		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
1855		   CLG_(stat).no_debug_BBs);
1856   }
1857   VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
1858		CLG_(stat).bbcc_clones);
1859   VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
1860		CLG_(stat).bb_retranslations);
1861   VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
1862		CLG_(stat).distinct_instrs);
1863   VG_(message)(Vg_DebugMsg, "");
1864
1865   VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
1866		CLG_(stat).cxt_lru_misses);
1867   VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
1868		CLG_(stat).bbcc_lru_misses);
1869   VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
1870		CLG_(stat).jcc_lru_misses);
1871   VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
1872		CLG_(stat).bb_executions);
1873   VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
1874		CLG_(stat).call_counter);
1875   VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
1876		CLG_(stat).jcnd_counter);
1877   VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
1878		CLG_(stat).jump_counter);
1879   VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
1880		CLG_(stat).rec_call_counter);
1881   VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
1882		CLG_(stat).ret_counter);
1883}
1884
1885
1886static
1887void finish(void)
1888{
1889  HChar buf[32+COSTS_LEN];
1890  HChar fmt[128];
1891  Int l1, l2, l3;
1892  FullCost total;
1893
1894  CLG_DEBUG(0, "finish()\n");
1895
1896  (*CLG_(cachesim).finish)();
1897
1898  /* pop all remaining items from CallStack for correct sum
1899   */
1900  CLG_(forall_threads)(unwind_thread);
1901
1902  CLG_(dump_profile)(0, False);
1903
1904  if (VG_(clo_verbosity) == 0) return;
1905
1906  if (VG_(clo_stats)) {
1907    VG_(message)(Vg_DebugMsg, "\n");
1908    clg_print_stats();
1909    VG_(message)(Vg_DebugMsg, "\n");
1910  }
1911
1912  CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
1913  VG_(message)(Vg_UserMsg, "Events    : %s\n", buf);
1914  CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
1915  VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
1916  VG_(message)(Vg_UserMsg, "\n");
1917
1918  /* determine value widths for statistics */
1919  total = CLG_(total_cost);
1920  l1 = ULong_width( total[fullOffset(EG_IR)] );
1921  l2 = l3 = 0;
1922  if (CLG_(clo).simulate_cache) {
1923      l2 = ULong_width( total[fullOffset(EG_DR)] );
1924      l3 = ULong_width( total[fullOffset(EG_DW)] );
1925  }
1926  if (CLG_(clo).simulate_branch) {
1927      int l2b = ULong_width( total[fullOffset(EG_BC)] );
1928      int l3b = ULong_width( total[fullOffset(EG_BI)] );
1929      if (l2b > l2) l2 = l2b;
1930      if (l3b > l3) l3 = l3b;
1931  }
1932
1933  /* Make format string, getting width right for numbers */
1934  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
1935
1936  /* Always print this */
1937  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
1938
1939  if (CLG_(clo).simulate_cache)
1940      (*CLG_(cachesim).printstat)(l1, l2, l3);
1941
1942  if (CLG_(clo).simulate_branch)
1943      branchsim_printstat(l1, l2, l3);
1944
1945}
1946
1947
1948void CLG_(fini)(Int exitcode)
1949{
1950  finish();
1951}
1952
1953
1954/*--------------------------------------------------------------------*/
1955/*--- Setup                                                        ---*/
1956/*--------------------------------------------------------------------*/
1957
1958static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
1959{
1960   static ULong last_blocks_done = 0;
1961
1962   if (0)
1963      VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
1964
1965   /* throttle calls to CLG_(run_thread) by number of BBs executed */
1966   if (blocks_done - last_blocks_done < 5000) return;
1967   last_blocks_done = blocks_done;
1968
1969   CLG_(run_thread)( tid );
1970}
1971
1972static
1973void CLG_(post_clo_init)(void)
1974{
1975   if (VG_(clo_vex_control).iropt_register_updates
1976       != VexRegUpdSpAtMemAccess) {
1977      CLG_DEBUG(1, " Using user specified value for "
1978                "--vex-iropt-register-updates\n");
1979   } else {
1980      CLG_DEBUG(1,
1981                " Using default --vex-iropt-register-updates="
1982                "sp-at-mem-access\n");
1983   }
1984
1985   if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
1986      VG_(message)(Vg_UserMsg,
1987                   "callgrind only works with --vex-iropt-unroll-thresh=0\n"
1988                   "=> resetting it back to 0\n");
1989      VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overriden.
1990   }
1991   if (VG_(clo_vex_control).guest_chase_thresh != 0) {
1992      VG_(message)(Vg_UserMsg,
1993                   "callgrind only works with --vex-guest-chase-thresh=0\n"
1994                   "=> resetting it back to 0\n");
1995      VG_(clo_vex_control).guest_chase_thresh = 0; // cannot be overriden.
1996   }
1997
1998   CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
1999   CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
2000   CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
2001
2002   if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
2003       VG_(message)(Vg_UserMsg, "Using source line as position.\n");
2004       CLG_(clo).dump_line = True;
2005   }
2006
2007   CLG_(init_dumps)();
2008
2009   (*CLG_(cachesim).post_clo_init)();
2010
2011   CLG_(init_eventsets)();
2012   CLG_(init_statistics)(& CLG_(stat));
2013   CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
2014
2015   /* initialize hash tables */
2016   CLG_(init_obj_table)();
2017   CLG_(init_cxt_table)();
2018   CLG_(init_bb_hash)();
2019
2020   CLG_(init_threads)();
2021   CLG_(run_thread)(1);
2022
2023   CLG_(instrument_state) = CLG_(clo).instrument_atstart;
2024
2025   if (VG_(clo_verbosity > 0)) {
2026      VG_(message)(Vg_UserMsg,
2027                   "For interactive control, run 'callgrind_control%s%s -h'.\n",
2028                   (VG_(arg_vgdb_prefix) ? " " : ""),
2029                   (VG_(arg_vgdb_prefix) ? VG_(arg_vgdb_prefix) : ""));
2030   }
2031}
2032
2033static
2034void CLG_(pre_clo_init)(void)
2035{
2036    VG_(details_name)            ("Callgrind");
2037    VG_(details_version)         (NULL);
2038    VG_(details_description)     ("a call-graph generating cache profiler");
2039    VG_(details_copyright_author)("Copyright (C) 2002-2013, and GNU GPL'd, "
2040				  "by Josef Weidendorfer et al.");
2041    VG_(details_bug_reports_to)  (VG_BUGS_TO);
2042    VG_(details_avg_translation_sizeB) ( 500 );
2043
2044    VG_(clo_vex_control).iropt_register_updates
2045       = VexRegUpdSpAtMemAccess; // overridable by the user.
2046    VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overriden.
2047    VG_(clo_vex_control).guest_chase_thresh = 0;    // cannot be overriden.
2048
2049    VG_(basic_tool_funcs)        (CLG_(post_clo_init),
2050                                  CLG_(instrument),
2051                                  CLG_(fini));
2052
2053    VG_(needs_superblock_discards)(clg_discard_superblock_info);
2054
2055
2056    VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
2057				    CLG_(print_usage),
2058				    CLG_(print_debug_usage));
2059
2060    VG_(needs_client_requests)(CLG_(handle_client_request));
2061    VG_(needs_print_stats)    (clg_print_stats);
2062    VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
2063			       CLG_(post_syscalltime));
2064
2065    VG_(track_start_client_code)  ( & clg_start_client_code_callback );
2066    VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
2067    VG_(track_post_deliver_signal)( & CLG_(post_signal) );
2068
2069    CLG_(set_clo_defaults)();
2070}
2071
2072VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
2073
2074/*--------------------------------------------------------------------*/
2075/*--- end                                                   main.c ---*/
2076/*--------------------------------------------------------------------*/
2077