1
2/*--------------------------------------------------------------------*/
3/*--- Callgrind                                                    ---*/
4/*---                                                       main.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of Callgrind, a Valgrind tool for call graph
9   profiling programs.
10
11   Copyright (C) 2002-2012, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12
13   This tool is derived from and contains code from Cachegrind
14   Copyright (C) 2002-2012 Nicholas Nethercote (njn@valgrind.org)
15
16   This program is free software; you can redistribute it and/or
17   modify it under the terms of the GNU General Public License as
18   published by the Free Software Foundation; either version 2 of the
19   License, or (at your option) any later version.
20
21   This program is distributed in the hope that it will be useful, but
22   WITHOUT ANY WARRANTY; without even the implied warranty of
23   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24   General Public License for more details.
25
26   You should have received a copy of the GNU General Public License
27   along with this program; if not, write to the Free Software
28   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29   02111-1307, USA.
30
31   The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "config.h"
35#include "callgrind.h"
36#include "global.h"
37
38#include "pub_tool_threadstate.h"
39#include "pub_tool_gdbserver.h"
40
41#include "cg_branchpred.c"
42
43/*------------------------------------------------------------*/
44/*--- Global variables                                     ---*/
45/*------------------------------------------------------------*/
46
47/* for all threads */
48CommandLineOptions CLG_(clo);
49Statistics CLG_(stat);
50Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
51
52/* thread and signal handler specific */
53exec_state CLG_(current_state);
54
55/* min of L1 and LL cache line sizes.  This only gets set to a
56   non-zero value if we are doing cache simulation. */
57Int CLG_(min_line_size) = 0;
58
59
60/*------------------------------------------------------------*/
61/*--- Statistics                                           ---*/
62/*------------------------------------------------------------*/
63
64static void CLG_(init_statistics)(Statistics* s)
65{
66  s->call_counter        = 0;
67  s->jcnd_counter        = 0;
68  s->jump_counter        = 0;
69  s->rec_call_counter    = 0;
70  s->ret_counter         = 0;
71  s->bb_executions       = 0;
72
73  s->context_counter     = 0;
74  s->bb_retranslations   = 0;
75
76  s->distinct_objs       = 0;
77  s->distinct_files      = 0;
78  s->distinct_fns        = 0;
79  s->distinct_contexts   = 0;
80  s->distinct_bbs        = 0;
81  s->distinct_bbccs      = 0;
82  s->distinct_instrs     = 0;
83  s->distinct_skips      = 0;
84
85  s->bb_hash_resizes     = 0;
86  s->bbcc_hash_resizes   = 0;
87  s->jcc_hash_resizes    = 0;
88  s->cxt_hash_resizes    = 0;
89  s->fn_array_resizes    = 0;
90  s->call_stack_resizes  = 0;
91  s->fn_stack_resizes    = 0;
92
93  s->full_debug_BBs      = 0;
94  s->file_line_debug_BBs = 0;
95  s->fn_name_debug_BBs   = 0;
96  s->no_debug_BBs        = 0;
97  s->bbcc_lru_misses     = 0;
98  s->jcc_lru_misses      = 0;
99  s->cxt_lru_misses      = 0;
100  s->bbcc_clones         = 0;
101}
102
103
104/*------------------------------------------------------------*/
105/*--- Simple callbacks (not cache similator)               ---*/
106/*------------------------------------------------------------*/
107
108VG_REGPARM(1)
109static void log_global_event(InstrInfo* ii)
110{
111    ULong* cost_Bus;
112
113    CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
114              CLG_(bb_base) + ii->instr_offset, ii->instr_size);
115
116    if (!CLG_(current_state).collect) return;
117
118    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
119
120    CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
121
122    if (CLG_(current_state).nonskipped)
123        cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
124    else
125        cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
126    cost_Bus[0]++;
127}
128
129
130/* For branches, we consult two different predictors, one which
131   predicts taken/untaken for conditional branches, and the other
132   which predicts the branch target address for indirect branches
133   (jump-to-register style ones). */
134
135static VG_REGPARM(2)
136void log_cond_branch(InstrInfo* ii, Word taken)
137{
138    Bool miss;
139    Int fullOffset_Bc;
140    ULong* cost_Bc;
141
142    CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
143              CLG_(bb_base) + ii->instr_offset, taken);
144
145    miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
146
147    if (!CLG_(current_state).collect) return;
148
149    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
150
151    if (CLG_(current_state).nonskipped)
152        cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
153    else
154        cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
155
156    fullOffset_Bc = fullOffset(EG_BC);
157    CLG_(current_state).cost[ fullOffset_Bc ]++;
158    cost_Bc[0]++;
159    if (miss) {
160        CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
161        cost_Bc[1]++;
162    }
163}
164
165static VG_REGPARM(2)
166void log_ind_branch(InstrInfo* ii, UWord actual_dst)
167{
168    Bool miss;
169    Int fullOffset_Bi;
170    ULong* cost_Bi;
171
172    CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
173              CLG_(bb_base) + ii->instr_offset, actual_dst);
174
175    miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
176
177    if (!CLG_(current_state).collect) return;
178
179    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
180
181    if (CLG_(current_state).nonskipped)
182        cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
183    else
184        cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
185
186    fullOffset_Bi = fullOffset(EG_BI);
187    CLG_(current_state).cost[ fullOffset_Bi ]++;
188    cost_Bi[0]++;
189    if (miss) {
190        CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
191        cost_Bi[1]++;
192    }
193}
194
195/*------------------------------------------------------------*/
196/*--- Instrumentation structures and event queue handling  ---*/
197/*------------------------------------------------------------*/
198
199/* Maintain an ordered list of memory events which are outstanding, in
200   the sense that no IR has yet been generated to do the relevant
201   helper calls.  The BB is scanned top to bottom and memory events
202   are added to the end of the list, merging with the most recent
203   notified event where possible (Dw immediately following Dr and
204   having the same size and EA can be merged).
205
206   This merging is done so that for architectures which have
207   load-op-store instructions (x86, amd64), the insn is treated as if
208   it makes just one memory reference (a modify), rather than two (a
209   read followed by a write at the same address).
210
211   At various points the list will need to be flushed, that is, IR
212   generated from it.  That must happen before any possible exit from
213   the block (the end, or an IRStmt_Exit).  Flushing also takes place
214   when there is no space to add a new event.
215
216   If we require the simulation statistics to be up to date with
217   respect to possible memory exceptions, then the list would have to
218   be flushed before each memory reference.  That would however lose
219   performance by inhibiting event-merging during flushing.
220
221   Flushing the list consists of walking it start to end and emitting
222   instrumentation IR for each event, in the order in which they
223   appear.  It may be possible to emit a single call for two adjacent
224   events in order to reduce the number of helper function calls made.
225   For example, it could well be profitable to handle two adjacent Ir
226   events with a single helper call.  */
227
228typedef
229   IRExpr
230   IRAtom;
231
232typedef
233   enum {
234      Ev_Ir,  // Instruction read
235      Ev_Dr,  // Data read
236      Ev_Dw,  // Data write
237      Ev_Dm,  // Data modify (read then write)
238      Ev_Bc,  // branch conditional
239      Ev_Bi,  // branch indirect (to unknown destination)
240      Ev_G    // Global bus event
241   }
242   EventTag;
243
244typedef
245   struct {
246      EventTag   tag;
247      InstrInfo* inode;
248      union {
249	 struct {
250	 } Ir;
251	 struct {
252	    IRAtom* ea;
253	    Int     szB;
254	 } Dr;
255	 struct {
256	    IRAtom* ea;
257	    Int     szB;
258	 } Dw;
259	 struct {
260	    IRAtom* ea;
261	    Int     szB;
262	 } Dm;
263         struct {
264            IRAtom* taken; /* :: Ity_I1 */
265         } Bc;
266         struct {
267            IRAtom* dst;
268         } Bi;
269	 struct {
270	 } G;
271      } Ev;
272   }
273   Event;
274
275static void init_Event ( Event* ev ) {
276   VG_(memset)(ev, 0, sizeof(Event));
277}
278
279static IRAtom* get_Event_dea ( Event* ev ) {
280   switch (ev->tag) {
281      case Ev_Dr: return ev->Ev.Dr.ea;
282      case Ev_Dw: return ev->Ev.Dw.ea;
283      case Ev_Dm: return ev->Ev.Dm.ea;
284      default:    tl_assert(0);
285   }
286}
287
288static Int get_Event_dszB ( Event* ev ) {
289   switch (ev->tag) {
290      case Ev_Dr: return ev->Ev.Dr.szB;
291      case Ev_Dw: return ev->Ev.Dw.szB;
292      case Ev_Dm: return ev->Ev.Dm.szB;
293      default:    tl_assert(0);
294   }
295}
296
297
298/* Up to this many unnotified events are allowed.  Number is
299   arbitrary.  Larger numbers allow more event merging to occur, but
300   potentially induce more spilling due to extending live ranges of
301   address temporaries. */
302#define N_EVENTS 16
303
304
305/* A struct which holds all the running state during instrumentation.
306   Mostly to avoid passing loads of parameters everywhere. */
307typedef struct {
308    /* The current outstanding-memory-event list. */
309    Event events[N_EVENTS];
310    Int   events_used;
311
312    /* The array of InstrInfo's is part of BB struct. */
313    BB* bb;
314
315    /* BB seen before (ie. re-instrumentation) */
316    Bool seen_before;
317
318    /* Number InstrInfo bins 'used' so far. */
319    UInt ii_index;
320
321    // current offset of guest instructions from BB start
322    UInt instr_offset;
323
324    /* The output SB being constructed. */
325    IRSB* sbOut;
326} ClgState;
327
328
329static void showEvent ( Event* ev )
330{
331   switch (ev->tag) {
332      case Ev_Ir:
333	 VG_(printf)("Ir (InstrInfo %p) at +%d\n",
334		     ev->inode, ev->inode->instr_offset);
335	 break;
336      case Ev_Dr:
337	 VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
338		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
339	 ppIRExpr(ev->Ev.Dr.ea);
340	 VG_(printf)("\n");
341	 break;
342      case Ev_Dw:
343	 VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
344		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
345	 ppIRExpr(ev->Ev.Dw.ea);
346	 VG_(printf)("\n");
347	 break;
348      case Ev_Dm:
349	 VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
350		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
351	 ppIRExpr(ev->Ev.Dm.ea);
352	 VG_(printf)("\n");
353	 break;
354      case Ev_Bc:
355         VG_(printf)("Bc %p   GA=", ev->inode);
356         ppIRExpr(ev->Ev.Bc.taken);
357         VG_(printf)("\n");
358         break;
359      case Ev_Bi:
360         VG_(printf)("Bi %p  DST=", ev->inode);
361         ppIRExpr(ev->Ev.Bi.dst);
362         VG_(printf)("\n");
363         break;
364      case Ev_G:
365         VG_(printf)("G  %p\n", ev->inode);
366         break;
367      default:
368	 tl_assert(0);
369	 break;
370   }
371}
372
373/* Generate code for all outstanding memory events, and mark the queue
374   empty.  Code is generated into cgs->sbOut, and this activity
375   'consumes' slots in cgs->bb. */
376
377static void flushEvents ( ClgState* clgs )
378{
379   Int        i, regparms, inew;
380   Char*      helperName;
381   void*      helperAddr;
382   IRExpr**   argv;
383   IRExpr*    i_node_expr;
384   IRDirty*   di;
385   Event*     ev;
386   Event*     ev2;
387   Event*     ev3;
388
389   if (!clgs->seen_before) {
390       // extend event sets as needed
391       // available sets: D0 Dr
392       for(i=0; i<clgs->events_used; i++) {
393	   ev  = &clgs->events[i];
394	   switch(ev->tag) {
395	   case Ev_Ir:
396	       // Ir event always is first for a guest instruction
397	       CLG_ASSERT(ev->inode->eventset == 0);
398	       ev->inode->eventset = CLG_(sets).base;
399	       break;
400	   case Ev_Dr:
401               // extend event set by Dr counters
402	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
403							   EG_DR);
404	       break;
405	   case Ev_Dw:
406	   case Ev_Dm:
407               // extend event set by Dw counters
408	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
409							   EG_DW);
410	       break;
411           case Ev_Bc:
412               // extend event set by Bc counters
413               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
414                                                           EG_BC);
415               break;
416           case Ev_Bi:
417               // extend event set by Bi counters
418               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
419                                                           EG_BI);
420               break;
421	   case Ev_G:
422               // extend event set by Bus counter
423	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
424							   EG_BUS);
425	       break;
426	   default:
427	       tl_assert(0);
428	   }
429       }
430   }
431
432   for(i = 0; i < clgs->events_used; i = inew) {
433
434      helperName = NULL;
435      helperAddr = NULL;
436      argv       = NULL;
437      regparms   = 0;
438
439      /* generate IR to notify event i and possibly the ones
440	 immediately following it. */
441      tl_assert(i >= 0 && i < clgs->events_used);
442
443      ev  = &clgs->events[i];
444      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
445      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
446
447      CLG_DEBUGIF(5) {
448	 VG_(printf)("   flush ");
449	 showEvent( ev );
450      }
451
452      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
453
454      /* Decide on helper fn to call and args to pass it, and advance
455	 i appropriately.
456	 Dm events have same effect as Dw events */
457      switch (ev->tag) {
458	 case Ev_Ir:
459	    /* Merge an Ir with a following Dr. */
460	    if (ev2 && ev2->tag == Ev_Dr) {
461	       /* Why is this true?  It's because we're merging an Ir
462		  with a following Dr.  The Ir derives from the
463		  instruction's IMark and the Dr from data
464		  references which follow it.  In short it holds
465		  because each insn starts with an IMark, hence an
466		  Ev_Ir, and so these Dr must pertain to the
467		  immediately preceding Ir.  Same applies to analogous
468		  assertions in the subsequent cases. */
469	       tl_assert(ev2->inode == ev->inode);
470	       helperName = CLG_(cachesim).log_1I1Dr_name;
471	       helperAddr = CLG_(cachesim).log_1I1Dr;
472	       argv = mkIRExprVec_3( i_node_expr,
473				     get_Event_dea(ev2),
474				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
475	       regparms = 3;
476	       inew = i+2;
477	    }
478	    /* Merge an Ir with a following Dw/Dm. */
479	    else
480	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
481	       tl_assert(ev2->inode == ev->inode);
482	       helperName = CLG_(cachesim).log_1I1Dw_name;
483	       helperAddr = CLG_(cachesim).log_1I1Dw;
484	       argv = mkIRExprVec_3( i_node_expr,
485				     get_Event_dea(ev2),
486				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
487	       regparms = 3;
488	       inew = i+2;
489	    }
490	    /* Merge an Ir with two following Irs. */
491	    else
492	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
493	       helperName = CLG_(cachesim).log_3I0D_name;
494	       helperAddr = CLG_(cachesim).log_3I0D;
495	       argv = mkIRExprVec_3( i_node_expr,
496				     mkIRExpr_HWord( (HWord)ev2->inode ),
497				     mkIRExpr_HWord( (HWord)ev3->inode ) );
498	       regparms = 3;
499	       inew = i+3;
500	    }
501	    /* Merge an Ir with one following Ir. */
502	    else
503	    if (ev2 && ev2->tag == Ev_Ir) {
504	       helperName = CLG_(cachesim).log_2I0D_name;
505	       helperAddr = CLG_(cachesim).log_2I0D;
506	       argv = mkIRExprVec_2( i_node_expr,
507				     mkIRExpr_HWord( (HWord)ev2->inode ) );
508	       regparms = 2;
509	       inew = i+2;
510	    }
511	    /* No merging possible; emit as-is. */
512	    else {
513	       helperName = CLG_(cachesim).log_1I0D_name;
514	       helperAddr = CLG_(cachesim).log_1I0D;
515	       argv = mkIRExprVec_1( i_node_expr );
516	       regparms = 1;
517	       inew = i+1;
518	    }
519	    break;
520	 case Ev_Dr:
521	    /* Data read or modify */
522	    helperName = CLG_(cachesim).log_0I1Dr_name;
523	    helperAddr = CLG_(cachesim).log_0I1Dr;
524	    argv = mkIRExprVec_3( i_node_expr,
525				  get_Event_dea(ev),
526				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
527	    regparms = 3;
528	    inew = i+1;
529	    break;
530	 case Ev_Dw:
531	 case Ev_Dm:
532	    /* Data write */
533	    helperName = CLG_(cachesim).log_0I1Dw_name;
534	    helperAddr = CLG_(cachesim).log_0I1Dw;
535	    argv = mkIRExprVec_3( i_node_expr,
536				  get_Event_dea(ev),
537				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
538	    regparms = 3;
539	    inew = i+1;
540	    break;
541         case Ev_Bc:
542            /* Conditional branch */
543            helperName = "log_cond_branch";
544            helperAddr = &log_cond_branch;
545            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
546            regparms = 2;
547            inew = i+1;
548            break;
549         case Ev_Bi:
550            /* Branch to an unknown destination */
551            helperName = "log_ind_branch";
552            helperAddr = &log_ind_branch;
553            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
554            regparms = 2;
555            inew = i+1;
556            break;
557         case Ev_G:
558            /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
559            helperName = "log_global_event";
560            helperAddr = &log_global_event;
561            argv = mkIRExprVec_1( i_node_expr );
562            regparms = 1;
563            inew = i+1;
564            break;
565	 default:
566	    tl_assert(0);
567      }
568
569      CLG_DEBUGIF(5) {
570	  if (inew > i+1) {
571	      VG_(printf)("   merge ");
572	      showEvent( ev2 );
573	  }
574	  if (inew > i+2) {
575	      VG_(printf)("   merge ");
576	      showEvent( ev3 );
577	  }
578	  if (helperAddr)
579	      VG_(printf)("   call  %s (%p)\n",
580			  helperName, helperAddr);
581      }
582
583      /* helper could be unset depending on the simulator used */
584      if (helperAddr == 0) continue;
585
586      /* Add the helper. */
587      tl_assert(helperName);
588      tl_assert(helperAddr);
589      tl_assert(argv);
590      di = unsafeIRDirty_0_N( regparms,
591			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
592			      argv );
593      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
594   }
595
596   clgs->events_used = 0;
597}
598
599static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
600{
601   Event* evt;
602   tl_assert(clgs->seen_before || (inode->eventset == 0));
603   if (!CLG_(clo).simulate_cache) return;
604
605   if (clgs->events_used == N_EVENTS)
606      flushEvents(clgs);
607   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
608   evt = &clgs->events[clgs->events_used];
609   init_Event(evt);
610   evt->tag      = Ev_Ir;
611   evt->inode    = inode;
612   clgs->events_used++;
613}
614
615static
616void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
617{
618   Event* evt;
619   tl_assert(isIRAtom(ea));
620   tl_assert(datasize >= 1);
621   if (!CLG_(clo).simulate_cache) return;
622   tl_assert(datasize <= CLG_(min_line_size));
623
624   if (clgs->events_used == N_EVENTS)
625      flushEvents(clgs);
626   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
627   evt = &clgs->events[clgs->events_used];
628   init_Event(evt);
629   evt->tag       = Ev_Dr;
630   evt->inode     = inode;
631   evt->Ev.Dr.szB = datasize;
632   evt->Ev.Dr.ea  = ea;
633   clgs->events_used++;
634}
635
636static
637void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
638{
639   Event* lastEvt;
640   Event* evt;
641   tl_assert(isIRAtom(ea));
642   tl_assert(datasize >= 1);
643   if (!CLG_(clo).simulate_cache) return;
644   tl_assert(datasize <= CLG_(min_line_size));
645
646   /* Is it possible to merge this write with the preceding read? */
647   lastEvt = &clgs->events[clgs->events_used-1];
648   if (clgs->events_used > 0
649       && lastEvt->tag       == Ev_Dr
650       && lastEvt->Ev.Dr.szB == datasize
651       && lastEvt->inode     == inode
652       && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
653   {
654      lastEvt->tag   = Ev_Dm;
655      return;
656   }
657
658   /* No.  Add as normal. */
659   if (clgs->events_used == N_EVENTS)
660      flushEvents(clgs);
661   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
662   evt = &clgs->events[clgs->events_used];
663   init_Event(evt);
664   evt->tag       = Ev_Dw;
665   evt->inode     = inode;
666   evt->Ev.Dw.szB = datasize;
667   evt->Ev.Dw.ea  = ea;
668   clgs->events_used++;
669}
670
671static
672void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
673{
674   Event* evt;
675   tl_assert(isIRAtom(guard));
676   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
677             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
678   if (!CLG_(clo).simulate_branch) return;
679
680   if (clgs->events_used == N_EVENTS)
681      flushEvents(clgs);
682   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
683   evt = &clgs->events[clgs->events_used];
684   init_Event(evt);
685   evt->tag         = Ev_Bc;
686   evt->inode       = inode;
687   evt->Ev.Bc.taken = guard;
688   clgs->events_used++;
689}
690
691static
692void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
693{
694   Event* evt;
695   tl_assert(isIRAtom(whereTo));
696   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
697             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
698   if (!CLG_(clo).simulate_branch) return;
699
700   if (clgs->events_used == N_EVENTS)
701      flushEvents(clgs);
702   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
703   evt = &clgs->events[clgs->events_used];
704   init_Event(evt);
705   evt->tag       = Ev_Bi;
706   evt->inode     = inode;
707   evt->Ev.Bi.dst = whereTo;
708   clgs->events_used++;
709}
710
711static
712void addEvent_G ( ClgState* clgs, InstrInfo* inode )
713{
714   Event* evt;
715   if (!CLG_(clo).collect_bus) return;
716
717   if (clgs->events_used == N_EVENTS)
718      flushEvents(clgs);
719   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
720   evt = &clgs->events[clgs->events_used];
721   init_Event(evt);
722   evt->tag       = Ev_G;
723   evt->inode     = inode;
724   clgs->events_used++;
725}
726
727/* Initialise or check (if already seen before) an InstrInfo for next insn.
728   We only can set instr_offset/instr_size here. The required event set and
729   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
730   instructions. The event set is extended as required on flush of the event
731   queue (when Dm events were determined), cost offsets are determined at
732   end of BB instrumentation. */
733static
734InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
735{
736   InstrInfo* ii;
737   tl_assert(clgs->ii_index >= 0);
738   tl_assert(clgs->ii_index < clgs->bb->instr_count);
739   ii = &clgs->bb->instr[ clgs->ii_index ];
740
741   if (clgs->seen_before) {
742       CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
743       CLG_ASSERT(ii->instr_size == instr_size);
744   }
745   else {
746       ii->instr_offset = clgs->instr_offset;
747       ii->instr_size = instr_size;
748       ii->cost_offset = 0;
749       ii->eventset = 0;
750   }
751
752   clgs->ii_index++;
753   clgs->instr_offset += instr_size;
754   CLG_(stat).distinct_instrs++;
755
756   return ii;
757}
758
759// return total number of cost values needed for this BB
760static
761UInt update_cost_offsets( ClgState* clgs )
762{
763    Int i;
764    InstrInfo* ii;
765    UInt cost_offset = 0;
766
767    CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
768    for(i=0; i<clgs->ii_index; i++) {
769	ii = &clgs->bb->instr[i];
770	if (clgs->seen_before) {
771	    CLG_ASSERT(ii->cost_offset == cost_offset);
772	} else
773	    ii->cost_offset = cost_offset;
774	cost_offset += ii->eventset ? ii->eventset->size : 0;
775    }
776
777    return cost_offset;
778}
779
780/*------------------------------------------------------------*/
781/*--- Instrumentation                                      ---*/
782/*------------------------------------------------------------*/
783
784#if defined(VG_BIGENDIAN)
785# define CLGEndness Iend_BE
786#elif defined(VG_LITTLEENDIAN)
787# define CLGEndness Iend_LE
788#else
789# error "Unknown endianness"
790#endif
791
792static
793Addr IRConst2Addr(IRConst* con)
794{
795    Addr addr;
796
797    if (sizeof(Addr) == 4) {
798	CLG_ASSERT( con->tag == Ico_U32 );
799	addr = con->Ico.U32;
800    }
801    else if (sizeof(Addr) == 8) {
802	CLG_ASSERT( con->tag == Ico_U64 );
803	addr = con->Ico.U64;
804    }
805    else
806	VG_(tool_panic)("Callgrind: invalid Addr type");
807
808    return addr;
809}
810
811/* First pass over a BB to instrument, counting instructions and jumps
812 * This is needed for the size of the BB struct to allocate
813 *
814 * Called from CLG_(get_bb)
815 */
816void CLG_(collectBlockInfo)(IRSB* sbIn,
817			    /*INOUT*/ UInt* instrs,
818			    /*INOUT*/ UInt* cjmps,
819			    /*INOUT*/ Bool* cjmp_inverted)
820{
821    Int i;
822    IRStmt* st;
823    Addr instrAddr =0, jumpDst;
824    UInt instrLen = 0;
825    Bool toNextInstr = False;
826
827    // Ist_Exit has to be ignored in preamble code, before first IMark:
828    // preamble code is added by VEX for self modifying code, and has
829    // nothing to do with client code
830    Bool inPreamble = True;
831
832    if (!sbIn) return;
833
834    for (i = 0; i < sbIn->stmts_used; i++) {
835	  st = sbIn->stmts[i];
836	  if (Ist_IMark == st->tag) {
837	      inPreamble = False;
838
839	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
840	      instrLen  = st->Ist.IMark.len;
841
842	      (*instrs)++;
843	      toNextInstr = False;
844	  }
845	  if (inPreamble) continue;
846	  if (Ist_Exit == st->tag) {
847	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
848	      toNextInstr =  (jumpDst == instrAddr + instrLen);
849
850	      (*cjmps)++;
851	  }
852    }
853
854    /* if the last instructions of BB conditionally jumps to next instruction
855     * (= first instruction of next BB in memory), this is a inverted by VEX.
856     */
857    *cjmp_inverted = toNextInstr;
858}
859
860static
861void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
862{
863    addStmtToIRSB( bbOut,
864		   IRStmt_Store(CLGEndness,
865				IRExpr_Const(hWordTy == Ity_I32 ?
866					     IRConst_U32( addr ) :
867					     IRConst_U64( addr )),
868				IRExpr_Const(IRConst_U32(val)) ));
869}
870
871
872/* add helper call to setup_bbcc, with pointer to BB struct as argument
873 *
874 * precondition for setup_bbcc:
875 * - jmps_passed has number of cond.jumps passed in last executed BB
876 * - current_bbcc has a pointer to the BBCC of the last executed BB
877 *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
878 *     current_bbcc->bb->jmp_addr
879 *   gives the address of the jump source.
880 *
881 * the setup does 2 things:
882 * - trace call:
883 *   * Unwind own call stack, i.e sync our ESP with real ESP
884 *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
885 *   * For CALLs or JMPs crossing objects, record call arg +
886 *     push are on own call stack
887 *
888 * - prepare for cache log functions:
889 *   set current_bbcc to BBCC that gets the costs for this BB execution
890 *   attached
891 */
892static
893void addBBSetupCall(ClgState* clgs)
894{
895   IRDirty* di;
896   IRExpr  *arg1, **argv;
897
898   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
899   argv = mkIRExprVec_1(arg1);
900   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
901			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
902			      argv);
903   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
904}
905
906
907static
908IRSB* CLG_(instrument)( VgCallbackClosure* closure,
909			IRSB* sbIn,
910			VexGuestLayout* layout,
911			VexGuestExtents* vge,
912			IRType gWordTy, IRType hWordTy )
913{
914   Int      i;
915   IRStmt*  st;
916   Addr     origAddr;
917   InstrInfo* curr_inode = NULL;
918   ClgState clgs;
919   UInt     cJumps = 0;
920
921
922   if (gWordTy != hWordTy) {
923      /* We don't currently support this case. */
924      VG_(tool_panic)("host/guest word size mismatch");
925   }
926
927   // No instrumentation if it is switched off
928   if (! CLG_(instrument_state)) {
929       CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
930		 (Addr)closure->readdr);
931       return sbIn;
932   }
933
934   CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
935
936   /* Set up SB for instrumented IR */
937   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
938
939   // Copy verbatim any IR preamble preceding the first IMark
940   i = 0;
941   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
942      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
943      i++;
944   }
945
946   // Get the first statement, and origAddr from it
947   CLG_ASSERT(sbIn->stmts_used >0);
948   CLG_ASSERT(i < sbIn->stmts_used);
949   st = sbIn->stmts[i];
950   CLG_ASSERT(Ist_IMark == st->tag);
951
952   origAddr = (Addr)st->Ist.IMark.addr + (Addr)st->Ist.IMark.delta;
953   CLG_ASSERT(origAddr == st->Ist.IMark.addr
954                          + st->Ist.IMark.delta);  // XXX: check no overflow
955
956   /* Get BB struct (creating if necessary).
957    * JS: The hash table is keyed with orig_addr_noredir -- important!
958    * JW: Why? If it is because of different chasing of the redirection,
959    *     this is not needed, as chasing is switched off in callgrind
960    */
961   clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
962
963   addBBSetupCall(&clgs);
964
965   // Set up running state
966   clgs.events_used = 0;
967   clgs.ii_index = 0;
968   clgs.instr_offset = 0;
969
970   for (/*use current i*/; i < sbIn->stmts_used; i++) {
971
972      st = sbIn->stmts[i];
973      CLG_ASSERT(isFlatIRStmt(st));
974
975      switch (st->tag) {
976	 case Ist_NoOp:
977	 case Ist_AbiHint:
978	 case Ist_Put:
979	 case Ist_PutI:
980	 case Ist_MBE:
981	    break;
982
983	 case Ist_IMark: {
984            Addr64 cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
985            Int    isize = st->Ist.IMark.len;
986            CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
987	    // If Vex fails to decode an instruction, the size will be zero.
988	    // Pretend otherwise.
989	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
990
991	    // Sanity-check size.
992	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
993		     || VG_CLREQ_SZB == isize );
994
995	    // Init the inode, record it as the current one.
996	    // Subsequent Dr/Dw/Dm events from the same instruction will
997	    // also use it.
998	    curr_inode = next_InstrInfo (&clgs, isize);
999
1000	    addEvent_Ir( &clgs, curr_inode );
1001	    break;
1002	 }
1003
1004	 case Ist_WrTmp: {
1005	    IRExpr* data = st->Ist.WrTmp.data;
1006	    if (data->tag == Iex_Load) {
1007	       IRExpr* aexpr = data->Iex.Load.addr;
1008	       // Note also, endianness info is ignored.  I guess
1009	       // that's not interesting.
1010	       addEvent_Dr( &clgs, curr_inode,
1011			    sizeofIRType(data->Iex.Load.ty), aexpr );
1012	    }
1013	    break;
1014	 }
1015
1016	 case Ist_Store: {
1017	    IRExpr* data  = st->Ist.Store.data;
1018	    IRExpr* aexpr = st->Ist.Store.addr;
1019	    addEvent_Dw( &clgs, curr_inode,
1020			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
1021	    break;
1022	 }
1023
1024	 case Ist_Dirty: {
1025	    Int      dataSize;
1026	    IRDirty* d = st->Ist.Dirty.details;
1027	    if (d->mFx != Ifx_None) {
1028	       /* This dirty helper accesses memory.  Collect the details. */
1029	       tl_assert(d->mAddr != NULL);
1030	       tl_assert(d->mSize != 0);
1031	       dataSize = d->mSize;
1032	       // Large (eg. 28B, 108B, 512B on x86) data-sized
1033	       // instructions will be done inaccurately, but they're
1034	       // very rare and this avoids errors from hitting more
1035	       // than two cache lines in the simulation.
1036	       if (CLG_(clo).simulate_cache && dataSize > CLG_(min_line_size))
1037		  dataSize = CLG_(min_line_size);
1038	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
1039		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
1040	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
1041		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
1042	    } else {
1043	       tl_assert(d->mAddr == NULL);
1044	       tl_assert(d->mSize == 0);
1045	    }
1046	    break;
1047	 }
1048
1049         case Ist_CAS: {
1050            /* We treat it as a read and a write of the location.  I
1051               think that is the same behaviour as it was before IRCAS
1052               was introduced, since prior to that point, the Vex
1053               front ends would translate a lock-prefixed instruction
1054               into a (normal) read followed by a (normal) write. */
1055            Int    dataSize;
1056            IRCAS* cas = st->Ist.CAS.details;
1057            CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
1058            CLG_ASSERT(cas->dataLo);
1059            dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
1060            if (cas->dataHi != NULL)
1061               dataSize *= 2; /* since this is a doubleword-cas */
1062            addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
1063            addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
1064            addEvent_G(  &clgs, curr_inode );
1065            break;
1066         }
1067
1068         case Ist_LLSC: {
1069            IRType dataTy;
1070            if (st->Ist.LLSC.storedata == NULL) {
1071               /* LL */
1072               dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
1073               addEvent_Dr( &clgs, curr_inode,
1074                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
1075            } else {
1076               /* SC */
1077               dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
1078               addEvent_Dw( &clgs, curr_inode,
1079                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
1080               /* I don't know whether the global-bus-lock cost should
1081                  be attributed to the LL or the SC, but it doesn't
1082                  really matter since they always have to be used in
1083                  pairs anyway.  Hence put it (quite arbitrarily) on
1084                  the SC. */
1085               addEvent_G(  &clgs, curr_inode );
1086            }
1087            break;
1088         }
1089
1090 	 case Ist_Exit: {
1091            Bool guest_exit, inverted;
1092
1093            /* VEX code generation sometimes inverts conditional branches.
1094             * As Callgrind counts (conditional) jumps, it has to correct
1095             * inversions. The heuristic is the following:
1096             * (1) Callgrind switches off SB chasing and unrolling, and
1097             *     therefore it assumes that a candidate for inversion only is
1098             *     the last conditional branch in an SB.
1099             * (2) inversion is assumed if the branch jumps to the address of
1100             *     the next guest instruction in memory.
1101             * This heuristic is precalculated in CLG_(collectBlockInfo)().
1102             *
1103             * Branching behavior is also used for branch prediction. Note that
1104             * above heuristic is different from what Cachegrind does.
1105             * Cachegrind uses (2) for all branches.
1106             */
1107            if (cJumps+1 == clgs.bb->cjmp_count)
1108                inverted = clgs.bb->cjmp_inverted;
1109            else
1110                inverted = False;
1111
1112            // call branch predictor only if this is a branch in guest code
1113            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
1114                         (st->Ist.Exit.jk == Ijk_Call) ||
1115                         (st->Ist.Exit.jk == Ijk_Ret);
1116
1117            if (guest_exit) {
1118                /* Stuff to widen the guard expression to a host word, so
1119                   we can pass it to the branch predictor simulation
1120                   functions easily. */
1121                IRType   tyW    = hWordTy;
1122                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
1123                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
1124                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
1125                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
1126                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
1127                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
1128                                               : IRExpr_Const(IRConst_U64(1));
1129
1130                /* Widen the guard expression. */
1131                addStmtToIRSB( clgs.sbOut,
1132                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
1133                addStmtToIRSB( clgs.sbOut,
1134                               IRStmt_WrTmp( guardW,
1135                                             IRExpr_Unop(widen,
1136                                                         IRExpr_RdTmp(guard1))) );
1137                /* If the exit is inverted, invert the sense of the guard. */
1138                addStmtToIRSB(
1139                        clgs.sbOut,
1140                        IRStmt_WrTmp(
1141                                guard,
1142                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
1143                                    : IRExpr_RdTmp(guardW)
1144                                    ));
1145                /* And post the event. */
1146                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
1147            }
1148
1149	    /* We may never reach the next statement, so need to flush
1150	       all outstanding transactions now. */
1151	    flushEvents( &clgs );
1152
1153	    CLG_ASSERT(clgs.ii_index>0);
1154	    if (!clgs.seen_before) {
1155	      ClgJumpKind jk;
1156
1157	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
1158	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
1159	      else {
1160		if (IRConst2Addr(st->Ist.Exit.dst) ==
1161		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
1162		  jk = jk_None;
1163		else
1164		  jk = jk_Jump;
1165	      }
1166
1167	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
1168	      clgs.bb->jmp[cJumps].jmpkind = jk;
1169	    }
1170
1171	    /* Update global variable jmps_passed before the jump
1172	     * A correction is needed if VEX inverted the last jump condition
1173	    */
1174	    addConstMemStoreStmt( clgs.sbOut,
1175				  (UWord) &CLG_(current_state).jmps_passed,
1176                                  inverted ? cJumps+1 : cJumps, hWordTy);
1177	    cJumps++;
1178
1179	    break;
1180	 }
1181
1182	 default:
1183	    tl_assert(0);
1184	    break;
1185      }
1186
1187      /* Copy the original statement */
1188      addStmtToIRSB( clgs.sbOut, st );
1189
1190      CLG_DEBUGIF(5) {
1191	 VG_(printf)("   pass  ");
1192	 ppIRStmt(st);
1193	 VG_(printf)("\n");
1194      }
1195   }
1196
1197   /* Deal with branches to unknown destinations.  Except ignore ones
1198      which are function returns as we assume the return stack
1199      predictor never mispredicts. */
1200   if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
1201      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
1202      switch (sbIn->next->tag) {
1203         case Iex_Const:
1204            break; /* boring - branch to known address */
1205         case Iex_RdTmp:
1206            /* looks like an indirect branch (branch to unknown) */
1207            addEvent_Bi( &clgs, curr_inode, sbIn->next );
1208            break;
1209         default:
1210            /* shouldn't happen - if the incoming IR is properly
1211               flattened, should only have tmp and const cases to
1212               consider. */
1213            tl_assert(0);
1214      }
1215   }
1216
1217   /* At the end of the bb.  Flush outstandings. */
1218   flushEvents( &clgs );
1219
1220   /* Always update global variable jmps_passed at end of bb.
1221    * A correction is needed if VEX inverted the last jump condition
1222    */
1223   {
1224      UInt jmps_passed = cJumps;
1225      if (clgs.bb->cjmp_inverted) jmps_passed--;
1226      addConstMemStoreStmt( clgs.sbOut,
1227			    (UWord) &CLG_(current_state).jmps_passed,
1228			    jmps_passed, hWordTy);
1229   }
1230   CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
1231   CLG_ASSERT(clgs.bb->instr_count = clgs.ii_index);
1232
1233   /* Info for final exit from BB */
1234   {
1235     ClgJumpKind jk;
1236
1237     if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
1238     else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
1239     else {
1240       jk = jk_Jump;
1241       if ((sbIn->next->tag == Iex_Const) &&
1242	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
1243	    origAddr + clgs.instr_offset))
1244	 jk = jk_None;
1245     }
1246     clgs.bb->jmp[cJumps].jmpkind = jk;
1247     /* Instruction index of the call/ret at BB end
1248      * (it is wrong for fall-through, but does not matter) */
1249     clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
1250   }
1251
1252   /* swap information of last exit with final exit if inverted */
1253   if (clgs.bb->cjmp_inverted) {
1254     ClgJumpKind jk;
1255     UInt instr;
1256
1257     jk = clgs.bb->jmp[cJumps].jmpkind;
1258     clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
1259     clgs.bb->jmp[cJumps-1].jmpkind = jk;
1260     instr = clgs.bb->jmp[cJumps].instr;
1261     clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
1262     clgs.bb->jmp[cJumps-1].instr = instr;
1263   }
1264
1265   if (clgs.seen_before) {
1266       CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
1267       CLG_ASSERT(clgs.bb->instr_len = clgs.instr_offset);
1268   }
1269   else {
1270       clgs.bb->cost_count = update_cost_offsets(&clgs);
1271       clgs.bb->instr_len = clgs.instr_offset;
1272   }
1273
1274   CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
1275	     origAddr, clgs.bb->instr_len,
1276	     clgs.bb->cjmp_count, clgs.bb->cost_count);
1277   if (cJumps>0) {
1278       CLG_DEBUG(3, "                     [ ");
1279       for (i=0;i<cJumps;i++)
1280	   CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
1281       CLG_DEBUG(3, "], last inverted: %s \n",
1282		 clgs.bb->cjmp_inverted ? "yes":"no");
1283   }
1284
1285  return clgs.sbOut;
1286}
1287
1288/*--------------------------------------------------------------------*/
1289/*--- Discarding BB info                                           ---*/
1290/*--------------------------------------------------------------------*/
1291
1292// Called when a translation is removed from the translation cache for
1293// any reason at all: to free up space, because the guest code was
1294// unmapped or modified, or for any arbitrary reason.
1295static
1296void clg_discard_superblock_info ( Addr64 orig_addr64, VexGuestExtents vge )
1297{
1298    Addr orig_addr = (Addr)orig_addr64;
1299
1300    tl_assert(vge.n_used > 0);
1301
1302   if (0)
1303      VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
1304                   (void*)(Addr)orig_addr,
1305                   (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
1306
1307   // Get BB info, remove from table, free BB info.  Simple!  Note that we
1308   // use orig_addr, not the first instruction address in vge.
1309   CLG_(delete_bb)(orig_addr);
1310}
1311
1312
1313/*------------------------------------------------------------*/
1314/*--- CLG_(fini)() and related function                     ---*/
1315/*------------------------------------------------------------*/
1316
1317
1318
1319static void zero_thread_cost(thread_info* t)
1320{
1321  Int i;
1322
1323  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
1324    if (!CLG_(current_call_stack).entry[i].jcc) continue;
1325
1326    /* reset call counters to current for active calls */
1327    CLG_(copy_cost)( CLG_(sets).full,
1328		    CLG_(current_call_stack).entry[i].enter_cost,
1329		    CLG_(current_state).cost );
1330    CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
1331  }
1332
1333  CLG_(forall_bbccs)(CLG_(zero_bbcc));
1334
1335  /* set counter for last dump */
1336  CLG_(copy_cost)( CLG_(sets).full,
1337		  t->lastdump_cost, CLG_(current_state).cost );
1338}
1339
1340void CLG_(zero_all_cost)(Bool only_current_thread)
1341{
1342  if (VG_(clo_verbosity) > 1)
1343    VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
1344
1345  if (only_current_thread)
1346    zero_thread_cost(CLG_(get_current_thread)());
1347  else
1348    CLG_(forall_threads)(zero_thread_cost);
1349
1350  if (VG_(clo_verbosity) > 1)
1351    VG_(message)(Vg_DebugMsg, "  ...done\n");
1352}
1353
1354static
1355void unwind_thread(thread_info* t)
1356{
1357  /* unwind signal handlers */
1358  while(CLG_(current_state).sig !=0)
1359    CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
1360
1361  /* unwind regular call stack */
1362  while(CLG_(current_call_stack).sp>0)
1363    CLG_(pop_call_stack)();
1364
1365  /* reset context and function stack for context generation */
1366  CLG_(init_exec_state)( &CLG_(current_state) );
1367  CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
1368}
1369
1370static
1371void zero_state_cost(thread_info* t)
1372{
1373    CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
1374}
1375
1376/* Ups, this can go very wrong... */
1377extern void VG_(discard_translations) ( Addr64 start, ULong range, HChar* who );
1378
1379void CLG_(set_instrument_state)(Char* reason, Bool state)
1380{
1381  if (CLG_(instrument_state) == state) {
1382    CLG_DEBUG(2, "%s: instrumentation already %s\n",
1383	     reason, state ? "ON" : "OFF");
1384    return;
1385  }
1386  CLG_(instrument_state) = state;
1387  CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
1388	   reason, state ? "ON" : "OFF");
1389
1390  VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl, "callgrind");
1391
1392  /* reset internal state: call stacks, simulator */
1393  CLG_(forall_threads)(unwind_thread);
1394  CLG_(forall_threads)(zero_state_cost);
1395  (*CLG_(cachesim).clear)();
1396
1397  if (VG_(clo_verbosity) > 1)
1398    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
1399		 reason, state ? "ON" : "OFF");
1400}
1401
1402/* helper for dump_state_togdb */
1403static void dump_state_of_thread_togdb(thread_info* ti)
1404{
1405    static Char buf[512];
1406    static FullCost sum = 0, tmp = 0;
1407    Int t, p, i;
1408    BBCC *from, *to;
1409    call_entry* ce;
1410
1411    t = CLG_(current_tid);
1412    CLG_(init_cost_lz)( CLG_(sets).full, &sum );
1413    CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
1414    CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
1415			 ti->states.entry[0]->cost);
1416    CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
1417    CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), sum);
1418    VG_(gdb_printf)("events-%d: %s\n", t, buf);
1419    VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
1420
1421    ce = 0;
1422    for(i = 0; i < CLG_(current_call_stack).sp; i++) {
1423      ce = CLG_(get_call_entry)(i);
1424      /* if this frame is skipped, we don't have counters */
1425      if (!ce->jcc) continue;
1426
1427      from = ce->jcc->from;
1428      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
1429      VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
1430
1431      /* FIXME: EventSets! */
1432      CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
1433      CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
1434      CLG_(add_diff_cost)( CLG_(sets).full, sum,
1435			  ce->enter_cost, CLG_(current_state).cost );
1436      CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
1437
1438      p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
1439      CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum );
1440      VG_(gdb_printf)("%s\n", buf);
1441    }
1442    if (ce && ce->jcc) {
1443      to = ce->jcc->to;
1444      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
1445    }
1446}
1447
1448/* Dump current state */
1449static void dump_state_togdb(void)
1450{
1451    static Char buf[512];
1452    thread_info** th;
1453    int t, p;
1454    Int orig_tid = CLG_(current_tid);
1455
1456    VG_(gdb_printf)("instrumentation: %s\n",
1457		    CLG_(instrument_state) ? "on":"off");
1458    if (!CLG_(instrument_state)) return;
1459
1460    VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
1461    VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
1462    VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
1463    VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
1464    VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
1465    VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
1466
1467    /* "events:" line. Given here because it will be dynamic in the future */
1468    p = VG_(sprintf)(buf, "events: ");
1469    CLG_(sprint_eventmapping)(buf+p, CLG_(dumpmap));
1470    VG_(gdb_printf)("%s\n", buf);
1471    /* "part:" line (number of last part. Is 0 at start */
1472    VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
1473
1474    /* threads */
1475    th = CLG_(get_threads)();
1476    p = VG_(sprintf)(buf, "threads:");
1477    for(t=1;t<VG_N_THREADS;t++) {
1478	if (!th[t]) continue;
1479	p += VG_(sprintf)(buf+p, " %d", t);
1480    }
1481    VG_(gdb_printf)("%s\n", buf);
1482    VG_(gdb_printf)("current-tid: %d\n", orig_tid);
1483    CLG_(forall_threads)(dump_state_of_thread_togdb);
1484}
1485
1486
1487static void print_monitor_help ( void )
1488{
1489   VG_(gdb_printf) ("\n");
1490   VG_(gdb_printf) ("callgrind monitor commands:\n");
1491   VG_(gdb_printf) ("  dump [<dump_hint>]\n");
1492   VG_(gdb_printf) ("        dump counters\n");
1493   VG_(gdb_printf) ("  zero\n");
1494   VG_(gdb_printf) ("        zero counters\n");
1495   VG_(gdb_printf) ("  status\n");
1496   VG_(gdb_printf) ("        print status\n");
1497   VG_(gdb_printf) ("  instrumentation [on|off]\n");
1498   VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
1499   VG_(gdb_printf) ("\n");
1500}
1501
1502/* return True if request recognised, False otherwise */
1503static Bool handle_gdb_monitor_command (ThreadId tid, Char *req)
1504{
1505   Char* wcmd;
1506   Char s[VG_(strlen(req))]; /* copy for strtok_r */
1507   Char *ssaveptr;
1508
1509   VG_(strcpy) (s, req);
1510
1511   wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
1512   switch (VG_(keyword_id) ("help dump zero status instrumentation",
1513                            wcmd, kwd_report_duplicated_matches)) {
1514   case -2: /* multiple matches */
1515      return True;
1516   case -1: /* not found */
1517      return False;
1518   case  0: /* help */
1519      print_monitor_help();
1520      return True;
1521   case  1: { /* dump */
1522      CLG_(dump_profile)(req, False);
1523      return True;
1524   }
1525   case  2: { /* zero */
1526      CLG_(zero_all_cost)(False);
1527      return True;
1528   }
1529
1530   case 3: { /* status */
1531     Char* arg = VG_(strtok_r) (0, " ", &ssaveptr);
1532     if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
1533       /* internal interface to callgrind_control */
1534       dump_state_togdb();
1535       return True;
1536     }
1537
1538     if (!CLG_(instrument_state)) {
1539       VG_(gdb_printf)("No status available as instrumentation is switched off\n");
1540     } else {
1541       // Status information to be improved ...
1542       thread_info** th = CLG_(get_threads)();
1543       Int t, tcount = 0;
1544       for(t=1;t<VG_N_THREADS;t++)
1545	 if (th[t]) tcount++;
1546       VG_(gdb_printf)("%d thread(s) running.\n", tcount);
1547     }
1548     return True;
1549   }
1550
1551   case 4: { /* instrumentation */
1552     Char* arg = VG_(strtok_r) (0, " ", &ssaveptr);
1553     if (!arg) {
1554       VG_(gdb_printf)("instrumentation: %s\n",
1555		       CLG_(instrument_state) ? "on":"off");
1556     }
1557     else
1558       CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
1559     return True;
1560   }
1561
1562   default:
1563      tl_assert(0);
1564      return False;
1565   }
1566}
1567
1568static
1569Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
1570{
1571   if (!VG_IS_TOOL_USERREQ('C','T',args[0])
1572       && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
1573      return False;
1574
1575   switch(args[0]) {
1576   case VG_USERREQ__DUMP_STATS:
1577      CLG_(dump_profile)("Client Request", True);
1578      *ret = 0;                 /* meaningless */
1579      break;
1580
1581   case VG_USERREQ__DUMP_STATS_AT:
1582     {
1583       Char buf[512];
1584       VG_(sprintf)(buf,"Client Request: %s", (Char*)args[1]);
1585       CLG_(dump_profile)(buf, True);
1586       *ret = 0;                 /* meaningless */
1587     }
1588     break;
1589
1590   case VG_USERREQ__ZERO_STATS:
1591     CLG_(zero_all_cost)(True);
1592      *ret = 0;                 /* meaningless */
1593      break;
1594
1595   case VG_USERREQ__TOGGLE_COLLECT:
1596     CLG_(current_state).collect = !CLG_(current_state).collect;
1597     CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
1598	      CLG_(current_state).collect ? "ON" : "OFF");
1599     *ret = 0;                 /* meaningless */
1600     break;
1601
1602   case VG_USERREQ__START_INSTRUMENTATION:
1603     CLG_(set_instrument_state)("Client Request", True);
1604     *ret = 0;                 /* meaningless */
1605     break;
1606
1607   case VG_USERREQ__STOP_INSTRUMENTATION:
1608     CLG_(set_instrument_state)("Client Request", False);
1609     *ret = 0;                 /* meaningless */
1610     break;
1611
1612   case VG_USERREQ__GDB_MONITOR_COMMAND: {
1613      Bool handled = handle_gdb_monitor_command (tid, (Char*)args[1]);
1614      if (handled)
1615         *ret = 1;
1616      else
1617         *ret = 0;
1618      return handled;
1619   }
1620   default:
1621      return False;
1622   }
1623
1624   return True;
1625}
1626
1627
1628/* Syscall Timing */
1629
1630/* struct timeval syscalltime[VG_N_THREADS]; */
1631#if CLG_MICROSYSTIME
1632#include <sys/time.h>
1633#include <sys/syscall.h>
1634extern Int VG_(do_syscall) ( UInt, ... );
1635
1636ULong syscalltime[VG_N_THREADS];
1637#else
1638UInt syscalltime[VG_N_THREADS];
1639#endif
1640
1641static
1642void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
1643                           UWord* args, UInt nArgs)
1644{
1645  if (CLG_(clo).collect_systime) {
1646#if CLG_MICROSYSTIME
1647    struct vki_timeval tv_now;
1648    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
1649    syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
1650#else
1651    syscalltime[tid] = VG_(read_millisecond_timer)();
1652#endif
1653  }
1654}
1655
1656static
1657void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
1658                            UWord* args, UInt nArgs, SysRes res)
1659{
1660  if (CLG_(clo).collect_systime &&
1661      CLG_(current_state).bbcc) {
1662      Int o;
1663#if CLG_MICROSYSTIME
1664    struct vki_timeval tv_now;
1665    ULong diff;
1666
1667    VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
1668    diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
1669#else
1670    UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
1671#endif
1672
1673    /* offset o is for "SysCount", o+1 for "SysTime" */
1674    o = fullOffset(EG_SYS);
1675    CLG_ASSERT(o>=0);
1676    CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
1677
1678    CLG_(current_state).cost[o] ++;
1679    CLG_(current_state).cost[o+1] += diff;
1680    if (!CLG_(current_state).bbcc->skipped)
1681      CLG_(init_cost_lz)(CLG_(sets).full,
1682			&(CLG_(current_state).bbcc->skipped));
1683    CLG_(current_state).bbcc->skipped[o] ++;
1684    CLG_(current_state).bbcc->skipped[o+1] += diff;
1685  }
1686}
1687
1688static UInt ULong_width(ULong n)
1689{
1690   UInt w = 0;
1691   while (n > 0) {
1692      n = n / 10;
1693      w++;
1694   }
1695   if (w == 0) w = 1;
1696   return w + (w-1)/3;   // add space for commas
1697}
1698
1699static
1700void branchsim_printstat(int l1, int l2, int l3)
1701{
1702    static Char buf1[128], buf2[128], buf3[128], fmt[128];
1703    FullCost total;
1704    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
1705    ULong B_total_b, B_total_mp;
1706
1707    total = CLG_(total_cost);
1708    Bc_total_b  = total[ fullOffset(EG_BC)   ];
1709    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
1710    Bi_total_b  = total[ fullOffset(EG_BI)   ];
1711    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
1712
1713    /* Make format string, getting width right for numbers */
1714    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
1715                 l1, l2, l3);
1716
1717    if (0 == Bc_total_b)  Bc_total_b = 1;
1718    if (0 == Bi_total_b)  Bi_total_b = 1;
1719    B_total_b  = Bc_total_b  + Bi_total_b;
1720    B_total_mp = Bc_total_mp + Bi_total_mp;
1721
1722    VG_(umsg)("\n");
1723    VG_(umsg)(fmt, "Branches:     ",
1724              B_total_b, Bc_total_b, Bi_total_b);
1725
1726    VG_(umsg)(fmt, "Mispredicts:  ",
1727              B_total_mp, Bc_total_mp, Bi_total_mp);
1728
1729    VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
1730    VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
1731    VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
1732
1733    VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
1734}
1735
1736
1737static
1738void finish(void)
1739{
1740  Char buf[32+COSTS_LEN], fmt[128];
1741  Int l1, l2, l3;
1742  FullCost total;
1743
1744  CLG_DEBUG(0, "finish()\n");
1745
1746  (*CLG_(cachesim).finish)();
1747
1748  /* pop all remaining items from CallStack for correct sum
1749   */
1750  CLG_(forall_threads)(unwind_thread);
1751
1752  CLG_(dump_profile)(0, False);
1753
1754  if (VG_(clo_verbosity) == 0) return;
1755
1756  /* Hash table stats */
1757  if (VG_(clo_stats)) {
1758    int BB_lookups =
1759      CLG_(stat).full_debug_BBs +
1760      CLG_(stat).fn_name_debug_BBs +
1761      CLG_(stat).file_line_debug_BBs +
1762      CLG_(stat).no_debug_BBs;
1763
1764    VG_(message)(Vg_DebugMsg, "\n");
1765    VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
1766		 CLG_(stat).distinct_objs);
1767    VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
1768		 CLG_(stat).distinct_files);
1769    VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
1770		 CLG_(stat).distinct_fns);
1771    VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
1772		 CLG_(stat).distinct_contexts);
1773    VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
1774		 CLG_(stat).distinct_bbs);
1775    VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)\n",
1776		 CLG_(costarray_entries), CLG_(costarray_chunks));
1777    VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
1778		 CLG_(stat).distinct_bbccs);
1779    VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
1780		 CLG_(stat).distinct_jccs);
1781    VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
1782		 CLG_(stat).distinct_skips);
1783    VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
1784		 BB_lookups);
1785    if (BB_lookups>0) {
1786      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
1787		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
1788		   CLG_(stat).full_debug_BBs);
1789      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
1790		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
1791		   CLG_(stat).file_line_debug_BBs);
1792      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
1793		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
1794		   CLG_(stat).fn_name_debug_BBs);
1795      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
1796		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
1797		   CLG_(stat).no_debug_BBs);
1798    }
1799    VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
1800		 CLG_(stat).bbcc_clones);
1801    VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
1802		 CLG_(stat).bb_retranslations);
1803    VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
1804		 CLG_(stat).distinct_instrs);
1805    VG_(message)(Vg_DebugMsg, "");
1806
1807    VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
1808		 CLG_(stat).cxt_lru_misses);
1809    VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
1810		 CLG_(stat).bbcc_lru_misses);
1811    VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
1812		 CLG_(stat).jcc_lru_misses);
1813    VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
1814		 CLG_(stat).bb_executions);
1815    VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
1816		 CLG_(stat).call_counter);
1817    VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
1818		 CLG_(stat).jcnd_counter);
1819    VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
1820		 CLG_(stat).jump_counter);
1821    VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
1822		 CLG_(stat).rec_call_counter);
1823    VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
1824		 CLG_(stat).ret_counter);
1825
1826    VG_(message)(Vg_DebugMsg, "");
1827  }
1828
1829  CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
1830  VG_(message)(Vg_UserMsg, "Events    : %s\n", buf);
1831  CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
1832  VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
1833  VG_(message)(Vg_UserMsg, "\n");
1834
1835  /* determine value widths for statistics */
1836  total = CLG_(total_cost);
1837  l1 = ULong_width( total[fullOffset(EG_IR)] );
1838  l2 = l3 = 0;
1839  if (CLG_(clo).simulate_cache) {
1840      l2 = ULong_width( total[fullOffset(EG_DR)] );
1841      l3 = ULong_width( total[fullOffset(EG_DW)] );
1842  }
1843  if (CLG_(clo).simulate_branch) {
1844      int l2b = ULong_width( total[fullOffset(EG_BC)] );
1845      int l3b = ULong_width( total[fullOffset(EG_BI)] );
1846      if (l2b > l2) l2 = l2b;
1847      if (l3b > l3) l3 = l3b;
1848  }
1849
1850  /* Make format string, getting width right for numbers */
1851  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
1852
1853  /* Always print this */
1854  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
1855
1856  if (CLG_(clo).simulate_cache)
1857      (*CLG_(cachesim).printstat)(l1, l2, l3);
1858
1859  if (CLG_(clo).simulate_branch)
1860      branchsim_printstat(l1, l2, l3);
1861
1862}
1863
1864
1865void CLG_(fini)(Int exitcode)
1866{
1867  finish();
1868}
1869
1870
1871/*--------------------------------------------------------------------*/
1872/*--- Setup                                                        ---*/
1873/*--------------------------------------------------------------------*/
1874
1875static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
1876{
1877   static ULong last_blocks_done = 0;
1878
1879   if (0)
1880      VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
1881
1882   /* throttle calls to CLG_(run_thread) by number of BBs executed */
1883   if (blocks_done - last_blocks_done < 5000) return;
1884   last_blocks_done = blocks_done;
1885
1886   CLG_(run_thread)( tid );
1887}
1888
1889static
1890void CLG_(post_clo_init)(void)
1891{
1892   VG_(clo_vex_control).iropt_unroll_thresh = 0;
1893   VG_(clo_vex_control).guest_chase_thresh = 0;
1894
1895   CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
1896   CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
1897   CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
1898
1899   if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
1900       VG_(message)(Vg_UserMsg, "Using source line as position.\n");
1901       CLG_(clo).dump_line = True;
1902   }
1903
1904   CLG_(init_dumps)();
1905
1906   (*CLG_(cachesim).post_clo_init)();
1907
1908   CLG_(init_eventsets)();
1909   CLG_(init_statistics)(& CLG_(stat));
1910   CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
1911
1912   /* initialize hash tables */
1913   CLG_(init_obj_table)();
1914   CLG_(init_cxt_table)();
1915   CLG_(init_bb_hash)();
1916
1917   CLG_(init_threads)();
1918   CLG_(run_thread)(1);
1919
1920   CLG_(instrument_state) = CLG_(clo).instrument_atstart;
1921
1922   if (VG_(clo_verbosity > 0)) {
1923      VG_(message)(Vg_UserMsg,
1924                   "For interactive control, run 'callgrind_control -h'.\n");
1925   }
1926}
1927
1928static
1929void CLG_(pre_clo_init)(void)
1930{
1931    VG_(details_name)            ("Callgrind");
1932    VG_(details_version)         (NULL);
1933    VG_(details_description)     ("a call-graph generating cache profiler");
1934    VG_(details_copyright_author)("Copyright (C) 2002-2012, and GNU GPL'd, "
1935				  "by Josef Weidendorfer et al.");
1936    VG_(details_bug_reports_to)  (VG_BUGS_TO);
1937    VG_(details_avg_translation_sizeB) ( 500 );
1938
1939    VG_(basic_tool_funcs)        (CLG_(post_clo_init),
1940                                  CLG_(instrument),
1941                                  CLG_(fini));
1942
1943    VG_(needs_superblock_discards)(clg_discard_superblock_info);
1944
1945
1946    VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
1947				    CLG_(print_usage),
1948				    CLG_(print_debug_usage));
1949
1950    VG_(needs_client_requests)(CLG_(handle_client_request));
1951    VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
1952			       CLG_(post_syscalltime));
1953
1954    VG_(track_start_client_code)  ( & clg_start_client_code_callback );
1955    VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
1956    VG_(track_post_deliver_signal)( & CLG_(post_signal) );
1957
1958    CLG_(set_clo_defaults)();
1959}
1960
1961VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
1962
1963/*--------------------------------------------------------------------*/
1964/*--- end                                                   main.c ---*/
1965/*--------------------------------------------------------------------*/
1966