1
2/*--------------------------------------------------------------------*/
3/*--- Callgrind                                                    ---*/
4/*---                                                       main.c ---*/
5/*--------------------------------------------------------------------*/
6
7/*
8   This file is part of Callgrind, a Valgrind tool for call graph
9   profiling programs.
10
11   Copyright (C) 2002-2017, Josef Weidendorfer (Josef.Weidendorfer@gmx.de)
12
13   This tool is derived from and contains code from Cachegrind
14   Copyright (C) 2002-2017 Nicholas Nethercote (njn@valgrind.org)
15
16   This program is free software; you can redistribute it and/or
17   modify it under the terms of the GNU General Public License as
18   published by the Free Software Foundation; either version 2 of the
19   License, or (at your option) any later version.
20
21   This program is distributed in the hope that it will be useful, but
22   WITHOUT ANY WARRANTY; without even the implied warranty of
23   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
24   General Public License for more details.
25
26   You should have received a copy of the GNU General Public License
27   along with this program; if not, write to the Free Software
28   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
29   02111-1307, USA.
30
31   The GNU General Public License is contained in the file COPYING.
32*/
33
34#include "config.h"
35#include "callgrind.h"
36#include "global.h"
37
38#include "pub_tool_threadstate.h"
39#include "pub_tool_gdbserver.h"
40#include "pub_tool_transtab.h"       // VG_(discard_translations_safely)
41
42#include "cg_branchpred.c"
43
44/*------------------------------------------------------------*/
45/*--- Global variables                                     ---*/
46/*------------------------------------------------------------*/
47
48/* for all threads */
49CommandLineOptions CLG_(clo);
50Statistics CLG_(stat);
51Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
52
53/* thread and signal handler specific */
54exec_state CLG_(current_state);
55
56/* min of L1 and LL cache line sizes.  This only gets set to a
57   non-zero value if we are doing cache simulation. */
58Int CLG_(min_line_size) = 0;
59
60
61/*------------------------------------------------------------*/
62/*--- Statistics                                           ---*/
63/*------------------------------------------------------------*/
64
65static void CLG_(init_statistics)(Statistics* s)
66{
67  s->call_counter        = 0;
68  s->jcnd_counter        = 0;
69  s->jump_counter        = 0;
70  s->rec_call_counter    = 0;
71  s->ret_counter         = 0;
72  s->bb_executions       = 0;
73
74  s->context_counter     = 0;
75  s->bb_retranslations   = 0;
76
77  s->distinct_objs       = 0;
78  s->distinct_files      = 0;
79  s->distinct_fns        = 0;
80  s->distinct_contexts   = 0;
81  s->distinct_bbs        = 0;
82  s->distinct_bbccs      = 0;
83  s->distinct_instrs     = 0;
84  s->distinct_skips      = 0;
85
86  s->bb_hash_resizes     = 0;
87  s->bbcc_hash_resizes   = 0;
88  s->jcc_hash_resizes    = 0;
89  s->cxt_hash_resizes    = 0;
90  s->fn_array_resizes    = 0;
91  s->call_stack_resizes  = 0;
92  s->fn_stack_resizes    = 0;
93
94  s->full_debug_BBs      = 0;
95  s->file_line_debug_BBs = 0;
96  s->fn_name_debug_BBs   = 0;
97  s->no_debug_BBs        = 0;
98  s->bbcc_lru_misses     = 0;
99  s->jcc_lru_misses      = 0;
100  s->cxt_lru_misses      = 0;
101  s->bbcc_clones         = 0;
102}
103
104
105/*------------------------------------------------------------*/
106/*--- Simple callbacks (not cache similator)               ---*/
107/*------------------------------------------------------------*/
108
109VG_REGPARM(1)
110static void log_global_event(InstrInfo* ii)
111{
112    ULong* cost_Bus;
113
114    CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
115              CLG_(bb_base) + ii->instr_offset, ii->instr_size);
116
117    if (!CLG_(current_state).collect) return;
118
119    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
120
121    CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
122
123    if (CLG_(current_state).nonskipped)
124        cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
125    else
126        cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
127    cost_Bus[0]++;
128}
129
130
131/* For branches, we consult two different predictors, one which
132   predicts taken/untaken for conditional branches, and the other
133   which predicts the branch target address for indirect branches
134   (jump-to-register style ones). */
135
136static VG_REGPARM(2)
137void log_cond_branch(InstrInfo* ii, Word taken)
138{
139    Bool miss;
140    Int fullOffset_Bc;
141    ULong* cost_Bc;
142
143    CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %ld\n",
144              CLG_(bb_base) + ii->instr_offset, taken);
145
146    miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
147
148    if (!CLG_(current_state).collect) return;
149
150    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
151
152    if (CLG_(current_state).nonskipped)
153        cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
154    else
155        cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
156
157    fullOffset_Bc = fullOffset(EG_BC);
158    CLG_(current_state).cost[ fullOffset_Bc ]++;
159    cost_Bc[0]++;
160    if (miss) {
161        CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
162        cost_Bc[1]++;
163    }
164}
165
166static VG_REGPARM(2)
167void log_ind_branch(InstrInfo* ii, UWord actual_dst)
168{
169    Bool miss;
170    Int fullOffset_Bi;
171    ULong* cost_Bi;
172
173    CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
174              CLG_(bb_base) + ii->instr_offset, actual_dst);
175
176    miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
177
178    if (!CLG_(current_state).collect) return;
179
180    CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
181
182    if (CLG_(current_state).nonskipped)
183        cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
184    else
185        cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
186
187    fullOffset_Bi = fullOffset(EG_BI);
188    CLG_(current_state).cost[ fullOffset_Bi ]++;
189    cost_Bi[0]++;
190    if (miss) {
191        CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
192        cost_Bi[1]++;
193    }
194}
195
196/*------------------------------------------------------------*/
197/*--- Instrumentation structures and event queue handling  ---*/
198/*------------------------------------------------------------*/
199
200/* Maintain an ordered list of memory events which are outstanding, in
201   the sense that no IR has yet been generated to do the relevant
202   helper calls.  The BB is scanned top to bottom and memory events
203   are added to the end of the list, merging with the most recent
204   notified event where possible (Dw immediately following Dr and
205   having the same size and EA can be merged).
206
207   This merging is done so that for architectures which have
208   load-op-store instructions (x86, amd64), the insn is treated as if
209   it makes just one memory reference (a modify), rather than two (a
210   read followed by a write at the same address).
211
212   At various points the list will need to be flushed, that is, IR
213   generated from it.  That must happen before any possible exit from
214   the block (the end, or an IRStmt_Exit).  Flushing also takes place
215   when there is no space to add a new event.
216
217   If we require the simulation statistics to be up to date with
218   respect to possible memory exceptions, then the list would have to
219   be flushed before each memory reference.  That would however lose
220   performance by inhibiting event-merging during flushing.
221
222   Flushing the list consists of walking it start to end and emitting
223   instrumentation IR for each event, in the order in which they
224   appear.  It may be possible to emit a single call for two adjacent
225   events in order to reduce the number of helper function calls made.
226   For example, it could well be profitable to handle two adjacent Ir
227   events with a single helper call.  */
228
229typedef
230   IRExpr
231   IRAtom;
232
233typedef
234   enum {
235      Ev_Ir,  // Instruction read
236      Ev_Dr,  // Data read
237      Ev_Dw,  // Data write
238      Ev_Dm,  // Data modify (read then write)
239      Ev_Bc,  // branch conditional
240      Ev_Bi,  // branch indirect (to unknown destination)
241      Ev_G    // Global bus event
242   }
243   EventTag;
244
245typedef
246   struct {
247      EventTag   tag;
248      InstrInfo* inode;
249      union {
250	 struct {
251	 } Ir;
252	 struct {
253	    IRAtom* ea;
254	    Int     szB;
255	 } Dr;
256	 struct {
257	    IRAtom* ea;
258	    Int     szB;
259	 } Dw;
260	 struct {
261	    IRAtom* ea;
262	    Int     szB;
263	 } Dm;
264         struct {
265            IRAtom* taken; /* :: Ity_I1 */
266         } Bc;
267         struct {
268            IRAtom* dst;
269         } Bi;
270	 struct {
271	 } G;
272      } Ev;
273   }
274   Event;
275
276static void init_Event ( Event* ev ) {
277   VG_(memset)(ev, 0, sizeof(Event));
278}
279
280static IRAtom* get_Event_dea ( Event* ev ) {
281   switch (ev->tag) {
282      case Ev_Dr: return ev->Ev.Dr.ea;
283      case Ev_Dw: return ev->Ev.Dw.ea;
284      case Ev_Dm: return ev->Ev.Dm.ea;
285      default:    tl_assert(0);
286   }
287}
288
289static Int get_Event_dszB ( Event* ev ) {
290   switch (ev->tag) {
291      case Ev_Dr: return ev->Ev.Dr.szB;
292      case Ev_Dw: return ev->Ev.Dw.szB;
293      case Ev_Dm: return ev->Ev.Dm.szB;
294      default:    tl_assert(0);
295   }
296}
297
298
299/* Up to this many unnotified events are allowed.  Number is
300   arbitrary.  Larger numbers allow more event merging to occur, but
301   potentially induce more spilling due to extending live ranges of
302   address temporaries. */
303#define N_EVENTS 16
304
305
306/* A struct which holds all the running state during instrumentation.
307   Mostly to avoid passing loads of parameters everywhere. */
308typedef struct {
309    /* The current outstanding-memory-event list. */
310    Event events[N_EVENTS];
311    Int   events_used;
312
313    /* The array of InstrInfo's is part of BB struct. */
314    BB* bb;
315
316    /* BB seen before (ie. re-instrumentation) */
317    Bool seen_before;
318
319    /* Number InstrInfo bins 'used' so far. */
320    UInt ii_index;
321
322    // current offset of guest instructions from BB start
323    UInt instr_offset;
324
325    /* The output SB being constructed. */
326    IRSB* sbOut;
327} ClgState;
328
329
330static void showEvent ( Event* ev )
331{
332   switch (ev->tag) {
333      case Ev_Ir:
334	 VG_(printf)("Ir (InstrInfo %p) at +%u\n",
335		     ev->inode, ev->inode->instr_offset);
336	 break;
337      case Ev_Dr:
338	 VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=",
339		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
340	 ppIRExpr(ev->Ev.Dr.ea);
341	 VG_(printf)("\n");
342	 break;
343      case Ev_Dw:
344	 VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=",
345		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
346	 ppIRExpr(ev->Ev.Dw.ea);
347	 VG_(printf)("\n");
348	 break;
349      case Ev_Dm:
350	 VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=",
351		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
352	 ppIRExpr(ev->Ev.Dm.ea);
353	 VG_(printf)("\n");
354	 break;
355      case Ev_Bc:
356         VG_(printf)("Bc %p   GA=", ev->inode);
357         ppIRExpr(ev->Ev.Bc.taken);
358         VG_(printf)("\n");
359         break;
360      case Ev_Bi:
361         VG_(printf)("Bi %p  DST=", ev->inode);
362         ppIRExpr(ev->Ev.Bi.dst);
363         VG_(printf)("\n");
364         break;
365      case Ev_G:
366         VG_(printf)("G  %p\n", ev->inode);
367         break;
368      default:
369	 tl_assert(0);
370	 break;
371   }
372}
373
374/* Generate code for all outstanding memory events, and mark the queue
375   empty.  Code is generated into cgs->sbOut, and this activity
376   'consumes' slots in cgs->bb. */
377
378static void flushEvents ( ClgState* clgs )
379{
380   Int        i, regparms, inew;
381   const HChar* helperName;
382   void*      helperAddr;
383   IRExpr**   argv;
384   IRExpr*    i_node_expr;
385   IRDirty*   di;
386   Event*     ev;
387   Event*     ev2;
388   Event*     ev3;
389
390   if (!clgs->seen_before) {
391       // extend event sets as needed
392       // available sets: D0 Dr
393       for(i=0; i<clgs->events_used; i++) {
394	   ev  = &clgs->events[i];
395	   switch(ev->tag) {
396	   case Ev_Ir:
397	       // Ir event always is first for a guest instruction
398	       CLG_ASSERT(ev->inode->eventset == 0);
399	       ev->inode->eventset = CLG_(sets).base;
400	       break;
401	   case Ev_Dr:
402               // extend event set by Dr counters
403	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
404							   EG_DR);
405	       break;
406	   case Ev_Dw:
407	   case Ev_Dm:
408               // extend event set by Dw counters
409	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
410							   EG_DW);
411	       break;
412           case Ev_Bc:
413               // extend event set by Bc counters
414               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
415                                                           EG_BC);
416               break;
417           case Ev_Bi:
418               // extend event set by Bi counters
419               ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
420                                                           EG_BI);
421               break;
422	   case Ev_G:
423               // extend event set by Bus counter
424	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
425							   EG_BUS);
426	       break;
427	   default:
428	       tl_assert(0);
429	   }
430       }
431   }
432
433   for(i = 0; i < clgs->events_used; i = inew) {
434
435      helperName = NULL;
436      helperAddr = NULL;
437      argv       = NULL;
438      regparms   = 0;
439
440      /* generate IR to notify event i and possibly the ones
441	 immediately following it. */
442      tl_assert(i >= 0 && i < clgs->events_used);
443
444      ev  = &clgs->events[i];
445      ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
446      ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
447
448      CLG_DEBUGIF(5) {
449	 VG_(printf)("   flush ");
450	 showEvent( ev );
451      }
452
453      i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
454
455      /* Decide on helper fn to call and args to pass it, and advance
456	 i appropriately.
457	 Dm events have same effect as Dw events */
458      switch (ev->tag) {
459	 case Ev_Ir:
460	    /* Merge an Ir with a following Dr. */
461	    if (ev2 && ev2->tag == Ev_Dr) {
462	       /* Why is this true?  It's because we're merging an Ir
463		  with a following Dr.  The Ir derives from the
464		  instruction's IMark and the Dr from data
465		  references which follow it.  In short it holds
466		  because each insn starts with an IMark, hence an
467		  Ev_Ir, and so these Dr must pertain to the
468		  immediately preceding Ir.  Same applies to analogous
469		  assertions in the subsequent cases. */
470	       tl_assert(ev2->inode == ev->inode);
471	       helperName = CLG_(cachesim).log_1I1Dr_name;
472	       helperAddr = CLG_(cachesim).log_1I1Dr;
473	       argv = mkIRExprVec_3( i_node_expr,
474				     get_Event_dea(ev2),
475				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
476	       regparms = 3;
477	       inew = i+2;
478	    }
479	    /* Merge an Ir with a following Dw/Dm. */
480	    else
481	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
482	       tl_assert(ev2->inode == ev->inode);
483	       helperName = CLG_(cachesim).log_1I1Dw_name;
484	       helperAddr = CLG_(cachesim).log_1I1Dw;
485	       argv = mkIRExprVec_3( i_node_expr,
486				     get_Event_dea(ev2),
487				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
488	       regparms = 3;
489	       inew = i+2;
490	    }
491	    /* Merge an Ir with two following Irs. */
492	    else
493	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
494	       helperName = CLG_(cachesim).log_3I0D_name;
495	       helperAddr = CLG_(cachesim).log_3I0D;
496	       argv = mkIRExprVec_3( i_node_expr,
497				     mkIRExpr_HWord( (HWord)ev2->inode ),
498				     mkIRExpr_HWord( (HWord)ev3->inode ) );
499	       regparms = 3;
500	       inew = i+3;
501	    }
502	    /* Merge an Ir with one following Ir. */
503	    else
504	    if (ev2 && ev2->tag == Ev_Ir) {
505	       helperName = CLG_(cachesim).log_2I0D_name;
506	       helperAddr = CLG_(cachesim).log_2I0D;
507	       argv = mkIRExprVec_2( i_node_expr,
508				     mkIRExpr_HWord( (HWord)ev2->inode ) );
509	       regparms = 2;
510	       inew = i+2;
511	    }
512	    /* No merging possible; emit as-is. */
513	    else {
514	       helperName = CLG_(cachesim).log_1I0D_name;
515	       helperAddr = CLG_(cachesim).log_1I0D;
516	       argv = mkIRExprVec_1( i_node_expr );
517	       regparms = 1;
518	       inew = i+1;
519	    }
520	    break;
521	 case Ev_Dr:
522	    /* Data read or modify */
523	    helperName = CLG_(cachesim).log_0I1Dr_name;
524	    helperAddr = CLG_(cachesim).log_0I1Dr;
525	    argv = mkIRExprVec_3( i_node_expr,
526				  get_Event_dea(ev),
527				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
528	    regparms = 3;
529	    inew = i+1;
530	    break;
531	 case Ev_Dw:
532	 case Ev_Dm:
533	    /* Data write */
534	    helperName = CLG_(cachesim).log_0I1Dw_name;
535	    helperAddr = CLG_(cachesim).log_0I1Dw;
536	    argv = mkIRExprVec_3( i_node_expr,
537				  get_Event_dea(ev),
538				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
539	    regparms = 3;
540	    inew = i+1;
541	    break;
542         case Ev_Bc:
543            /* Conditional branch */
544            helperName = "log_cond_branch";
545            helperAddr = &log_cond_branch;
546            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
547            regparms = 2;
548            inew = i+1;
549            break;
550         case Ev_Bi:
551            /* Branch to an unknown destination */
552            helperName = "log_ind_branch";
553            helperAddr = &log_ind_branch;
554            argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
555            regparms = 2;
556            inew = i+1;
557            break;
558         case Ev_G:
559            /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
560            helperName = "log_global_event";
561            helperAddr = &log_global_event;
562            argv = mkIRExprVec_1( i_node_expr );
563            regparms = 1;
564            inew = i+1;
565            break;
566	 default:
567	    tl_assert(0);
568      }
569
570      CLG_DEBUGIF(5) {
571	  if (inew > i+1) {
572	      VG_(printf)("   merge ");
573	      showEvent( ev2 );
574	  }
575	  if (inew > i+2) {
576	      VG_(printf)("   merge ");
577	      showEvent( ev3 );
578	  }
579	  if (helperAddr)
580	      VG_(printf)("   call  %s (%p)\n",
581			  helperName, helperAddr);
582      }
583
584      /* helper could be unset depending on the simulator used */
585      if (helperAddr == 0) continue;
586
587      /* Add the helper. */
588      tl_assert(helperName);
589      tl_assert(helperAddr);
590      tl_assert(argv);
591      di = unsafeIRDirty_0_N( regparms,
592			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
593			      argv );
594      addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
595   }
596
597   clgs->events_used = 0;
598}
599
600static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
601{
602   Event* evt;
603   tl_assert(clgs->seen_before || (inode->eventset == 0));
604   if (!CLG_(clo).simulate_cache) return;
605
606   if (clgs->events_used == N_EVENTS)
607      flushEvents(clgs);
608   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
609   evt = &clgs->events[clgs->events_used];
610   init_Event(evt);
611   evt->tag      = Ev_Ir;
612   evt->inode    = inode;
613   clgs->events_used++;
614}
615
616static
617void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
618{
619   Event* evt;
620   tl_assert(isIRAtom(ea));
621   tl_assert(datasize >= 1);
622   if (!CLG_(clo).simulate_cache) return;
623   tl_assert(datasize <= CLG_(min_line_size));
624
625   if (clgs->events_used == N_EVENTS)
626      flushEvents(clgs);
627   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
628   evt = &clgs->events[clgs->events_used];
629   init_Event(evt);
630   evt->tag       = Ev_Dr;
631   evt->inode     = inode;
632   evt->Ev.Dr.szB = datasize;
633   evt->Ev.Dr.ea  = ea;
634   clgs->events_used++;
635}
636
637static
638void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
639{
640   Event* evt;
641   tl_assert(isIRAtom(ea));
642   tl_assert(datasize >= 1);
643   if (!CLG_(clo).simulate_cache) return;
644   tl_assert(datasize <= CLG_(min_line_size));
645
646   /* Is it possible to merge this write with the preceding read? */
647   if (clgs->events_used > 0) {
648      Event* lastEvt = &clgs->events[clgs->events_used-1];
649      if (   lastEvt->tag       == Ev_Dr
650          && lastEvt->Ev.Dr.szB == datasize
651          && lastEvt->inode     == inode
652          && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
653      {
654         lastEvt->tag   = Ev_Dm;
655         return;
656      }
657   }
658
659   /* No.  Add as normal. */
660   if (clgs->events_used == N_EVENTS)
661      flushEvents(clgs);
662   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
663   evt = &clgs->events[clgs->events_used];
664   init_Event(evt);
665   evt->tag       = Ev_Dw;
666   evt->inode     = inode;
667   evt->Ev.Dw.szB = datasize;
668   evt->Ev.Dw.ea  = ea;
669   clgs->events_used++;
670}
671
672static
673void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
674                          Int datasize, IRAtom* ea, IRAtom* guard,
675                          Bool isWrite )
676{
677   tl_assert(isIRAtom(ea));
678   tl_assert(guard);
679   tl_assert(isIRAtom(guard));
680   tl_assert(datasize >= 1);
681   if (!CLG_(clo).simulate_cache) return;
682   tl_assert(datasize <= CLG_(min_line_size));
683
684   /* Adding guarded memory actions and merging them with the existing
685      queue is too complex.  Simply flush the queue and add this
686      action immediately.  Since guarded loads and stores are pretty
687      rare, this is not thought likely to cause any noticeable
688      performance loss as a result of the loss of event-merging
689      opportunities. */
690   tl_assert(clgs->events_used >= 0);
691   flushEvents(clgs);
692   tl_assert(clgs->events_used == 0);
693   /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
694   IRExpr*      i_node_expr;
695   const HChar* helperName;
696   void*        helperAddr;
697   IRExpr**     argv;
698   Int          regparms;
699   IRDirty*     di;
700   i_node_expr = mkIRExpr_HWord( (HWord)inode );
701   helperName  = isWrite ? CLG_(cachesim).log_0I1Dw_name
702                         : CLG_(cachesim).log_0I1Dr_name;
703   helperAddr  = isWrite ? CLG_(cachesim).log_0I1Dw
704                         : CLG_(cachesim).log_0I1Dr;
705   argv        = mkIRExprVec_3( i_node_expr,
706                                ea, mkIRExpr_HWord( datasize ) );
707   regparms    = 3;
708   di          = unsafeIRDirty_0_N(
709                    regparms,
710                    helperName, VG_(fnptr_to_fnentry)( helperAddr ),
711                    argv );
712   di->guard = guard;
713   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
714}
715
716static
717void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
718{
719   Event* evt;
720   tl_assert(isIRAtom(guard));
721   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
722             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
723   if (!CLG_(clo).simulate_branch) return;
724
725   if (clgs->events_used == N_EVENTS)
726      flushEvents(clgs);
727   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
728   evt = &clgs->events[clgs->events_used];
729   init_Event(evt);
730   evt->tag         = Ev_Bc;
731   evt->inode       = inode;
732   evt->Ev.Bc.taken = guard;
733   clgs->events_used++;
734}
735
736static
737void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
738{
739   Event* evt;
740   tl_assert(isIRAtom(whereTo));
741   tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
742             == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
743   if (!CLG_(clo).simulate_branch) return;
744
745   if (clgs->events_used == N_EVENTS)
746      flushEvents(clgs);
747   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
748   evt = &clgs->events[clgs->events_used];
749   init_Event(evt);
750   evt->tag       = Ev_Bi;
751   evt->inode     = inode;
752   evt->Ev.Bi.dst = whereTo;
753   clgs->events_used++;
754}
755
756static
757void addEvent_G ( ClgState* clgs, InstrInfo* inode )
758{
759   Event* evt;
760   if (!CLG_(clo).collect_bus) return;
761
762   if (clgs->events_used == N_EVENTS)
763      flushEvents(clgs);
764   tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
765   evt = &clgs->events[clgs->events_used];
766   init_Event(evt);
767   evt->tag       = Ev_G;
768   evt->inode     = inode;
769   clgs->events_used++;
770}
771
772/* Initialise or check (if already seen before) an InstrInfo for next insn.
773   We only can set instr_offset/instr_size here. The required event set and
774   resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
775   instructions. The event set is extended as required on flush of the event
776   queue (when Dm events were determined), cost offsets are determined at
777   end of BB instrumentation. */
778static
779InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
780{
781   InstrInfo* ii;
782   tl_assert(clgs->ii_index >= 0);
783   tl_assert(clgs->ii_index < clgs->bb->instr_count);
784   ii = &clgs->bb->instr[ clgs->ii_index ];
785
786   if (clgs->seen_before) {
787       CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
788       CLG_ASSERT(ii->instr_size == instr_size);
789   }
790   else {
791       ii->instr_offset = clgs->instr_offset;
792       ii->instr_size = instr_size;
793       ii->cost_offset = 0;
794       ii->eventset = 0;
795   }
796
797   clgs->ii_index++;
798   clgs->instr_offset += instr_size;
799   CLG_(stat).distinct_instrs++;
800
801   return ii;
802}
803
804// return total number of cost values needed for this BB
805static
806UInt update_cost_offsets( ClgState* clgs )
807{
808    Int i;
809    InstrInfo* ii;
810    UInt cost_offset = 0;
811
812    CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
813    for(i=0; i<clgs->ii_index; i++) {
814	ii = &clgs->bb->instr[i];
815	if (clgs->seen_before) {
816	    CLG_ASSERT(ii->cost_offset == cost_offset);
817	} else
818	    ii->cost_offset = cost_offset;
819	cost_offset += ii->eventset ? ii->eventset->size : 0;
820    }
821
822    return cost_offset;
823}
824
825/*------------------------------------------------------------*/
826/*--- Instrumentation                                      ---*/
827/*------------------------------------------------------------*/
828
829#if defined(VG_BIGENDIAN)
830# define CLGEndness Iend_BE
831#elif defined(VG_LITTLEENDIAN)
832# define CLGEndness Iend_LE
833#else
834# error "Unknown endianness"
835#endif
836
837static
838Addr IRConst2Addr(IRConst* con)
839{
840    Addr addr;
841
842    if (sizeof(Addr) == 4) {
843	CLG_ASSERT( con->tag == Ico_U32 );
844	addr = con->Ico.U32;
845    }
846    else if (sizeof(Addr) == 8) {
847	CLG_ASSERT( con->tag == Ico_U64 );
848	addr = con->Ico.U64;
849    }
850    else
851	VG_(tool_panic)("Callgrind: invalid Addr type");
852
853    return addr;
854}
855
856/* First pass over a BB to instrument, counting instructions and jumps
857 * This is needed for the size of the BB struct to allocate
858 *
859 * Called from CLG_(get_bb)
860 */
861void CLG_(collectBlockInfo)(IRSB* sbIn,
862			    /*INOUT*/ UInt* instrs,
863			    /*INOUT*/ UInt* cjmps,
864			    /*INOUT*/ Bool* cjmp_inverted)
865{
866    Int i;
867    IRStmt* st;
868    Addr instrAddr =0, jumpDst;
869    UInt instrLen = 0;
870    Bool toNextInstr = False;
871
872    // Ist_Exit has to be ignored in preamble code, before first IMark:
873    // preamble code is added by VEX for self modifying code, and has
874    // nothing to do with client code
875    Bool inPreamble = True;
876
877    if (!sbIn) return;
878
879    for (i = 0; i < sbIn->stmts_used; i++) {
880	  st = sbIn->stmts[i];
881	  if (Ist_IMark == st->tag) {
882	      inPreamble = False;
883
884	      instrAddr = st->Ist.IMark.addr;
885	      instrLen  = st->Ist.IMark.len;
886
887	      (*instrs)++;
888	      toNextInstr = False;
889	  }
890	  if (inPreamble) continue;
891	  if (Ist_Exit == st->tag) {
892	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
893	      toNextInstr =  (jumpDst == instrAddr + instrLen);
894
895	      (*cjmps)++;
896	  }
897    }
898
899    /* if the last instructions of BB conditionally jumps to next instruction
900     * (= first instruction of next BB in memory), this is a inverted by VEX.
901     */
902    *cjmp_inverted = toNextInstr;
903}
904
905static
906void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
907{
908    addStmtToIRSB( bbOut,
909		   IRStmt_Store(CLGEndness,
910				IRExpr_Const(hWordTy == Ity_I32 ?
911					     IRConst_U32( addr ) :
912					     IRConst_U64( addr )),
913				IRExpr_Const(IRConst_U32(val)) ));
914}
915
916
917/* add helper call to setup_bbcc, with pointer to BB struct as argument
918 *
919 * precondition for setup_bbcc:
920 * - jmps_passed has number of cond.jumps passed in last executed BB
921 * - current_bbcc has a pointer to the BBCC of the last executed BB
922 *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
923 *     current_bbcc->bb->jmp_addr
924 *   gives the address of the jump source.
925 *
926 * the setup does 2 things:
927 * - trace call:
928 *   * Unwind own call stack, i.e sync our ESP with real ESP
929 *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
930 *   * For CALLs or JMPs crossing objects, record call arg +
931 *     push are on own call stack
932 *
933 * - prepare for cache log functions:
934 *   set current_bbcc to BBCC that gets the costs for this BB execution
935 *   attached
936 */
937static
938void addBBSetupCall(ClgState* clgs)
939{
940   IRDirty* di;
941   IRExpr  *arg1, **argv;
942
943   arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
944   argv = mkIRExprVec_1(arg1);
945   di = unsafeIRDirty_0_N( 1, "setup_bbcc",
946			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
947			      argv);
948   addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
949}
950
951
952static
953IRSB* CLG_(instrument)( VgCallbackClosure* closure,
954                        IRSB* sbIn,
955			const VexGuestLayout* layout,
956			const VexGuestExtents* vge,
957                        const VexArchInfo* archinfo_host,
958			IRType gWordTy, IRType hWordTy )
959{
960   Int        i;
961   IRStmt*    st;
962   Addr       origAddr;
963   InstrInfo* curr_inode = NULL;
964   ClgState   clgs;
965   UInt       cJumps = 0;
966   IRTypeEnv* tyenv = sbIn->tyenv;
967
968   if (gWordTy != hWordTy) {
969      /* We don't currently support this case. */
970      VG_(tool_panic)("host/guest word size mismatch");
971   }
972
973   // No instrumentation if it is switched off
974   if (! CLG_(instrument_state)) {
975       CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
976		 (Addr)closure->readdr);
977       return sbIn;
978   }
979
980   CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
981
982   /* Set up SB for instrumented IR */
983   clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
984
985   // Copy verbatim any IR preamble preceding the first IMark
986   i = 0;
987   while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
988      addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
989      i++;
990   }
991
992   // Get the first statement, and origAddr from it
993   CLG_ASSERT(sbIn->stmts_used >0);
994   CLG_ASSERT(i < sbIn->stmts_used);
995   st = sbIn->stmts[i];
996   CLG_ASSERT(Ist_IMark == st->tag);
997
998   origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta;
999   CLG_ASSERT(origAddr == st->Ist.IMark.addr
1000                          + st->Ist.IMark.delta);  // XXX: check no overflow
1001
1002   /* Get BB struct (creating if necessary).
1003    * JS: The hash table is keyed with orig_addr_noredir -- important!
1004    * JW: Why? If it is because of different chasing of the redirection,
1005    *     this is not needed, as chasing is switched off in callgrind
1006    */
1007   clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
1008
1009   addBBSetupCall(&clgs);
1010
1011   // Set up running state
1012   clgs.events_used = 0;
1013   clgs.ii_index = 0;
1014   clgs.instr_offset = 0;
1015
1016   for (/*use current i*/; i < sbIn->stmts_used; i++) {
1017
1018      st = sbIn->stmts[i];
1019      CLG_ASSERT(isFlatIRStmt(st));
1020
1021      switch (st->tag) {
1022	 case Ist_NoOp:
1023	 case Ist_AbiHint:
1024	 case Ist_Put:
1025	 case Ist_PutI:
1026	 case Ist_MBE:
1027	    break;
1028
1029	 case Ist_IMark: {
1030            Addr   cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
1031            UInt   isize = st->Ist.IMark.len;
1032            CLG_ASSERT(clgs.instr_offset == cia - origAddr);
1033	    // If Vex fails to decode an instruction, the size will be zero.
1034	    // Pretend otherwise.
1035	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
1036
1037	    // Sanity-check size.
1038	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
1039		     || VG_CLREQ_SZB == isize );
1040
1041	    // Init the inode, record it as the current one.
1042	    // Subsequent Dr/Dw/Dm events from the same instruction will
1043	    // also use it.
1044	    curr_inode = next_InstrInfo (&clgs, isize);
1045
1046	    addEvent_Ir( &clgs, curr_inode );
1047	    break;
1048	 }
1049
1050	 case Ist_WrTmp: {
1051	    IRExpr* data = st->Ist.WrTmp.data;
1052	    if (data->tag == Iex_Load) {
1053	       IRExpr* aexpr = data->Iex.Load.addr;
1054	       // Note also, endianness info is ignored.  I guess
1055	       // that's not interesting.
1056	       addEvent_Dr( &clgs, curr_inode,
1057			    sizeofIRType(data->Iex.Load.ty), aexpr );
1058	    }
1059	    break;
1060	 }
1061
1062	 case Ist_Store: {
1063	    IRExpr* data  = st->Ist.Store.data;
1064	    IRExpr* aexpr = st->Ist.Store.addr;
1065	    addEvent_Dw( &clgs, curr_inode,
1066			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
1067	    break;
1068	 }
1069
1070         case Ist_StoreG: {
1071            IRStoreG* sg   = st->Ist.StoreG.details;
1072            IRExpr*   data = sg->data;
1073            IRExpr*   addr = sg->addr;
1074            IRType    type = typeOfIRExpr(tyenv, data);
1075            tl_assert(type != Ity_INVALID);
1076            addEvent_D_guarded( &clgs, curr_inode,
1077                                sizeofIRType(type), addr, sg->guard,
1078                                True/*isWrite*/ );
1079            break;
1080         }
1081
1082         case Ist_LoadG: {
1083            IRLoadG* lg       = st->Ist.LoadG.details;
1084            IRType   type     = Ity_INVALID; /* loaded type */
1085            IRType   typeWide = Ity_INVALID; /* after implicit widening */
1086            IRExpr*  addr     = lg->addr;
1087            typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
1088            tl_assert(type != Ity_INVALID);
1089            addEvent_D_guarded( &clgs, curr_inode,
1090                                sizeofIRType(type), addr, lg->guard,
1091                                False/*!isWrite*/ );
1092            break;
1093         }
1094
1095	 case Ist_Dirty: {
1096	    Int      dataSize;
1097	    IRDirty* d = st->Ist.Dirty.details;
1098	    if (d->mFx != Ifx_None) {
1099	       /* This dirty helper accesses memory.  Collect the details. */
1100	       tl_assert(d->mAddr != NULL);
1101	       tl_assert(d->mSize != 0);
1102	       dataSize = d->mSize;
1103	       // Large (eg. 28B, 108B, 512B on x86) data-sized
1104	       // instructions will be done inaccurately, but they're
1105	       // very rare and this avoids errors from hitting more
1106	       // than two cache lines in the simulation.
1107	       if (CLG_(clo).simulate_cache && dataSize > CLG_(min_line_size))
1108		  dataSize = CLG_(min_line_size);
1109	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
1110		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
1111	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
1112		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
1113	    } else {
1114	       tl_assert(d->mAddr == NULL);
1115	       tl_assert(d->mSize == 0);
1116	    }
1117	    break;
1118	 }
1119
1120         case Ist_CAS: {
1121            /* We treat it as a read and a write of the location.  I
1122               think that is the same behaviour as it was before IRCAS
1123               was introduced, since prior to that point, the Vex
1124               front ends would translate a lock-prefixed instruction
1125               into a (normal) read followed by a (normal) write. */
1126            Int    dataSize;
1127            IRCAS* cas = st->Ist.CAS.details;
1128            CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
1129            CLG_ASSERT(cas->dataLo);
1130            dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
1131            if (cas->dataHi != NULL)
1132               dataSize *= 2; /* since this is a doubleword-cas */
1133            addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
1134            addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
1135            addEvent_G(  &clgs, curr_inode );
1136            break;
1137         }
1138
1139         case Ist_LLSC: {
1140            IRType dataTy;
1141            if (st->Ist.LLSC.storedata == NULL) {
1142               /* LL */
1143               dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
1144               addEvent_Dr( &clgs, curr_inode,
1145                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
1146               /* flush events before LL, should help SC to succeed */
1147               flushEvents( &clgs );
1148            } else {
1149               /* SC */
1150               dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
1151               addEvent_Dw( &clgs, curr_inode,
1152                            sizeofIRType(dataTy), st->Ist.LLSC.addr );
1153               /* I don't know whether the global-bus-lock cost should
1154                  be attributed to the LL or the SC, but it doesn't
1155                  really matter since they always have to be used in
1156                  pairs anyway.  Hence put it (quite arbitrarily) on
1157                  the SC. */
1158               addEvent_G(  &clgs, curr_inode );
1159            }
1160            break;
1161         }
1162
1163 	 case Ist_Exit: {
1164            Bool guest_exit, inverted;
1165
1166            /* VEX code generation sometimes inverts conditional branches.
1167             * As Callgrind counts (conditional) jumps, it has to correct
1168             * inversions. The heuristic is the following:
1169             * (1) Callgrind switches off SB chasing and unrolling, and
1170             *     therefore it assumes that a candidate for inversion only is
1171             *     the last conditional branch in an SB.
1172             * (2) inversion is assumed if the branch jumps to the address of
1173             *     the next guest instruction in memory.
1174             * This heuristic is precalculated in CLG_(collectBlockInfo)().
1175             *
1176             * Branching behavior is also used for branch prediction. Note that
1177             * above heuristic is different from what Cachegrind does.
1178             * Cachegrind uses (2) for all branches.
1179             */
1180            if (cJumps+1 == clgs.bb->cjmp_count)
1181                inverted = clgs.bb->cjmp_inverted;
1182            else
1183                inverted = False;
1184
1185            // call branch predictor only if this is a branch in guest code
1186            guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
1187                         (st->Ist.Exit.jk == Ijk_Call) ||
1188                         (st->Ist.Exit.jk == Ijk_Ret);
1189
1190            if (guest_exit) {
1191                /* Stuff to widen the guard expression to a host word, so
1192                   we can pass it to the branch predictor simulation
1193                   functions easily. */
1194                IRType   tyW    = hWordTy;
1195                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
1196                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
1197                IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
1198                IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
1199                IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
1200                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
1201                                               : IRExpr_Const(IRConst_U64(1));
1202
1203                /* Widen the guard expression. */
1204                addStmtToIRSB( clgs.sbOut,
1205                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
1206                addStmtToIRSB( clgs.sbOut,
1207                               IRStmt_WrTmp( guardW,
1208                                             IRExpr_Unop(widen,
1209                                                         IRExpr_RdTmp(guard1))) );
1210                /* If the exit is inverted, invert the sense of the guard. */
1211                addStmtToIRSB(
1212                        clgs.sbOut,
1213                        IRStmt_WrTmp(
1214                                guard,
1215                                inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
1216                                    : IRExpr_RdTmp(guardW)
1217                                    ));
1218                /* And post the event. */
1219                addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
1220            }
1221
1222	    /* We may never reach the next statement, so need to flush
1223	       all outstanding transactions now. */
1224	    flushEvents( &clgs );
1225
1226	    CLG_ASSERT(clgs.ii_index>0);
1227	    if (!clgs.seen_before) {
1228	      ClgJumpKind jk;
1229
1230	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
1231	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
1232	      else {
1233		if (IRConst2Addr(st->Ist.Exit.dst) ==
1234		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
1235		  jk = jk_None;
1236		else
1237		  jk = jk_Jump;
1238	      }
1239
1240	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
1241	      clgs.bb->jmp[cJumps].jmpkind = jk;
1242	    }
1243
1244	    /* Update global variable jmps_passed before the jump
1245	     * A correction is needed if VEX inverted the last jump condition
1246	    */
1247	    UInt val = inverted ? cJumps+1 : cJumps;
1248	    addConstMemStoreStmt( clgs.sbOut,
1249				  (UWord) &CLG_(current_state).jmps_passed,
1250				  val, hWordTy);
1251	    cJumps++;
1252
1253	    break;
1254	 }
1255
1256	 default:
1257	    tl_assert(0);
1258	    break;
1259      }
1260
1261      /* Copy the original statement */
1262      addStmtToIRSB( clgs.sbOut, st );
1263
1264      CLG_DEBUGIF(5) {
1265	 VG_(printf)("   pass  ");
1266	 ppIRStmt(st);
1267	 VG_(printf)("\n");
1268      }
1269   }
1270
1271   /* Deal with branches to unknown destinations.  Except ignore ones
1272      which are function returns as we assume the return stack
1273      predictor never mispredicts. */
1274   if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
1275      if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
1276      switch (sbIn->next->tag) {
1277         case Iex_Const:
1278            break; /* boring - branch to known address */
1279         case Iex_RdTmp:
1280            /* looks like an indirect branch (branch to unknown) */
1281            addEvent_Bi( &clgs, curr_inode, sbIn->next );
1282            break;
1283         default:
1284            /* shouldn't happen - if the incoming IR is properly
1285               flattened, should only have tmp and const cases to
1286               consider. */
1287            tl_assert(0);
1288      }
1289   }
1290
1291   /* At the end of the bb.  Flush outstandings. */
1292   flushEvents( &clgs );
1293
1294   /* Update global variable jmps_passed at end of SB.
1295    * As CLG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
1296    * this can be omitted if there is no conditional jump in this SB.
1297    * A correction is needed if VEX inverted the last jump condition
1298    */
1299   if (cJumps>0) {
1300      UInt jmps_passed = cJumps;
1301      if (clgs.bb->cjmp_inverted) jmps_passed--;
1302      addConstMemStoreStmt( clgs.sbOut,
1303			    (UWord) &CLG_(current_state).jmps_passed,
1304			    jmps_passed, hWordTy);
1305   }
1306   CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
1307   CLG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
1308
1309   /* Info for final exit from BB */
1310   {
1311     ClgJumpKind jk;
1312
1313     if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
1314     else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
1315     else {
1316       jk = jk_Jump;
1317       if ((sbIn->next->tag == Iex_Const) &&
1318	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
1319	    origAddr + clgs.instr_offset))
1320	 jk = jk_None;
1321     }
1322     clgs.bb->jmp[cJumps].jmpkind = jk;
1323     /* Instruction index of the call/ret at BB end
1324      * (it is wrong for fall-through, but does not matter) */
1325     clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
1326   }
1327
1328   /* swap information of last exit with final exit if inverted */
1329   if (clgs.bb->cjmp_inverted) {
1330     ClgJumpKind jk;
1331     UInt instr;
1332
1333     jk = clgs.bb->jmp[cJumps].jmpkind;
1334     clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
1335     clgs.bb->jmp[cJumps-1].jmpkind = jk;
1336     instr = clgs.bb->jmp[cJumps].instr;
1337     clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
1338     clgs.bb->jmp[cJumps-1].instr = instr;
1339   }
1340
1341   if (clgs.seen_before) {
1342       CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
1343       CLG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
1344   }
1345   else {
1346       clgs.bb->cost_count = update_cost_offsets(&clgs);
1347       clgs.bb->instr_len = clgs.instr_offset;
1348   }
1349
1350   CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
1351	     origAddr, clgs.bb->instr_len,
1352	     clgs.bb->cjmp_count, clgs.bb->cost_count);
1353   if (cJumps>0) {
1354       CLG_DEBUG(3, "                     [ ");
1355       for (i=0;i<cJumps;i++)
1356	   CLG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr);
1357       CLG_DEBUG(3, "], last inverted: %s \n",
1358		 clgs.bb->cjmp_inverted ? "yes":"no");
1359   }
1360
1361  return clgs.sbOut;
1362}
1363
1364/*--------------------------------------------------------------------*/
1365/*--- Discarding BB info                                           ---*/
1366/*--------------------------------------------------------------------*/
1367
1368// Called when a translation is removed from the translation cache for
1369// any reason at all: to free up space, because the guest code was
1370// unmapped or modified, or for any arbitrary reason.
1371static
1372void clg_discard_superblock_info ( Addr orig_addr, VexGuestExtents vge )
1373{
1374    tl_assert(vge.n_used > 0);
1375
1376   if (0)
1377      VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
1378                   (void*)orig_addr,
1379                   (void*)vge.base[0], (ULong)vge.len[0]);
1380
1381   // Get BB info, remove from table, free BB info.  Simple!
1382   // When created, the BB is keyed by the first instruction address,
1383   // (not orig_addr, but eventually redirected address). Thus, we
1384   // use the first instruction address in vge.
1385   CLG_(delete_bb)(vge.base[0]);
1386}
1387
1388
1389/*------------------------------------------------------------*/
1390/*--- CLG_(fini)() and related function                     ---*/
1391/*------------------------------------------------------------*/
1392
1393
1394
1395static void zero_thread_cost(thread_info* t)
1396{
1397  Int i;
1398
1399  for(i = 0; i < CLG_(current_call_stack).sp; i++) {
1400    if (!CLG_(current_call_stack).entry[i].jcc) continue;
1401
1402    /* reset call counters to current for active calls */
1403    CLG_(copy_cost)( CLG_(sets).full,
1404		    CLG_(current_call_stack).entry[i].enter_cost,
1405		    CLG_(current_state).cost );
1406    CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
1407  }
1408
1409  CLG_(forall_bbccs)(CLG_(zero_bbcc));
1410
1411  /* set counter for last dump */
1412  CLG_(copy_cost)( CLG_(sets).full,
1413		  t->lastdump_cost, CLG_(current_state).cost );
1414}
1415
1416void CLG_(zero_all_cost)(Bool only_current_thread)
1417{
1418  if (VG_(clo_verbosity) > 1)
1419    VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
1420
1421  if (only_current_thread)
1422    zero_thread_cost(CLG_(get_current_thread)());
1423  else
1424    CLG_(forall_threads)(zero_thread_cost);
1425
1426  if (VG_(clo_verbosity) > 1)
1427    VG_(message)(Vg_DebugMsg, "  ...done\n");
1428}
1429
1430static
1431void unwind_thread(thread_info* t)
1432{
1433  /* unwind signal handlers */
1434  while(CLG_(current_state).sig !=0)
1435    CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
1436
1437  /* unwind regular call stack */
1438  while(CLG_(current_call_stack).sp>0)
1439    CLG_(pop_call_stack)();
1440
1441  /* reset context and function stack for context generation */
1442  CLG_(init_exec_state)( &CLG_(current_state) );
1443  CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
1444}
1445
1446static
1447void zero_state_cost(thread_info* t)
1448{
1449    CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
1450}
1451
1452void CLG_(set_instrument_state)(const HChar* reason, Bool state)
1453{
1454  if (CLG_(instrument_state) == state) {
1455    CLG_DEBUG(2, "%s: instrumentation already %s\n",
1456	     reason, state ? "ON" : "OFF");
1457    return;
1458  }
1459  CLG_(instrument_state) = state;
1460  CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
1461	   reason, state ? "ON" : "OFF");
1462
1463  VG_(discard_translations_safely)( (Addr)0x1000, ~(SizeT)0xfff, "callgrind");
1464
1465  /* reset internal state: call stacks, simulator */
1466  CLG_(forall_threads)(unwind_thread);
1467  CLG_(forall_threads)(zero_state_cost);
1468  (*CLG_(cachesim).clear)();
1469
1470  if (VG_(clo_verbosity) > 1)
1471    VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
1472		 reason, state ? "ON" : "OFF");
1473}
1474
1475/* helper for dump_state_togdb */
1476static void dump_state_of_thread_togdb(thread_info* ti)
1477{
1478    static FullCost sum = 0, tmp = 0;
1479    Int t, i;
1480    BBCC *from, *to;
1481    call_entry* ce;
1482    HChar *mcost;
1483
1484    t = CLG_(current_tid);
1485    CLG_(init_cost_lz)( CLG_(sets).full, &sum );
1486    CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
1487    CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
1488			 ti->states.entry[0]->cost);
1489    CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
1490    mcost = CLG_(mappingcost_as_string)(CLG_(dumpmap), sum);
1491    VG_(gdb_printf)("events-%d: %s\n", t, mcost);
1492    VG_(free)(mcost);
1493    VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
1494
1495    ce = 0;
1496    for(i = 0; i < CLG_(current_call_stack).sp; i++) {
1497      ce = CLG_(get_call_entry)(i);
1498      /* if this frame is skipped, we don't have counters */
1499      if (!ce->jcc) continue;
1500
1501      from = ce->jcc->from;
1502      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
1503      VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
1504
1505      /* FIXME: EventSets! */
1506      CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
1507      CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
1508      CLG_(add_diff_cost)( CLG_(sets).full, sum,
1509			  ce->enter_cost, CLG_(current_state).cost );
1510      CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
1511
1512      mcost = CLG_(mappingcost_as_string)(CLG_(dumpmap), sum);
1513      VG_(gdb_printf)("events-%d-%d: %s\n",t, i, mcost);
1514      VG_(free)(mcost);
1515    }
1516    if (ce && ce->jcc) {
1517      to = ce->jcc->to;
1518      VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
1519    }
1520}
1521
1522/* Dump current state */
1523static void dump_state_togdb(void)
1524{
1525    thread_info** th;
1526    int t;
1527    Int orig_tid = CLG_(current_tid);
1528
1529    VG_(gdb_printf)("instrumentation: %s\n",
1530		    CLG_(instrument_state) ? "on":"off");
1531    if (!CLG_(instrument_state)) return;
1532
1533    VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
1534    VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
1535    VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
1536    VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
1537    VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
1538    VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
1539
1540    /* "events:" line. Given here because it will be dynamic in the future */
1541    HChar *evmap = CLG_(eventmapping_as_string)(CLG_(dumpmap));
1542    VG_(gdb_printf)("events: %s\n", evmap);
1543    VG_(free)(evmap);
1544    /* "part:" line (number of last part. Is 0 at start */
1545    VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
1546
1547    /* threads */
1548    th = CLG_(get_threads)();
1549    VG_(gdb_printf)("threads:");
1550    for(t=1;t<VG_N_THREADS;t++) {
1551	if (!th[t]) continue;
1552	VG_(gdb_printf)(" %d", t);
1553    }
1554    VG_(gdb_printf)("\n");
1555    VG_(gdb_printf)("current-tid: %d\n", orig_tid);
1556    CLG_(forall_threads)(dump_state_of_thread_togdb);
1557}
1558
1559
1560static void print_monitor_help ( void )
1561{
1562   VG_(gdb_printf) ("\n");
1563   VG_(gdb_printf) ("callgrind monitor commands:\n");
1564   VG_(gdb_printf) ("  dump [<dump_hint>]\n");
1565   VG_(gdb_printf) ("        dump counters\n");
1566   VG_(gdb_printf) ("  zero\n");
1567   VG_(gdb_printf) ("        zero counters\n");
1568   VG_(gdb_printf) ("  status\n");
1569   VG_(gdb_printf) ("        print status\n");
1570   VG_(gdb_printf) ("  instrumentation [on|off]\n");
1571   VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
1572   VG_(gdb_printf) ("\n");
1573}
1574
1575/* return True if request recognised, False otherwise */
1576static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
1577{
1578   HChar* wcmd;
1579   HChar s[VG_(strlen)(req) + 1]; /* copy for strtok_r */
1580   HChar *ssaveptr;
1581
1582   VG_(strcpy) (s, req);
1583
1584   wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
1585   switch (VG_(keyword_id) ("help dump zero status instrumentation",
1586                            wcmd, kwd_report_duplicated_matches)) {
1587   case -2: /* multiple matches */
1588      return True;
1589   case -1: /* not found */
1590      return False;
1591   case  0: /* help */
1592      print_monitor_help();
1593      return True;
1594   case  1: { /* dump */
1595      CLG_(dump_profile)(req, False);
1596      return True;
1597   }
1598   case  2: { /* zero */
1599      CLG_(zero_all_cost)(False);
1600      return True;
1601   }
1602
1603   case 3: { /* status */
1604     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
1605     if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
1606       /* internal interface to callgrind_control */
1607       dump_state_togdb();
1608       return True;
1609     }
1610
1611     if (!CLG_(instrument_state)) {
1612       VG_(gdb_printf)("No status available as instrumentation is switched off\n");
1613     } else {
1614       // Status information to be improved ...
1615       thread_info** th = CLG_(get_threads)();
1616       Int t, tcount = 0;
1617       for(t=1;t<VG_N_THREADS;t++)
1618	 if (th[t]) tcount++;
1619       VG_(gdb_printf)("%d thread(s) running.\n", tcount);
1620     }
1621     return True;
1622   }
1623
1624   case 4: { /* instrumentation */
1625     HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
1626     if (!arg) {
1627       VG_(gdb_printf)("instrumentation: %s\n",
1628		       CLG_(instrument_state) ? "on":"off");
1629     }
1630     else
1631       CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
1632     return True;
1633   }
1634
1635   default:
1636      tl_assert(0);
1637      return False;
1638   }
1639}
1640
1641static
1642Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
1643{
1644   if (!VG_IS_TOOL_USERREQ('C','T',args[0])
1645       && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
1646      return False;
1647
1648   switch(args[0]) {
1649   case VG_USERREQ__DUMP_STATS:
1650      CLG_(dump_profile)("Client Request", True);
1651      *ret = 0;                 /* meaningless */
1652      break;
1653
1654   case VG_USERREQ__DUMP_STATS_AT:
1655     {
1656       const HChar *arg = (HChar*)args[1];
1657       HChar buf[30 + VG_(strlen)(arg)];    // large enough
1658       VG_(sprintf)(buf,"Client Request: %s", arg);
1659       CLG_(dump_profile)(buf, True);
1660       *ret = 0;                 /* meaningless */
1661     }
1662     break;
1663
1664   case VG_USERREQ__ZERO_STATS:
1665     CLG_(zero_all_cost)(True);
1666      *ret = 0;                 /* meaningless */
1667      break;
1668
1669   case VG_USERREQ__TOGGLE_COLLECT:
1670     CLG_(current_state).collect = !CLG_(current_state).collect;
1671     CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
1672	      CLG_(current_state).collect ? "ON" : "OFF");
1673     *ret = 0;                 /* meaningless */
1674     break;
1675
1676   case VG_USERREQ__START_INSTRUMENTATION:
1677     CLG_(set_instrument_state)("Client Request", True);
1678     *ret = 0;                 /* meaningless */
1679     break;
1680
1681   case VG_USERREQ__STOP_INSTRUMENTATION:
1682     CLG_(set_instrument_state)("Client Request", False);
1683     *ret = 0;                 /* meaningless */
1684     break;
1685
1686   case VG_USERREQ__GDB_MONITOR_COMMAND: {
1687      Bool handled = handle_gdb_monitor_command (tid, (HChar*)args[1]);
1688      if (handled)
1689         *ret = 1;
1690      else
1691         *ret = 0;
1692      return handled;
1693   }
1694   default:
1695      return False;
1696   }
1697
1698   return True;
1699}
1700
1701
1702/* Syscall Timing */
1703
1704/* struct timeval syscalltime[VG_N_THREADS]; */
1705#if CLG_MICROSYSTIME
1706ULong *syscalltime;
1707#else
1708UInt *syscalltime;
1709#endif
1710
1711static
1712void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
1713                           UWord* args, UInt nArgs)
1714{
1715  if (CLG_(clo).collect_systime) {
1716#if CLG_MICROSYSTIME
1717    struct vki_timeval tv_now;
1718    VG_(gettimeofday)(&tv_now, NULL);
1719    syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
1720#else
1721    syscalltime[tid] = VG_(read_millisecond_timer)();
1722#endif
1723  }
1724}
1725
1726static
1727void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
1728                            UWord* args, UInt nArgs, SysRes res)
1729{
1730  if (CLG_(clo).collect_systime &&
1731      CLG_(current_state).bbcc) {
1732      Int o;
1733#if CLG_MICROSYSTIME
1734    struct vki_timeval tv_now;
1735    ULong diff;
1736
1737    VG_(gettimeofday)(&tv_now, NULL);
1738    diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
1739#else
1740    UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
1741#endif
1742
1743    /* offset o is for "SysCount", o+1 for "SysTime" */
1744    o = fullOffset(EG_SYS);
1745    CLG_ASSERT(o>=0);
1746    CLG_DEBUG(0,"   Time (Off %d) for Syscall %u: %llu\n", o, syscallno,
1747              (ULong)diff);
1748
1749    CLG_(current_state).cost[o] ++;
1750    CLG_(current_state).cost[o+1] += diff;
1751    if (!CLG_(current_state).bbcc->skipped)
1752      CLG_(init_cost_lz)(CLG_(sets).full,
1753			&(CLG_(current_state).bbcc->skipped));
1754    CLG_(current_state).bbcc->skipped[o] ++;
1755    CLG_(current_state).bbcc->skipped[o+1] += diff;
1756  }
1757}
1758
1759static UInt ULong_width(ULong n)
1760{
1761   UInt w = 0;
1762   while (n > 0) {
1763      n = n / 10;
1764      w++;
1765   }
1766   if (w == 0) w = 1;
1767   return w + (w-1)/3;   // add space for commas
1768}
1769
1770static
1771void branchsim_printstat(int l1, int l2, int l3)
1772{
1773    static HChar fmt[128];    // large enough
1774    FullCost total;
1775    ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
1776    ULong B_total_b, B_total_mp;
1777
1778    total = CLG_(total_cost);
1779    Bc_total_b  = total[ fullOffset(EG_BC)   ];
1780    Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
1781    Bi_total_b  = total[ fullOffset(EG_BI)   ];
1782    Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
1783
1784    /* Make format string, getting width right for numbers */
1785    VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
1786                 l1, l2, l3);
1787
1788    if (0 == Bc_total_b)  Bc_total_b = 1;
1789    if (0 == Bi_total_b)  Bi_total_b = 1;
1790    B_total_b  = Bc_total_b  + Bi_total_b;
1791    B_total_mp = Bc_total_mp + Bi_total_mp;
1792
1793    VG_(umsg)("\n");
1794    VG_(umsg)(fmt, "Branches:     ",
1795              B_total_b, Bc_total_b, Bi_total_b);
1796
1797    VG_(umsg)(fmt, "Mispredicts:  ",
1798              B_total_mp, Bc_total_mp, Bi_total_mp);
1799
1800    VG_(umsg)("Mispred rate:  %*.1f%% (%*.1f%%     + %*.1f%%   )\n",
1801              l1, B_total_mp  * 100.0 / B_total_b,
1802              l2, Bc_total_mp * 100.0 / Bc_total_b,
1803              l3, Bi_total_mp * 100.0 / Bi_total_b);
1804}
1805
1806static
1807void clg_print_stats(void)
1808{
1809   int BB_lookups =
1810     CLG_(stat).full_debug_BBs +
1811     CLG_(stat).fn_name_debug_BBs +
1812     CLG_(stat).file_line_debug_BBs +
1813     CLG_(stat).no_debug_BBs;
1814
1815   /* Hash table stats */
1816   VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
1817		CLG_(stat).distinct_objs);
1818   VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
1819		CLG_(stat).distinct_files);
1820   VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
1821		CLG_(stat).distinct_fns);
1822   VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
1823		CLG_(stat).distinct_contexts);
1824   VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
1825		CLG_(stat).distinct_bbs);
1826   VG_(message)(Vg_DebugMsg, "Cost entries:     %u (Chunks %u)\n",
1827		CLG_(costarray_entries), CLG_(costarray_chunks));
1828   VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
1829		CLG_(stat).distinct_bbccs);
1830   VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
1831		CLG_(stat).distinct_jccs);
1832   VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
1833		CLG_(stat).distinct_skips);
1834   VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
1835		BB_lookups);
1836   if (BB_lookups>0) {
1837      VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
1838		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
1839		   CLG_(stat).full_debug_BBs);
1840      VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
1841		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
1842		   CLG_(stat).file_line_debug_BBs);
1843      VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
1844		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
1845		   CLG_(stat).fn_name_debug_BBs);
1846      VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
1847		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
1848		   CLG_(stat).no_debug_BBs);
1849   }
1850   VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
1851		CLG_(stat).bbcc_clones);
1852   VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
1853		CLG_(stat).bb_retranslations);
1854   VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
1855		CLG_(stat).distinct_instrs);
1856
1857   VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
1858		CLG_(stat).cxt_lru_misses);
1859   VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
1860		CLG_(stat).bbcc_lru_misses);
1861   VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
1862		CLG_(stat).jcc_lru_misses);
1863   VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
1864		CLG_(stat).bb_executions);
1865   VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
1866		CLG_(stat).call_counter);
1867   VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
1868		CLG_(stat).jcnd_counter);
1869   VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
1870		CLG_(stat).jump_counter);
1871   VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
1872		CLG_(stat).rec_call_counter);
1873   VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
1874		CLG_(stat).ret_counter);
1875}
1876
1877
1878static
1879void finish(void)
1880{
1881  HChar fmt[128];    // large enough
1882  Int l1, l2, l3;
1883  FullCost total;
1884
1885  CLG_DEBUG(0, "finish()\n");
1886
1887  (*CLG_(cachesim).finish)();
1888
1889  /* pop all remaining items from CallStack for correct sum
1890   */
1891  CLG_(forall_threads)(unwind_thread);
1892
1893  CLG_(dump_profile)(0, False);
1894
1895  if (VG_(clo_verbosity) == 0) return;
1896
1897  if (VG_(clo_stats)) {
1898    VG_(message)(Vg_DebugMsg, "\n");
1899    clg_print_stats();
1900    VG_(message)(Vg_DebugMsg, "\n");
1901  }
1902
1903  HChar *evmap = CLG_(eventmapping_as_string)(CLG_(dumpmap));
1904  VG_(message)(Vg_UserMsg, "Events    : %s\n", evmap);
1905  VG_(free)(evmap);
1906  HChar *mcost = CLG_(mappingcost_as_string)(CLG_(dumpmap), CLG_(total_cost));
1907  VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost);
1908  VG_(free)(mcost);
1909  VG_(message)(Vg_UserMsg, "\n");
1910
1911  /* determine value widths for statistics */
1912  total = CLG_(total_cost);
1913  l1 = ULong_width( total[fullOffset(EG_IR)] );
1914  l2 = l3 = 0;
1915  if (CLG_(clo).simulate_cache) {
1916      l2 = ULong_width( total[fullOffset(EG_DR)] );
1917      l3 = ULong_width( total[fullOffset(EG_DW)] );
1918  }
1919  if (CLG_(clo).simulate_branch) {
1920      int l2b = ULong_width( total[fullOffset(EG_BC)] );
1921      int l3b = ULong_width( total[fullOffset(EG_BI)] );
1922      if (l2b > l2) l2 = l2b;
1923      if (l3b > l3) l3 = l3b;
1924  }
1925
1926  /* Make format string, getting width right for numbers */
1927  VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
1928
1929  /* Always print this */
1930  VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
1931
1932  if (CLG_(clo).simulate_cache)
1933      (*CLG_(cachesim).printstat)(l1, l2, l3);
1934
1935  if (CLG_(clo).simulate_branch)
1936      branchsim_printstat(l1, l2, l3);
1937
1938}
1939
1940
1941void CLG_(fini)(Int exitcode)
1942{
1943  finish();
1944}
1945
1946
1947/*--------------------------------------------------------------------*/
1948/*--- Setup                                                        ---*/
1949/*--------------------------------------------------------------------*/
1950
1951static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
1952{
1953   static ULong last_blocks_done = 0;
1954
1955   if (0)
1956      VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
1957
1958   /* throttle calls to CLG_(run_thread) by number of BBs executed */
1959   if (blocks_done - last_blocks_done < 5000) return;
1960   last_blocks_done = blocks_done;
1961
1962   CLG_(run_thread)( tid );
1963}
1964
1965static
1966void CLG_(post_clo_init)(void)
1967{
1968   if (VG_(clo_vex_control).iropt_register_updates_default
1969       != VexRegUpdSpAtMemAccess) {
1970      CLG_DEBUG(1, " Using user specified value for "
1971                "--vex-iropt-register-updates\n");
1972   } else {
1973      CLG_DEBUG(1,
1974                " Using default --vex-iropt-register-updates="
1975                "sp-at-mem-access\n");
1976   }
1977
1978   if (CLG_(clo).collect_systime) {
1979      VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
1980                                 CLG_(post_syscalltime));
1981      syscalltime = CLG_MALLOC("cl.main.pci.1",
1982                               VG_N_THREADS * sizeof syscalltime[0]);
1983      for (UInt i = 0; i < VG_N_THREADS; ++i) {
1984         syscalltime[i] = 0;
1985      }
1986   }
1987
1988   if (VG_(clo_px_file_backed) != VexRegUpdSpAtMemAccess) {
1989      CLG_DEBUG(1, " Using user specified value for "
1990                "--px-file-backed\n");
1991   } else {
1992      CLG_DEBUG(1,
1993                " Using default --px-file-backed="
1994                "sp-at-mem-access\n");
1995   }
1996
1997   if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
1998      VG_(message)(Vg_UserMsg,
1999                   "callgrind only works with --vex-iropt-unroll-thresh=0\n"
2000                   "=> resetting it back to 0\n");
2001      VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
2002   }
2003   if (VG_(clo_vex_control).guest_chase_thresh != 0) {
2004      VG_(message)(Vg_UserMsg,
2005                   "callgrind only works with --vex-guest-chase-thresh=0\n"
2006                   "=> resetting it back to 0\n");
2007      VG_(clo_vex_control).guest_chase_thresh = 0; // cannot be overridden.
2008   }
2009
2010   CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
2011   CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
2012   CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
2013
2014   if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
2015       VG_(message)(Vg_UserMsg, "Using source line as position.\n");
2016       CLG_(clo).dump_line = True;
2017   }
2018
2019   CLG_(init_dumps)();
2020
2021   (*CLG_(cachesim).post_clo_init)();
2022
2023   CLG_(init_eventsets)();
2024   CLG_(init_statistics)(& CLG_(stat));
2025   CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
2026
2027   /* initialize hash tables */
2028   CLG_(init_obj_table)();
2029   CLG_(init_cxt_table)();
2030   CLG_(init_bb_hash)();
2031
2032   CLG_(init_threads)();
2033   CLG_(run_thread)(1);
2034
2035   CLG_(instrument_state) = CLG_(clo).instrument_atstart;
2036
2037   if (VG_(clo_verbosity) > 0) {
2038      VG_(message)(Vg_UserMsg,
2039                   "For interactive control, run 'callgrind_control%s%s -h'.\n",
2040                   (VG_(arg_vgdb_prefix) ? " " : ""),
2041                   (VG_(arg_vgdb_prefix) ? VG_(arg_vgdb_prefix) : ""));
2042   }
2043}
2044
2045static
2046void CLG_(pre_clo_init)(void)
2047{
2048    VG_(details_name)            ("Callgrind");
2049    VG_(details_version)         (NULL);
2050    VG_(details_description)     ("a call-graph generating cache profiler");
2051    VG_(details_copyright_author)("Copyright (C) 2002-2017, and GNU GPL'd, "
2052				  "by Josef Weidendorfer et al.");
2053    VG_(details_bug_reports_to)  (VG_BUGS_TO);
2054    VG_(details_avg_translation_sizeB) ( 500 );
2055
2056    VG_(clo_vex_control).iropt_register_updates_default
2057       = VG_(clo_px_file_backed)
2058       = VexRegUpdSpAtMemAccess; // overridable by the user.
2059
2060    VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overridden.
2061    VG_(clo_vex_control).guest_chase_thresh = 0;    // cannot be overridden.
2062
2063    VG_(basic_tool_funcs)        (CLG_(post_clo_init),
2064                                  CLG_(instrument),
2065                                  CLG_(fini));
2066
2067    VG_(needs_superblock_discards)(clg_discard_superblock_info);
2068
2069
2070    VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
2071				    CLG_(print_usage),
2072				    CLG_(print_debug_usage));
2073
2074    VG_(needs_client_requests)(CLG_(handle_client_request));
2075    VG_(needs_print_stats)    (clg_print_stats);
2076
2077    VG_(track_start_client_code)  ( & clg_start_client_code_callback );
2078    VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
2079    VG_(track_post_deliver_signal)( & CLG_(post_signal) );
2080
2081    CLG_(set_clo_defaults)();
2082
2083}
2084
2085VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
2086
2087/*--------------------------------------------------------------------*/
2088/*--- end                                                   main.c ---*/
2089/*--------------------------------------------------------------------*/
2090