1
2/*--------------------------------------------------------------------*/
3/*--- Thread scheduling.                               scheduler.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2000-2013 Julian Seward
11      jseward@acm.org
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26   02111-1307, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29*/
30
31/*
32   Overview
33
34   Valgrind tries to emulate the kernel's threading as closely as
35   possible.  The client does all threading via the normal syscalls
36   (on Linux: clone, etc).  Valgrind emulates this by creating exactly
37   the same process structure as would be created without Valgrind.
38   There are no extra threads.
39
40   The main difference is that Valgrind only allows one client thread
41   to run at once.  This is controlled with the CPU Big Lock,
42   "the_BigLock".  Any time a thread wants to run client code or
43   manipulate any shared state (which is anything other than its own
44   ThreadState entry), it must hold the_BigLock.
45
46   When a thread is about to block in a blocking syscall, it releases
47   the_BigLock, and re-takes it when it becomes runnable again (either
48   because the syscall finished, or we took a signal).
49
50   VG_(scheduler) therefore runs in each thread.  It returns only when
51   the thread is exiting, either because it exited itself, or it was
52   told to exit by another thread.
53
54   This file is almost entirely OS-independent.  The details of how
55   the OS handles threading and signalling are abstracted away and
56   implemented elsewhere.  [Some of the functions have worked their
57   way back for the moment, until we do an OS port in earnest...]
58*/
59
60
61#include "pub_core_basics.h"
62#include "pub_core_debuglog.h"
63#include "pub_core_vki.h"
64#include "pub_core_vkiscnums.h"    // __NR_sched_yield
65#include "pub_core_libcsetjmp.h"   // to keep _threadstate.h happy
66#include "pub_core_threadstate.h"
67#include "pub_core_aspacemgr.h"
68#include "pub_core_clreq.h"         // for VG_USERREQ__*
69#include "pub_core_dispatch.h"
70#include "pub_core_errormgr.h"      // For VG_(get_n_errs_found)()
71#include "pub_core_gdbserver.h"     // for VG_(gdbserver) and VG_(gdbserver_activity)
72#include "pub_core_libcbase.h"
73#include "pub_core_libcassert.h"
74#include "pub_core_libcprint.h"
75#include "pub_core_libcproc.h"
76#include "pub_core_libcsignal.h"
77#if defined(VGO_darwin)
78#include "pub_core_mach.h"
79#endif
80#include "pub_core_machine.h"
81#include "pub_core_mallocfree.h"
82#include "pub_core_options.h"
83#include "pub_core_replacemalloc.h"
84#include "pub_core_sbprofile.h"
85#include "pub_core_signals.h"
86#include "pub_core_stacks.h"
87#include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
88#include "pub_core_syscall.h"
89#include "pub_core_syswrap.h"
90#include "pub_core_tooliface.h"
91#include "pub_core_translate.h"     // For VG_(translate)()
92#include "pub_core_transtab.h"
93#include "pub_core_debuginfo.h"     // VG_(di_notify_pdb_debuginfo)
94#include "priv_sched-lock.h"
95#include "pub_core_scheduler.h"     // self
96#include "pub_core_redir.h"
97#include "libvex_emnote.h"          // VexEmNote
98
99
100/* ---------------------------------------------------------------------
101   Types and globals for the scheduler.
102   ------------------------------------------------------------------ */
103
104/* ThreadId and ThreadState are defined elsewhere*/
105
106/* Defines the thread-scheduling timeslice, in terms of the number of
107   basic blocks we attempt to run each thread for.  Smaller values
108   give finer interleaving but much increased scheduling overheads. */
109#define SCHEDULING_QUANTUM   100000
110
111/* If False, a fault is Valgrind-internal (ie, a bug) */
112Bool VG_(in_generated_code) = False;
113
114/* 64-bit counter for the number of basic blocks done. */
115static ULong bbs_done = 0;
116
117/* Counter to see if vgdb activity is to be verified.
118   When nr of bbs done reaches vgdb_next_poll, scheduler will
119   poll for gdbserver activity. VG_(force_vgdb_poll) and
120   VG_(disable_vgdb_poll) allows the valgrind core (e.g. m_gdbserver)
121   to control when the next poll will be done. */
122static ULong vgdb_next_poll;
123
124/* Forwards */
125static void do_client_request ( ThreadId tid );
126static void scheduler_sanity ( ThreadId tid );
127static void mostly_clear_thread_record ( ThreadId tid );
128
129/* Stats. */
130static ULong n_scheduling_events_MINOR = 0;
131static ULong n_scheduling_events_MAJOR = 0;
132
133/* Stats: number of XIndirs, and number that missed in the fast
134   cache. */
135static ULong stats__n_xindirs = 0;
136static ULong stats__n_xindir_misses = 0;
137
138/* And 32-bit temp bins for the above, so that 32-bit platforms don't
139   have to do 64 bit incs on the hot path through
140   VG_(cp_disp_xindir). */
141/*global*/ UInt VG_(stats__n_xindirs_32) = 0;
142/*global*/ UInt VG_(stats__n_xindir_misses_32) = 0;
143
144/* Sanity checking counts. */
145static UInt sanity_fast_count = 0;
146static UInt sanity_slow_count = 0;
147
148void VG_(print_scheduler_stats)(void)
149{
150   VG_(message)(Vg_DebugMsg,
151      "scheduler: %'llu event checks.\n", bbs_done );
152   VG_(message)(Vg_DebugMsg,
153                "scheduler: %'llu indir transfers, %'llu misses (1 in %llu)\n",
154                stats__n_xindirs, stats__n_xindir_misses,
155                stats__n_xindirs / (stats__n_xindir_misses
156                                    ? stats__n_xindir_misses : 1));
157   VG_(message)(Vg_DebugMsg,
158      "scheduler: %'llu/%'llu major/minor sched events.\n",
159      n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
160   VG_(message)(Vg_DebugMsg,
161                "   sanity: %d cheap, %d expensive checks.\n",
162                sanity_fast_count, sanity_slow_count );
163}
164
165/*
166 * Mutual exclusion object used to serialize threads.
167 */
168static struct sched_lock *the_BigLock;
169
170
171/* ---------------------------------------------------------------------
172   Helper functions for the scheduler.
173   ------------------------------------------------------------------ */
174
175static
176void print_sched_event ( ThreadId tid, const HChar* what )
177{
178   VG_(message)(Vg_DebugMsg, "  SCHED[%d]: %s\n", tid, what );
179}
180
181/* For showing SB profiles, if the user asks to see them. */
182static
183void maybe_show_sb_profile ( void )
184{
185   /* DO NOT MAKE NON-STATIC */
186   static ULong bbs_done_lastcheck = 0;
187   /* */
188   vg_assert(VG_(clo_profyle_interval) > 0);
189   Long delta = (Long)(bbs_done - bbs_done_lastcheck);
190   vg_assert(delta >= 0);
191   if ((ULong)delta >= VG_(clo_profyle_interval)) {
192      bbs_done_lastcheck = bbs_done;
193      VG_(get_and_show_SB_profile)(bbs_done);
194   }
195}
196
197static
198const HChar* name_of_sched_event ( UInt event )
199{
200   switch (event) {
201      case VEX_TRC_JMP_INVALICACHE:    return "INVALICACHE";
202      case VEX_TRC_JMP_FLUSHDCACHE:    return "FLUSHDCACHE";
203      case VEX_TRC_JMP_NOREDIR:        return "NOREDIR";
204      case VEX_TRC_JMP_SIGILL:         return "SIGILL";
205      case VEX_TRC_JMP_SIGTRAP:        return "SIGTRAP";
206      case VEX_TRC_JMP_SIGSEGV:        return "SIGSEGV";
207      case VEX_TRC_JMP_SIGBUS:         return "SIGBUS";
208      case VEX_TRC_JMP_SIGFPE_INTOVF:
209      case VEX_TRC_JMP_SIGFPE_INTDIV:  return "SIGFPE";
210      case VEX_TRC_JMP_EMWARN:         return "EMWARN";
211      case VEX_TRC_JMP_EMFAIL:         return "EMFAIL";
212      case VEX_TRC_JMP_CLIENTREQ:      return "CLIENTREQ";
213      case VEX_TRC_JMP_YIELD:          return "YIELD";
214      case VEX_TRC_JMP_NODECODE:       return "NODECODE";
215      case VEX_TRC_JMP_MAPFAIL:        return "MAPFAIL";
216      case VEX_TRC_JMP_SYS_SYSCALL:    return "SYSCALL";
217      case VEX_TRC_JMP_SYS_INT32:      return "INT32";
218      case VEX_TRC_JMP_SYS_INT128:     return "INT128";
219      case VEX_TRC_JMP_SYS_INT129:     return "INT129";
220      case VEX_TRC_JMP_SYS_INT130:     return "INT130";
221      case VEX_TRC_JMP_SYS_SYSENTER:   return "SYSENTER";
222      case VEX_TRC_JMP_BORING:         return "VEX_BORING";
223
224      case VG_TRC_BORING:              return "VG_BORING";
225      case VG_TRC_INNER_FASTMISS:      return "FASTMISS";
226      case VG_TRC_INNER_COUNTERZERO:   return "COUNTERZERO";
227      case VG_TRC_FAULT_SIGNAL:        return "FAULTSIGNAL";
228      case VG_TRC_INVARIANT_FAILED:    return "INVFAILED";
229      case VG_TRC_CHAIN_ME_TO_SLOW_EP: return "CHAIN_ME_SLOW";
230      case VG_TRC_CHAIN_ME_TO_FAST_EP: return "CHAIN_ME_FAST";
231      default:                         return "??UNKNOWN??";
232  }
233}
234
235/* Allocate a completely empty ThreadState record. */
236ThreadId VG_(alloc_ThreadState) ( void )
237{
238   Int i;
239   for (i = 1; i < VG_N_THREADS; i++) {
240      if (VG_(threads)[i].status == VgTs_Empty) {
241	 VG_(threads)[i].status = VgTs_Init;
242	 VG_(threads)[i].exitreason = VgSrc_None;
243         if (VG_(threads)[i].thread_name)
244            VG_(arena_free)(VG_AR_CORE, VG_(threads)[i].thread_name);
245         VG_(threads)[i].thread_name = NULL;
246         return i;
247      }
248   }
249   VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
250   VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
251   VG_(core_panic)("VG_N_THREADS is too low");
252   /*NOTREACHED*/
253}
254
255/*
256   Mark a thread as Runnable.  This will block until the_BigLock is
257   available, so that we get exclusive access to all the shared
258   structures and the CPU.  Up until we get the_BigLock, we must not
259   touch any shared state.
260
261   When this returns, we'll actually be running.
262 */
263void VG_(acquire_BigLock)(ThreadId tid, const HChar* who)
264{
265   ThreadState *tst;
266
267#if 0
268   if (VG_(clo_trace_sched)) {
269      HChar buf[100];
270      vg_assert(VG_(strlen)(who) <= 100-50);
271      VG_(sprintf)(buf, "waiting for lock (%s)", who);
272      print_sched_event(tid, buf);
273   }
274#endif
275
276   /* First, acquire the_BigLock.  We can't do anything else safely
277      prior to this point.  Even doing debug printing prior to this
278      point is, technically, wrong. */
279   VG_(acquire_BigLock_LL)(NULL);
280
281   tst = VG_(get_ThreadState)(tid);
282
283   vg_assert(tst->status != VgTs_Runnable);
284
285   tst->status = VgTs_Runnable;
286
287   if (VG_(running_tid) != VG_INVALID_THREADID)
288      VG_(printf)("tid %d found %d running\n", tid, VG_(running_tid));
289   vg_assert(VG_(running_tid) == VG_INVALID_THREADID);
290   VG_(running_tid) = tid;
291
292   { Addr gsp = VG_(get_SP)(tid);
293      if (NULL != VG_(tdict).track_new_mem_stack_w_ECU)
294         VG_(unknown_SP_update_w_ECU)(gsp, gsp, 0/*unknown origin*/);
295      else
296         VG_(unknown_SP_update)(gsp, gsp);
297   }
298
299   if (VG_(clo_trace_sched)) {
300      HChar buf[150];
301      vg_assert(VG_(strlen)(who) <= 150-50);
302      VG_(sprintf)(buf, " acquired lock (%s)", who);
303      print_sched_event(tid, buf);
304   }
305}
306
307/*
308   Set a thread into a sleeping state, and give up exclusive access to
309   the CPU.  On return, the thread must be prepared to block until it
310   is ready to run again (generally this means blocking in a syscall,
311   but it may mean that we remain in a Runnable state and we're just
312   yielding the CPU to another thread).
313 */
314void VG_(release_BigLock)(ThreadId tid, ThreadStatus sleepstate,
315                          const HChar* who)
316{
317   ThreadState *tst = VG_(get_ThreadState)(tid);
318
319   vg_assert(tst->status == VgTs_Runnable);
320
321   vg_assert(sleepstate == VgTs_WaitSys ||
322	     sleepstate == VgTs_Yielding);
323
324   tst->status = sleepstate;
325
326   vg_assert(VG_(running_tid) == tid);
327   VG_(running_tid) = VG_INVALID_THREADID;
328
329   if (VG_(clo_trace_sched)) {
330      HChar buf[200];
331      vg_assert(VG_(strlen)(who) <= 200-100);
332      VG_(sprintf)(buf, "releasing lock (%s) -> %s",
333                        who, VG_(name_of_ThreadStatus)(sleepstate));
334      print_sched_event(tid, buf);
335   }
336
337   /* Release the_BigLock; this will reschedule any runnable
338      thread. */
339   VG_(release_BigLock_LL)(NULL);
340}
341
342static void init_BigLock(void)
343{
344   vg_assert(!the_BigLock);
345   the_BigLock = ML_(create_sched_lock)();
346}
347
348static void deinit_BigLock(void)
349{
350   ML_(destroy_sched_lock)(the_BigLock);
351   the_BigLock = NULL;
352}
353
354/* See pub_core_scheduler.h for description */
355void VG_(acquire_BigLock_LL) ( const HChar* who )
356{
357   ML_(acquire_sched_lock)(the_BigLock);
358}
359
360/* See pub_core_scheduler.h for description */
361void VG_(release_BigLock_LL) ( const HChar* who )
362{
363   ML_(release_sched_lock)(the_BigLock);
364}
365
366Bool VG_(owns_BigLock_LL) ( ThreadId tid )
367{
368   return (ML_(get_sched_lock_owner)(the_BigLock)
369           == VG_(threads)[tid].os_state.lwpid);
370}
371
372
373/* Clear out the ThreadState and release the semaphore. Leaves the
374   ThreadState in VgTs_Zombie state, so that it doesn't get
375   reallocated until the caller is really ready. */
376void VG_(exit_thread)(ThreadId tid)
377{
378   vg_assert(VG_(is_valid_tid)(tid));
379   vg_assert(VG_(is_running_thread)(tid));
380   vg_assert(VG_(is_exiting)(tid));
381
382   mostly_clear_thread_record(tid);
383   VG_(running_tid) = VG_INVALID_THREADID;
384
385   /* There should still be a valid exitreason for this thread */
386   vg_assert(VG_(threads)[tid].exitreason != VgSrc_None);
387
388   if (VG_(clo_trace_sched))
389      print_sched_event(tid, "release lock in VG_(exit_thread)");
390
391   VG_(release_BigLock_LL)(NULL);
392}
393
394/* If 'tid' is blocked in a syscall, send it SIGVGKILL so as to get it
395   out of the syscall and onto doing the next thing, whatever that is.
396   If it isn't blocked in a syscall, has no effect on the thread. */
397void VG_(get_thread_out_of_syscall)(ThreadId tid)
398{
399   vg_assert(VG_(is_valid_tid)(tid));
400   vg_assert(!VG_(is_running_thread)(tid));
401
402   if (VG_(threads)[tid].status == VgTs_WaitSys) {
403      if (VG_(clo_trace_signals)) {
404	 VG_(message)(Vg_DebugMsg,
405                      "get_thread_out_of_syscall zaps tid %d lwp %d\n",
406		      tid, VG_(threads)[tid].os_state.lwpid);
407      }
408#     if defined(VGO_darwin)
409      {
410         // GrP fixme use mach primitives on darwin?
411         // GrP fixme thread_abort_safely?
412         // GrP fixme race for thread with WaitSys set but not in syscall yet?
413         extern kern_return_t thread_abort(mach_port_t);
414         thread_abort(VG_(threads)[tid].os_state.lwpid);
415      }
416#     else
417      {
418         __attribute__((unused))
419         Int r = VG_(tkill)(VG_(threads)[tid].os_state.lwpid, VG_SIGVGKILL);
420         /* JRS 2009-Mar-20: should we assert for r==0 (tkill succeeded)?
421            I'm really not sure.  Here's a race scenario which argues
422            that we shoudn't; but equally I'm not sure the scenario is
423            even possible, because of constraints caused by the question
424            of who holds the BigLock when.
425
426            Target thread tid does sys_read on a socket and blocks.  This
427            function gets called, and we observe correctly that tid's
428            status is WaitSys but then for whatever reason this function
429            goes very slowly for a while.  Then data arrives from
430            wherever, tid's sys_read returns, tid exits.  Then we do
431            tkill on tid, but tid no longer exists; tkill returns an
432            error code and the assert fails. */
433         /* vg_assert(r == 0); */
434      }
435#     endif
436   }
437}
438
439/*
440   Yield the CPU for a short time to let some other thread run.
441 */
442void VG_(vg_yield)(void)
443{
444   ThreadId tid = VG_(running_tid);
445
446   vg_assert(tid != VG_INVALID_THREADID);
447   vg_assert(VG_(threads)[tid].os_state.lwpid == VG_(gettid)());
448
449   VG_(release_BigLock)(tid, VgTs_Yielding, "VG_(vg_yield)");
450
451   /*
452      Tell the kernel we're yielding.
453    */
454   VG_(do_syscall0)(__NR_sched_yield);
455
456   VG_(acquire_BigLock)(tid, "VG_(vg_yield)");
457}
458
459
460/* Set the standard set of blocked signals, used whenever we're not
461   running a client syscall. */
462static void block_signals(void)
463{
464   vki_sigset_t mask;
465
466   VG_(sigfillset)(&mask);
467
468   /* Don't block these because they're synchronous */
469   VG_(sigdelset)(&mask, VKI_SIGSEGV);
470   VG_(sigdelset)(&mask, VKI_SIGBUS);
471   VG_(sigdelset)(&mask, VKI_SIGFPE);
472   VG_(sigdelset)(&mask, VKI_SIGILL);
473   VG_(sigdelset)(&mask, VKI_SIGTRAP);
474
475   /* Can't block these anyway */
476   VG_(sigdelset)(&mask, VKI_SIGSTOP);
477   VG_(sigdelset)(&mask, VKI_SIGKILL);
478
479   VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, NULL);
480}
481
482static void os_state_clear(ThreadState *tst)
483{
484   tst->os_state.lwpid       = 0;
485   tst->os_state.threadgroup = 0;
486#  if defined(VGO_linux)
487   /* no other fields to clear */
488#  elif defined(VGO_darwin)
489   tst->os_state.post_mach_trap_fn = NULL;
490   tst->os_state.pthread           = 0;
491   tst->os_state.func_arg          = 0;
492   VG_(memset)(&tst->os_state.child_go, 0, sizeof(tst->os_state.child_go));
493   VG_(memset)(&tst->os_state.child_done, 0, sizeof(tst->os_state.child_done));
494   tst->os_state.wq_jmpbuf_valid   = False;
495   tst->os_state.remote_port       = 0;
496   tst->os_state.msgh_id           = 0;
497   VG_(memset)(&tst->os_state.mach_args, 0, sizeof(tst->os_state.mach_args));
498#  else
499#    error "Unknown OS"
500#  endif
501}
502
503static void os_state_init(ThreadState *tst)
504{
505   tst->os_state.valgrind_stack_base    = 0;
506   tst->os_state.valgrind_stack_init_SP = 0;
507   os_state_clear(tst);
508}
509
510static
511void mostly_clear_thread_record ( ThreadId tid )
512{
513   vki_sigset_t savedmask;
514
515   vg_assert(tid >= 0 && tid < VG_N_THREADS);
516   VG_(cleanup_thread)(&VG_(threads)[tid].arch);
517   VG_(threads)[tid].tid = tid;
518
519   /* Leave the thread in Zombie, so that it doesn't get reallocated
520      until the caller is finally done with the thread stack. */
521   VG_(threads)[tid].status               = VgTs_Zombie;
522
523   VG_(sigemptyset)(&VG_(threads)[tid].sig_mask);
524   VG_(sigemptyset)(&VG_(threads)[tid].tmp_sig_mask);
525
526   os_state_clear(&VG_(threads)[tid]);
527
528   /* start with no altstack */
529   VG_(threads)[tid].altstack.ss_sp = (void *)0xdeadbeef;
530   VG_(threads)[tid].altstack.ss_size = 0;
531   VG_(threads)[tid].altstack.ss_flags = VKI_SS_DISABLE;
532
533   VG_(clear_out_queued_signals)(tid, &savedmask);
534
535   VG_(threads)[tid].sched_jmpbuf_valid = False;
536}
537
538/*
539   Called in the child after fork.  If the parent has multiple
540   threads, then we've inherited a VG_(threads) array describing them,
541   but only the thread which called fork() is actually alive in the
542   child.  This functions needs to clean up all those other thread
543   structures.
544
545   Whichever tid in the parent which called fork() becomes the
546   master_tid in the child.  That's because the only living slot in
547   VG_(threads) in the child after fork is VG_(threads)[tid], and it
548   would be too hard to try to re-number the thread and relocate the
549   thread state down to VG_(threads)[1].
550
551   This function also needs to reinitialize the_BigLock, since
552   otherwise we may end up sharing its state with the parent, which
553   would be deeply confusing.
554*/
555static void sched_fork_cleanup(ThreadId me)
556{
557   ThreadId tid;
558   vg_assert(VG_(running_tid) == me);
559
560#  if defined(VGO_darwin)
561   // GrP fixme hack reset Mach ports
562   VG_(mach_init)();
563#  endif
564
565   VG_(threads)[me].os_state.lwpid = VG_(gettid)();
566   VG_(threads)[me].os_state.threadgroup = VG_(getpid)();
567
568   /* clear out all the unused thread slots */
569   for (tid = 1; tid < VG_N_THREADS; tid++) {
570      if (tid != me) {
571         mostly_clear_thread_record(tid);
572	 VG_(threads)[tid].status = VgTs_Empty;
573         VG_(clear_syscallInfo)(tid);
574      }
575   }
576
577   /* re-init and take the sema */
578   deinit_BigLock();
579   init_BigLock();
580   VG_(acquire_BigLock_LL)(NULL);
581}
582
583
584/* First phase of initialisation of the scheduler.  Initialise the
585   bigLock, zeroise the VG_(threads) structure and decide on the
586   ThreadId of the root thread.
587*/
588ThreadId VG_(scheduler_init_phase1) ( void )
589{
590   Int i;
591   ThreadId tid_main;
592
593   VG_(debugLog)(1,"sched","sched_init_phase1\n");
594
595   if (VG_(clo_fair_sched) != disable_fair_sched
596       && !ML_(set_sched_lock_impl)(sched_lock_ticket)
597       && VG_(clo_fair_sched) == enable_fair_sched)
598   {
599      VG_(printf)("Error: fair scheduling is not supported on this system.\n");
600      VG_(exit)(1);
601   }
602
603   if (VG_(clo_verbosity) > 1) {
604      VG_(message)(Vg_DebugMsg,
605                   "Scheduler: using %s scheduler lock implementation.\n",
606                   ML_(get_sched_lock_name)());
607   }
608
609   init_BigLock();
610
611   for (i = 0 /* NB; not 1 */; i < VG_N_THREADS; i++) {
612      /* Paranoia .. completely zero it out. */
613      VG_(memset)( & VG_(threads)[i], 0, sizeof( VG_(threads)[i] ) );
614
615      VG_(threads)[i].sig_queue = NULL;
616
617      os_state_init(&VG_(threads)[i]);
618      mostly_clear_thread_record(i);
619
620      VG_(threads)[i].status                    = VgTs_Empty;
621      VG_(threads)[i].client_stack_szB          = 0;
622      VG_(threads)[i].client_stack_highest_word = (Addr)NULL;
623      VG_(threads)[i].err_disablement_level     = 0;
624      VG_(threads)[i].thread_name               = NULL;
625   }
626
627   tid_main = VG_(alloc_ThreadState)();
628
629   /* Bleh.  Unfortunately there are various places in the system that
630      assume that the main thread has a ThreadId of 1.
631      - Helgrind (possibly)
632      - stack overflow message in default_action() in m_signals.c
633      - definitely a lot more places
634   */
635   vg_assert(tid_main == 1);
636
637   return tid_main;
638}
639
640
641/* Second phase of initialisation of the scheduler.  Given the root
642   ThreadId computed by first phase of initialisation, fill in stack
643   details and acquire bigLock.  Initialise the scheduler.  This is
644   called at startup.  The caller subsequently initialises the guest
645   state components of this main thread.
646*/
647void VG_(scheduler_init_phase2) ( ThreadId tid_main,
648                                  Addr     clstack_end,
649                                  SizeT    clstack_size )
650{
651   VG_(debugLog)(1,"sched","sched_init_phase2: tid_main=%d, "
652                   "cls_end=0x%lx, cls_sz=%ld\n",
653                   tid_main, clstack_end, clstack_size);
654
655   vg_assert(VG_IS_PAGE_ALIGNED(clstack_end+1));
656   vg_assert(VG_IS_PAGE_ALIGNED(clstack_size));
657
658   VG_(threads)[tid_main].client_stack_highest_word
659      = clstack_end + 1 - sizeof(UWord);
660   VG_(threads)[tid_main].client_stack_szB
661      = clstack_size;
662
663   VG_(atfork)(NULL, NULL, sched_fork_cleanup);
664}
665
666
667/* ---------------------------------------------------------------------
668   Helpers for running translations.
669   ------------------------------------------------------------------ */
670
671/* Use gcc's built-in setjmp/longjmp.  longjmp must not restore signal
672   mask state, but does need to pass "val" through.  jumped must be a
673   volatile UWord. */
674#define SCHEDSETJMP(tid, jumped, stmt)					\
675   do {									\
676      ThreadState * volatile _qq_tst = VG_(get_ThreadState)(tid);	\
677									\
678      (jumped) = VG_MINIMAL_SETJMP(_qq_tst->sched_jmpbuf);              \
679      if ((jumped) == ((UWord)0)) {                                     \
680	 vg_assert(!_qq_tst->sched_jmpbuf_valid);			\
681	 _qq_tst->sched_jmpbuf_valid = True;				\
682	 stmt;								\
683      }	else if (VG_(clo_trace_sched))					\
684	 VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=%ld\n",       \
685                     __LINE__, tid, jumped);                            \
686      vg_assert(_qq_tst->sched_jmpbuf_valid);				\
687      _qq_tst->sched_jmpbuf_valid = False;				\
688   } while(0)
689
690
691/* Do various guest state alignment checks prior to running a thread.
692   Specifically, check that what we have matches Vex's guest state
693   layout requirements.  See libvex.h for details, but in short the
694   requirements are: There must be no holes in between the primary
695   guest state, its two copies, and the spill area.  In short, all 4
696   areas must have a 16-aligned size and be 16-aligned, and placed
697   back-to-back. */
698static void do_pre_run_checks ( ThreadState* tst )
699{
700   Addr a_vex     = (Addr) & tst->arch.vex;
701   Addr a_vexsh1  = (Addr) & tst->arch.vex_shadow1;
702   Addr a_vexsh2  = (Addr) & tst->arch.vex_shadow2;
703   Addr a_spill   = (Addr) & tst->arch.vex_spill;
704   UInt sz_vex    = (UInt) sizeof tst->arch.vex;
705   UInt sz_vexsh1 = (UInt) sizeof tst->arch.vex_shadow1;
706   UInt sz_vexsh2 = (UInt) sizeof tst->arch.vex_shadow2;
707   UInt sz_spill  = (UInt) sizeof tst->arch.vex_spill;
708
709   if (0)
710   VG_(printf)("gst %p %d, sh1 %p %d, "
711               "sh2 %p %d, spill %p %d\n",
712               (void*)a_vex, sz_vex,
713               (void*)a_vexsh1, sz_vexsh1,
714               (void*)a_vexsh2, sz_vexsh2,
715               (void*)a_spill, sz_spill );
716
717   vg_assert(VG_IS_16_ALIGNED(sz_vex));
718   vg_assert(VG_IS_16_ALIGNED(sz_vexsh1));
719   vg_assert(VG_IS_16_ALIGNED(sz_vexsh2));
720   vg_assert(VG_IS_16_ALIGNED(sz_spill));
721
722   vg_assert(VG_IS_16_ALIGNED(a_vex));
723   vg_assert(VG_IS_16_ALIGNED(a_vexsh1));
724   vg_assert(VG_IS_16_ALIGNED(a_vexsh2));
725   vg_assert(VG_IS_16_ALIGNED(a_spill));
726
727   /* Check that the guest state and its two shadows have the same
728      size, and that there are no holes in between.  The latter is
729      important because Memcheck assumes that it can reliably access
730      the shadows by indexing off a pointer to the start of the
731      primary guest state area. */
732   vg_assert(sz_vex == sz_vexsh1);
733   vg_assert(sz_vex == sz_vexsh2);
734   vg_assert(a_vex + 1 * sz_vex == a_vexsh1);
735   vg_assert(a_vex + 2 * sz_vex == a_vexsh2);
736   /* Also check there's no hole between the second shadow area and
737      the spill area. */
738   vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
739   vg_assert(a_vex + 3 * sz_vex == a_spill);
740
741#  if defined(VGA_x86)
742   /* x86 XMM regs must form an array, ie, have no holes in
743      between. */
744   vg_assert(
745      (offsetof(VexGuestX86State,guest_XMM7)
746       - offsetof(VexGuestX86State,guest_XMM0))
747      == (8/*#regs*/-1) * 16/*bytes per reg*/
748   );
749   vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestX86State,guest_XMM0)));
750   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestX86State,guest_FPREG)));
751   vg_assert(8 == offsetof(VexGuestX86State,guest_EAX));
752   vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EAX)));
753   vg_assert(VG_IS_4_ALIGNED(offsetof(VexGuestX86State,guest_EIP)));
754#  endif
755
756#  if defined(VGA_amd64)
757   /* amd64 YMM regs must form an array, ie, have no holes in
758      between. */
759   vg_assert(
760      (offsetof(VexGuestAMD64State,guest_YMM16)
761       - offsetof(VexGuestAMD64State,guest_YMM0))
762      == (17/*#regs*/-1) * 32/*bytes per reg*/
763   );
764   vg_assert(VG_IS_16_ALIGNED(offsetof(VexGuestAMD64State,guest_YMM0)));
765   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_FPREG)));
766   vg_assert(16 == offsetof(VexGuestAMD64State,guest_RAX));
767   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RAX)));
768   vg_assert(VG_IS_8_ALIGNED(offsetof(VexGuestAMD64State,guest_RIP)));
769#  endif
770
771#  if defined(VGA_ppc32) || defined(VGA_ppc64)
772   /* ppc guest_state vector regs must be 16 byte aligned for
773      loads/stores.  This is important! */
774   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VSR0));
775   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VSR0));
776   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VSR0));
777   /* be extra paranoid .. */
778   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VSR1));
779   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VSR1));
780   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VSR1));
781#  endif
782
783#  if defined(VGA_arm)
784   /* arm guest_state VFP regs must be 8 byte aligned for
785      loads/stores.  Let's use 16 just to be on the safe side. */
786   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_D0));
787   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
788   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
789   /* be extra paranoid .. */
790   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D1));
791   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D1));
792   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D1));
793#  endif
794
795#  if defined(VGA_arm64)
796   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_X0));
797   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_X0));
798   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_X0));
799   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_Q0));
800   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_Q0));
801   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_Q0));
802#  endif
803
804#  if defined(VGA_s390x)
805   /* no special requirements */
806#  endif
807
808#  if defined(VGA_mips32) || defined(VGA_mips64)
809   /* no special requirements */
810#  endif
811}
812
813// NO_VGDB_POLL value ensures vgdb is not polled, while
814// VGDB_POLL_ASAP ensures that the next scheduler call
815// will cause a poll.
816#define NO_VGDB_POLL    0xffffffffffffffffULL
817#define VGDB_POLL_ASAP  0x0ULL
818
819void VG_(disable_vgdb_poll) (void )
820{
821   vgdb_next_poll = NO_VGDB_POLL;
822}
823void VG_(force_vgdb_poll) ( void )
824{
825   vgdb_next_poll = VGDB_POLL_ASAP;
826}
827
828/* Run the thread tid for a while, and return a VG_TRC_* value
829   indicating why VG_(disp_run_translations) stopped, and possibly an
830   auxiliary word.  Also, only allow the thread to run for at most
831   *dispatchCtrP events.  If (as is the normal case) use_alt_host_addr
832   is False, we are running ordinary redir'd translations, and we
833   should therefore start by looking up the guest next IP in TT.  If
834   it is True then we ignore the guest next IP and just run from
835   alt_host_addr, which presumably points at host code for a no-redir
836   translation.
837
838   Return results are placed in two_words.  two_words[0] is set to the
839   TRC.  In the case where that is VG_TRC_CHAIN_ME_TO_{SLOW,FAST}_EP,
840   the address to patch is placed in two_words[1].
841*/
842static
843void run_thread_for_a_while ( /*OUT*/HWord* two_words,
844                              /*MOD*/Int*   dispatchCtrP,
845                              ThreadId      tid,
846                              HWord         alt_host_addr,
847                              Bool          use_alt_host_addr )
848{
849   volatile HWord        jumped         = 0;
850   volatile ThreadState* tst            = NULL; /* stop gcc complaining */
851   volatile Int          done_this_time = 0;
852   volatile HWord        host_code_addr = 0;
853
854   /* Paranoia */
855   vg_assert(VG_(is_valid_tid)(tid));
856   vg_assert(VG_(is_running_thread)(tid));
857   vg_assert(!VG_(is_exiting)(tid));
858   vg_assert(*dispatchCtrP > 0);
859
860   tst = VG_(get_ThreadState)(tid);
861   do_pre_run_checks( (ThreadState*)tst );
862   /* end Paranoia */
863
864   /* Futz with the XIndir stats counters. */
865   vg_assert(VG_(stats__n_xindirs_32) == 0);
866   vg_assert(VG_(stats__n_xindir_misses_32) == 0);
867
868   /* Clear return area. */
869   two_words[0] = two_words[1] = 0;
870
871   /* Figure out where we're starting from. */
872   if (use_alt_host_addr) {
873      /* unusual case -- no-redir translation */
874      host_code_addr = alt_host_addr;
875   } else {
876      /* normal case -- redir translation */
877      UInt cno = (UInt)VG_TT_FAST_HASH((Addr)tst->arch.vex.VG_INSTR_PTR);
878      if (LIKELY(VG_(tt_fast)[cno].guest == (Addr)tst->arch.vex.VG_INSTR_PTR))
879         host_code_addr = VG_(tt_fast)[cno].host;
880      else {
881         AddrH res   = 0;
882         /* not found in VG_(tt_fast). Searching here the transtab
883            improves the performance compared to returning directly
884            to the scheduler. */
885         Bool  found = VG_(search_transtab)(&res, NULL, NULL,
886                                            (Addr)tst->arch.vex.VG_INSTR_PTR,
887                                            True/*upd cache*/
888                                            );
889         if (LIKELY(found)) {
890            host_code_addr = res;
891         } else {
892            /* At this point, we know that we intended to start at a
893               normal redir translation, but it was not found.  In
894               which case we can return now claiming it's not
895               findable. */
896            two_words[0] = VG_TRC_INNER_FASTMISS; /* hmm, is that right? */
897            return;
898         }
899      }
900   }
901   /* We have either a no-redir or a redir translation. */
902   vg_assert(host_code_addr != 0); /* implausible */
903
904   /* there should be no undealt-with signals */
905   //vg_assert(VG_(threads)[tid].siginfo.si_signo == 0);
906
907   /* Set up event counter stuff for the run. */
908   tst->arch.vex.host_EvC_COUNTER = *dispatchCtrP;
909   tst->arch.vex.host_EvC_FAILADDR
910      = (HWord)VG_(fnptr_to_fnentry)( &VG_(disp_cp_evcheck_fail) );
911
912   if (0) {
913      vki_sigset_t m;
914      Int i, err = VG_(sigprocmask)(VKI_SIG_SETMASK, NULL, &m);
915      vg_assert(err == 0);
916      VG_(printf)("tid %d: entering code with unblocked signals: ", tid);
917      for (i = 1; i <= _VKI_NSIG; i++)
918         if (!VG_(sigismember)(&m, i))
919            VG_(printf)("%d ", i);
920      VG_(printf)("\n");
921   }
922
923   /* Set up return-value area. */
924
925   // Tell the tool this thread is about to run client code
926   VG_TRACK( start_client_code, tid, bbs_done );
927
928   vg_assert(VG_(in_generated_code) == False);
929   VG_(in_generated_code) = True;
930
931   SCHEDSETJMP(
932      tid,
933      jumped,
934      VG_(disp_run_translations)(
935         two_words,
936         (void*)&tst->arch.vex,
937         host_code_addr
938      )
939   );
940
941   vg_assert(VG_(in_generated_code) == True);
942   VG_(in_generated_code) = False;
943
944   if (jumped != (HWord)0) {
945      /* We get here if the client took a fault that caused our signal
946         handler to longjmp. */
947      vg_assert(two_words[0] == 0 && two_words[1] == 0); // correct?
948      two_words[0] = VG_TRC_FAULT_SIGNAL;
949      two_words[1] = 0;
950      block_signals();
951   }
952
953   /* Merge the 32-bit XIndir/miss counters into the 64 bit versions,
954      and zero out the 32-bit ones in preparation for the next run of
955      generated code. */
956   stats__n_xindirs += (ULong)VG_(stats__n_xindirs_32);
957   VG_(stats__n_xindirs_32) = 0;
958   stats__n_xindir_misses += (ULong)VG_(stats__n_xindir_misses_32);
959   VG_(stats__n_xindir_misses_32) = 0;
960
961   /* Inspect the event counter. */
962   vg_assert((Int)tst->arch.vex.host_EvC_COUNTER >= -1);
963   vg_assert(tst->arch.vex.host_EvC_FAILADDR
964             == (HWord)VG_(fnptr_to_fnentry)( &VG_(disp_cp_evcheck_fail)) );
965
966   done_this_time = *dispatchCtrP - ((Int)tst->arch.vex.host_EvC_COUNTER + 1);
967
968   vg_assert(done_this_time >= 0);
969   bbs_done += (ULong)done_this_time;
970
971   *dispatchCtrP -= done_this_time;
972   vg_assert(*dispatchCtrP >= 0);
973
974   // Tell the tool this thread has stopped running client code
975   VG_TRACK( stop_client_code, tid, bbs_done );
976
977   if (bbs_done >= vgdb_next_poll) {
978      if (VG_(clo_vgdb_poll))
979         vgdb_next_poll = bbs_done + (ULong)VG_(clo_vgdb_poll);
980      else
981         /* value was changed due to gdbserver invocation via ptrace */
982         vgdb_next_poll = NO_VGDB_POLL;
983      if (VG_(gdbserver_activity) (tid))
984         VG_(gdbserver) (tid);
985   }
986
987   /* TRC value and possible auxiliary patch-address word are already
988      in two_words[0] and [1] respectively, as a result of the call to
989      VG_(run_innerloop). */
990   /* Stay sane .. */
991   if (two_words[0] == VG_TRC_CHAIN_ME_TO_SLOW_EP
992       || two_words[0] == VG_TRC_CHAIN_ME_TO_FAST_EP) {
993      vg_assert(two_words[1] != 0); /* we have a legit patch addr */
994   } else {
995      vg_assert(two_words[1] == 0); /* nobody messed with it */
996   }
997}
998
999
1000/* ---------------------------------------------------------------------
1001   The scheduler proper.
1002   ------------------------------------------------------------------ */
1003
1004static void handle_tt_miss ( ThreadId tid )
1005{
1006   Bool found;
1007   Addr ip = VG_(get_IP)(tid);
1008
1009   /* Trivial event.  Miss in the fast-cache.  Do a full
1010      lookup for it. */
1011   found = VG_(search_transtab)( NULL, NULL, NULL,
1012                                 ip, True/*upd_fast_cache*/ );
1013   if (UNLIKELY(!found)) {
1014      /* Not found; we need to request a translation. */
1015      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/,
1016                          bbs_done, True/*allow redirection*/ )) {
1017         found = VG_(search_transtab)( NULL, NULL, NULL,
1018                                       ip, True );
1019         vg_assert2(found, "handle_tt_miss: missing tt_fast entry");
1020
1021      } else {
1022	 // If VG_(translate)() fails, it's because it had to throw a
1023	 // signal because the client jumped to a bad address.  That
1024	 // means that either a signal has been set up for delivery,
1025	 // or the thread has been marked for termination.  Either
1026	 // way, we just need to go back into the scheduler loop.
1027      }
1028   }
1029}
1030
1031static
1032void handle_chain_me ( ThreadId tid, void* place_to_chain, Bool toFastEP )
1033{
1034   Bool found          = False;
1035   Addr ip             = VG_(get_IP)(tid);
1036   UInt to_sNo         = (UInt)-1;
1037   UInt to_tteNo       = (UInt)-1;
1038
1039   found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
1040                                 ip, False/*dont_upd_fast_cache*/ );
1041   if (!found) {
1042      /* Not found; we need to request a translation. */
1043      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/,
1044                          bbs_done, True/*allow redirection*/ )) {
1045         found = VG_(search_transtab)( NULL, &to_sNo, &to_tteNo,
1046                                       ip, False );
1047         vg_assert2(found, "handle_chain_me: missing tt_fast entry");
1048      } else {
1049	 // If VG_(translate)() fails, it's because it had to throw a
1050	 // signal because the client jumped to a bad address.  That
1051	 // means that either a signal has been set up for delivery,
1052	 // or the thread has been marked for termination.  Either
1053	 // way, we just need to go back into the scheduler loop.
1054        return;
1055      }
1056   }
1057   vg_assert(found);
1058   vg_assert(to_sNo != -1);
1059   vg_assert(to_tteNo != -1);
1060
1061   /* So, finally we know where to patch through to.  Do the patching
1062      and update the various admin tables that allow it to be undone
1063      in the case that the destination block gets deleted. */
1064   VG_(tt_tc_do_chaining)( place_to_chain,
1065                           to_sNo, to_tteNo, toFastEP );
1066}
1067
1068static void handle_syscall(ThreadId tid, UInt trc)
1069{
1070   ThreadState * volatile tst = VG_(get_ThreadState)(tid);
1071   volatile UWord jumped;
1072
1073   /* Syscall may or may not block; either way, it will be
1074      complete by the time this call returns, and we'll be
1075      runnable again.  We could take a signal while the
1076      syscall runs. */
1077
1078   if (VG_(clo_sanity_level >= 3))
1079      VG_(am_do_sync_check)("(BEFORE SYSCALL)",__FILE__,__LINE__);
1080
1081   SCHEDSETJMP(tid, jumped, VG_(client_syscall)(tid, trc));
1082
1083   if (VG_(clo_sanity_level >= 3))
1084      VG_(am_do_sync_check)("(AFTER SYSCALL)",__FILE__,__LINE__);
1085
1086   if (!VG_(is_running_thread)(tid))
1087      VG_(printf)("tid %d not running; VG_(running_tid)=%d, tid %d status %d\n",
1088		  tid, VG_(running_tid), tid, tst->status);
1089   vg_assert(VG_(is_running_thread)(tid));
1090
1091   if (jumped != (UWord)0) {
1092      block_signals();
1093      VG_(poll_signals)(tid);
1094   }
1095}
1096
1097/* tid just requested a jump to the noredir version of its current
1098   program counter.  So make up that translation if needed, run it,
1099   and return the resulting thread return code in two_words[]. */
1100static
1101void handle_noredir_jump ( /*OUT*/HWord* two_words,
1102                           /*MOD*/Int*   dispatchCtrP,
1103                           ThreadId tid )
1104{
1105   /* Clear return area. */
1106   two_words[0] = two_words[1] = 0;
1107
1108   AddrH hcode = 0;
1109   Addr  ip    = VG_(get_IP)(tid);
1110
1111   Bool  found = VG_(search_unredir_transtab)( &hcode, ip );
1112   if (!found) {
1113      /* Not found; we need to request a translation. */
1114      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
1115                          False/*NO REDIRECTION*/ )) {
1116
1117         found = VG_(search_unredir_transtab)( &hcode, ip );
1118         vg_assert2(found, "unredir translation missing after creation?!");
1119      } else {
1120	 // If VG_(translate)() fails, it's because it had to throw a
1121	 // signal because the client jumped to a bad address.  That
1122	 // means that either a signal has been set up for delivery,
1123	 // or the thread has been marked for termination.  Either
1124	 // way, we just need to go back into the scheduler loop.
1125         two_words[0] = VG_TRC_BORING;
1126         return;
1127      }
1128
1129   }
1130
1131   vg_assert(found);
1132   vg_assert(hcode != 0);
1133
1134   /* Otherwise run it and return the resulting VG_TRC_* value. */
1135   vg_assert(*dispatchCtrP > 0); /* so as to guarantee progress */
1136   run_thread_for_a_while( two_words, dispatchCtrP, tid,
1137                           hcode, True/*use hcode*/ );
1138}
1139
1140
1141/*
1142   Run a thread until it wants to exit.
1143
1144   We assume that the caller has already called VG_(acquire_BigLock) for
1145   us, so we own the VCPU.  Also, all signals are blocked.
1146 */
1147VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
1148{
1149   /* Holds the remaining size of this thread's "timeslice". */
1150   Int dispatch_ctr = 0;
1151
1152   ThreadState *tst = VG_(get_ThreadState)(tid);
1153   static Bool vgdb_startup_action_done = False;
1154
1155   if (VG_(clo_trace_sched))
1156      print_sched_event(tid, "entering VG_(scheduler)");
1157
1158   /* Do vgdb initialization (but once). Only the first (main) task
1159      starting up will do the below.
1160      Initialize gdbserver earlier than at the first
1161      thread VG_(scheduler) is causing problems:
1162      * at the end of VG_(scheduler_init_phase2) :
1163        The main thread is in VgTs_Init state, but in a not yet
1164        consistent state => the thread cannot be reported to gdb
1165        (e.g. causes an assert in LibVEX_GuestX86_get_eflags when giving
1166        back the guest registers to gdb).
1167      * at end of valgrind_main, just
1168        before VG_(main_thread_wrapper_NORETURN)(1) :
1169        The main thread is still in VgTs_Init state but in a
1170        more advanced state. However, the thread state is not yet
1171        completely initialized : a.o., the os_state is not yet fully
1172        set => the thread is then not properly reported to gdb,
1173        which is then confused (causing e.g. a duplicate thread be
1174        shown, without thread id).
1175      * it would be possible to initialize gdbserver "lower" in the
1176        call stack (e.g. in VG_(main_thread_wrapper_NORETURN)) but
1177        these are platform dependent and the place at which
1178        the thread state is completely initialized is not
1179        specific anymore to the main thread (so a similar "do it only
1180        once" would be needed).
1181
1182        => a "once only" initialization here is the best compromise. */
1183   if (!vgdb_startup_action_done) {
1184      vg_assert(tid == 1); // it must be the main thread.
1185      vgdb_startup_action_done = True;
1186      if (VG_(clo_vgdb) != Vg_VgdbNo) {
1187         /* If we have to poll, ensures we do an initial poll at first
1188            scheduler call. Otherwise, ensure no poll (unless interrupted
1189            by ptrace). */
1190         if (VG_(clo_vgdb_poll))
1191            VG_(force_vgdb_poll) ();
1192         else
1193            VG_(disable_vgdb_poll) ();
1194
1195         vg_assert (VG_(dyn_vgdb_error) == VG_(clo_vgdb_error));
1196         /* As we are initializing, VG_(dyn_vgdb_error) can't have been
1197            changed yet. */
1198
1199         VG_(gdbserver_prerun_action) (1);
1200      } else {
1201         VG_(disable_vgdb_poll) ();
1202      }
1203   }
1204
1205   /* set the proper running signal mask */
1206   block_signals();
1207
1208   vg_assert(VG_(is_running_thread)(tid));
1209
1210   dispatch_ctr = SCHEDULING_QUANTUM;
1211
1212   while (!VG_(is_exiting)(tid)) {
1213
1214      vg_assert(dispatch_ctr >= 0);
1215      if (dispatch_ctr == 0) {
1216
1217	 /* Our slice is done, so yield the CPU to another thread.  On
1218            Linux, this doesn't sleep between sleeping and running,
1219            since that would take too much time. */
1220
1221	 /* 4 July 06: it seems that a zero-length nsleep is needed to
1222            cause async thread cancellation (canceller.c) to terminate
1223            in finite time; else it is in some kind of race/starvation
1224            situation and completion is arbitrarily delayed (although
1225            this is not a deadlock).
1226
1227            Unfortunately these sleeps cause MPI jobs not to terminate
1228            sometimes (some kind of livelock).  So sleeping once
1229            every N opportunities appears to work. */
1230
1231	 /* 3 Aug 06: doing sys__nsleep works but crashes some apps.
1232            sys_yield also helps the problem, whilst not crashing apps. */
1233
1234	 VG_(release_BigLock)(tid, VgTs_Yielding,
1235                                   "VG_(scheduler):timeslice");
1236	 /* ------------ now we don't have The Lock ------------ */
1237
1238	 VG_(acquire_BigLock)(tid, "VG_(scheduler):timeslice");
1239	 /* ------------ now we do have The Lock ------------ */
1240
1241	 /* OK, do some relatively expensive housekeeping stuff */
1242	 scheduler_sanity(tid);
1243	 VG_(sanity_check_general)(False);
1244
1245	 /* Look for any pending signals for this thread, and set them up
1246	    for delivery */
1247	 VG_(poll_signals)(tid);
1248
1249	 if (VG_(is_exiting)(tid))
1250	    break;		/* poll_signals picked up a fatal signal */
1251
1252	 /* For stats purposes only. */
1253	 n_scheduling_events_MAJOR++;
1254
1255	 /* Figure out how many bbs to ask vg_run_innerloop to do.  Note
1256	    that it decrements the counter before testing it for zero, so
1257	    that if tst->dispatch_ctr is set to N you get at most N-1
1258	    iterations.  Also this means that tst->dispatch_ctr must
1259	    exceed zero before entering the innerloop.  Also also, the
1260	    decrement is done before the bb is actually run, so you
1261	    always get at least one decrement even if nothing happens. */
1262         // FIXME is this right?
1263         dispatch_ctr = SCHEDULING_QUANTUM;
1264
1265	 /* paranoia ... */
1266	 vg_assert(tst->tid == tid);
1267	 vg_assert(tst->os_state.lwpid == VG_(gettid)());
1268      }
1269
1270      /* For stats purposes only. */
1271      n_scheduling_events_MINOR++;
1272
1273      if (0)
1274         VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs\n",
1275                                   tid, dispatch_ctr - 1 );
1276
1277      HWord trc[2]; /* "two_words" */
1278      run_thread_for_a_while( &trc[0],
1279                              &dispatch_ctr,
1280                              tid, 0/*ignored*/, False );
1281
1282      if (VG_(clo_trace_sched) && VG_(clo_verbosity) > 2) {
1283	 HChar buf[50];
1284	 VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc[0]));
1285	 print_sched_event(tid, buf);
1286      }
1287
1288      if (trc[0] == VEX_TRC_JMP_NOREDIR) {
1289         /* If we got a request to run a no-redir version of
1290            something, do so now -- handle_noredir_jump just (creates
1291            and) runs that one translation.  The flip side is that the
1292            noredir translation can't itself return another noredir
1293            request -- that would be nonsensical.  It can, however,
1294            return VG_TRC_BORING, which just means keep going as
1295            normal. */
1296         /* Note that the fact that we need to continue with a
1297            no-redir jump is not recorded anywhere else in this
1298            thread's state.  So we *must* execute the block right now
1299            -- we can't fail to execute it and later resume with it,
1300            because by then we'll have forgotten the fact that it
1301            should be run as no-redir, but will get run as a normal
1302            potentially-redir'd, hence screwing up.  This really ought
1303            to be cleaned up, by noting in the guest state that the
1304            next block to be executed should be no-redir.  Then we can
1305            suspend and resume at any point, which isn't the case at
1306            the moment. */
1307         handle_noredir_jump( &trc[0],
1308                              &dispatch_ctr,
1309                              tid );
1310         vg_assert(trc[0] != VEX_TRC_JMP_NOREDIR);
1311
1312         /* This can't be allowed to happen, since it means the block
1313            didn't execute, and we have no way to resume-as-noredir
1314            after we get more timeslice.  But I don't think it ever
1315            can, since handle_noredir_jump will assert if the counter
1316            is zero on entry. */
1317         vg_assert(trc[0] != VG_TRC_INNER_COUNTERZERO);
1318
1319         /* A no-redir translation can't return with a chain-me
1320            request, since chaining in the no-redir cache is too
1321            complex. */
1322         vg_assert(trc[0] != VG_TRC_CHAIN_ME_TO_SLOW_EP
1323                   && trc[0] != VG_TRC_CHAIN_ME_TO_FAST_EP);
1324      }
1325
1326      switch (trc[0]) {
1327      case VEX_TRC_JMP_BORING:
1328         /* assisted dispatch, no event.  Used by no-redir
1329            translations to force return to the scheduler. */
1330      case VG_TRC_BORING:
1331         /* no special event, just keep going. */
1332         break;
1333
1334      case VG_TRC_INNER_FASTMISS:
1335	 vg_assert(dispatch_ctr > 0);
1336	 handle_tt_miss(tid);
1337	 break;
1338
1339      case VG_TRC_CHAIN_ME_TO_SLOW_EP: {
1340         if (0) VG_(printf)("sched: CHAIN_TO_SLOW_EP: %p\n", (void*)trc[1] );
1341         handle_chain_me(tid, (void*)trc[1], False);
1342         break;
1343      }
1344
1345      case VG_TRC_CHAIN_ME_TO_FAST_EP: {
1346         if (0) VG_(printf)("sched: CHAIN_TO_FAST_EP: %p\n", (void*)trc[1] );
1347         handle_chain_me(tid, (void*)trc[1], True);
1348         break;
1349      }
1350
1351      case VEX_TRC_JMP_CLIENTREQ:
1352	 do_client_request(tid);
1353	 break;
1354
1355      case VEX_TRC_JMP_SYS_INT128:  /* x86-linux */
1356      case VEX_TRC_JMP_SYS_INT129:  /* x86-darwin */
1357      case VEX_TRC_JMP_SYS_INT130:  /* x86-darwin */
1358      case VEX_TRC_JMP_SYS_SYSCALL: /* amd64-linux, ppc32-linux, amd64-darwin */
1359	 handle_syscall(tid, trc[0]);
1360	 if (VG_(clo_sanity_level) > 2)
1361	    VG_(sanity_check_general)(True); /* sanity-check every syscall */
1362	 break;
1363
1364      case VEX_TRC_JMP_YIELD:
1365	 /* Explicit yield, because this thread is in a spin-lock
1366	    or something.  Only let the thread run for a short while
1367            longer.  Because swapping to another thread is expensive,
1368            we're prepared to let this thread eat a little more CPU
1369            before swapping to another.  That means that short term
1370            spins waiting for hardware to poke memory won't cause a
1371            thread swap. */
1372	 if (dispatch_ctr > 1000)
1373            dispatch_ctr = 1000;
1374	 break;
1375
1376      case VG_TRC_INNER_COUNTERZERO:
1377	 /* Timeslice is out.  Let a new thread be scheduled. */
1378	 vg_assert(dispatch_ctr == 0);
1379	 break;
1380
1381      case VG_TRC_FAULT_SIGNAL:
1382	 /* Everything should be set up (either we're exiting, or
1383	    about to start in a signal handler). */
1384	 break;
1385
1386      case VEX_TRC_JMP_MAPFAIL:
1387         /* Failure of arch-specific address translation (x86/amd64
1388            segment override use) */
1389         /* jrs 2005 03 11: is this correct? */
1390         VG_(synth_fault)(tid);
1391         break;
1392
1393      case VEX_TRC_JMP_EMWARN: {
1394         static Int  counts[EmNote_NUMBER];
1395         static Bool counts_initted = False;
1396         VexEmNote ew;
1397         const HChar* what;
1398         Bool      show;
1399         Int       q;
1400         if (!counts_initted) {
1401            counts_initted = True;
1402            for (q = 0; q < EmNote_NUMBER; q++)
1403               counts[q] = 0;
1404         }
1405         ew   = (VexEmNote)VG_(threads)[tid].arch.vex.guest_EMNOTE;
1406         what = (ew < 0 || ew >= EmNote_NUMBER)
1407                   ? "unknown (?!)"
1408                   : LibVEX_EmNote_string(ew);
1409         show = (ew < 0 || ew >= EmNote_NUMBER)
1410                   ? True
1411                   : counts[ew]++ < 3;
1412         if (show && VG_(clo_show_emwarns) && !VG_(clo_xml)) {
1413            VG_(message)( Vg_UserMsg,
1414                          "Emulation warning: unsupported action:\n");
1415            VG_(message)( Vg_UserMsg, "  %s\n", what);
1416            VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1417         }
1418         break;
1419      }
1420
1421      case VEX_TRC_JMP_EMFAIL: {
1422         VexEmNote ew;
1423         const HChar* what;
1424         ew   = (VexEmNote)VG_(threads)[tid].arch.vex.guest_EMNOTE;
1425         what = (ew < 0 || ew >= EmNote_NUMBER)
1426                   ? "unknown (?!)"
1427                   : LibVEX_EmNote_string(ew);
1428         VG_(message)( Vg_UserMsg,
1429                       "Emulation fatal error -- Valgrind cannot continue:\n");
1430         VG_(message)( Vg_UserMsg, "  %s\n", what);
1431         VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1432         VG_(message)(Vg_UserMsg, "\n");
1433         VG_(message)(Vg_UserMsg, "Valgrind has to exit now.  Sorry.\n");
1434         VG_(message)(Vg_UserMsg, "\n");
1435         VG_(exit)(1);
1436         break;
1437      }
1438
1439      case VEX_TRC_JMP_SIGILL:
1440         VG_(synth_sigill)(tid, VG_(get_IP)(tid));
1441         break;
1442
1443      case VEX_TRC_JMP_SIGTRAP:
1444         VG_(synth_sigtrap)(tid);
1445         break;
1446
1447      case VEX_TRC_JMP_SIGSEGV:
1448         VG_(synth_fault)(tid);
1449         break;
1450
1451      case VEX_TRC_JMP_SIGBUS:
1452         VG_(synth_sigbus)(tid);
1453         break;
1454
1455      case VEX_TRC_JMP_SIGFPE_INTDIV:
1456         VG_(synth_sigfpe)(tid, VKI_FPE_INTDIV);
1457         break;
1458
1459      case VEX_TRC_JMP_SIGFPE_INTOVF:
1460         VG_(synth_sigfpe)(tid, VKI_FPE_INTOVF);
1461         break;
1462
1463      case VEX_TRC_JMP_NODECODE: {
1464         Addr addr = VG_(get_IP)(tid);
1465
1466         if (VG_(clo_sigill_diag)) {
1467            VG_(umsg)(
1468               "valgrind: Unrecognised instruction at address %#lx.\n", addr);
1469            VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
1470#        define M(a) VG_(umsg)(a "\n");
1471         M("Your program just tried to execute an instruction that Valgrind" );
1472         M("did not recognise.  There are two possible reasons for this."    );
1473         M("1. Your program has a bug and erroneously jumped to a non-code"  );
1474         M("   location.  If you are running Memcheck and you just saw a"    );
1475         M("   warning about a bad jump, it's probably your program's fault.");
1476         M("2. The instruction is legitimate but Valgrind doesn't handle it,");
1477         M("   i.e. it's Valgrind's fault.  If you think this is the case or");
1478         M("   you are not sure, please let us know and we'll try to fix it.");
1479         M("Either way, Valgrind will now raise a SIGILL signal which will"  );
1480         M("probably kill your program."                                     );
1481#        undef M
1482         }
1483#        if defined(VGA_s390x)
1484         /* Now that the complaint is out we need to adjust the guest_IA. The
1485            reason is that -- after raising the exception -- execution will
1486            continue with the insn that follows the invalid insn. As the first
1487            2 bits of the invalid insn determine its length in the usual way,
1488            we can compute the address of the next insn here and adjust the
1489            guest_IA accordingly. This adjustment is essential and tested by
1490            none/tests/s390x/op_exception.c (which would loop forever
1491            otherwise) */
1492         UChar byte = ((UChar *)addr)[0];
1493         UInt  insn_length = ((((byte >> 6) + 1) >> 1) + 1) << 1;
1494         Addr  next_insn_addr = addr + insn_length;
1495         VG_(set_IP)(tid, next_insn_addr);
1496#        endif
1497         VG_(synth_sigill)(tid, addr);
1498         break;
1499      }
1500
1501      case VEX_TRC_JMP_INVALICACHE:
1502         VG_(discard_translations)(
1503            (Addr64)VG_(threads)[tid].arch.vex.guest_CMSTART,
1504            VG_(threads)[tid].arch.vex.guest_CMLEN,
1505            "scheduler(VEX_TRC_JMP_INVALICACHE)"
1506         );
1507         if (0)
1508            VG_(printf)("dump translations done.\n");
1509         break;
1510
1511      case VEX_TRC_JMP_FLUSHDCACHE: {
1512         void* start = (void*)VG_(threads)[tid].arch.vex.guest_CMSTART;
1513         SizeT len   = VG_(threads)[tid].arch.vex.guest_CMLEN;
1514         VG_(debugLog)(2, "sched", "flush_dcache(%p, %lu)\n", start, len);
1515         VG_(flush_dcache)(start, len);
1516         break;
1517      }
1518
1519      case VG_TRC_INVARIANT_FAILED:
1520         /* This typically happens if, after running generated code,
1521            it is detected that host CPU settings (eg, FPU/Vector
1522            control words) are not as they should be.  Vex's code
1523            generation specifies the state such control words should
1524            be in on entry to Vex-generated code, and they should be
1525            unchanged on exit from it.  Failure of this assertion
1526            usually means a bug in Vex's code generation. */
1527         //{ UInt xx;
1528         //  __asm__ __volatile__ (
1529         //     "\t.word 0xEEF12A10\n"  // fmrx r2,fpscr
1530         //     "\tmov %0, r2" : "=r"(xx) : : "r2" );
1531         //  VG_(printf)("QQQQ new fpscr = %08x\n", xx);
1532         //}
1533         vg_assert2(0, "VG_(scheduler), phase 3: "
1534                       "run_innerloop detected host "
1535                       "state invariant failure", trc);
1536
1537      case VEX_TRC_JMP_SYS_SYSENTER:
1538         /* Do whatever simulation is appropriate for an x86 sysenter
1539            instruction.  Note that it is critical to set this thread's
1540            guest_EIP to point at the code to execute after the
1541            sysenter, since Vex-generated code will not have set it --
1542            vex does not know what it should be.  Vex sets the next
1543            address to zero, so if you don't set guest_EIP, the thread
1544            will jump to zero afterwards and probably die as a result. */
1545#        if defined(VGP_x86_linux)
1546         vg_assert2(0, "VG_(scheduler), phase 3: "
1547                       "sysenter_x86 on x86-linux is not supported");
1548#        elif defined(VGP_x86_darwin)
1549         /* return address in client edx */
1550         VG_(threads)[tid].arch.vex.guest_EIP
1551            = VG_(threads)[tid].arch.vex.guest_EDX;
1552         handle_syscall(tid, trc[0]);
1553#        else
1554         vg_assert2(0, "VG_(scheduler), phase 3: "
1555                       "sysenter_x86 on non-x86 platform?!?!");
1556#        endif
1557         break;
1558
1559      default:
1560	 vg_assert2(0, "VG_(scheduler), phase 3: "
1561                       "unexpected thread return code (%u)", trc[0]);
1562	 /* NOTREACHED */
1563	 break;
1564
1565      } /* switch (trc) */
1566
1567      if (UNLIKELY(VG_(clo_profyle_sbs)) && VG_(clo_profyle_interval) > 0)
1568         maybe_show_sb_profile();
1569   }
1570
1571   if (VG_(clo_trace_sched))
1572      print_sched_event(tid, "exiting VG_(scheduler)");
1573
1574   vg_assert(VG_(is_exiting)(tid));
1575
1576   return tst->exitreason;
1577}
1578
1579
1580/*
1581   This causes all threads to forceably exit.  They aren't actually
1582   dead by the time this returns; you need to call
1583   VG_(reap_threads)() to wait for them.
1584 */
1585void VG_(nuke_all_threads_except) ( ThreadId me, VgSchedReturnCode src )
1586{
1587   ThreadId tid;
1588
1589   vg_assert(VG_(is_running_thread)(me));
1590
1591   for (tid = 1; tid < VG_N_THREADS; tid++) {
1592      if (tid == me
1593          || VG_(threads)[tid].status == VgTs_Empty)
1594         continue;
1595      if (0)
1596         VG_(printf)(
1597            "VG_(nuke_all_threads_except): nuking tid %d\n", tid);
1598
1599      VG_(threads)[tid].exitreason = src;
1600      if (src == VgSrc_FatalSig)
1601         VG_(threads)[tid].os_state.fatalsig = VKI_SIGKILL;
1602      VG_(get_thread_out_of_syscall)(tid);
1603   }
1604}
1605
1606
1607/* ---------------------------------------------------------------------
1608   Specifying shadow register values
1609   ------------------------------------------------------------------ */
1610
1611#if defined(VGA_x86)
1612#  define VG_CLREQ_ARGS       guest_EAX
1613#  define VG_CLREQ_RET        guest_EDX
1614#elif defined(VGA_amd64)
1615#  define VG_CLREQ_ARGS       guest_RAX
1616#  define VG_CLREQ_RET        guest_RDX
1617#elif defined(VGA_ppc32) || defined(VGA_ppc64)
1618#  define VG_CLREQ_ARGS       guest_GPR4
1619#  define VG_CLREQ_RET        guest_GPR3
1620#elif defined(VGA_arm)
1621#  define VG_CLREQ_ARGS       guest_R4
1622#  define VG_CLREQ_RET        guest_R3
1623#elif defined(VGA_arm64)
1624#  define VG_CLREQ_ARGS       guest_X4
1625#  define VG_CLREQ_RET        guest_X3
1626#elif defined (VGA_s390x)
1627#  define VG_CLREQ_ARGS       guest_r2
1628#  define VG_CLREQ_RET        guest_r3
1629#elif defined(VGA_mips32) || defined(VGA_mips64)
1630#  define VG_CLREQ_ARGS       guest_r12
1631#  define VG_CLREQ_RET        guest_r11
1632#else
1633#  error Unknown arch
1634#endif
1635
1636#define CLREQ_ARGS(regs)   ((regs).vex.VG_CLREQ_ARGS)
1637#define CLREQ_RET(regs)    ((regs).vex.VG_CLREQ_RET)
1638#define O_CLREQ_RET        (offsetof(VexGuestArchState, VG_CLREQ_RET))
1639
1640// These macros write a value to a client's thread register, and tell the
1641// tool that it's happened (if necessary).
1642
1643#define SET_CLREQ_RETVAL(zztid, zzval) \
1644   do { CLREQ_RET(VG_(threads)[zztid].arch) = (zzval); \
1645        VG_TRACK( post_reg_write, \
1646                  Vg_CoreClientReq, zztid, O_CLREQ_RET, sizeof(UWord)); \
1647   } while (0)
1648
1649#define SET_CLCALL_RETVAL(zztid, zzval, f) \
1650   do { CLREQ_RET(VG_(threads)[zztid].arch) = (zzval); \
1651        VG_TRACK( post_reg_write_clientcall_return, \
1652                  zztid, O_CLREQ_RET, sizeof(UWord), f); \
1653   } while (0)
1654
1655
1656/* ---------------------------------------------------------------------
1657   Handle client requests.
1658   ------------------------------------------------------------------ */
1659
1660// OS-specific(?) client requests
1661static Bool os_client_request(ThreadId tid, UWord *args)
1662{
1663   Bool handled = True;
1664
1665   vg_assert(VG_(is_running_thread)(tid));
1666
1667   switch(args[0]) {
1668   case VG_USERREQ__LIBC_FREERES_DONE:
1669      /* This is equivalent to an exit() syscall, but we don't set the
1670	 exitcode (since it might already be set) */
1671      if (0 || VG_(clo_trace_syscalls) || VG_(clo_trace_sched))
1672         VG_(message)(Vg_DebugMsg,
1673                      "__libc_freeres() done; really quitting!\n");
1674      VG_(threads)[tid].exitreason = VgSrc_ExitThread;
1675      break;
1676
1677   default:
1678      handled = False;
1679      break;
1680   }
1681
1682   return handled;
1683}
1684
1685
1686/* Write out a client message, possibly including a back trace. Return
1687   the number of characters written. In case of XML output, the format
1688   string as well as any arguments it requires will be XML'ified.
1689   I.e. special characters such as the angle brackets will be translated
1690   into proper escape sequences. */
1691static
1692Int print_client_message( ThreadId tid, const HChar *format,
1693                          va_list *vargsp, Bool include_backtrace)
1694{
1695   Int count;
1696
1697   if (VG_(clo_xml)) {
1698      /* Translate the format string as follows:
1699         <  -->  &lt;
1700         >  -->  &gt;
1701         &  -->  &amp;
1702         %s -->  %pS
1703         Yes, yes, it's simplified but in synch with
1704         myvprintf_str_XML_simplistic and VG_(debugLog_vprintf).
1705      */
1706
1707      /* Allocate a buffer that is for sure large enough. */
1708      HChar xml_format[VG_(strlen)(format) * 5 + 1];
1709
1710      const HChar *p;
1711      HChar *q = xml_format;
1712
1713      for (p = format; *p; ++p) {
1714         switch (*p) {
1715         case '<': VG_(strcpy)(q, "&lt;");  q += 4; break;
1716         case '>': VG_(strcpy)(q, "&gt;");  q += 4; break;
1717         case '&': VG_(strcpy)(q, "&amp;"); q += 5; break;
1718         case '%':
1719            /* Careful: make sure %%s stays %%s */
1720            *q++ = *p++;
1721            if (*p == 's') {
1722              *q++ = 'p';
1723              *q++ = 'S';
1724            } else {
1725              *q++ = *p;
1726            }
1727            break;
1728
1729         default:
1730            *q++ = *p;
1731            break;
1732         }
1733      }
1734      *q = '\0';
1735
1736      VG_(printf_xml)( "<clientmsg>\n" );
1737      VG_(printf_xml)( "  <tid>%d</tid>\n", tid );
1738      VG_(printf_xml)( "  <text>" );
1739      count = VG_(vprintf_xml)( xml_format, *vargsp );
1740      VG_(printf_xml)( "  </text>\n" );
1741   } else {
1742      count = VG_(vmessage)( Vg_ClientMsg, format, *vargsp );
1743      VG_(message_flush)();
1744   }
1745
1746   if (include_backtrace)
1747      VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1748
1749   if (VG_(clo_xml))
1750      VG_(printf_xml)( "</clientmsg>\n" );
1751
1752   return count;
1753}
1754
1755
1756/* Do a client request for the thread tid.  After the request, tid may
1757   or may not still be runnable; if not, the scheduler will have to
1758   choose a new thread to run.
1759*/
1760static
1761void do_client_request ( ThreadId tid )
1762{
1763   UWord* arg = (UWord*)(CLREQ_ARGS(VG_(threads)[tid].arch));
1764   UWord req_no = arg[0];
1765
1766   if (0)
1767      VG_(printf)("req no = 0x%llx, arg = %p\n", (ULong)req_no, arg);
1768   switch (req_no) {
1769
1770      case VG_USERREQ__CLIENT_CALL0: {
1771         UWord (*f)(ThreadId) = (void*)arg[1];
1772	 if (f == NULL)
1773	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL0: func=%p\n", f);
1774	 else
1775	    SET_CLCALL_RETVAL(tid, f ( tid ), (Addr)f);
1776         break;
1777      }
1778      case VG_USERREQ__CLIENT_CALL1: {
1779         UWord (*f)(ThreadId, UWord) = (void*)arg[1];
1780	 if (f == NULL)
1781	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL1: func=%p\n", f);
1782	 else
1783	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2] ), (Addr)f );
1784         break;
1785      }
1786      case VG_USERREQ__CLIENT_CALL2: {
1787         UWord (*f)(ThreadId, UWord, UWord) = (void*)arg[1];
1788	 if (f == NULL)
1789	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL2: func=%p\n", f);
1790	 else
1791	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3] ), (Addr)f );
1792         break;
1793      }
1794      case VG_USERREQ__CLIENT_CALL3: {
1795         UWord (*f)(ThreadId, UWord, UWord, UWord) = (void*)arg[1];
1796	 if (f == NULL)
1797	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL3: func=%p\n", f);
1798	 else
1799	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3], arg[4] ), (Addr)f );
1800         break;
1801      }
1802
1803      // Nb: this looks like a circular definition, because it kind of is.
1804      // See comment in valgrind.h to understand what's going on.
1805      case VG_USERREQ__RUNNING_ON_VALGRIND:
1806         SET_CLREQ_RETVAL(tid, RUNNING_ON_VALGRIND+1);
1807         break;
1808
1809      case VG_USERREQ__PRINTF: {
1810         const HChar* format = (HChar *)arg[1];
1811         /* JRS 2010-Jan-28: this is DEPRECATED; use the
1812            _VALIST_BY_REF version instead */
1813         if (sizeof(va_list) != sizeof(UWord))
1814            goto va_list_casting_error_NORETURN;
1815         union {
1816            va_list vargs;
1817            unsigned long uw;
1818         } u;
1819         u.uw = (unsigned long)arg[2];
1820         Int count =
1821            print_client_message( tid, format, &u.vargs,
1822                                  /* include_backtrace */ False );
1823         SET_CLREQ_RETVAL( tid, count );
1824         break;
1825      }
1826
1827      case VG_USERREQ__PRINTF_BACKTRACE: {
1828         const HChar* format = (HChar *)arg[1];
1829         /* JRS 2010-Jan-28: this is DEPRECATED; use the
1830            _VALIST_BY_REF version instead */
1831         if (sizeof(va_list) != sizeof(UWord))
1832            goto va_list_casting_error_NORETURN;
1833         union {
1834            va_list vargs;
1835            unsigned long uw;
1836         } u;
1837         u.uw = (unsigned long)arg[2];
1838         Int count =
1839            print_client_message( tid, format, &u.vargs,
1840                                  /* include_backtrace */ True );
1841         SET_CLREQ_RETVAL( tid, count );
1842         break;
1843      }
1844
1845      case VG_USERREQ__PRINTF_VALIST_BY_REF: {
1846         const HChar* format = (HChar *)arg[1];
1847         va_list* vargsp = (va_list*)arg[2];
1848         Int count =
1849            print_client_message( tid, format, vargsp,
1850                                  /* include_backtrace */ False );
1851
1852         SET_CLREQ_RETVAL( tid, count );
1853         break;
1854      }
1855
1856      case VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF: {
1857         const HChar* format = (HChar *)arg[1];
1858         va_list* vargsp = (va_list*)arg[2];
1859         Int count =
1860            print_client_message( tid, format, vargsp,
1861                                  /* include_backtrace */ True );
1862         SET_CLREQ_RETVAL( tid, count );
1863         break;
1864      }
1865
1866      case VG_USERREQ__INTERNAL_PRINTF_VALIST_BY_REF: {
1867         va_list* vargsp = (va_list*)arg[2];
1868         Int count =
1869            VG_(vmessage)( Vg_DebugMsg, (HChar *)arg[1], *vargsp );
1870         VG_(message_flush)();
1871         SET_CLREQ_RETVAL( tid, count );
1872         break;
1873      }
1874
1875      case VG_USERREQ__ADD_IFUNC_TARGET: {
1876         VG_(redir_add_ifunc_target)( arg[1], arg[2] );
1877         SET_CLREQ_RETVAL( tid, 0);
1878         break; }
1879
1880      case VG_USERREQ__STACK_REGISTER: {
1881         UWord sid = VG_(register_stack)((Addr)arg[1], (Addr)arg[2]);
1882         SET_CLREQ_RETVAL( tid, sid );
1883         break; }
1884
1885      case VG_USERREQ__STACK_DEREGISTER: {
1886         VG_(deregister_stack)(arg[1]);
1887         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1888         break; }
1889
1890      case VG_USERREQ__STACK_CHANGE: {
1891         VG_(change_stack)(arg[1], (Addr)arg[2], (Addr)arg[3]);
1892         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1893         break; }
1894
1895      case VG_USERREQ__GET_MALLOCFUNCS: {
1896	 struct vg_mallocfunc_info *info = (struct vg_mallocfunc_info *)arg[1];
1897
1898	 info->tl_malloc               = VG_(tdict).tool_malloc;
1899	 info->tl_calloc               = VG_(tdict).tool_calloc;
1900	 info->tl_realloc              = VG_(tdict).tool_realloc;
1901	 info->tl_memalign             = VG_(tdict).tool_memalign;
1902	 info->tl___builtin_new        = VG_(tdict).tool___builtin_new;
1903	 info->tl___builtin_vec_new    = VG_(tdict).tool___builtin_vec_new;
1904	 info->tl_free                 = VG_(tdict).tool_free;
1905	 info->tl___builtin_delete     = VG_(tdict).tool___builtin_delete;
1906	 info->tl___builtin_vec_delete = VG_(tdict).tool___builtin_vec_delete;
1907         info->tl_malloc_usable_size   = VG_(tdict).tool_malloc_usable_size;
1908
1909	 info->mallinfo                = VG_(mallinfo);
1910	 info->clo_trace_malloc        = VG_(clo_trace_malloc);
1911
1912         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1913
1914	 break;
1915      }
1916
1917      /* Requests from the client program */
1918
1919      case VG_USERREQ__DISCARD_TRANSLATIONS:
1920         if (VG_(clo_verbosity) > 2)
1921            VG_(printf)( "client request: DISCARD_TRANSLATIONS,"
1922                         " addr %p,  len %lu\n",
1923                         (void*)arg[1], arg[2] );
1924
1925         VG_(discard_translations)(
1926            arg[1], arg[2], "scheduler(VG_USERREQ__DISCARD_TRANSLATIONS)"
1927         );
1928
1929         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1930	 break;
1931
1932      case VG_USERREQ__COUNT_ERRORS:
1933         SET_CLREQ_RETVAL( tid, VG_(get_n_errs_found)() );
1934         break;
1935
1936      case VG_USERREQ__LOAD_PDB_DEBUGINFO:
1937         VG_(di_notify_pdb_debuginfo)( arg[1], arg[2], arg[3], arg[4] );
1938         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1939         break;
1940
1941      case VG_USERREQ__MAP_IP_TO_SRCLOC: {
1942         Addr   ip    = arg[1];
1943         HChar* buf64 = (HChar*)arg[2];
1944
1945         VG_(memset)(buf64, 0, 64);
1946         UInt linenum = 0;
1947         Bool ok = VG_(get_filename_linenum)(
1948                      ip, &buf64[0], 50, NULL, 0, NULL, &linenum
1949                   );
1950         if (ok) {
1951            /* Find the terminating zero in the first 50 bytes. */
1952            UInt i;
1953            for (i = 0; i < 50; i++) {
1954               if (buf64[i] == 0)
1955                  break;
1956            }
1957            /* We must find a zero somewhere in 0 .. 49.  Else
1958               VG_(get_filename_linenum) is not properly zero
1959               terminating. */
1960            vg_assert(i < 50);
1961            VG_(sprintf)(&buf64[i], ":%u", linenum);
1962         } else {
1963            buf64[0] = 0;
1964         }
1965
1966         SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */
1967         break;
1968      }
1969
1970      case VG_USERREQ__CHANGE_ERR_DISABLEMENT: {
1971         Word delta = arg[1];
1972         vg_assert(delta == 1 || delta == -1);
1973         ThreadState* tst = VG_(get_ThreadState)(tid);
1974         vg_assert(tst);
1975         if (delta == 1 && tst->err_disablement_level < 0xFFFFFFFF) {
1976            tst->err_disablement_level++;
1977         }
1978         else
1979         if (delta == -1 && tst->err_disablement_level > 0) {
1980            tst->err_disablement_level--;
1981         }
1982         SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */
1983         break;
1984      }
1985
1986      case VG_USERREQ__GDB_MONITOR_COMMAND: {
1987         UWord ret;
1988         ret = (UWord) VG_(client_monitor_command) ((HChar*)arg[1]);
1989         SET_CLREQ_RETVAL(tid, ret);
1990         break;
1991      }
1992
1993      case VG_USERREQ__MALLOCLIKE_BLOCK:
1994      case VG_USERREQ__RESIZEINPLACE_BLOCK:
1995      case VG_USERREQ__FREELIKE_BLOCK:
1996         // Ignore them if the addr is NULL;  otherwise pass onto the tool.
1997         if (!arg[1]) {
1998            SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1999            break;
2000         } else {
2001            goto my_default;
2002         }
2003
2004      case VG_USERREQ__VEX_INIT_FOR_IRI:
2005         LibVEX_InitIRI ( (IRICB *)arg[1] );
2006         break;
2007
2008      default:
2009       my_default:
2010	 if (os_client_request(tid, arg)) {
2011	    // do nothing, os_client_request() handled it
2012         } else if (VG_(needs).client_requests) {
2013	    UWord ret;
2014
2015            if (VG_(clo_verbosity) > 2)
2016               VG_(printf)("client request: code %lx,  addr %p,  len %lu\n",
2017                           arg[0], (void*)arg[1], arg[2] );
2018
2019	    if ( VG_TDICT_CALL(tool_handle_client_request, tid, arg, &ret) )
2020	       SET_CLREQ_RETVAL(tid, ret);
2021         } else {
2022	    static Bool whined = False;
2023
2024	    if (!whined && VG_(clo_verbosity) > 2) {
2025               // Allow for requests in core, but defined by tools, which
2026               // have 0 and 0 in their two high bytes.
2027               HChar c1 = (arg[0] >> 24) & 0xff;
2028               HChar c2 = (arg[0] >> 16) & 0xff;
2029               if (c1 == 0) c1 = '_';
2030               if (c2 == 0) c2 = '_';
2031	       VG_(message)(Vg_UserMsg, "Warning:\n"
2032                   "  unhandled client request: 0x%lx (%c%c+0x%lx).  Perhaps\n"
2033		   "  VG_(needs).client_requests should be set?\n",
2034			    arg[0], c1, c2, arg[0] & 0xffff);
2035	       whined = True;
2036	    }
2037         }
2038         break;
2039   }
2040   return;
2041
2042   /*NOTREACHED*/
2043  va_list_casting_error_NORETURN:
2044   VG_(umsg)(
2045      "Valgrind: fatal error - cannot continue: use of the deprecated\n"
2046      "client requests VG_USERREQ__PRINTF or VG_USERREQ__PRINTF_BACKTRACE\n"
2047      "on a platform where they cannot be supported.  Please use the\n"
2048      "equivalent _VALIST_BY_REF versions instead.\n"
2049      "\n"
2050      "This is a binary-incompatible change in Valgrind's client request\n"
2051      "mechanism.  It is unfortunate, but difficult to avoid.  End-users\n"
2052      "are expected to almost never see this message.  The only case in\n"
2053      "which you might see this message is if your code uses the macros\n"
2054      "VALGRIND_PRINTF or VALGRIND_PRINTF_BACKTRACE.  If so, you will need\n"
2055      "to recompile such code, using the header files from this version of\n"
2056      "Valgrind, and not any previous version.\n"
2057      "\n"
2058      "If you see this mesage in any other circumstances, it is probably\n"
2059      "a bug in Valgrind.  In this case, please file a bug report at\n"
2060      "\n"
2061      "   http://www.valgrind.org/support/bug_reports.html\n"
2062      "\n"
2063      "Will now abort.\n"
2064   );
2065   vg_assert(0);
2066}
2067
2068
2069/* ---------------------------------------------------------------------
2070   Sanity checking (permanently engaged)
2071   ------------------------------------------------------------------ */
2072
2073/* Internal consistency checks on the sched structures. */
2074static
2075void scheduler_sanity ( ThreadId tid )
2076{
2077   Bool bad = False;
2078   Int lwpid = VG_(gettid)();
2079
2080   if (!VG_(is_running_thread)(tid)) {
2081      VG_(message)(Vg_DebugMsg,
2082		   "Thread %d is supposed to be running, "
2083                   "but doesn't own the_BigLock (owned by %d)\n",
2084		   tid, VG_(running_tid));
2085      bad = True;
2086   }
2087
2088   if (lwpid != VG_(threads)[tid].os_state.lwpid) {
2089      VG_(message)(Vg_DebugMsg,
2090                   "Thread %d supposed to be in LWP %d, but we're actually %d\n",
2091                   tid, VG_(threads)[tid].os_state.lwpid, VG_(gettid)());
2092      bad = True;
2093   }
2094
2095   if (lwpid != ML_(get_sched_lock_owner)(the_BigLock)) {
2096      VG_(message)(Vg_DebugMsg,
2097                   "Thread (LWPID) %d doesn't own the_BigLock\n",
2098                   tid);
2099      bad = True;
2100   }
2101
2102   if (0) {
2103      /* Periodically show the state of all threads, for debugging
2104         purposes. */
2105      static UInt lasttime = 0;
2106      UInt now;
2107      now = VG_(read_millisecond_timer)();
2108      if ((!bad) && (lasttime + 4000/*ms*/ <= now)) {
2109         lasttime = now;
2110         VG_(printf)("\n------------ Sched State at %d ms ------------\n",
2111                     (Int)now);
2112         VG_(show_sched_status)(True,  // host_stacktrace
2113                                True,  // valgrind_stack_usage
2114                                True); // exited_threads);
2115      }
2116   }
2117
2118   /* core_panic also shows the sched status, which is why we don't
2119      show it above if bad==True. */
2120   if (bad)
2121      VG_(core_panic)("scheduler_sanity: failed");
2122}
2123
2124void VG_(sanity_check_general) ( Bool force_expensive )
2125{
2126   ThreadId tid;
2127
2128   static UInt next_slow_check_at = 1;
2129   static UInt slow_check_interval = 25;
2130
2131   if (VG_(clo_sanity_level) < 1) return;
2132
2133   /* --- First do all the tests that we can do quickly. ---*/
2134
2135   sanity_fast_count++;
2136
2137   /* Check stuff pertaining to the memory check system. */
2138
2139   /* Check that nobody has spuriously claimed that the first or
2140      last 16 pages of memory have become accessible [...] */
2141   if (VG_(needs).sanity_checks) {
2142      vg_assert(VG_TDICT_CALL(tool_cheap_sanity_check));
2143   }
2144
2145   /* --- Now some more expensive checks. ---*/
2146
2147   /* Once every now and again, check some more expensive stuff.
2148      Gradually increase the interval between such checks so as not to
2149      burden long-running programs too much. */
2150   if ( force_expensive
2151        || VG_(clo_sanity_level) > 1
2152        || (VG_(clo_sanity_level) == 1
2153            && sanity_fast_count == next_slow_check_at)) {
2154
2155      if (0) VG_(printf)("SLOW at %d\n", sanity_fast_count-1);
2156
2157      next_slow_check_at = sanity_fast_count - 1 + slow_check_interval;
2158      slow_check_interval++;
2159      sanity_slow_count++;
2160
2161      if (VG_(needs).sanity_checks) {
2162          vg_assert(VG_TDICT_CALL(tool_expensive_sanity_check));
2163      }
2164
2165      /* Look for stack overruns.  Visit all threads. */
2166      for (tid = 1; tid < VG_N_THREADS; tid++) {
2167	 SizeT    remains;
2168         VgStack* stack;
2169
2170	 if (VG_(threads)[tid].status == VgTs_Empty ||
2171	     VG_(threads)[tid].status == VgTs_Zombie)
2172	    continue;
2173
2174         stack
2175            = (VgStack*)
2176              VG_(get_ThreadState)(tid)->os_state.valgrind_stack_base;
2177         SizeT limit
2178            = 4096; // Let's say.  Checking more causes lots of L2 misses.
2179	 remains
2180            = VG_(am_get_VgStack_unused_szB)(stack, limit);
2181	 if (remains < limit)
2182	    VG_(message)(Vg_DebugMsg,
2183                         "WARNING: Thread %d is within %ld bytes "
2184                         "of running out of stack!\n",
2185		         tid, remains);
2186      }
2187   }
2188
2189   if (VG_(clo_sanity_level) > 1) {
2190      /* Check sanity of the low-level memory manager.  Note that bugs
2191         in the client's code can cause this to fail, so we don't do
2192         this check unless specially asked for.  And because it's
2193         potentially very expensive. */
2194      VG_(sanity_check_malloc_all)();
2195   }
2196}
2197
2198/*--------------------------------------------------------------------*/
2199/*--- end                                                          ---*/
2200/*--------------------------------------------------------------------*/
2201