1
2/*--------------------------------------------------------------------*/
3/*--- Thread scheduling.                               scheduler.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2000-2011 Julian Seward
11      jseward@acm.org
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26   02111-1307, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29*/
30
31/*
32   Overview
33
34   Valgrind tries to emulate the kernel's threading as closely as
35   possible.  The client does all threading via the normal syscalls
36   (on Linux: clone, etc).  Valgrind emulates this by creating exactly
37   the same process structure as would be created without Valgrind.
38   There are no extra threads.
39
40   The main difference is that Valgrind only allows one client thread
41   to run at once.  This is controlled with the CPU Big Lock,
42   "the_BigLock".  Any time a thread wants to run client code or
43   manipulate any shared state (which is anything other than its own
44   ThreadState entry), it must hold the_BigLock.
45
46   When a thread is about to block in a blocking syscall, it releases
47   the_BigLock, and re-takes it when it becomes runnable again (either
48   because the syscall finished, or we took a signal).
49
50   VG_(scheduler) therefore runs in each thread.  It returns only when
51   the thread is exiting, either because it exited itself, or it was
52   told to exit by another thread.
53
54   This file is almost entirely OS-independent.  The details of how
55   the OS handles threading and signalling are abstracted away and
56   implemented elsewhere.  [Some of the functions have worked their
57   way back for the moment, until we do an OS port in earnest...]
58 */
59
60#include "pub_core_basics.h"
61#include "pub_core_debuglog.h"
62#include "pub_core_vki.h"
63#include "pub_core_vkiscnums.h"    // __NR_sched_yield
64#include "pub_core_libcsetjmp.h"   // to keep _threadstate.h happy
65#include "pub_core_threadstate.h"
66#include "pub_core_aspacemgr.h"
67#include "pub_core_clreq.h"         // for VG_USERREQ__*
68#include "pub_core_dispatch.h"
69#include "pub_core_errormgr.h"      // For VG_(get_n_errs_found)()
70#include "pub_core_gdbserver.h"     // for VG_(gdbserver) and VG_(gdbserver_activity)
71#include "pub_core_libcbase.h"
72#include "pub_core_libcassert.h"
73#include "pub_core_libcprint.h"
74#include "pub_core_libcproc.h"
75#include "pub_core_libcsignal.h"
76#if defined(VGO_darwin)
77#include "pub_core_mach.h"
78#endif
79#include "pub_core_machine.h"
80#include "pub_core_mallocfree.h"
81#include "pub_core_options.h"
82#include "pub_core_replacemalloc.h"
83#include "pub_core_signals.h"
84#include "pub_core_stacks.h"
85#include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
86#include "pub_core_syscall.h"
87#include "pub_core_syswrap.h"
88#include "pub_core_tooliface.h"
89#include "pub_core_translate.h"     // For VG_(translate)()
90#include "pub_core_transtab.h"
91#include "pub_core_debuginfo.h"     // VG_(di_notify_pdb_debuginfo)
92#include "priv_sema.h"
93#include "pub_core_scheduler.h"     // self
94#include "pub_core_redir.h"
95
96
97/* ---------------------------------------------------------------------
98   Types and globals for the scheduler.
99   ------------------------------------------------------------------ */
100
101/* ThreadId and ThreadState are defined elsewhere*/
102
103/* Defines the thread-scheduling timeslice, in terms of the number of
104   basic blocks we attempt to run each thread for.  Smaller values
105   give finer interleaving but much increased scheduling overheads. */
106#define SCHEDULING_QUANTUM   100000
107
108/* If False, a fault is Valgrind-internal (ie, a bug) */
109Bool VG_(in_generated_code) = False;
110
111/* Counts downwards in VG_(run_innerloop). */
112UInt VG_(dispatch_ctr);
113
114/* 64-bit counter for the number of basic blocks done. */
115static ULong bbs_done = 0;
116
117/* Counter to see if vgdb activity is to be verified.
118   When nr of bbs done reaches vgdb_next_poll, scheduler will
119   poll for gdbserver activity. VG_(force_vgdb_poll) and
120   VG_(disable_vgdb_poll) allows the valgrind core (e.g. m_gdbserver)
121   to control when the next poll will be done. */
122static ULong vgdb_next_poll;
123
124/* Forwards */
125static void do_client_request ( ThreadId tid );
126static void scheduler_sanity ( ThreadId tid );
127static void mostly_clear_thread_record ( ThreadId tid );
128
129/* Stats. */
130static ULong n_scheduling_events_MINOR = 0;
131static ULong n_scheduling_events_MAJOR = 0;
132
133/* Sanity checking counts. */
134static UInt sanity_fast_count = 0;
135static UInt sanity_slow_count = 0;
136
137void VG_(print_scheduler_stats)(void)
138{
139   VG_(message)(Vg_DebugMsg,
140      "scheduler: %'llu jumps (bb entries).\n", bbs_done );
141   VG_(message)(Vg_DebugMsg,
142      "scheduler: %'llu/%'llu major/minor sched events.\n",
143      n_scheduling_events_MAJOR, n_scheduling_events_MINOR);
144   VG_(message)(Vg_DebugMsg,
145                "   sanity: %d cheap, %d expensive checks.\n",
146                sanity_fast_count, sanity_slow_count );
147}
148
149/* CPU semaphore, so that threads can run exclusively */
150static vg_sema_t the_BigLock;
151
152// Base address of the NaCl sandbox.
153UWord nacl_head;
154
155// Path to NaCl nexe.
156char *nacl_file;
157
158/* ---------------------------------------------------------------------
159   Helper functions for the scheduler.
160   ------------------------------------------------------------------ */
161
162static
163void print_sched_event ( ThreadId tid, Char* what )
164{
165   VG_(message)(Vg_DebugMsg, "  SCHED[%d]: %s\n", tid, what );
166}
167
168/* For showing SB counts, if the user asks to see them. */
169#define SHOW_SBCOUNT_EVERY (20ULL * 1000 * 1000)
170static ULong bbs_done_lastcheck = 0;
171
172static
173void maybe_show_sb_counts ( void )
174{
175   Long delta = bbs_done - bbs_done_lastcheck;
176   vg_assert(delta >= 0);
177   if (UNLIKELY(delta >= SHOW_SBCOUNT_EVERY)) {
178      VG_(umsg)("%'lld superblocks executed\n", bbs_done);
179      bbs_done_lastcheck = bbs_done;
180   }
181}
182
183static
184HChar* name_of_sched_event ( UInt event )
185{
186   switch (event) {
187      case VEX_TRC_JMP_SYS_SYSCALL:   return "SYSCALL";
188      case VEX_TRC_JMP_SYS_INT32:     return "INT32";
189      case VEX_TRC_JMP_SYS_INT128:    return "INT128";
190      case VEX_TRC_JMP_SYS_INT129:    return "INT129";
191      case VEX_TRC_JMP_SYS_INT130:    return "INT130";
192      case VEX_TRC_JMP_SYS_SYSENTER:  return "SYSENTER";
193      case VEX_TRC_JMP_CLIENTREQ:     return "CLIENTREQ";
194      case VEX_TRC_JMP_YIELD:         return "YIELD";
195      case VEX_TRC_JMP_YIELD_NOREDIR: return "YIELD_NOREDIR";
196      case VEX_TRC_JMP_NODECODE:      return "NODECODE";
197      case VEX_TRC_JMP_MAPFAIL:       return "MAPFAIL";
198      case VEX_TRC_JMP_NOREDIR:       return "NOREDIR";
199      case VEX_TRC_JMP_EMWARN:        return "EMWARN";
200      case VEX_TRC_JMP_TINVAL:        return "TINVAL";
201      case VG_TRC_INVARIANT_FAILED:   return "INVFAILED";
202      case VG_TRC_INNER_COUNTERZERO:  return "COUNTERZERO";
203      case VG_TRC_INNER_FASTMISS:     return "FASTMISS";
204      case VG_TRC_FAULT_SIGNAL:       return "FAULTSIGNAL";
205      default:                        return "??UNKNOWN??";
206  }
207}
208
209/* Allocate a completely empty ThreadState record. */
210ThreadId VG_(alloc_ThreadState) ( void )
211{
212   Int i;
213   for (i = 1; i < VG_N_THREADS; i++) {
214      if (VG_(threads)[i].status == VgTs_Empty) {
215	 VG_(threads)[i].status = VgTs_Init;
216	 VG_(threads)[i].exitreason = VgSrc_None;
217         return i;
218      }
219   }
220   VG_(printf)("vg_alloc_ThreadState: no free slots available\n");
221   VG_(printf)("Increase VG_N_THREADS, rebuild and try again.\n");
222   VG_(core_panic)("VG_N_THREADS is too low");
223   /*NOTREACHED*/
224}
225
226/*
227   Mark a thread as Runnable.  This will block until the_BigLock is
228   available, so that we get exclusive access to all the shared
229   structures and the CPU.  Up until we get the_BigLock, we must not
230   touch any shared state.
231
232   When this returns, we'll actually be running.
233 */
234void VG_(acquire_BigLock)(ThreadId tid, HChar* who)
235{
236   ThreadState *tst;
237
238#if 0
239   if (VG_(clo_trace_sched)) {
240      HChar buf[100];
241      vg_assert(VG_(strlen)(who) <= 100-50);
242      VG_(sprintf)(buf, "waiting for lock (%s)", who);
243      print_sched_event(tid, buf);
244   }
245#endif
246
247   /* First, acquire the_BigLock.  We can't do anything else safely
248      prior to this point.  Even doing debug printing prior to this
249      point is, technically, wrong. */
250   ML_(sema_down)(&the_BigLock, False/*not LL*/);
251
252   tst = VG_(get_ThreadState)(tid);
253
254   vg_assert(tst->status != VgTs_Runnable);
255
256   tst->status = VgTs_Runnable;
257
258   if (VG_(running_tid) != VG_INVALID_THREADID)
259      VG_(printf)("tid %d found %d running\n", tid, VG_(running_tid));
260   vg_assert(VG_(running_tid) == VG_INVALID_THREADID);
261   VG_(running_tid) = tid;
262
263   { Addr gsp = VG_(get_SP)(tid);
264     VG_(unknown_SP_update)(gsp, gsp, 0/*unknown origin*/);
265   }
266
267   if (VG_(clo_trace_sched)) {
268      HChar buf[150];
269      vg_assert(VG_(strlen)(who) <= 150-50);
270      VG_(sprintf)(buf, " acquired lock (%s)", who);
271      print_sched_event(tid, buf);
272   }
273}
274
275/*
276   Set a thread into a sleeping state, and give up exclusive access to
277   the CPU.  On return, the thread must be prepared to block until it
278   is ready to run again (generally this means blocking in a syscall,
279   but it may mean that we remain in a Runnable state and we're just
280   yielding the CPU to another thread).
281 */
282void VG_(release_BigLock)(ThreadId tid, ThreadStatus sleepstate, HChar* who)
283{
284   ThreadState *tst = VG_(get_ThreadState)(tid);
285
286   vg_assert(tst->status == VgTs_Runnable);
287
288   vg_assert(sleepstate == VgTs_WaitSys ||
289	     sleepstate == VgTs_Yielding);
290
291   tst->status = sleepstate;
292
293   vg_assert(VG_(running_tid) == tid);
294   VG_(running_tid) = VG_INVALID_THREADID;
295
296   if (VG_(clo_trace_sched)) {
297      Char buf[200];
298      vg_assert(VG_(strlen)(who) <= 200-100);
299      VG_(sprintf)(buf, "releasing lock (%s) -> %s",
300                        who, VG_(name_of_ThreadStatus)(sleepstate));
301      print_sched_event(tid, buf);
302   }
303
304   /* Release the_BigLock; this will reschedule any runnable
305      thread. */
306   ML_(sema_up)(&the_BigLock, False/*not LL*/);
307}
308
309/* See pub_core_scheduler.h for description */
310void VG_(acquire_BigLock_LL) ( HChar* who )
311{
312  ML_(sema_down)(&the_BigLock, True/*LL*/);
313}
314
315/* See pub_core_scheduler.h for description */
316void VG_(release_BigLock_LL) ( HChar* who )
317{
318   ML_(sema_up)(&the_BigLock, True/*LL*/);
319}
320
321
322/* Clear out the ThreadState and release the semaphore. Leaves the
323   ThreadState in VgTs_Zombie state, so that it doesn't get
324   reallocated until the caller is really ready. */
325void VG_(exit_thread)(ThreadId tid)
326{
327   vg_assert(VG_(is_valid_tid)(tid));
328   vg_assert(VG_(is_running_thread)(tid));
329   vg_assert(VG_(is_exiting)(tid));
330
331   mostly_clear_thread_record(tid);
332   VG_(running_tid) = VG_INVALID_THREADID;
333
334   /* There should still be a valid exitreason for this thread */
335   vg_assert(VG_(threads)[tid].exitreason != VgSrc_None);
336
337   if (VG_(clo_trace_sched))
338      print_sched_event(tid, "release lock in VG_(exit_thread)");
339
340   ML_(sema_up)(&the_BigLock, False/*not LL*/);
341}
342
343/* If 'tid' is blocked in a syscall, send it SIGVGKILL so as to get it
344   out of the syscall and onto doing the next thing, whatever that is.
345   If it isn't blocked in a syscall, has no effect on the thread. */
346void VG_(get_thread_out_of_syscall)(ThreadId tid)
347{
348   vg_assert(VG_(is_valid_tid)(tid));
349   vg_assert(!VG_(is_running_thread)(tid));
350
351   if (VG_(threads)[tid].status == VgTs_WaitSys) {
352      if (VG_(clo_trace_signals)) {
353	 VG_(message)(Vg_DebugMsg,
354                      "get_thread_out_of_syscall zaps tid %d lwp %d\n",
355		      tid, VG_(threads)[tid].os_state.lwpid);
356      }
357#     if defined(VGO_darwin)
358      {
359         // GrP fixme use mach primitives on darwin?
360         // GrP fixme thread_abort_safely?
361         // GrP fixme race for thread with WaitSys set but not in syscall yet?
362         extern kern_return_t thread_abort(mach_port_t);
363         thread_abort(VG_(threads)[tid].os_state.lwpid);
364      }
365#     else
366      {
367         __attribute__((unused))
368         Int r = VG_(tkill)(VG_(threads)[tid].os_state.lwpid, VG_SIGVGKILL);
369         /* JRS 2009-Mar-20: should we assert for r==0 (tkill succeeded)?
370            I'm really not sure.  Here's a race scenario which argues
371            that we shoudn't; but equally I'm not sure the scenario is
372            even possible, because of constraints caused by the question
373            of who holds the BigLock when.
374
375            Target thread tid does sys_read on a socket and blocks.  This
376            function gets called, and we observe correctly that tid's
377            status is WaitSys but then for whatever reason this function
378            goes very slowly for a while.  Then data arrives from
379            wherever, tid's sys_read returns, tid exits.  Then we do
380            tkill on tid, but tid no longer exists; tkill returns an
381            error code and the assert fails. */
382         /* vg_assert(r == 0); */
383      }
384#     endif
385   }
386}
387
388/*
389   Yield the CPU for a short time to let some other thread run.
390 */
391void VG_(vg_yield)(void)
392{
393   ThreadId tid = VG_(running_tid);
394
395   vg_assert(tid != VG_INVALID_THREADID);
396   vg_assert(VG_(threads)[tid].os_state.lwpid == VG_(gettid)());
397
398   VG_(release_BigLock)(tid, VgTs_Yielding, "VG_(vg_yield)");
399
400   /*
401      Tell the kernel we're yielding.
402    */
403   VG_(do_syscall0)(__NR_sched_yield);
404
405   VG_(acquire_BigLock)(tid, "VG_(vg_yield)");
406}
407
408
409/* Set the standard set of blocked signals, used whenever we're not
410   running a client syscall. */
411static void block_signals(void)
412{
413   vki_sigset_t mask;
414
415   VG_(sigfillset)(&mask);
416
417   /* Don't block these because they're synchronous */
418   VG_(sigdelset)(&mask, VKI_SIGSEGV);
419   VG_(sigdelset)(&mask, VKI_SIGBUS);
420   VG_(sigdelset)(&mask, VKI_SIGFPE);
421   VG_(sigdelset)(&mask, VKI_SIGILL);
422   VG_(sigdelset)(&mask, VKI_SIGTRAP);
423
424   /* Can't block these anyway */
425   VG_(sigdelset)(&mask, VKI_SIGSTOP);
426   VG_(sigdelset)(&mask, VKI_SIGKILL);
427
428   VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, NULL);
429}
430
431static void os_state_clear(ThreadState *tst)
432{
433   tst->os_state.lwpid       = 0;
434   tst->os_state.threadgroup = 0;
435#  if defined(VGO_linux)
436   /* no other fields to clear */
437#  elif defined(VGO_darwin)
438   tst->os_state.post_mach_trap_fn = NULL;
439   tst->os_state.pthread           = 0;
440   tst->os_state.func_arg          = 0;
441   VG_(memset)(&tst->os_state.child_go, 0, sizeof(tst->os_state.child_go));
442   VG_(memset)(&tst->os_state.child_done, 0, sizeof(tst->os_state.child_done));
443   tst->os_state.wq_jmpbuf_valid   = False;
444   tst->os_state.remote_port       = 0;
445   tst->os_state.msgh_id           = 0;
446   VG_(memset)(&tst->os_state.mach_args, 0, sizeof(tst->os_state.mach_args));
447#  else
448#    error "Unknown OS"
449#  endif
450}
451
452static void os_state_init(ThreadState *tst)
453{
454   tst->os_state.valgrind_stack_base    = 0;
455   tst->os_state.valgrind_stack_init_SP = 0;
456   os_state_clear(tst);
457}
458
459static
460void mostly_clear_thread_record ( ThreadId tid )
461{
462   vki_sigset_t savedmask;
463
464   vg_assert(tid >= 0 && tid < VG_N_THREADS);
465   VG_(cleanup_thread)(&VG_(threads)[tid].arch);
466   VG_(threads)[tid].tid = tid;
467
468   /* Leave the thread in Zombie, so that it doesn't get reallocated
469      until the caller is finally done with the thread stack. */
470   VG_(threads)[tid].status               = VgTs_Zombie;
471
472   VG_(sigemptyset)(&VG_(threads)[tid].sig_mask);
473   VG_(sigemptyset)(&VG_(threads)[tid].tmp_sig_mask);
474
475   os_state_clear(&VG_(threads)[tid]);
476
477   /* start with no altstack */
478   VG_(threads)[tid].altstack.ss_sp = (void *)0xdeadbeef;
479   VG_(threads)[tid].altstack.ss_size = 0;
480   VG_(threads)[tid].altstack.ss_flags = VKI_SS_DISABLE;
481
482   VG_(clear_out_queued_signals)(tid, &savedmask);
483
484   VG_(threads)[tid].sched_jmpbuf_valid = False;
485}
486
487/*
488   Called in the child after fork.  If the parent has multiple
489   threads, then we've inherited a VG_(threads) array describing them,
490   but only the thread which called fork() is actually alive in the
491   child.  This functions needs to clean up all those other thread
492   structures.
493
494   Whichever tid in the parent which called fork() becomes the
495   master_tid in the child.  That's because the only living slot in
496   VG_(threads) in the child after fork is VG_(threads)[tid], and it
497   would be too hard to try to re-number the thread and relocate the
498   thread state down to VG_(threads)[1].
499
500   This function also needs to reinitialize the_BigLock, since
501   otherwise we may end up sharing its state with the parent, which
502   would be deeply confusing.
503*/
504static void sched_fork_cleanup(ThreadId me)
505{
506   ThreadId tid;
507   vg_assert(VG_(running_tid) == me);
508
509#  if defined(VGO_darwin)
510   // GrP fixme hack reset Mach ports
511   VG_(mach_init)();
512#  endif
513
514   VG_(threads)[me].os_state.lwpid = VG_(gettid)();
515   VG_(threads)[me].os_state.threadgroup = VG_(getpid)();
516
517   /* clear out all the unused thread slots */
518   for (tid = 1; tid < VG_N_THREADS; tid++) {
519      if (tid != me) {
520         mostly_clear_thread_record(tid);
521	 VG_(threads)[tid].status = VgTs_Empty;
522         VG_(clear_syscallInfo)(tid);
523      }
524   }
525
526   /* re-init and take the sema */
527   ML_(sema_deinit)(&the_BigLock);
528   ML_(sema_init)(&the_BigLock);
529   ML_(sema_down)(&the_BigLock, False/*not LL*/);
530}
531
532
533/* First phase of initialisation of the scheduler.  Initialise the
534   bigLock, zeroise the VG_(threads) structure and decide on the
535   ThreadId of the root thread.
536*/
537ThreadId VG_(scheduler_init_phase1) ( void )
538{
539   Int i;
540   ThreadId tid_main;
541
542   VG_(debugLog)(1,"sched","sched_init_phase1\n");
543
544   ML_(sema_init)(&the_BigLock);
545
546   for (i = 0 /* NB; not 1 */; i < VG_N_THREADS; i++) {
547      /* Paranoia .. completely zero it out. */
548      VG_(memset)( & VG_(threads)[i], 0, sizeof( VG_(threads)[i] ) );
549
550      VG_(threads)[i].sig_queue = NULL;
551
552      os_state_init(&VG_(threads)[i]);
553      mostly_clear_thread_record(i);
554
555      VG_(threads)[i].status                    = VgTs_Empty;
556      VG_(threads)[i].client_stack_szB          = 0;
557      VG_(threads)[i].client_stack_highest_word = (Addr)NULL;
558      VG_(threads)[i].err_disablement_level     = 0;
559   }
560
561   tid_main = VG_(alloc_ThreadState)();
562
563   /* Bleh.  Unfortunately there are various places in the system that
564      assume that the main thread has a ThreadId of 1.
565      - Helgrind (possibly)
566      - stack overflow message in default_action() in m_signals.c
567      - definitely a lot more places
568   */
569   vg_assert(tid_main == 1);
570
571   return tid_main;
572}
573
574
575/* Second phase of initialisation of the scheduler.  Given the root
576   ThreadId computed by first phase of initialisation, fill in stack
577   details and acquire bigLock.  Initialise the scheduler.  This is
578   called at startup.  The caller subsequently initialises the guest
579   state components of this main thread.
580*/
581void VG_(scheduler_init_phase2) ( ThreadId tid_main,
582                                  Addr     clstack_end,
583                                  SizeT    clstack_size )
584{
585   VG_(debugLog)(1,"sched","sched_init_phase2: tid_main=%d, "
586                   "cls_end=0x%lx, cls_sz=%ld\n",
587                   tid_main, clstack_end, clstack_size);
588
589   vg_assert(VG_IS_PAGE_ALIGNED(clstack_end+1));
590   vg_assert(VG_IS_PAGE_ALIGNED(clstack_size));
591
592   VG_(threads)[tid_main].client_stack_highest_word
593      = clstack_end + 1 - sizeof(UWord);
594   VG_(threads)[tid_main].client_stack_szB
595      = clstack_size;
596
597   VG_(atfork)(NULL, NULL, sched_fork_cleanup);
598}
599
600
601/* ---------------------------------------------------------------------
602   Helpers for running translations.
603   ------------------------------------------------------------------ */
604
605/* Use gcc's built-in setjmp/longjmp.  longjmp must not restore signal
606   mask state, but does need to pass "val" through.  jumped must be a
607   volatile UWord. */
608#define SCHEDSETJMP(tid, jumped, stmt)					\
609   do {									\
610      ThreadState * volatile _qq_tst = VG_(get_ThreadState)(tid);	\
611									\
612      (jumped) = VG_MINIMAL_SETJMP(_qq_tst->sched_jmpbuf);              \
613      if ((jumped) == ((UWord)0)) {                                     \
614	 vg_assert(!_qq_tst->sched_jmpbuf_valid);			\
615	 _qq_tst->sched_jmpbuf_valid = True;				\
616	 stmt;								\
617      }	else if (VG_(clo_trace_sched))					\
618	 VG_(printf)("SCHEDSETJMP(line %d) tid %d, jumped=%ld\n",       \
619                     __LINE__, tid, jumped);                            \
620      vg_assert(_qq_tst->sched_jmpbuf_valid);				\
621      _qq_tst->sched_jmpbuf_valid = False;				\
622   } while(0)
623
624
625/* Do various guest state alignment checks prior to running a thread.
626   Specifically, check that what we have matches Vex's guest state
627   layout requirements.  See libvex.h for details, but in short the
628   requirements are: There must be no holes in between the primary
629   guest state, its two copies, and the spill area.  In short, all 4
630   areas must have a 16-aligned size and be 16-aligned, and placed
631   back-to-back. */
632static void do_pre_run_checks ( ThreadState* tst )
633{
634   Addr a_vex     = (Addr) & tst->arch.vex;
635   Addr a_vexsh1  = (Addr) & tst->arch.vex_shadow1;
636   Addr a_vexsh2  = (Addr) & tst->arch.vex_shadow2;
637   Addr a_spill   = (Addr) & tst->arch.vex_spill;
638   UInt sz_vex    = (UInt) sizeof tst->arch.vex;
639   UInt sz_vexsh1 = (UInt) sizeof tst->arch.vex_shadow1;
640   UInt sz_vexsh2 = (UInt) sizeof tst->arch.vex_shadow2;
641   UInt sz_spill  = (UInt) sizeof tst->arch.vex_spill;
642
643   if (0)
644   VG_(printf)("gst %p %d, sh1 %p %d, "
645               "sh2 %p %d, spill %p %d\n",
646               (void*)a_vex, sz_vex,
647               (void*)a_vexsh1, sz_vexsh1,
648               (void*)a_vexsh2, sz_vexsh2,
649               (void*)a_spill, sz_spill );
650
651   vg_assert(VG_IS_16_ALIGNED(sz_vex));
652   vg_assert(VG_IS_16_ALIGNED(sz_vexsh1));
653   vg_assert(VG_IS_16_ALIGNED(sz_vexsh2));
654   vg_assert(VG_IS_16_ALIGNED(sz_spill));
655
656   vg_assert(VG_IS_16_ALIGNED(a_vex));
657   vg_assert(VG_IS_16_ALIGNED(a_vexsh1));
658   vg_assert(VG_IS_16_ALIGNED(a_vexsh2));
659   vg_assert(VG_IS_16_ALIGNED(a_spill));
660
661   /* Check that the guest state and its two shadows have the same
662      size, and that there are no holes in between.  The latter is
663      important because Memcheck assumes that it can reliably access
664      the shadows by indexing off a pointer to the start of the
665      primary guest state area. */
666   vg_assert(sz_vex == sz_vexsh1);
667   vg_assert(sz_vex == sz_vexsh2);
668   vg_assert(a_vex + 1 * sz_vex == a_vexsh1);
669   vg_assert(a_vex + 2 * sz_vex == a_vexsh2);
670   /* Also check there's no hole between the second shadow area and
671      the spill area. */
672   vg_assert(sz_spill == LibVEX_N_SPILL_BYTES);
673   vg_assert(a_vex + 3 * sz_vex == a_spill);
674
675#  if defined(VGA_amd64)
676   /* x86/amd64 XMM regs must form an array, ie, have no
677      holes in between. */
678   vg_assert(
679      (offsetof(VexGuestAMD64State,guest_XMM16)
680       - offsetof(VexGuestAMD64State,guest_XMM0))
681      == (17/*#regs*/-1) * 16/*bytes per reg*/
682   );
683#  endif
684
685#  if defined(VGA_ppc32) || defined(VGA_ppc64)
686   /* ppc guest_state vector regs must be 16 byte aligned for
687      loads/stores.  This is important! */
688   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VSR0));
689   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VSR0));
690   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VSR0));
691   /* be extra paranoid .. */
692   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex.guest_VSR1));
693   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow1.guest_VSR1));
694   vg_assert(VG_IS_16_ALIGNED(& tst->arch.vex_shadow2.guest_VSR1));
695#  endif
696
697#  if defined(VGA_arm)
698   /* arm guest_state VFP regs must be 8 byte aligned for
699      loads/stores. */
700   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D0));
701   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D0));
702   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D0));
703   /* be extra paranoid .. */
704   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex.guest_D1));
705   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow1.guest_D1));
706   vg_assert(VG_IS_8_ALIGNED(& tst->arch.vex_shadow2.guest_D1));
707#  endif
708
709#  if defined(VGA_s390x)
710   /* no special requirements */
711#  endif
712}
713
714// NO_VGDB_POLL value ensures vgdb is not polled, while
715// VGDB_POLL_ASAP ensures that the next scheduler call
716// will cause a poll.
717#define NO_VGDB_POLL    0xffffffffffffffffULL
718#define VGDB_POLL_ASAP  0x0ULL
719
720void VG_(disable_vgdb_poll) (void )
721{
722   vgdb_next_poll = NO_VGDB_POLL;
723}
724void VG_(force_vgdb_poll) ( void )
725{
726   vgdb_next_poll = VGDB_POLL_ASAP;
727}
728
729/* Run the thread tid for a while, and return a VG_TRC_* value
730   indicating why VG_(run_innerloop) stopped. */
731static UInt run_thread_for_a_while ( ThreadId tid )
732{
733   volatile UWord        jumped;
734   volatile ThreadState* tst = NULL; /* stop gcc complaining */
735   volatile UInt         trc;
736   volatile Int          dispatch_ctr_SAVED;
737   volatile Int          done_this_time;
738
739   /* Paranoia */
740   vg_assert(VG_(is_valid_tid)(tid));
741   vg_assert(VG_(is_running_thread)(tid));
742   vg_assert(!VG_(is_exiting)(tid));
743
744   tst = VG_(get_ThreadState)(tid);
745   do_pre_run_checks( (ThreadState*)tst );
746   /* end Paranoia */
747
748   trc = 0;
749   dispatch_ctr_SAVED = VG_(dispatch_ctr);
750
751   /* there should be no undealt-with signals */
752   //vg_assert(VG_(threads)[tid].siginfo.si_signo == 0);
753
754   if (0) {
755      vki_sigset_t m;
756      Int i, err = VG_(sigprocmask)(VKI_SIG_SETMASK, NULL, &m);
757      vg_assert(err == 0);
758      VG_(printf)("tid %d: entering code with unblocked signals: ", tid);
759      for (i = 1; i <= _VKI_NSIG; i++)
760         if (!VG_(sigismember)(&m, i))
761            VG_(printf)("%d ", i);
762      VG_(printf)("\n");
763   }
764
765   // Tell the tool this thread is about to run client code
766   VG_TRACK( start_client_code, tid, bbs_done );
767
768   vg_assert(VG_(in_generated_code) == False);
769   VG_(in_generated_code) = True;
770
771   SCHEDSETJMP(
772      tid,
773      jumped,
774      trc = (UInt)VG_(run_innerloop)( (void*)&tst->arch.vex,
775                                      VG_(clo_profile_flags) > 0 ? 1 : 0 )
776   );
777
778   vg_assert(VG_(in_generated_code) == True);
779   VG_(in_generated_code) = False;
780
781   if (jumped != (UWord)0) {
782      /* We get here if the client took a fault that caused our signal
783         handler to longjmp. */
784      vg_assert(trc == 0);
785      trc = VG_TRC_FAULT_SIGNAL;
786      block_signals();
787   }
788
789   done_this_time = (Int)dispatch_ctr_SAVED - (Int)VG_(dispatch_ctr) - 0;
790
791   vg_assert(done_this_time >= 0);
792   bbs_done += (ULong)done_this_time;
793
794   // Tell the tool this thread has stopped running client code
795   VG_TRACK( stop_client_code, tid, bbs_done );
796
797   if (bbs_done >= vgdb_next_poll) {
798      if (VG_(clo_vgdb_poll))
799         vgdb_next_poll = bbs_done + (ULong)VG_(clo_vgdb_poll);
800      else
801         /* value was changed due to gdbserver invocation via ptrace */
802         vgdb_next_poll = NO_VGDB_POLL;
803      if (VG_(gdbserver_activity) (tid))
804         VG_(gdbserver) (tid);
805   }
806
807   return trc;
808}
809
810
811/* Run a no-redir translation just once, and return the resulting
812   VG_TRC_* value. */
813static UInt run_noredir_translation ( Addr hcode, ThreadId tid )
814{
815   volatile UWord        jumped;
816   volatile ThreadState* tst;
817   volatile UWord        argblock[4];
818   volatile UInt         retval;
819
820   /* Paranoia */
821   vg_assert(VG_(is_valid_tid)(tid));
822   vg_assert(VG_(is_running_thread)(tid));
823   vg_assert(!VG_(is_exiting)(tid));
824
825   tst = VG_(get_ThreadState)(tid);
826   do_pre_run_checks( (ThreadState*)tst );
827   /* end Paranoia */
828
829#  if defined(VGA_ppc32) || defined(VGA_ppc64)
830   /* I don't think we need to clear this thread's guest_RESVN here,
831      because we can only get here if run_thread_for_a_while() has
832      been used immediately before, on this same thread. */
833#  endif
834
835   /* There can be 3 outcomes from VG_(run_a_noredir_translation):
836
837      - a signal occurred and the sighandler longjmp'd.  Then both [2]
838        and [3] are unchanged - hence zero.
839
840      - translation ran normally, set [2] (next guest IP) and set [3]
841        to whatever [1] was beforehand, indicating a normal (boring)
842        jump to the next block.
843
844      - translation ran normally, set [2] (next guest IP) and set [3]
845        to something different from [1] beforehand, which indicates a
846        TRC_ value.
847   */
848   argblock[0] = (UWord)hcode;
849   argblock[1] = (UWord)&VG_(threads)[tid].arch.vex;
850   argblock[2] = 0; /* next guest IP is written here */
851   argblock[3] = 0; /* guest state ptr afterwards is written here */
852
853   // Tell the tool this thread is about to run client code
854   VG_TRACK( start_client_code, tid, bbs_done );
855
856   vg_assert(VG_(in_generated_code) == False);
857   VG_(in_generated_code) = True;
858
859   SCHEDSETJMP(
860      tid,
861      jumped,
862      VG_(run_a_noredir_translation)( &argblock[0] )
863   );
864
865   VG_(in_generated_code) = False;
866
867   if (jumped != (UWord)0) {
868      /* We get here if the client took a fault that caused our signal
869         handler to longjmp. */
870      vg_assert(argblock[2] == 0); /* next guest IP was not written */
871      vg_assert(argblock[3] == 0); /* trc was not written */
872      block_signals();
873      retval = VG_TRC_FAULT_SIGNAL;
874   } else {
875      /* store away the guest program counter */
876      VG_(set_IP)( tid, argblock[2] );
877      if (argblock[3] == argblock[1])
878         /* the guest state pointer afterwards was unchanged */
879         retval = VG_TRC_BORING;
880      else
881         retval = (UInt)argblock[3];
882   }
883
884   bbs_done++;
885
886   // Tell the tool this thread has stopped running client code
887   VG_TRACK( stop_client_code, tid, bbs_done );
888
889   return retval;
890}
891
892
893/* ---------------------------------------------------------------------
894   The scheduler proper.
895   ------------------------------------------------------------------ */
896
897static void handle_tt_miss ( ThreadId tid )
898{
899   Bool found;
900   Addr ip = VG_(get_IP)(tid);
901
902   /* Trivial event.  Miss in the fast-cache.  Do a full
903      lookup for it. */
904   found = VG_(search_transtab)( NULL, ip, True/*upd_fast_cache*/ );
905   if (UNLIKELY(!found)) {
906      /* Not found; we need to request a translation. */
907      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/,
908                          bbs_done, True/*allow redirection*/ )) {
909	 found = VG_(search_transtab)( NULL, ip, True );
910         vg_assert2(found, "VG_TRC_INNER_FASTMISS: missing tt_fast entry");
911
912      } else {
913	 // If VG_(translate)() fails, it's because it had to throw a
914	 // signal because the client jumped to a bad address.  That
915	 // means that either a signal has been set up for delivery,
916	 // or the thread has been marked for termination.  Either
917	 // way, we just need to go back into the scheduler loop.
918      }
919   }
920}
921
922static void handle_syscall(ThreadId tid, UInt trc)
923{
924   ThreadState * volatile tst = VG_(get_ThreadState)(tid);
925   volatile UWord jumped;
926
927   /* Syscall may or may not block; either way, it will be
928      complete by the time this call returns, and we'll be
929      runnable again.  We could take a signal while the
930      syscall runs. */
931
932   if (VG_(clo_sanity_level >= 3))
933      VG_(am_do_sync_check)("(BEFORE SYSCALL)",__FILE__,__LINE__);
934
935   SCHEDSETJMP(tid, jumped, VG_(client_syscall)(tid, trc));
936
937   if (VG_(clo_sanity_level >= 3))
938      VG_(am_do_sync_check)("(AFTER SYSCALL)",__FILE__,__LINE__);
939
940   if (!VG_(is_running_thread)(tid))
941      VG_(printf)("tid %d not running; VG_(running_tid)=%d, tid %d status %d\n",
942		  tid, VG_(running_tid), tid, tst->status);
943   vg_assert(VG_(is_running_thread)(tid));
944
945   if (jumped != (UWord)0) {
946      block_signals();
947      VG_(poll_signals)(tid);
948   }
949}
950
951/* tid just requested a jump to the noredir version of its current
952   program counter.  So make up that translation if needed, run it,
953   and return the resulting thread return code. */
954static UInt/*trc*/ handle_noredir_jump ( ThreadId tid )
955{
956   AddrH hcode = 0;
957   Addr  ip    = VG_(get_IP)(tid);
958
959   Bool  found = VG_(search_unredir_transtab)( &hcode, ip );
960   if (!found) {
961      /* Not found; we need to request a translation. */
962      if (VG_(translate)( tid, ip, /*debug*/False, 0/*not verbose*/, bbs_done,
963                          False/*NO REDIRECTION*/ )) {
964
965         found = VG_(search_unredir_transtab)( &hcode, ip );
966         vg_assert2(found, "unredir translation missing after creation?!");
967
968      } else {
969	 // If VG_(translate)() fails, it's because it had to throw a
970	 // signal because the client jumped to a bad address.  That
971	 // means that either a signal has been set up for delivery,
972	 // or the thread has been marked for termination.  Either
973	 // way, we just need to go back into the scheduler loop.
974         return VG_TRC_BORING;
975      }
976
977   }
978
979   vg_assert(found);
980   vg_assert(hcode != 0);
981
982   /* Otherwise run it and return the resulting VG_TRC_* value. */
983   return run_noredir_translation( hcode, tid );
984}
985
986
987/*
988   Run a thread until it wants to exit.
989
990   We assume that the caller has already called VG_(acquire_BigLock) for
991   us, so we own the VCPU.  Also, all signals are blocked.
992 */
993VgSchedReturnCode VG_(scheduler) ( ThreadId tid )
994{
995   UInt     trc = VG_TRC_BORING;
996   ThreadState *tst = VG_(get_ThreadState)(tid);
997   static Bool vgdb_startup_action_done = False;
998
999   if (VG_(clo_trace_sched))
1000      print_sched_event(tid, "entering VG_(scheduler)");
1001
1002   /* Do vgdb initialization (but once). Only the first (main) task
1003      starting up will do the below.
1004      Initialize gdbserver earlier than at the first
1005      thread VG_(scheduler) is causing problems:
1006      * at the end of VG_(scheduler_init_phase2) :
1007        The main thread is in VgTs_Init state, but in a not yet
1008        consistent state => the thread cannot be reported to gdb
1009        (e.g. causes an assert in LibVEX_GuestX86_get_eflags when giving
1010        back the guest registers to gdb).
1011      * at end of valgrind_main, just
1012        before VG_(main_thread_wrapper_NORETURN)(1) :
1013        The main thread is still in VgTs_Init state but in a
1014        more advanced state. However, the thread state is not yet
1015        completely initialized : a.o., the os_state is not yet fully
1016        set => the thread is then not properly reported to gdb,
1017        which is then confused (causing e.g. a duplicate thread be
1018        shown, without thread id).
1019      * it would be possible to initialize gdbserver "lower" in the
1020        call stack (e.g. in VG_(main_thread_wrapper_NORETURN)) but
1021        these are platform dependent and the place at which
1022        the thread state is completely initialized is not
1023        specific anymore to the main thread (so a similar "do it only
1024        once" would be needed).
1025
1026        => a "once only" initialization here is the best compromise. */
1027   if (!vgdb_startup_action_done) {
1028      vg_assert(tid == 1); // it must be the main thread.
1029      vgdb_startup_action_done = True;
1030      if (VG_(clo_vgdb) != Vg_VgdbNo) {
1031         /* If we have to poll, ensures we do an initial poll at first
1032            scheduler call. Otherwise, ensure no poll (unless interrupted
1033            by ptrace). */
1034         if (VG_(clo_vgdb_poll))
1035            VG_(force_vgdb_poll) ();
1036         else
1037            VG_(disable_vgdb_poll) ();
1038
1039         vg_assert (VG_(dyn_vgdb_error) == VG_(clo_vgdb_error));
1040         /* As we are initializing, VG_(dyn_vgdb_error) can't have been
1041            changed yet. */
1042
1043         VG_(gdbserver_prerun_action) (1);
1044      } else {
1045         VG_(disable_vgdb_poll) ();
1046      }
1047   }
1048
1049   /* set the proper running signal mask */
1050   block_signals();
1051
1052   vg_assert(VG_(is_running_thread)(tid));
1053
1054   VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1;
1055
1056   while (!VG_(is_exiting)(tid)) {
1057
1058      if (VG_(dispatch_ctr) == 1) {
1059
1060	 /* Our slice is done, so yield the CPU to another thread.  On
1061            Linux, this doesn't sleep between sleeping and running,
1062            since that would take too much time. */
1063
1064	 /* 4 July 06: it seems that a zero-length nsleep is needed to
1065            cause async thread cancellation (canceller.c) to terminate
1066            in finite time; else it is in some kind of race/starvation
1067            situation and completion is arbitrarily delayed (although
1068            this is not a deadlock).
1069
1070            Unfortunately these sleeps cause MPI jobs not to terminate
1071            sometimes (some kind of livelock).  So sleeping once
1072            every N opportunities appears to work. */
1073
1074	 /* 3 Aug 06: doing sys__nsleep works but crashes some apps.
1075            sys_yield also helps the problem, whilst not crashing apps. */
1076
1077	 VG_(release_BigLock)(tid, VgTs_Yielding,
1078                                   "VG_(scheduler):timeslice");
1079	 /* ------------ now we don't have The Lock ------------ */
1080
1081         VG_(do_syscall0)(__NR_sched_yield);
1082
1083	 VG_(acquire_BigLock)(tid, "VG_(scheduler):timeslice");
1084	 /* ------------ now we do have The Lock ------------ */
1085
1086	 /* OK, do some relatively expensive housekeeping stuff */
1087	 scheduler_sanity(tid);
1088	 VG_(sanity_check_general)(False);
1089
1090	 /* Look for any pending signals for this thread, and set them up
1091	    for delivery */
1092	 VG_(poll_signals)(tid);
1093
1094	 if (VG_(is_exiting)(tid))
1095	    break;		/* poll_signals picked up a fatal signal */
1096
1097	 /* For stats purposes only. */
1098	 n_scheduling_events_MAJOR++;
1099
1100	 /* Figure out how many bbs to ask vg_run_innerloop to do.  Note
1101	    that it decrements the counter before testing it for zero, so
1102	    that if tst->dispatch_ctr is set to N you get at most N-1
1103	    iterations.  Also this means that tst->dispatch_ctr must
1104	    exceed zero before entering the innerloop.  Also also, the
1105	    decrement is done before the bb is actually run, so you
1106	    always get at least one decrement even if nothing happens. */
1107         VG_(dispatch_ctr) = SCHEDULING_QUANTUM + 1;
1108
1109	 /* paranoia ... */
1110	 vg_assert(tst->tid == tid);
1111	 vg_assert(tst->os_state.lwpid == VG_(gettid)());
1112      }
1113
1114      /* For stats purposes only. */
1115      n_scheduling_events_MINOR++;
1116
1117      if (0)
1118         VG_(message)(Vg_DebugMsg, "thread %d: running for %d bbs\n",
1119                                   tid, VG_(dispatch_ctr) - 1 );
1120
1121      if (trc == VEX_TRC_JMP_YIELD_NOREDIR) {
1122        trc = handle_noredir_jump(tid);
1123      } else {
1124        trc = run_thread_for_a_while ( tid );
1125      }
1126
1127      if (VG_(clo_trace_sched) && VG_(clo_verbosity) > 2) {
1128	 Char buf[50];
1129	 VG_(sprintf)(buf, "TRC: %s", name_of_sched_event(trc));
1130	 print_sched_event(tid, buf);
1131      }
1132
1133      if (trc == VEX_TRC_JMP_NOREDIR) {
1134         /* If we got a request to run a no-redir version of
1135            something, do so now -- handle_noredir_jump just (creates
1136            and) runs that one translation.  The flip side is that the
1137            noredir translation can't itself return another noredir
1138            request -- that would be nonsensical.  It can, however,
1139            return VG_TRC_BORING, which just means keep going as
1140            normal. */
1141         trc = handle_noredir_jump(tid);
1142         vg_assert(trc != VEX_TRC_JMP_NOREDIR);
1143      }
1144
1145      switch (trc) {
1146      case VG_TRC_BORING:
1147         /* no special event, just keep going. */
1148         break;
1149
1150      case VG_TRC_INNER_FASTMISS:
1151	 vg_assert(VG_(dispatch_ctr) > 1);
1152	 handle_tt_miss(tid);
1153	 break;
1154
1155      case VEX_TRC_JMP_CLIENTREQ:
1156	 do_client_request(tid);
1157	 break;
1158
1159      case VEX_TRC_JMP_SYS_INT128:  /* x86-linux */
1160      case VEX_TRC_JMP_SYS_INT129:  /* x86-darwin */
1161      case VEX_TRC_JMP_SYS_INT130:  /* x86-darwin */
1162      case VEX_TRC_JMP_SYS_SYSCALL: /* amd64-linux, ppc32-linux, amd64-darwin */
1163	 handle_syscall(tid, trc);
1164	 if (VG_(clo_sanity_level) > 2)
1165	    VG_(sanity_check_general)(True); /* sanity-check every syscall */
1166	 break;
1167
1168      case VEX_TRC_JMP_YIELD:
1169	 /* Explicit yield, because this thread is in a spin-lock
1170	    or something.  Only let the thread run for a short while
1171            longer.  Because swapping to another thread is expensive,
1172            we're prepared to let this thread eat a little more CPU
1173            before swapping to another.  That means that short term
1174            spins waiting for hardware to poke memory won't cause a
1175            thread swap. */
1176	 if (VG_(dispatch_ctr) > 2000)
1177            VG_(dispatch_ctr) = 2000;
1178	 break;
1179
1180      case VEX_TRC_JMP_YIELD_NOREDIR:
1181         VG_(dispatch_ctr) = 1;
1182         break;
1183
1184      case VG_TRC_INNER_COUNTERZERO:
1185	 /* Timeslice is out.  Let a new thread be scheduled. */
1186	 vg_assert(VG_(dispatch_ctr) == 1);
1187	 break;
1188
1189      case VG_TRC_FAULT_SIGNAL:
1190	 /* Everything should be set up (either we're exiting, or
1191	    about to start in a signal handler). */
1192	 break;
1193
1194      case VEX_TRC_JMP_MAPFAIL:
1195         /* Failure of arch-specific address translation (x86/amd64
1196            segment override use) */
1197         /* jrs 2005 03 11: is this correct? */
1198         VG_(synth_fault)(tid);
1199         break;
1200
1201      case VEX_TRC_JMP_EMWARN: {
1202         static Int  counts[EmWarn_NUMBER];
1203         static Bool counts_initted = False;
1204         VexEmWarn ew;
1205         HChar*    what;
1206         Bool      show;
1207         Int       q;
1208         if (!counts_initted) {
1209            counts_initted = True;
1210            for (q = 0; q < EmWarn_NUMBER; q++)
1211               counts[q] = 0;
1212         }
1213         ew   = (VexEmWarn)VG_(threads)[tid].arch.vex.guest_EMWARN;
1214         what = (ew < 0 || ew >= EmWarn_NUMBER)
1215                   ? "unknown (?!)"
1216                   : LibVEX_EmWarn_string(ew);
1217         show = (ew < 0 || ew >= EmWarn_NUMBER)
1218                   ? True
1219                   : counts[ew]++ < 3;
1220         if (show && VG_(clo_show_emwarns) && !VG_(clo_xml)) {
1221            VG_(message)( Vg_UserMsg,
1222                          "Emulation warning: unsupported action:\n");
1223            VG_(message)( Vg_UserMsg, "  %s\n", what);
1224            VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1225         }
1226         break;
1227      }
1228
1229      case VEX_TRC_JMP_EMFAIL: {
1230         VexEmWarn ew;
1231         HChar*    what;
1232         ew   = (VexEmWarn)VG_(threads)[tid].arch.vex.guest_EMWARN;
1233         what = (ew < 0 || ew >= EmWarn_NUMBER)
1234                   ? "unknown (?!)"
1235                   : LibVEX_EmWarn_string(ew);
1236         VG_(message)( Vg_UserMsg,
1237                       "Emulation fatal error -- Valgrind cannot continue:\n");
1238         VG_(message)( Vg_UserMsg, "  %s\n", what);
1239         VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1240         VG_(message)(Vg_UserMsg, "\n");
1241         VG_(message)(Vg_UserMsg, "Valgrind has to exit now.  Sorry.\n");
1242         VG_(message)(Vg_UserMsg, "\n");
1243         VG_(exit)(1);
1244         break;
1245      }
1246
1247      case VEX_TRC_JMP_SIGTRAP:
1248         VG_(synth_sigtrap)(tid);
1249         break;
1250
1251      case VEX_TRC_JMP_SIGSEGV:
1252         VG_(synth_fault)(tid);
1253         break;
1254
1255      case VEX_TRC_JMP_SIGBUS:
1256         VG_(synth_sigbus)(tid);
1257         break;
1258
1259      case VEX_TRC_JMP_NODECODE:
1260         VG_(umsg)(
1261            "valgrind: Unrecognised instruction at address %#lx.\n",
1262            VG_(get_IP)(tid));
1263         VG_(get_and_pp_StackTrace)(tid, 50);
1264#define M(a) VG_(umsg)(a "\n");
1265   M("Your program just tried to execute an instruction that Valgrind" );
1266   M("did not recognise.  There are two possible reasons for this."    );
1267   M("1. Your program has a bug and erroneously jumped to a non-code"  );
1268   M("   location.  If you are running Memcheck and you just saw a"    );
1269   M("   warning about a bad jump, it's probably your program's fault.");
1270   M("2. The instruction is legitimate but Valgrind doesn't handle it,");
1271   M("   i.e. it's Valgrind's fault.  If you think this is the case or");
1272   M("   you are not sure, please let us know and we'll try to fix it.");
1273   M("Either way, Valgrind will now raise a SIGILL signal which will"  );
1274   M("probably kill your program."                                     );
1275#undef M
1276         VG_(synth_sigill)(tid, VG_(get_IP)(tid));
1277         break;
1278
1279      case VEX_TRC_JMP_TINVAL:
1280         VG_(discard_translations)(
1281            (Addr64)VG_(threads)[tid].arch.vex.guest_TISTART,
1282            VG_(threads)[tid].arch.vex.guest_TILEN,
1283            "scheduler(VEX_TRC_JMP_TINVAL)"
1284         );
1285         if (0)
1286            VG_(printf)("dump translations done.\n");
1287         break;
1288
1289      case VG_TRC_INVARIANT_FAILED:
1290         /* This typically happens if, after running generated code,
1291            it is detected that host CPU settings (eg, FPU/Vector
1292            control words) are not as they should be.  Vex's code
1293            generation specifies the state such control words should
1294            be in on entry to Vex-generated code, and they should be
1295            unchanged on exit from it.  Failure of this assertion
1296            usually means a bug in Vex's code generation. */
1297         //{ UInt xx;
1298         //  __asm__ __volatile__ (
1299         //     "\t.word 0xEEF12A10\n"  // fmrx r2,fpscr
1300         //     "\tmov %0, r2" : "=r"(xx) : : "r2" );
1301         //  VG_(printf)("QQQQ new fpscr = %08x\n", xx);
1302         //}
1303         vg_assert2(0, "VG_(scheduler), phase 3: "
1304                       "run_innerloop detected host "
1305                       "state invariant failure", trc);
1306
1307      case VEX_TRC_JMP_SYS_SYSENTER:
1308         /* Do whatever simulation is appropriate for an x86 sysenter
1309            instruction.  Note that it is critical to set this thread's
1310            guest_EIP to point at the code to execute after the
1311            sysenter, since Vex-generated code will not have set it --
1312            vex does not know what it should be.  Vex sets the next
1313            address to zero, so if you don't set guest_EIP, the thread
1314            will jump to zero afterwards and probably die as a result. */
1315#        if defined(VGP_x86_linux)
1316         vg_assert2(0, "VG_(scheduler), phase 3: "
1317                       "sysenter_x86 on x86-linux is not supported");
1318#        elif defined(VGP_x86_darwin)
1319         /* return address in client edx */
1320         VG_(threads)[tid].arch.vex.guest_EIP
1321            = VG_(threads)[tid].arch.vex.guest_EDX;
1322         handle_syscall(tid, trc);
1323#        else
1324         vg_assert2(0, "VG_(scheduler), phase 3: "
1325                       "sysenter_x86 on non-x86 platform?!?!");
1326#        endif
1327         break;
1328
1329      default:
1330	 vg_assert2(0, "VG_(scheduler), phase 3: "
1331                       "unexpected thread return code (%u)", trc);
1332	 /* NOTREACHED */
1333	 break;
1334
1335      } /* switch (trc) */
1336
1337      if (0)
1338         maybe_show_sb_counts();
1339   }
1340
1341   if (VG_(clo_trace_sched))
1342      print_sched_event(tid, "exiting VG_(scheduler)");
1343
1344   vg_assert(VG_(is_exiting)(tid));
1345
1346   return tst->exitreason;
1347}
1348
1349
1350/*
1351   This causes all threads to forceably exit.  They aren't actually
1352   dead by the time this returns; you need to call
1353   VG_(reap_threads)() to wait for them.
1354 */
1355void VG_(nuke_all_threads_except) ( ThreadId me, VgSchedReturnCode src )
1356{
1357   ThreadId tid;
1358
1359   vg_assert(VG_(is_running_thread)(me));
1360
1361   for (tid = 1; tid < VG_N_THREADS; tid++) {
1362      if (tid == me
1363          || VG_(threads)[tid].status == VgTs_Empty)
1364         continue;
1365      if (0)
1366         VG_(printf)(
1367            "VG_(nuke_all_threads_except): nuking tid %d\n", tid);
1368
1369      VG_(threads)[tid].exitreason = src;
1370      if (src == VgSrc_FatalSig)
1371         VG_(threads)[tid].os_state.fatalsig = VKI_SIGKILL;
1372      VG_(get_thread_out_of_syscall)(tid);
1373   }
1374}
1375
1376
1377/* ---------------------------------------------------------------------
1378   Specifying shadow register values
1379   ------------------------------------------------------------------ */
1380
1381#if defined(VGA_x86)
1382#  define VG_CLREQ_ARGS       guest_EAX
1383#  define VG_CLREQ_RET        guest_EDX
1384#elif defined(VGA_amd64)
1385#  define VG_CLREQ_ARGS       guest_RAX
1386#  define VG_CLREQ_RET        guest_RDX
1387#elif defined(VGA_ppc32) || defined(VGA_ppc64)
1388#  define VG_CLREQ_ARGS       guest_GPR4
1389#  define VG_CLREQ_RET        guest_GPR3
1390#elif defined(VGA_arm)
1391#  define VG_CLREQ_ARGS       guest_R4
1392#  define VG_CLREQ_RET        guest_R3
1393#elif defined (VGA_s390x)
1394#  define VG_CLREQ_ARGS       guest_r2
1395#  define VG_CLREQ_RET        guest_r3
1396#else
1397#  error Unknown arch
1398#endif
1399
1400#define CLREQ_ARGS(regs)   ((regs).vex.VG_CLREQ_ARGS)
1401#define CLREQ_RET(regs)    ((regs).vex.VG_CLREQ_RET)
1402#define O_CLREQ_RET        (offsetof(VexGuestArchState, VG_CLREQ_RET))
1403
1404// These macros write a value to a client's thread register, and tell the
1405// tool that it's happened (if necessary).
1406
1407#define SET_CLREQ_RETVAL(zztid, zzval) \
1408   do { CLREQ_RET(VG_(threads)[zztid].arch) = (zzval); \
1409        VG_TRACK( post_reg_write, \
1410                  Vg_CoreClientReq, zztid, O_CLREQ_RET, sizeof(UWord)); \
1411   } while (0)
1412
1413#define SET_CLCALL_RETVAL(zztid, zzval, f) \
1414   do { CLREQ_RET(VG_(threads)[zztid].arch) = (zzval); \
1415        VG_TRACK( post_reg_write_clientcall_return, \
1416                  zztid, O_CLREQ_RET, sizeof(UWord), f); \
1417   } while (0)
1418
1419
1420/* ---------------------------------------------------------------------
1421   Handle client requests.
1422   ------------------------------------------------------------------ */
1423
1424// OS-specific(?) client requests
1425static Bool os_client_request(ThreadId tid, UWord *args)
1426{
1427   Bool handled = True;
1428
1429   vg_assert(VG_(is_running_thread)(tid));
1430
1431   switch(args[0]) {
1432   case VG_USERREQ__LIBC_FREERES_DONE:
1433      /* This is equivalent to an exit() syscall, but we don't set the
1434	 exitcode (since it might already be set) */
1435      if (0 || VG_(clo_trace_syscalls) || VG_(clo_trace_sched))
1436         VG_(message)(Vg_DebugMsg,
1437                      "__libc_freeres() done; really quitting!\n");
1438      VG_(threads)[tid].exitreason = VgSrc_ExitThread;
1439      break;
1440
1441   default:
1442      handled = False;
1443      break;
1444   }
1445
1446   return handled;
1447}
1448
1449
1450/* Do a client request for the thread tid.  After the request, tid may
1451   or may not still be runnable; if not, the scheduler will have to
1452   choose a new thread to run.
1453*/
1454static
1455void do_client_request ( ThreadId tid )
1456{
1457   UWord* arg = (UWord*)(CLREQ_ARGS(VG_(threads)[tid].arch));
1458   UWord req_no = arg[0];
1459
1460   if (0)
1461      VG_(printf)("req no = 0x%llx, arg = %p\n", (ULong)req_no, arg);
1462   switch (req_no) {
1463
1464      case VG_USERREQ__CLIENT_CALL0: {
1465         UWord (*f)(ThreadId) = (void*)arg[1];
1466	 if (f == NULL)
1467	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL0: func=%p\n", f);
1468	 else
1469	    SET_CLCALL_RETVAL(tid, f ( tid ), (Addr)f);
1470         break;
1471      }
1472      case VG_USERREQ__CLIENT_CALL1: {
1473         UWord (*f)(ThreadId, UWord) = (void*)arg[1];
1474	 if (f == NULL)
1475	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL1: func=%p\n", f);
1476	 else
1477	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2] ), (Addr)f );
1478         break;
1479      }
1480      case VG_USERREQ__CLIENT_CALL2: {
1481         UWord (*f)(ThreadId, UWord, UWord) = (void*)arg[1];
1482	 if (f == NULL)
1483	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL2: func=%p\n", f);
1484	 else
1485	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3] ), (Addr)f );
1486         break;
1487      }
1488      case VG_USERREQ__CLIENT_CALL3: {
1489         UWord (*f)(ThreadId, UWord, UWord, UWord) = (void*)arg[1];
1490	 if (f == NULL)
1491	    VG_(message)(Vg_DebugMsg, "VG_USERREQ__CLIENT_CALL3: func=%p\n", f);
1492	 else
1493	    SET_CLCALL_RETVAL(tid, f ( tid, arg[2], arg[3], arg[4] ), (Addr)f );
1494         break;
1495      }
1496
1497      // Nb: this looks like a circular definition, because it kind of is.
1498      // See comment in valgrind.h to understand what's going on.
1499      case VG_USERREQ__RUNNING_ON_VALGRIND:
1500         SET_CLREQ_RETVAL(tid, RUNNING_ON_VALGRIND+1);
1501         break;
1502
1503      case VG_USERREQ__PRINTF: {
1504         /* JRS 2010-Jan-28: this is DEPRECATED; use the
1505            _VALIST_BY_REF version instead */
1506         if (sizeof(va_list) != sizeof(UWord))
1507            goto va_list_casting_error_NORETURN;
1508         union {
1509            va_list vargs;
1510            unsigned long uw;
1511         } u;
1512         u.uw = (unsigned long)arg[2];
1513         Int count =
1514            VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], u.vargs );
1515         VG_(message_flush)();
1516         SET_CLREQ_RETVAL( tid, count );
1517         break;
1518      }
1519
1520      case VG_USERREQ__PRINTF_BACKTRACE: {
1521         /* JRS 2010-Jan-28: this is DEPRECATED; use the
1522            _VALIST_BY_REF version instead */
1523         if (sizeof(va_list) != sizeof(UWord))
1524            goto va_list_casting_error_NORETURN;
1525         union {
1526            va_list vargs;
1527            unsigned long uw;
1528         } u;
1529         u.uw = (unsigned long)arg[2];
1530         Int count =
1531            VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], u.vargs );
1532         VG_(message_flush)();
1533         VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1534         SET_CLREQ_RETVAL( tid, count );
1535         break;
1536      }
1537
1538      case VG_USERREQ__PRINTF_VALIST_BY_REF: {
1539         va_list* vargsp = (va_list*)arg[2];
1540         Int count =
1541            VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], *vargsp );
1542         VG_(message_flush)();
1543         SET_CLREQ_RETVAL( tid, count );
1544         break;
1545      }
1546
1547      case VG_USERREQ__PRINTF_BACKTRACE_VALIST_BY_REF: {
1548         va_list* vargsp = (va_list*)arg[2];
1549         Int count =
1550            VG_(vmessage)( Vg_ClientMsg, (char *)arg[1], *vargsp );
1551         VG_(message_flush)();
1552         VG_(get_and_pp_StackTrace)( tid, VG_(clo_backtrace_size) );
1553         SET_CLREQ_RETVAL( tid, count );
1554         break;
1555      }
1556
1557      case VG_USERREQ__INTERNAL_PRINTF_VALIST_BY_REF: {
1558         va_list* vargsp = (va_list*)arg[2];
1559         Int count =
1560            VG_(vmessage)( Vg_DebugMsg, (char *)arg[1], *vargsp );
1561         VG_(message_flush)();
1562         SET_CLREQ_RETVAL( tid, count );
1563         break;
1564      }
1565
1566      case VG_USERREQ__ADD_IFUNC_TARGET: {
1567         VG_(redir_add_ifunc_target)( arg[1], arg[2] );
1568         SET_CLREQ_RETVAL( tid, 0);
1569         break; }
1570
1571      case VG_USERREQ__STACK_REGISTER: {
1572         UWord sid = VG_(register_stack)((Addr)arg[1], (Addr)arg[2]);
1573         SET_CLREQ_RETVAL( tid, sid );
1574         break; }
1575
1576      case VG_USERREQ__STACK_DEREGISTER: {
1577         VG_(deregister_stack)(arg[1]);
1578         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1579         break; }
1580
1581      case VG_USERREQ__STACK_CHANGE: {
1582         VG_(change_stack)(arg[1], (Addr)arg[2], (Addr)arg[3]);
1583         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1584         break; }
1585
1586      case VG_USERREQ__GET_MALLOCFUNCS: {
1587	 struct vg_mallocfunc_info *info = (struct vg_mallocfunc_info *)arg[1];
1588
1589	 info->tl_malloc               = VG_(tdict).tool_malloc;
1590	 info->tl_calloc               = VG_(tdict).tool_calloc;
1591	 info->tl_realloc              = VG_(tdict).tool_realloc;
1592	 info->tl_memalign             = VG_(tdict).tool_memalign;
1593	 info->tl___builtin_new        = VG_(tdict).tool___builtin_new;
1594	 info->tl___builtin_vec_new    = VG_(tdict).tool___builtin_vec_new;
1595	 info->tl_free                 = VG_(tdict).tool_free;
1596	 info->tl___builtin_delete     = VG_(tdict).tool___builtin_delete;
1597	 info->tl___builtin_vec_delete = VG_(tdict).tool___builtin_vec_delete;
1598         info->tl_malloc_usable_size   = VG_(tdict).tool_malloc_usable_size;
1599
1600	 info->mallinfo                = VG_(mallinfo);
1601	 info->clo_trace_malloc        = VG_(clo_trace_malloc);
1602
1603         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1604
1605	 break;
1606      }
1607
1608      /* Requests from the client program */
1609
1610      case VG_USERREQ__DISCARD_TRANSLATIONS:
1611         if (VG_(clo_verbosity) > 2)
1612            VG_(printf)( "client request: DISCARD_TRANSLATIONS,"
1613                         " addr %p,  len %lu\n",
1614                         (void*)arg[1], arg[2] );
1615
1616         VG_(discard_translations)(
1617            arg[1], arg[2], "scheduler(VG_USERREQ__DISCARD_TRANSLATIONS)"
1618         );
1619
1620         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1621	 break;
1622
1623      case VG_USERREQ__COUNT_ERRORS:
1624         SET_CLREQ_RETVAL( tid, VG_(get_n_errs_found)() );
1625         break;
1626
1627      case VG_USERREQ__LOAD_PDB_DEBUGINFO:
1628         VG_(di_notify_pdb_debuginfo)( arg[1], arg[2], arg[3], arg[4] );
1629         SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1630         break;
1631
1632      case VG_USERREQ__MAP_IP_TO_SRCLOC: {
1633         Addr   ip    = arg[1];
1634         UChar* buf64 = (UChar*)arg[2];
1635
1636         VG_(memset)(buf64, 0, 64);
1637         UInt linenum = 0;
1638         Bool ok = VG_(get_filename_linenum)(
1639                      ip, &buf64[0], 50, NULL, 0, NULL, &linenum
1640                   );
1641         if (ok) {
1642            /* Find the terminating zero in the first 50 bytes. */
1643            UInt i;
1644            for (i = 0; i < 50; i++) {
1645               if (buf64[i] == 0)
1646                  break;
1647            }
1648            /* We must find a zero somewhere in 0 .. 49.  Else
1649               VG_(get_filename_linenum) is not properly zero
1650               terminating. */
1651            vg_assert(i < 50);
1652            VG_(sprintf)(&buf64[i], ":%u", linenum);
1653         } else {
1654            buf64[0] = 0;
1655         }
1656
1657         SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */
1658         break;
1659      }
1660
1661      case VG_USERREQ__CHANGE_ERR_DISABLEMENT: {
1662         Word delta = arg[1];
1663         vg_assert(delta == 1 || delta == -1);
1664         ThreadState* tst = VG_(get_ThreadState)(tid);
1665         vg_assert(tst);
1666         if (delta == 1 && tst->err_disablement_level < 0xFFFFFFFF) {
1667            tst->err_disablement_level++;
1668         }
1669         else
1670         if (delta == -1 && tst->err_disablement_level > 0) {
1671            tst->err_disablement_level--;
1672         }
1673         SET_CLREQ_RETVAL( tid, 0 ); /* return value is meaningless */
1674         break;
1675      }
1676
1677      case VG_USERREQ__MALLOCLIKE_BLOCK:
1678      case VG_USERREQ__RESIZEINPLACE_BLOCK:
1679      case VG_USERREQ__FREELIKE_BLOCK:
1680         // Ignore them if the addr is NULL;  otherwise pass onto the tool.
1681         if (!arg[1]) {
1682            SET_CLREQ_RETVAL( tid, 0 );     /* return value is meaningless */
1683            break;
1684         } else {
1685            goto my_default;
1686         }
1687
1688      case VG_USERREQ__NACL_MEM_START: {
1689         Addr mem_start = arg[1];
1690         nacl_head = mem_start;
1691         VG_(printf)("*********************** NaCl mem_start: %p\n", (void*)mem_start);
1692
1693         // At this point all segments in the sandbox belong to nacl_file (the
1694         // first untrusted binary loaded by sel_ldr), and have correct
1695         // permissions. Read its debug info.
1696         NSegment* seg = VG_(am_find_nsegment)(mem_start);
1697         int fnIdx = -1;
1698         while (seg) {
1699           if (seg->kind == SkFileC) {
1700             if (fnIdx == seg->fnIdx || fnIdx == -1) {
1701               fnIdx = seg->fnIdx;
1702               VG_(printf)("Segment at %p belongs to the loader\n", (void*)seg->start);
1703               VG_(di_notify_mmap)(seg->start, False, /*glider: don't use fd*/-1);
1704             }
1705           }
1706           seg = VG_(am_next_nsegment)((NSegment*)seg, True);
1707         }
1708         goto my_default;
1709      }
1710
1711      case VG_USERREQ__NACL_FILE: {
1712         VG_(printf)("*********************** NaCl nacl_file: %s\n", (void*)arg[1]);
1713         nacl_file = (char*) arg[1];
1714         goto my_default;
1715      }
1716
1717      case VG_USERREQ__NACL_MMAP: {
1718         // Simulate an mmap().
1719         UWord vma = arg[1]; // Base VMA of the mapping.
1720         UWord size = arg[2]; // Size of the mapping.
1721         UWord file_offset = arg[3]; // File offset.
1722         UWord access = arg[4]; // Access.
1723         UWord clone_vma = arg[5]; // Another mapping of the same; only used to find the file name.
1724         if (!access)
1725           access = VKI_PROT_READ | VKI_PROT_EXEC;
1726         VG_(printf)("*********************** NaCl nacl_mmap: %lx %lx %lx %lx\n", vma, size, file_offset, clone_vma);
1727
1728         char* file_name = NULL;
1729         if (clone_vma) {
1730           NSegment* seg = VG_(am_find_nsegment)(clone_vma);
1731           file_name = VG_(am_get_filename)(seg);
1732           VG_(printf)("*********************** NaCl DSO file_name: %s\n", file_name);
1733         }
1734
1735         UWord vma_end = vma + size;
1736         UWord vma_aligned = VG_PGROUNDDN(vma);
1737         UWord vma_end_aligned = VG_PGROUNDUP(vma_end);
1738         size = vma_end_aligned - vma_aligned;
1739         file_offset -= vma - vma_aligned;
1740         VG_(am_notify_fake_client_mmap)(vma_aligned, size, access,
1741             0, file_name ? file_name : (VG_(clo_nacl_file) ? VG_(clo_nacl_file) : nacl_file), file_offset);
1742         // If file_name == NULL, then this is the main (sel_ldr-mapped) nexe,
1743         // and has incorrect permissions at this point. In that case, wait for
1744         // NACL_MEM_START to read the debug info.
1745         if (file_name)
1746           VG_(di_notify_mmap)(vma_aligned, False, /*glider: don't use fd*/-1);
1747         goto my_default;
1748      }
1749
1750
1751      default:
1752       my_default:
1753	 if (os_client_request(tid, arg)) {
1754	    // do nothing, os_client_request() handled it
1755         } else if (VG_(needs).client_requests) {
1756	    UWord ret;
1757
1758            if (VG_(clo_verbosity) > 2)
1759               VG_(printf)("client request: code %lx,  addr %p,  len %lu\n",
1760                           arg[0], (void*)arg[1], arg[2] );
1761
1762	    if ( VG_TDICT_CALL(tool_handle_client_request, tid, arg, &ret) )
1763	       SET_CLREQ_RETVAL(tid, ret);
1764         } else {
1765	    static Bool whined = False;
1766
1767	    if (!whined && VG_(clo_verbosity) > 2) {
1768               // Allow for requests in core, but defined by tools, which
1769               // have 0 and 0 in their two high bytes.
1770               Char c1 = (arg[0] >> 24) & 0xff;
1771               Char c2 = (arg[0] >> 16) & 0xff;
1772               if (c1 == 0) c1 = '_';
1773               if (c2 == 0) c2 = '_';
1774	       VG_(message)(Vg_UserMsg, "Warning:\n"
1775                   "  unhandled client request: 0x%lx (%c%c+0x%lx).  Perhaps\n"
1776		   "  VG_(needs).client_requests should be set?\n",
1777			    arg[0], c1, c2, arg[0] & 0xffff);
1778	       whined = True;
1779	    }
1780         }
1781         break;
1782   }
1783   return;
1784
1785   /*NOTREACHED*/
1786  va_list_casting_error_NORETURN:
1787   VG_(umsg)(
1788      "Valgrind: fatal error - cannot continue: use of the deprecated\n"
1789      "client requests VG_USERREQ__PRINTF or VG_USERREQ__PRINTF_BACKTRACE\n"
1790      "on a platform where they cannot be supported.  Please use the\n"
1791      "equivalent _VALIST_BY_REF versions instead.\n"
1792      "\n"
1793      "This is a binary-incompatible change in Valgrind's client request\n"
1794      "mechanism.  It is unfortunate, but difficult to avoid.  End-users\n"
1795      "are expected to almost never see this message.  The only case in\n"
1796      "which you might see this message is if your code uses the macros\n"
1797      "VALGRIND_PRINTF or VALGRIND_PRINTF_BACKTRACE.  If so, you will need\n"
1798      "to recompile such code, using the header files from this version of\n"
1799      "Valgrind, and not any previous version.\n"
1800      "\n"
1801      "If you see this mesage in any other circumstances, it is probably\n"
1802      "a bug in Valgrind.  In this case, please file a bug report at\n"
1803      "\n"
1804      "   http://www.valgrind.org/support/bug_reports.html\n"
1805      "\n"
1806      "Will now abort.\n"
1807   );
1808   vg_assert(0);
1809}
1810
1811
1812/* ---------------------------------------------------------------------
1813   Sanity checking (permanently engaged)
1814   ------------------------------------------------------------------ */
1815
1816/* Internal consistency checks on the sched structures. */
1817static
1818void scheduler_sanity ( ThreadId tid )
1819{
1820   Bool bad = False;
1821   static UInt lasttime = 0;
1822   UInt now;
1823   Int lwpid = VG_(gettid)();
1824
1825   if (!VG_(is_running_thread)(tid)) {
1826      VG_(message)(Vg_DebugMsg,
1827		   "Thread %d is supposed to be running, "
1828                   "but doesn't own the_BigLock (owned by %d)\n",
1829		   tid, VG_(running_tid));
1830      bad = True;
1831   }
1832
1833   if (lwpid != VG_(threads)[tid].os_state.lwpid) {
1834      VG_(message)(Vg_DebugMsg,
1835                   "Thread %d supposed to be in LWP %d, but we're actually %d\n",
1836                   tid, VG_(threads)[tid].os_state.lwpid, VG_(gettid)());
1837      bad = True;
1838   }
1839
1840#if !defined(VGO_darwin)
1841   // GrP fixme
1842   if (lwpid != the_BigLock.owner_lwpid) {
1843      VG_(message)(Vg_DebugMsg,
1844                   "Thread (LWPID) %d doesn't own the_BigLock\n",
1845                   tid);
1846      bad = True;
1847   }
1848#endif
1849
1850   /* Periodically show the state of all threads, for debugging
1851      purposes. */
1852   now = VG_(read_millisecond_timer)();
1853   if (0 && (!bad) && (lasttime + 4000/*ms*/ <= now)) {
1854      lasttime = now;
1855      VG_(printf)("\n------------ Sched State at %d ms ------------\n",
1856                  (Int)now);
1857      VG_(show_sched_status)();
1858   }
1859
1860   /* core_panic also shows the sched status, which is why we don't
1861      show it above if bad==True. */
1862   if (bad)
1863      VG_(core_panic)("scheduler_sanity: failed");
1864}
1865
1866void VG_(sanity_check_general) ( Bool force_expensive )
1867{
1868   ThreadId tid;
1869
1870   static UInt next_slow_check_at = 1;
1871   static UInt slow_check_interval = 25;
1872
1873   if (VG_(clo_sanity_level) < 1) return;
1874
1875   /* --- First do all the tests that we can do quickly. ---*/
1876
1877   sanity_fast_count++;
1878
1879   /* Check stuff pertaining to the memory check system. */
1880
1881   /* Check that nobody has spuriously claimed that the first or
1882      last 16 pages of memory have become accessible [...] */
1883   if (VG_(needs).sanity_checks) {
1884      vg_assert(VG_TDICT_CALL(tool_cheap_sanity_check));
1885   }
1886
1887   /* --- Now some more expensive checks. ---*/
1888
1889   /* Once every now and again, check some more expensive stuff.
1890      Gradually increase the interval between such checks so as not to
1891      burden long-running programs too much. */
1892   if ( force_expensive
1893        || VG_(clo_sanity_level) > 1
1894        || (VG_(clo_sanity_level) == 1
1895            && sanity_fast_count == next_slow_check_at)) {
1896
1897      if (0) VG_(printf)("SLOW at %d\n", sanity_fast_count-1);
1898
1899      next_slow_check_at = sanity_fast_count - 1 + slow_check_interval;
1900      slow_check_interval++;
1901      sanity_slow_count++;
1902
1903      if (VG_(needs).sanity_checks) {
1904          vg_assert(VG_TDICT_CALL(tool_expensive_sanity_check));
1905      }
1906
1907      /* Look for stack overruns.  Visit all threads. */
1908      for (tid = 1; tid < VG_N_THREADS; tid++) {
1909	 SizeT    remains;
1910         VgStack* stack;
1911
1912	 if (VG_(threads)[tid].status == VgTs_Empty ||
1913	     VG_(threads)[tid].status == VgTs_Zombie)
1914	    continue;
1915
1916         stack
1917            = (VgStack*)
1918              VG_(get_ThreadState)(tid)->os_state.valgrind_stack_base;
1919         SizeT limit
1920            = 4096; // Let's say.  Checking more causes lots of L2 misses.
1921	 remains
1922            = VG_(am_get_VgStack_unused_szB)(stack, limit);
1923	 if (remains < limit)
1924	    VG_(message)(Vg_DebugMsg,
1925                         "WARNING: Thread %d is within %ld bytes "
1926                         "of running out of stack!\n",
1927		         tid, remains);
1928      }
1929   }
1930
1931   if (VG_(clo_sanity_level) > 1) {
1932      /* Check sanity of the low-level memory manager.  Note that bugs
1933         in the client's code can cause this to fail, so we don't do
1934         this check unless specially asked for.  And because it's
1935         potentially very expensive. */
1936      VG_(sanity_check_malloc_all)();
1937   }
1938}
1939
1940/*--------------------------------------------------------------------*/
1941/*--- end                                                          ---*/
1942/*--------------------------------------------------------------------*/
1943