1
2/*--------------------------------------------------------------------*/
3/*--- Handle system calls.                          syswrap-main.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2000-2011 Julian Seward
11      jseward@acm.org
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26   02111-1307, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29*/
30
31#include "libvex_guest_offsets.h"
32#include "libvex_trc_values.h"
33#include "pub_core_basics.h"
34#include "pub_core_aspacemgr.h"
35#include "pub_core_vki.h"
36#include "pub_core_vkiscnums.h"
37#include "pub_core_libcsetjmp.h"    // to keep _threadstate.h happy
38#include "pub_core_threadstate.h"
39#include "pub_core_libcbase.h"
40#include "pub_core_libcassert.h"
41#include "pub_core_libcprint.h"
42#include "pub_core_libcproc.h"      // For VG_(getpid)()
43#include "pub_core_libcsignal.h"
44#include "pub_core_scheduler.h"     // For VG_({acquire,release}_BigLock),
45                                    //   and VG_(vg_yield)
46#include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
47#include "pub_core_tooliface.h"
48#include "pub_core_options.h"
49#include "pub_core_signals.h"       // For VG_SIGVGKILL, VG_(poll_signals)
50#include "pub_core_syscall.h"
51#include "pub_core_machine.h"
52#include "pub_core_syswrap.h"
53
54#include "priv_types_n_macros.h"
55#include "priv_syswrap-main.h"
56
57#if defined(VGO_darwin)
58#include "priv_syswrap-darwin.h"
59#endif
60
61/* Useful info which needs to be recorded somewhere:
62   Use of registers in syscalls is:
63
64          NUM   ARG1 ARG2 ARG3 ARG4 ARG5 ARG6 ARG7 ARG8 RESULT
65   LINUX:
66   x86    eax   ebx  ecx  edx  esi  edi  ebp  n/a  n/a  eax       (== NUM)
67   amd64  rax   rdi  rsi  rdx  r10  r8   r9   n/a  n/a  rax       (== NUM)
68   ppc32  r0    r3   r4   r5   r6   r7   r8   n/a  n/a  r3+CR0.SO (== ARG1)
69   ppc64  r0    r3   r4   r5   r6   r7   r8   n/a  n/a  r3+CR0.SO (== ARG1)
70   arm    r7    r0   r1   r2   r3   r4   r5   n/a  n/a  r0        (== ARG1)
71
72   On s390x the svc instruction is used for system calls. The system call
73   number is encoded in the instruction (8 bit immediate field). Since Linux
74   2.6 it is also allowed to use svc 0 with the system call number in r1.
75   This was introduced for system calls >255, but works for all. It is
76   also possible to see the svc 0 together with an EXecute instruction, that
77   fills in the immediate field.
78   s390x r1/SVC r2   r3   r4   r5   r6   r7   n/a  n/a  r2        (== ARG1)
79
80   DARWIN:
81   x86    eax +4   +8   +12  +16  +20  +24  +28  +32  edx:eax, eflags.c
82   amd64  rax rdi  rsi  rdx  rcx  r8   r9   +8   +16  rdx:rax, rflags.c
83
84   For x86-darwin, "+N" denotes "in memory at N(%esp)"; ditto
85   amd64-darwin.  Apparently 0(%esp) is some kind of return address
86   (perhaps for syscalls done with "sysenter"?)  I don't think it is
87   relevant for syscalls done with "int $0x80/1/2".
88*/
89
90/* This is the top level of the system-call handler module.  All
91   system calls are channelled through here, doing two things:
92
93   * notify the tool of the events (mem/reg reads, writes) happening
94
95   * perform the syscall, usually by passing it along to the kernel
96     unmodified.
97
98   A magical piece of assembly code, do_syscall_for_client_WRK, in
99   syscall-$PLATFORM.S does the tricky bit of passing a syscall to the
100   kernel, whilst having the simulator retain control.
101*/
102
103/* The main function is VG_(client_syscall).  The simulation calls it
104   whenever a client thread wants to do a syscall.  The following is a
105   sketch of what it does.
106
107   * Ensures the root thread's stack is suitably mapped.  Tedious and
108     arcane.  See big big comment in VG_(client_syscall).
109
110   * First, it rounds up the syscall number and args (which is a
111     platform dependent activity) and puts them in a struct ("args")
112     and also a copy in "orig_args".
113
114     The pre/post wrappers refer to these structs and so no longer
115     need magic macros to access any specific registers.  This struct
116     is stored in thread-specific storage.
117
118
119   * The pre-wrapper is called, passing it a pointer to struct
120     "args".
121
122
123   * The pre-wrapper examines the args and pokes the tool
124     appropriately.  It may modify the args; this is why "orig_args"
125     is also stored.
126
127     The pre-wrapper may choose to 'do' the syscall itself, and
128     concludes one of three outcomes:
129
130       Success(N)    -- syscall is already complete, with success;
131                        result is N
132
133       Fail(N)       -- syscall is already complete, with failure;
134                        error code is N
135
136       HandToKernel  -- (the usual case): this needs to be given to
137                        the kernel to be done, using the values in
138                        the possibly-modified "args" struct.
139
140     In addition, the pre-wrapper may set some flags:
141
142       MayBlock   -- only applicable when outcome==HandToKernel
143
144       PostOnFail -- only applicable when outcome==HandToKernel or Fail
145
146
147   * If the pre-outcome is HandToKernel, the syscall is duly handed
148     off to the kernel (perhaps involving some thread switchery, but
149     that's not important).  This reduces the possible set of outcomes
150     to either Success(N) or Fail(N).
151
152
153   * The outcome (Success(N) or Fail(N)) is written back to the guest
154     register(s).  This is platform specific:
155
156     x86:    Success(N) ==>  eax = N
157             Fail(N)    ==>  eax = -N
158
159     ditto amd64
160
161     ppc32:  Success(N) ==>  r3 = N, CR0.SO = 0
162             Fail(N) ==>     r3 = N, CR0.SO = 1
163
164     Darwin:
165     x86:    Success(N) ==>  edx:eax = N, cc = 0
166             Fail(N)    ==>  edx:eax = N, cc = 1
167
168     s390x:  Success(N) ==>  r2 = N
169             Fail(N)    ==>  r2 = -N
170
171   * The post wrapper is called if:
172
173     - it exists, and
174     - outcome==Success or (outcome==Fail and PostOnFail is set)
175
176     The post wrapper is passed the adulterated syscall args (struct
177     "args"), and the syscall outcome (viz, Success(N) or Fail(N)).
178
179   There are several other complications, primarily to do with
180   syscalls getting interrupted, explained in comments in the code.
181*/
182
183/* CAVEATS for writing wrappers.  It is important to follow these!
184
185   The macros defined in priv_types_n_macros.h are designed to help
186   decouple the wrapper logic from the actual representation of
187   syscall args/results, since these wrappers are designed to work on
188   multiple platforms.
189
190   Sometimes a PRE wrapper will complete the syscall itself, without
191   handing it to the kernel.  It will use one of SET_STATUS_Success,
192   SET_STATUS_Failure or SET_STATUS_from_SysRes to set the return
193   value.  It is critical to appreciate that use of the macro does not
194   immediately cause the underlying guest state to be updated -- that
195   is done by the driver logic in this file, when the wrapper returns.
196
197   As a result, PRE wrappers of the following form will malfunction:
198
199   PRE(fooble)
200   {
201      ... do stuff ...
202      SET_STATUS_Somehow(...)
203
204      // do something that assumes guest state is up to date
205   }
206
207   In particular, direct or indirect calls to VG_(poll_signals) after
208   setting STATUS can cause the guest state to be read (in order to
209   build signal frames).  Do not do this.  If you want a signal poll
210   after the syscall goes through, do "*flags |= SfPollAfter" and the
211   driver logic will do it for you.
212
213   -----------
214
215   Another critical requirement following introduction of new address
216   space manager (JRS, 20050923):
217
218   In a situation where the mappedness of memory has changed, aspacem
219   should be notified BEFORE the tool.  Hence the following is
220   correct:
221
222      Bool d = VG_(am_notify_munmap)(s->start, s->end+1 - s->start);
223      VG_TRACK( die_mem_munmap, s->start, s->end+1 - s->start );
224      if (d)
225         VG_(discard_translations)(s->start, s->end+1 - s->start);
226
227   whilst this is wrong:
228
229      VG_TRACK( die_mem_munmap, s->start, s->end+1 - s->start );
230      Bool d = VG_(am_notify_munmap)(s->start, s->end+1 - s->start);
231      if (d)
232         VG_(discard_translations)(s->start, s->end+1 - s->start);
233
234   The reason is that the tool may itself ask aspacem for more shadow
235   memory as a result of the VG_TRACK call.  In such a situation it is
236   critical that aspacem's segment array is up to date -- hence the
237   need to notify aspacem first.
238
239   -----------
240
241   Also .. take care to call VG_(discard_translations) whenever
242   memory with execute permissions is unmapped.
243*/
244
245
246/* ---------------------------------------------------------------------
247   Do potentially blocking syscall for the client, and mess with
248   signal masks at the same time.
249   ------------------------------------------------------------------ */
250
251/* Perform a syscall on behalf of a client thread, using a specific
252   signal mask.  On completion, the signal mask is set to restore_mask
253   (which presumably blocks almost everything).  If a signal happens
254   during the syscall, the handler should call
255   VG_(fixup_guest_state_after_syscall_interrupted) to adjust the
256   thread's context to do the right thing.
257
258   The _WRK function is handwritten assembly, implemented per-platform
259   in coregrind/m_syswrap/syscall-$PLAT.S.  It has some very magic
260   properties.  See comments at the top of
261   VG_(fixup_guest_state_after_syscall_interrupted) below for details.
262
263   This function (these functions) are required to return zero in case
264   of success (even if the syscall itself failed), and nonzero if the
265   sigprocmask-swizzling calls failed.  We don't actually care about
266   the failure values from sigprocmask, although most of the assembly
267   implementations do attempt to return that, using the convention
268   0 for success, or 0x8000 | error-code for failure.
269*/
270#if defined(VGO_linux)
271extern
272UWord ML_(do_syscall_for_client_WRK)( Word syscallno,
273                                      void* guest_state,
274                                      const vki_sigset_t *syscall_mask,
275                                      const vki_sigset_t *restore_mask,
276                                      Word sigsetSzB );
277#elif defined(VGO_darwin)
278extern
279UWord ML_(do_syscall_for_client_unix_WRK)( Word syscallno,
280                                           void* guest_state,
281                                           const vki_sigset_t *syscall_mask,
282                                           const vki_sigset_t *restore_mask,
283                                           Word sigsetSzB ); /* unused */
284extern
285UWord ML_(do_syscall_for_client_mach_WRK)( Word syscallno,
286                                           void* guest_state,
287                                           const vki_sigset_t *syscall_mask,
288                                           const vki_sigset_t *restore_mask,
289                                           Word sigsetSzB ); /* unused */
290extern
291UWord ML_(do_syscall_for_client_mdep_WRK)( Word syscallno,
292                                           void* guest_state,
293                                           const vki_sigset_t *syscall_mask,
294                                           const vki_sigset_t *restore_mask,
295                                           Word sigsetSzB ); /* unused */
296#else
297#  error "Unknown OS"
298#endif
299
300
301static
302void do_syscall_for_client ( Int syscallno,
303                             ThreadState* tst,
304                             const vki_sigset_t* syscall_mask )
305{
306   vki_sigset_t saved;
307   UWord err;
308#  if defined(VGO_linux)
309   err = ML_(do_syscall_for_client_WRK)(
310            syscallno, &tst->arch.vex,
311            syscall_mask, &saved, sizeof(vki_sigset_t)
312         );
313#  elif defined(VGO_darwin)
314   switch (VG_DARWIN_SYSNO_CLASS(syscallno)) {
315      case VG_DARWIN_SYSCALL_CLASS_UNIX:
316         err = ML_(do_syscall_for_client_unix_WRK)(
317                  VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
318                  syscall_mask, &saved, 0/*unused:sigsetSzB*/
319               );
320         break;
321      case VG_DARWIN_SYSCALL_CLASS_MACH:
322         err = ML_(do_syscall_for_client_mach_WRK)(
323                  VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
324                  syscall_mask, &saved, 0/*unused:sigsetSzB*/
325               );
326         break;
327      case VG_DARWIN_SYSCALL_CLASS_MDEP:
328         err = ML_(do_syscall_for_client_mdep_WRK)(
329                  VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
330                  syscall_mask, &saved, 0/*unused:sigsetSzB*/
331               );
332         break;
333      default:
334         vg_assert(0);
335         /*NOTREACHED*/
336         break;
337   }
338#  else
339#    error "Unknown OS"
340#  endif
341   vg_assert2(
342      err == 0,
343      "ML_(do_syscall_for_client_WRK): sigprocmask error %d",
344      (Int)(err & 0xFFF)
345   );
346}
347
348
349/* ---------------------------------------------------------------------
350   Impedance matchers and misc helpers
351   ------------------------------------------------------------------ */
352
353static
354Bool eq_SyscallArgs ( SyscallArgs* a1, SyscallArgs* a2 )
355{
356   return a1->sysno == a2->sysno
357          && a1->arg1 == a2->arg1
358          && a1->arg2 == a2->arg2
359          && a1->arg3 == a2->arg3
360          && a1->arg4 == a2->arg4
361          && a1->arg5 == a2->arg5
362          && a1->arg6 == a2->arg6
363          && a1->arg7 == a2->arg7
364          && a1->arg8 == a2->arg8;
365}
366
367static
368Bool eq_SyscallStatus ( SyscallStatus* s1, SyscallStatus* s2 )
369{
370   /* was: return s1->what == s2->what && sr_EQ( s1->sres, s2->sres ); */
371   if (s1->what == s2->what && sr_EQ( s1->sres, s2->sres ))
372      return True;
373#  if defined(VGO_darwin)
374   /* Darwin-specific debugging guff */
375   vg_assert(s1->what == s2->what);
376   VG_(printf)("eq_SyscallStatus:\n");
377   VG_(printf)("  {%lu %lu %u}\n", s1->sres._wLO, s1->sres._wHI, s1->sres._mode);
378   VG_(printf)("  {%lu %lu %u}\n", s2->sres._wLO, s2->sres._wHI, s2->sres._mode);
379   vg_assert(0);
380#  endif
381   return False;
382}
383
384/* Convert between SysRes and SyscallStatus, to the extent possible. */
385
386static
387SyscallStatus convert_SysRes_to_SyscallStatus ( SysRes res )
388{
389   SyscallStatus status;
390   status.what = SsComplete;
391   status.sres = res;
392   return status;
393}
394
395
396/* Impedance matchers.  These convert syscall arg or result data from
397   the platform-specific in-guest-state format to the canonical
398   formats, and back. */
399
400static
401void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs*       canonical,
402                                    /*IN*/ VexGuestArchState* gst_vanilla,
403                                    /*IN*/ UInt trc )
404{
405#if defined(VGP_x86_linux)
406   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
407   canonical->sysno = gst->guest_EAX;
408   canonical->arg1  = gst->guest_EBX;
409   canonical->arg2  = gst->guest_ECX;
410   canonical->arg3  = gst->guest_EDX;
411   canonical->arg4  = gst->guest_ESI;
412   canonical->arg5  = gst->guest_EDI;
413   canonical->arg6  = gst->guest_EBP;
414   canonical->arg7  = 0;
415   canonical->arg8  = 0;
416
417#elif defined(VGP_amd64_linux)
418   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
419   canonical->sysno = gst->guest_RAX;
420   canonical->arg1  = gst->guest_RDI;
421   canonical->arg2  = gst->guest_RSI;
422   canonical->arg3  = gst->guest_RDX;
423   canonical->arg4  = gst->guest_R10;
424   canonical->arg5  = gst->guest_R8;
425   canonical->arg6  = gst->guest_R9;
426   canonical->arg7  = 0;
427   canonical->arg8  = 0;
428
429#elif defined(VGP_ppc32_linux)
430   VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
431   canonical->sysno = gst->guest_GPR0;
432   canonical->arg1  = gst->guest_GPR3;
433   canonical->arg2  = gst->guest_GPR4;
434   canonical->arg3  = gst->guest_GPR5;
435   canonical->arg4  = gst->guest_GPR6;
436   canonical->arg5  = gst->guest_GPR7;
437   canonical->arg6  = gst->guest_GPR8;
438   canonical->arg7  = 0;
439   canonical->arg8  = 0;
440
441#elif defined(VGP_ppc64_linux)
442   VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
443   canonical->sysno = gst->guest_GPR0;
444   canonical->arg1  = gst->guest_GPR3;
445   canonical->arg2  = gst->guest_GPR4;
446   canonical->arg3  = gst->guest_GPR5;
447   canonical->arg4  = gst->guest_GPR6;
448   canonical->arg5  = gst->guest_GPR7;
449   canonical->arg6  = gst->guest_GPR8;
450   canonical->arg7  = 0;
451   canonical->arg8  = 0;
452
453#elif defined(VGP_arm_linux)
454   VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
455   canonical->sysno = gst->guest_R7;
456   canonical->arg1  = gst->guest_R0;
457   canonical->arg2  = gst->guest_R1;
458   canonical->arg3  = gst->guest_R2;
459   canonical->arg4  = gst->guest_R3;
460   canonical->arg5  = gst->guest_R4;
461   canonical->arg6  = gst->guest_R5;
462   canonical->arg7  = 0;
463   canonical->arg8  = 0;
464
465#elif defined(VGP_x86_darwin)
466   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
467   UWord *stack = (UWord *)gst->guest_ESP;
468   // GrP fixme hope syscalls aren't called with really shallow stacks...
469   canonical->sysno = gst->guest_EAX;
470   if (canonical->sysno != 0) {
471      // stack[0] is return address
472      canonical->arg1  = stack[1];
473      canonical->arg2  = stack[2];
474      canonical->arg3  = stack[3];
475      canonical->arg4  = stack[4];
476      canonical->arg5  = stack[5];
477      canonical->arg6  = stack[6];
478      canonical->arg7  = stack[7];
479      canonical->arg8  = stack[8];
480   } else {
481      // GrP fixme hack handle syscall()
482      // GrP fixme what about __syscall() ?
483      // stack[0] is return address
484      // DDD: the tool can't see that the params have been shifted!  Can
485      //      lead to incorrect checking, I think, because the PRRAn/PSARn
486      //      macros will mention the pre-shifted args.
487      canonical->sysno = stack[1];
488      vg_assert(canonical->sysno != 0);
489      canonical->arg1  = stack[2];
490      canonical->arg2  = stack[3];
491      canonical->arg3  = stack[4];
492      canonical->arg4  = stack[5];
493      canonical->arg5  = stack[6];
494      canonical->arg6  = stack[7];
495      canonical->arg7  = stack[8];
496      canonical->arg8  = stack[9];
497
498      PRINT("SYSCALL[%d,?](%s) syscall(%s, ...); please stand by...\n",
499            VG_(getpid)(), /*tid,*/
500            VG_SYSNUM_STRING(0), VG_SYSNUM_STRING(canonical->sysno));
501   }
502
503   // Here we determine what kind of syscall it was by looking at the
504   // interrupt kind, and then encode the syscall number using the 64-bit
505   // encoding for Valgrind's internal use.
506   //
507   // DDD: Would it be better to stash the JMP kind into the Darwin
508   // thread state rather than passing in the trc?
509   switch (trc) {
510   case VEX_TRC_JMP_SYS_INT128:
511      // int $0x80 = Unix, 64-bit result
512      vg_assert(canonical->sysno >= 0);
513      canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(canonical->sysno);
514      break;
515   case VEX_TRC_JMP_SYS_SYSENTER:
516      // syscall = Unix, 32-bit result
517      // OR        Mach, 32-bit result
518      if (canonical->sysno >= 0) {
519         // GrP fixme hack:  0xffff == I386_SYSCALL_NUMBER_MASK
520         canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(canonical->sysno
521                                                             & 0xffff);
522      } else {
523         canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MACH(-canonical->sysno);
524      }
525      break;
526   case VEX_TRC_JMP_SYS_INT129:
527      // int $0x81 = Mach, 32-bit result
528      vg_assert(canonical->sysno < 0);
529      canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MACH(-canonical->sysno);
530      break;
531   case VEX_TRC_JMP_SYS_INT130:
532      // int $0x82 = mdep, 32-bit result
533      vg_assert(canonical->sysno >= 0);
534      canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MDEP(canonical->sysno);
535      break;
536   default:
537      vg_assert(0);
538      break;
539   }
540
541#elif defined(VGP_amd64_darwin)
542   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
543   UWord *stack = (UWord *)gst->guest_RSP;
544
545   vg_assert(trc == VEX_TRC_JMP_SYS_SYSCALL);
546
547   // GrP fixme hope syscalls aren't called with really shallow stacks...
548   canonical->sysno = gst->guest_RAX;
549   if (canonical->sysno != __NR_syscall) {
550      // stack[0] is return address
551      canonical->arg1  = gst->guest_RDI;
552      canonical->arg2  = gst->guest_RSI;
553      canonical->arg3  = gst->guest_RDX;
554      canonical->arg4  = gst->guest_R10;  // not rcx with syscall insn
555      canonical->arg5  = gst->guest_R8;
556      canonical->arg6  = gst->guest_R9;
557      canonical->arg7  = stack[1];
558      canonical->arg8  = stack[2];
559   } else {
560      // GrP fixme hack handle syscall()
561      // GrP fixme what about __syscall() ?
562      // stack[0] is return address
563      // DDD: the tool can't see that the params have been shifted!  Can
564      //      lead to incorrect checking, I think, because the PRRAn/PSARn
565      //      macros will mention the pre-shifted args.
566      canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(gst->guest_RDI);
567      vg_assert(canonical->sysno != __NR_syscall);
568      canonical->arg1  = gst->guest_RSI;
569      canonical->arg2  = gst->guest_RDX;
570      canonical->arg3  = gst->guest_R10;  // not rcx with syscall insn
571      canonical->arg4  = gst->guest_R8;
572      canonical->arg5  = gst->guest_R9;
573      canonical->arg6  = stack[1];
574      canonical->arg7  = stack[2];
575      canonical->arg8  = stack[3];
576
577      PRINT("SYSCALL[%d,?](%s) syscall(%s, ...); please stand by...\n",
578            VG_(getpid)(), /*tid,*/
579            VG_SYSNUM_STRING(0), VG_SYSNUM_STRING(canonical->sysno));
580   }
581
582   // no canonical->sysno adjustment needed
583
584#elif defined(VGP_s390x_linux)
585   VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
586   canonical->sysno = gst->guest_SYSNO;
587   canonical->arg1  = gst->guest_r2;
588   canonical->arg2  = gst->guest_r3;
589   canonical->arg3  = gst->guest_r4;
590   canonical->arg4  = gst->guest_r5;
591   canonical->arg5  = gst->guest_r6;
592   canonical->arg6  = gst->guest_r7;
593   canonical->arg7  = 0;
594   canonical->arg8  = 0;
595#else
596#  error "getSyscallArgsFromGuestState: unknown arch"
597#endif
598}
599
600static
601void putSyscallArgsIntoGuestState ( /*IN*/ SyscallArgs*       canonical,
602                                    /*OUT*/VexGuestArchState* gst_vanilla )
603{
604#if defined(VGP_x86_linux)
605   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
606   gst->guest_EAX = canonical->sysno;
607   gst->guest_EBX = canonical->arg1;
608   gst->guest_ECX = canonical->arg2;
609   gst->guest_EDX = canonical->arg3;
610   gst->guest_ESI = canonical->arg4;
611   gst->guest_EDI = canonical->arg5;
612   gst->guest_EBP = canonical->arg6;
613
614#elif defined(VGP_amd64_linux)
615   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
616   gst->guest_RAX = canonical->sysno;
617   gst->guest_RDI = canonical->arg1;
618   gst->guest_RSI = canonical->arg2;
619   gst->guest_RDX = canonical->arg3;
620   gst->guest_R10 = canonical->arg4;
621   gst->guest_R8  = canonical->arg5;
622   gst->guest_R9  = canonical->arg6;
623
624#elif defined(VGP_ppc32_linux)
625   VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
626   gst->guest_GPR0 = canonical->sysno;
627   gst->guest_GPR3 = canonical->arg1;
628   gst->guest_GPR4 = canonical->arg2;
629   gst->guest_GPR5 = canonical->arg3;
630   gst->guest_GPR6 = canonical->arg4;
631   gst->guest_GPR7 = canonical->arg5;
632   gst->guest_GPR8 = canonical->arg6;
633
634#elif defined(VGP_ppc64_linux)
635   VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
636   gst->guest_GPR0 = canonical->sysno;
637   gst->guest_GPR3 = canonical->arg1;
638   gst->guest_GPR4 = canonical->arg2;
639   gst->guest_GPR5 = canonical->arg3;
640   gst->guest_GPR6 = canonical->arg4;
641   gst->guest_GPR7 = canonical->arg5;
642   gst->guest_GPR8 = canonical->arg6;
643
644#elif defined(VGP_arm_linux)
645   VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
646   gst->guest_R7 = canonical->sysno;
647   gst->guest_R0 = canonical->arg1;
648   gst->guest_R1 = canonical->arg2;
649   gst->guest_R2 = canonical->arg3;
650   gst->guest_R3 = canonical->arg4;
651   gst->guest_R4 = canonical->arg5;
652   gst->guest_R5 = canonical->arg6;
653
654#elif defined(VGP_x86_darwin)
655   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
656   UWord *stack = (UWord *)gst->guest_ESP;
657
658   gst->guest_EAX = VG_DARWIN_SYSNO_FOR_KERNEL(canonical->sysno);
659
660   // GrP fixme? gst->guest_TEMP_EFLAG_C = 0;
661   // stack[0] is return address
662   stack[1] = canonical->arg1;
663   stack[2] = canonical->arg2;
664   stack[3] = canonical->arg3;
665   stack[4] = canonical->arg4;
666   stack[5] = canonical->arg5;
667   stack[6] = canonical->arg6;
668   stack[7] = canonical->arg7;
669   stack[8] = canonical->arg8;
670
671#elif defined(VGP_amd64_darwin)
672   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
673   UWord *stack = (UWord *)gst->guest_RSP;
674
675   gst->guest_RAX = VG_DARWIN_SYSNO_FOR_KERNEL(canonical->sysno);
676   // GrP fixme? gst->guest_TEMP_EFLAG_C = 0;
677
678   // stack[0] is return address
679   gst->guest_RDI = canonical->arg1;
680   gst->guest_RSI = canonical->arg2;
681   gst->guest_RDX = canonical->arg3;
682   gst->guest_RCX = canonical->arg4;
683   gst->guest_R8  = canonical->arg5;
684   gst->guest_R9  = canonical->arg6;
685   stack[1]       = canonical->arg7;
686   stack[2]       = canonical->arg8;
687
688#elif defined(VGP_s390x_linux)
689   VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
690   gst->guest_SYSNO  = canonical->sysno;
691   gst->guest_r2     = canonical->arg1;
692   gst->guest_r3     = canonical->arg2;
693   gst->guest_r4     = canonical->arg3;
694   gst->guest_r5     = canonical->arg4;
695   gst->guest_r6     = canonical->arg5;
696   gst->guest_r7     = canonical->arg6;
697
698#else
699#  error "putSyscallArgsIntoGuestState: unknown arch"
700#endif
701}
702
703static
704void getSyscallStatusFromGuestState ( /*OUT*/SyscallStatus*     canonical,
705                                      /*IN*/ VexGuestArchState* gst_vanilla )
706{
707#  if defined(VGP_x86_linux)
708   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
709   canonical->sres = VG_(mk_SysRes_x86_linux)( gst->guest_EAX );
710   canonical->what = SsComplete;
711
712#  elif defined(VGP_amd64_linux)
713   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
714   canonical->sres = VG_(mk_SysRes_amd64_linux)( gst->guest_RAX );
715   canonical->what = SsComplete;
716
717#  elif defined(VGP_ppc32_linux)
718   VexGuestPPC32State* gst   = (VexGuestPPC32State*)gst_vanilla;
719   UInt                cr    = LibVEX_GuestPPC32_get_CR( gst );
720   UInt                cr0so = (cr >> 28) & 1;
721   canonical->sres = VG_(mk_SysRes_ppc32_linux)( gst->guest_GPR3, cr0so );
722   canonical->what = SsComplete;
723
724#  elif defined(VGP_ppc64_linux)
725   VexGuestPPC64State* gst   = (VexGuestPPC64State*)gst_vanilla;
726   UInt                cr    = LibVEX_GuestPPC64_get_CR( gst );
727   UInt                cr0so = (cr >> 28) & 1;
728   canonical->sres = VG_(mk_SysRes_ppc64_linux)( gst->guest_GPR3, cr0so );
729   canonical->what = SsComplete;
730
731#  elif defined(VGP_arm_linux)
732   VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
733   canonical->sres = VG_(mk_SysRes_arm_linux)( gst->guest_R0 );
734   canonical->what = SsComplete;
735
736#  elif defined(VGP_x86_darwin)
737   /* duplicates logic in m_signals.VG_UCONTEXT_SYSCALL_SYSRES */
738   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
739   UInt carry = 1 & LibVEX_GuestX86_get_eflags(gst);
740   UInt err = 0;
741   UInt wLO = 0;
742   UInt wHI = 0;
743   switch (gst->guest_SC_CLASS) {
744      case VG_DARWIN_SYSCALL_CLASS_UNIX:
745         // int $0x80 = Unix, 64-bit result
746         err = carry;
747         wLO = gst->guest_EAX;
748         wHI = gst->guest_EDX;
749         break;
750      case VG_DARWIN_SYSCALL_CLASS_MACH:
751         // int $0x81 = Mach, 32-bit result
752         wLO = gst->guest_EAX;
753         break;
754      case VG_DARWIN_SYSCALL_CLASS_MDEP:
755         // int $0x82 = mdep, 32-bit result
756         wLO = gst->guest_EAX;
757         break;
758      default:
759         vg_assert(0);
760         break;
761   }
762   canonical->sres = VG_(mk_SysRes_x86_darwin)(
763                        gst->guest_SC_CLASS, err ? True : False,
764                        wHI, wLO
765                     );
766   canonical->what = SsComplete;
767
768#  elif defined(VGP_amd64_darwin)
769   /* duplicates logic in m_signals.VG_UCONTEXT_SYSCALL_SYSRES */
770   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
771   ULong carry = 1 & LibVEX_GuestAMD64_get_rflags(gst);
772   ULong err = 0;
773   ULong wLO = 0;
774   ULong wHI = 0;
775   switch (gst->guest_SC_CLASS) {
776      case VG_DARWIN_SYSCALL_CLASS_UNIX:
777         // syscall = Unix, 128-bit result
778         err = carry;
779         wLO = gst->guest_RAX;
780         wHI = gst->guest_RDX;
781         break;
782      case VG_DARWIN_SYSCALL_CLASS_MACH:
783         // syscall = Mach, 64-bit result
784         wLO = gst->guest_RAX;
785         break;
786      case VG_DARWIN_SYSCALL_CLASS_MDEP:
787         // syscall = mdep, 64-bit result
788         wLO = gst->guest_RAX;
789         break;
790      default:
791         vg_assert(0);
792         break;
793   }
794   canonical->sres = VG_(mk_SysRes_amd64_darwin)(
795                        gst->guest_SC_CLASS, err ? True : False,
796                        wHI, wLO
797                     );
798   canonical->what = SsComplete;
799
800#  elif defined(VGP_s390x_linux)
801   VexGuestS390XState* gst   = (VexGuestS390XState*)gst_vanilla;
802   canonical->sres = VG_(mk_SysRes_s390x_linux)( gst->guest_r2 );
803   canonical->what = SsComplete;
804
805#  else
806#    error "getSyscallStatusFromGuestState: unknown arch"
807#  endif
808}
809
810static
811void putSyscallStatusIntoGuestState ( /*IN*/ ThreadId tid,
812                                      /*IN*/ SyscallStatus*     canonical,
813                                      /*OUT*/VexGuestArchState* gst_vanilla )
814{
815#  if defined(VGP_x86_linux)
816   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
817   vg_assert(canonical->what == SsComplete);
818   if (sr_isError(canonical->sres)) {
819      /* This isn't exactly right, in that really a Failure with res
820         not in the range 1 .. 4095 is unrepresentable in the
821         Linux-x86 scheme.  Oh well. */
822      gst->guest_EAX = - (Int)sr_Err(canonical->sres);
823   } else {
824      gst->guest_EAX = sr_Res(canonical->sres);
825   }
826   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
827             OFFSET_x86_EAX, sizeof(UWord) );
828
829#  elif defined(VGP_amd64_linux)
830   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
831   vg_assert(canonical->what == SsComplete);
832   if (sr_isError(canonical->sres)) {
833      /* This isn't exactly right, in that really a Failure with res
834         not in the range 1 .. 4095 is unrepresentable in the
835         Linux-amd64 scheme.  Oh well. */
836      gst->guest_RAX = - (Long)sr_Err(canonical->sres);
837   } else {
838      gst->guest_RAX = sr_Res(canonical->sres);
839   }
840   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
841             OFFSET_amd64_RAX, sizeof(UWord) );
842
843#  elif defined(VGP_ppc32_linux)
844   VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
845   UInt old_cr = LibVEX_GuestPPC32_get_CR(gst);
846   vg_assert(canonical->what == SsComplete);
847   if (sr_isError(canonical->sres)) {
848      /* set CR0.SO */
849      LibVEX_GuestPPC32_put_CR( old_cr | (1<<28), gst );
850      gst->guest_GPR3 = sr_Err(canonical->sres);
851   } else {
852      /* clear CR0.SO */
853      LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), gst );
854      gst->guest_GPR3 = sr_Res(canonical->sres);
855   }
856   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
857             OFFSET_ppc32_GPR3, sizeof(UWord) );
858   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
859             OFFSET_ppc32_CR0_0, sizeof(UChar) );
860
861#  elif defined(VGP_ppc64_linux)
862   VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
863   UInt old_cr = LibVEX_GuestPPC64_get_CR(gst);
864   vg_assert(canonical->what == SsComplete);
865   if (sr_isError(canonical->sres)) {
866      /* set CR0.SO */
867      LibVEX_GuestPPC64_put_CR( old_cr | (1<<28), gst );
868      gst->guest_GPR3 = sr_Err(canonical->sres);
869   } else {
870      /* clear CR0.SO */
871      LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), gst );
872      gst->guest_GPR3 = sr_Res(canonical->sres);
873   }
874   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
875             OFFSET_ppc64_GPR3, sizeof(UWord) );
876   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
877             OFFSET_ppc64_CR0_0, sizeof(UChar) );
878
879#  elif defined(VGP_arm_linux)
880   VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
881   vg_assert(canonical->what == SsComplete);
882   if (sr_isError(canonical->sres)) {
883      /* This isn't exactly right, in that really a Failure with res
884         not in the range 1 .. 4095 is unrepresentable in the
885         Linux-arm scheme.  Oh well. */
886      gst->guest_R0 = - (Int)sr_Err(canonical->sres);
887   } else {
888      gst->guest_R0 = sr_Res(canonical->sres);
889   }
890   VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
891             OFFSET_arm_R0, sizeof(UWord) );
892
893#elif defined(VGP_x86_darwin)
894   VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
895   SysRes sres = canonical->sres;
896   vg_assert(canonical->what == SsComplete);
897   /* Unfortunately here we have to break abstraction and look
898      directly inside 'res', in order to decide what to do. */
899   switch (sres._mode) {
900      case SysRes_MACH: // int $0x81 = Mach, 32-bit result
901      case SysRes_MDEP: // int $0x82 = mdep, 32-bit result
902         gst->guest_EAX = sres._wLO;
903         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
904                   OFFSET_x86_EAX, sizeof(UInt) );
905         break;
906      case SysRes_UNIX_OK:  // int $0x80 = Unix, 64-bit result
907      case SysRes_UNIX_ERR: // int $0x80 = Unix, 64-bit error
908         gst->guest_EAX = sres._wLO;
909         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
910                   OFFSET_x86_EAX, sizeof(UInt) );
911         gst->guest_EDX = sres._wHI;
912         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
913                   OFFSET_x86_EDX, sizeof(UInt) );
914         LibVEX_GuestX86_put_eflag_c( sres._mode==SysRes_UNIX_ERR ? 1 : 0,
915                                      gst );
916         // GrP fixme sets defined for entire eflags, not just bit c
917         // DDD: this breaks exp-ptrcheck.
918         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
919                   offsetof(VexGuestX86State, guest_CC_DEP1), sizeof(UInt) );
920         break;
921      default:
922         vg_assert(0);
923         break;
924   }
925
926#elif defined(VGP_amd64_darwin)
927   VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
928   SysRes sres = canonical->sres;
929   vg_assert(canonical->what == SsComplete);
930   /* Unfortunately here we have to break abstraction and look
931      directly inside 'res', in order to decide what to do. */
932   switch (sres._mode) {
933      case SysRes_MACH: // syscall = Mach, 64-bit result
934      case SysRes_MDEP: // syscall = mdep, 64-bit result
935         gst->guest_RAX = sres._wLO;
936         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
937                   OFFSET_amd64_RAX, sizeof(ULong) );
938         break;
939      case SysRes_UNIX_OK:  // syscall = Unix, 128-bit result
940      case SysRes_UNIX_ERR: // syscall = Unix, 128-bit error
941         gst->guest_RAX = sres._wLO;
942         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
943                   OFFSET_amd64_RAX, sizeof(ULong) );
944         gst->guest_RDX = sres._wHI;
945         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
946                   OFFSET_amd64_RDX, sizeof(ULong) );
947         LibVEX_GuestAMD64_put_rflag_c( sres._mode==SysRes_UNIX_ERR ? 1 : 0,
948                                        gst );
949         // GrP fixme sets defined for entire rflags, not just bit c
950         // DDD: this breaks exp-ptrcheck.
951         VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
952                   offsetof(VexGuestAMD64State, guest_CC_DEP1), sizeof(ULong) );
953         break;
954      default:
955         vg_assert(0);
956         break;
957   }
958
959#  elif defined(VGP_s390x_linux)
960   VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
961   vg_assert(canonical->what == SsComplete);
962   if (sr_isError(canonical->sres)) {
963      gst->guest_r2 = - (Long)sr_Err(canonical->sres);
964   } else {
965      gst->guest_r2 = sr_Res(canonical->sres);
966   }
967
968#  else
969#    error "putSyscallStatusIntoGuestState: unknown arch"
970#  endif
971}
972
973
974/* Tell me the offsets in the guest state of the syscall params, so
975   that the scalar argument checkers don't have to have this info
976   hardwired. */
977
978static
979void getSyscallArgLayout ( /*OUT*/SyscallArgLayout* layout )
980{
981#if defined(VGP_x86_linux)
982   layout->o_sysno  = OFFSET_x86_EAX;
983   layout->o_arg1   = OFFSET_x86_EBX;
984   layout->o_arg2   = OFFSET_x86_ECX;
985   layout->o_arg3   = OFFSET_x86_EDX;
986   layout->o_arg4   = OFFSET_x86_ESI;
987   layout->o_arg5   = OFFSET_x86_EDI;
988   layout->o_arg6   = OFFSET_x86_EBP;
989   layout->uu_arg7  = -1; /* impossible value */
990   layout->uu_arg8  = -1; /* impossible value */
991
992#elif defined(VGP_amd64_linux)
993   layout->o_sysno  = OFFSET_amd64_RAX;
994   layout->o_arg1   = OFFSET_amd64_RDI;
995   layout->o_arg2   = OFFSET_amd64_RSI;
996   layout->o_arg3   = OFFSET_amd64_RDX;
997   layout->o_arg4   = OFFSET_amd64_R10;
998   layout->o_arg5   = OFFSET_amd64_R8;
999   layout->o_arg6   = OFFSET_amd64_R9;
1000   layout->uu_arg7  = -1; /* impossible value */
1001   layout->uu_arg8  = -1; /* impossible value */
1002
1003#elif defined(VGP_ppc32_linux)
1004   layout->o_sysno  = OFFSET_ppc32_GPR0;
1005   layout->o_arg1   = OFFSET_ppc32_GPR3;
1006   layout->o_arg2   = OFFSET_ppc32_GPR4;
1007   layout->o_arg3   = OFFSET_ppc32_GPR5;
1008   layout->o_arg4   = OFFSET_ppc32_GPR6;
1009   layout->o_arg5   = OFFSET_ppc32_GPR7;
1010   layout->o_arg6   = OFFSET_ppc32_GPR8;
1011   layout->uu_arg7  = -1; /* impossible value */
1012   layout->uu_arg8  = -1; /* impossible value */
1013
1014#elif defined(VGP_ppc64_linux)
1015   layout->o_sysno  = OFFSET_ppc64_GPR0;
1016   layout->o_arg1   = OFFSET_ppc64_GPR3;
1017   layout->o_arg2   = OFFSET_ppc64_GPR4;
1018   layout->o_arg3   = OFFSET_ppc64_GPR5;
1019   layout->o_arg4   = OFFSET_ppc64_GPR6;
1020   layout->o_arg5   = OFFSET_ppc64_GPR7;
1021   layout->o_arg6   = OFFSET_ppc64_GPR8;
1022   layout->uu_arg7  = -1; /* impossible value */
1023   layout->uu_arg8  = -1; /* impossible value */
1024
1025#elif defined(VGP_arm_linux)
1026   layout->o_sysno  = OFFSET_arm_R7;
1027   layout->o_arg1   = OFFSET_arm_R0;
1028   layout->o_arg2   = OFFSET_arm_R1;
1029   layout->o_arg3   = OFFSET_arm_R2;
1030   layout->o_arg4   = OFFSET_arm_R3;
1031   layout->o_arg5   = OFFSET_arm_R4;
1032   layout->o_arg6   = OFFSET_arm_R5;
1033   layout->uu_arg7  = -1; /* impossible value */
1034   layout->uu_arg8  = -1; /* impossible value */
1035
1036#elif defined(VGP_x86_darwin)
1037   layout->o_sysno  = OFFSET_x86_EAX;
1038   // syscall parameters are on stack in C convention
1039   layout->s_arg1   = sizeof(UWord) * 1;
1040   layout->s_arg2   = sizeof(UWord) * 2;
1041   layout->s_arg3   = sizeof(UWord) * 3;
1042   layout->s_arg4   = sizeof(UWord) * 4;
1043   layout->s_arg5   = sizeof(UWord) * 5;
1044   layout->s_arg6   = sizeof(UWord) * 6;
1045   layout->s_arg7   = sizeof(UWord) * 7;
1046   layout->s_arg8   = sizeof(UWord) * 8;
1047
1048#elif defined(VGP_amd64_darwin)
1049   layout->o_sysno  = OFFSET_amd64_RAX;
1050   layout->o_arg1   = OFFSET_amd64_RDI;
1051   layout->o_arg2   = OFFSET_amd64_RSI;
1052   layout->o_arg3   = OFFSET_amd64_RDX;
1053   layout->o_arg4   = OFFSET_amd64_RCX;
1054   layout->o_arg5   = OFFSET_amd64_R8;
1055   layout->o_arg6   = OFFSET_amd64_R9;
1056   layout->s_arg7   = sizeof(UWord) * 1;
1057   layout->s_arg8   = sizeof(UWord) * 2;
1058
1059#elif defined(VGP_s390x_linux)
1060   layout->o_sysno  = OFFSET_s390x_SYSNO;
1061   layout->o_arg1   = OFFSET_s390x_r2;
1062   layout->o_arg2   = OFFSET_s390x_r3;
1063   layout->o_arg3   = OFFSET_s390x_r4;
1064   layout->o_arg4   = OFFSET_s390x_r5;
1065   layout->o_arg5   = OFFSET_s390x_r6;
1066   layout->o_arg6   = OFFSET_s390x_r7;
1067   layout->uu_arg7  = -1; /* impossible value */
1068   layout->uu_arg8  = -1; /* impossible value */
1069#else
1070#  error "getSyscallLayout: unknown arch"
1071#endif
1072}
1073
1074
1075/* ---------------------------------------------------------------------
1076   The main driver logic
1077   ------------------------------------------------------------------ */
1078
1079/* Finding the handlers for a given syscall, or faking up one
1080   when no handler is found. */
1081
1082static
1083void bad_before ( ThreadId              tid,
1084                  SyscallArgLayout*     layout,
1085                  /*MOD*/SyscallArgs*   args,
1086                  /*OUT*/SyscallStatus* status,
1087                  /*OUT*/UWord*         flags )
1088{
1089   VG_(dmsg)("WARNING: unhandled syscall: %s\n",
1090      VG_SYSNUM_STRING_EXTRA(args->sysno));
1091   if (VG_(clo_verbosity) > 1) {
1092      VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
1093   }
1094   VG_(dmsg)("You may be able to write your own handler.\n");
1095   VG_(dmsg)("Read the file README_MISSING_SYSCALL_OR_IOCTL.\n");
1096   VG_(dmsg)("Nevertheless we consider this a bug.  Please report\n");
1097   VG_(dmsg)("it at http://valgrind.org/support/bug_reports.html.\n");
1098
1099   SET_STATUS_Failure(VKI_ENOSYS);
1100}
1101
1102static SyscallTableEntry bad_sys =
1103   { bad_before, NULL };
1104
1105static const SyscallTableEntry* get_syscall_entry ( Int syscallno )
1106{
1107   const SyscallTableEntry* sys = NULL;
1108
1109#  if defined(VGO_linux)
1110   sys = ML_(get_linux_syscall_entry)( syscallno );
1111
1112#  elif defined(VGO_darwin)
1113   Int idx = VG_DARWIN_SYSNO_INDEX(syscallno);
1114
1115   switch (VG_DARWIN_SYSNO_CLASS(syscallno)) {
1116   case VG_DARWIN_SYSCALL_CLASS_UNIX:
1117      if (idx >= 0 && idx < ML_(syscall_table_size) &&
1118          ML_(syscall_table)[idx].before != NULL)
1119         sys = &ML_(syscall_table)[idx];
1120         break;
1121   case VG_DARWIN_SYSCALL_CLASS_MACH:
1122      if (idx >= 0 && idx < ML_(mach_trap_table_size) &&
1123          ML_(mach_trap_table)[idx].before != NULL)
1124         sys = &ML_(mach_trap_table)[idx];
1125         break;
1126   case VG_DARWIN_SYSCALL_CLASS_MDEP:
1127      if (idx >= 0 && idx < ML_(mdep_trap_table_size) &&
1128          ML_(mdep_trap_table)[idx].before != NULL)
1129         sys = &ML_(mdep_trap_table)[idx];
1130         break;
1131   default:
1132      vg_assert(0);
1133      break;
1134   }
1135
1136#  else
1137#    error Unknown OS
1138#  endif
1139
1140   return sys == NULL  ? &bad_sys  : sys;
1141}
1142
1143
1144/* Add and remove signals from mask so that we end up telling the
1145   kernel the state we actually want rather than what the client
1146   wants. */
1147static void sanitize_client_sigmask(vki_sigset_t *mask)
1148{
1149   VG_(sigdelset)(mask, VKI_SIGKILL);
1150   VG_(sigdelset)(mask, VKI_SIGSTOP);
1151   VG_(sigdelset)(mask, VG_SIGVGKILL); /* never block */
1152}
1153
1154typedef
1155   struct {
1156      SyscallArgs   orig_args;
1157      SyscallArgs   args;
1158      SyscallStatus status;
1159      UWord         flags;
1160   }
1161   SyscallInfo;
1162
1163SyscallInfo syscallInfo[VG_N_THREADS];
1164
1165
1166/* The scheduler needs to be able to zero out these records after a
1167   fork, hence this is exported from m_syswrap. */
1168void VG_(clear_syscallInfo) ( Int tid )
1169{
1170   vg_assert(tid >= 0 && tid < VG_N_THREADS);
1171   VG_(memset)( & syscallInfo[tid], 0, sizeof( syscallInfo[tid] ));
1172   syscallInfo[tid].status.what = SsIdle;
1173}
1174
1175static void ensure_initialised ( void )
1176{
1177   Int i;
1178   static Bool init_done = False;
1179   if (init_done)
1180      return;
1181   init_done = True;
1182   for (i = 0; i < VG_N_THREADS; i++) {
1183      VG_(clear_syscallInfo)( i );
1184   }
1185}
1186
1187/* --- This is the main function of this file. --- */
1188
1189void VG_(client_syscall) ( ThreadId tid, UInt trc )
1190{
1191   Word                     sysno;
1192   ThreadState*             tst;
1193   const SyscallTableEntry* ent;
1194   SyscallArgLayout         layout;
1195   SyscallInfo*             sci;
1196
1197   ensure_initialised();
1198
1199   vg_assert(VG_(is_valid_tid)(tid));
1200   vg_assert(tid >= 1 && tid < VG_N_THREADS);
1201   vg_assert(VG_(is_running_thread)(tid));
1202
1203   tst = VG_(get_ThreadState)(tid);
1204
1205   /* BEGIN ensure root thread's stack is suitably mapped */
1206   /* In some rare circumstances, we may do the syscall without the
1207      bottom page of the stack being mapped, because the stack pointer
1208      was moved down just a few instructions before the syscall
1209      instruction, and there have been no memory references since
1210      then, that would cause a call to VG_(extend_stack) to have
1211      happened.
1212
1213      In native execution that's OK: the kernel automagically extends
1214      the stack's mapped area down to cover the stack pointer (or sp -
1215      redzone, really).  In simulated normal execution that's OK too,
1216      since any signals we get from accessing below the mapped area of
1217      the (guest's) stack lead us to VG_(extend_stack), where we
1218      simulate the kernel's stack extension logic.  But that leaves
1219      the problem of entering a syscall with the SP unmapped.  Because
1220      the kernel doesn't know that the segment immediately above SP is
1221      supposed to be a grow-down segment, it causes the syscall to
1222      fail, and thereby causes a divergence between native behaviour
1223      (syscall succeeds) and simulated behaviour (syscall fails).
1224
1225      This is quite a rare failure mode.  It has only been seen
1226      affecting calls to sys_readlink on amd64-linux, and even then it
1227      requires a certain code sequence around the syscall to trigger
1228      it.  Here is one:
1229
1230      extern int my_readlink ( const char* path );
1231      asm(
1232      ".text\n"
1233      ".globl my_readlink\n"
1234      "my_readlink:\n"
1235      "\tsubq    $0x1008,%rsp\n"
1236      "\tmovq    %rdi,%rdi\n"              // path is in rdi
1237      "\tmovq    %rsp,%rsi\n"              // &buf[0] -> rsi
1238      "\tmovl    $0x1000,%edx\n"           // sizeof(buf) in rdx
1239      "\tmovl    $"__NR_READLINK",%eax\n"  // syscall number
1240      "\tsyscall\n"
1241      "\taddq    $0x1008,%rsp\n"
1242      "\tret\n"
1243      ".previous\n"
1244      );
1245
1246      For more details, see bug #156404
1247      (https://bugs.kde.org/show_bug.cgi?id=156404).
1248
1249      The fix is actually very simple.  We simply need to call
1250      VG_(extend_stack) for this thread, handing it the lowest
1251      possible valid address for stack (sp - redzone), to ensure the
1252      pages all the way down to that address, are mapped.  Because
1253      this is a potentially expensive and frequent operation, we
1254      filter in two ways:
1255
1256      First, only the main thread (tid=1) has a growdown stack.  So
1257      ignore all others.  It is conceivable, although highly unlikely,
1258      that the main thread exits, and later another thread is
1259      allocated tid=1, but that's harmless, I believe;
1260      VG_(extend_stack) will do nothing when applied to a non-root
1261      thread.
1262
1263      Secondly, first call VG_(am_find_nsegment) directly, to see if
1264      the page holding (sp - redzone) is mapped correctly.  If so, do
1265      nothing.  This is almost always the case.  VG_(extend_stack)
1266      calls VG_(am_find_nsegment) twice, so this optimisation -- and
1267      that's all it is -- more or less halves the number of calls to
1268      VG_(am_find_nsegment) required.
1269
1270      TODO: the test "seg->kind == SkAnonC" is really inadequate,
1271      because although it tests whether the segment is mapped
1272      _somehow_, it doesn't check that it has the right permissions
1273      (r,w, maybe x) ?  We could test that here, but it will also be
1274      necessary to fix the corresponding test in VG_(extend_stack).
1275
1276      All this guff is of course Linux-specific.  Hence the ifdef.
1277   */
1278#  if defined(VGO_linux)
1279   if (tid == 1/*ROOT THREAD*/) {
1280      Addr     stackMin   = VG_(get_SP)(tid) - VG_STACK_REDZONE_SZB;
1281      NSegment const* seg = VG_(am_find_nsegment)(stackMin);
1282      if (seg && seg->kind == SkAnonC) {
1283         /* stackMin is already mapped.  Nothing to do. */
1284      } else {
1285         (void)VG_(extend_stack)( stackMin,
1286                                  tst->client_stack_szB );
1287      }
1288   }
1289#  endif
1290   /* END ensure root thread's stack is suitably mapped */
1291
1292   /* First off, get the syscall args and number.  This is a
1293      platform-dependent action. */
1294
1295   sci = & syscallInfo[tid];
1296   vg_assert(sci->status.what == SsIdle);
1297
1298   getSyscallArgsFromGuestState( &sci->orig_args, &tst->arch.vex, trc );
1299
1300   /* Copy .orig_args to .args.  The pre-handler may modify .args, but
1301      we want to keep the originals too, just in case. */
1302   sci->args = sci->orig_args;
1303
1304   /* Save the syscall number in the thread state in case the syscall
1305      is interrupted by a signal. */
1306   sysno = sci->orig_args.sysno;
1307
1308   /* It's sometimes useful, as a crude debugging hack, to get a
1309      stack trace at each (or selected) syscalls. */
1310   if (0 && sysno == __NR_ioctl) {
1311      VG_(umsg)("\nioctl:\n");
1312      VG_(get_and_pp_StackTrace)(tid, 10);
1313      VG_(umsg)("\n");
1314   }
1315
1316#  if defined(VGO_darwin)
1317   /* Record syscall class.  But why?  Because the syscall might be
1318      interrupted by a signal, and in the signal handler (which will
1319      be m_signals.async_signalhandler) we will need to build a SysRes
1320      reflecting the syscall return result.  In order to do that we
1321      need to know the syscall class.  Hence stash it in the guest
1322      state of this thread.  This madness is not needed on Linux
1323      because it only has a single syscall return convention and so
1324      there is no ambiguity involved in converting the post-signal
1325      machine state into a SysRes. */
1326   tst->arch.vex.guest_SC_CLASS = VG_DARWIN_SYSNO_CLASS(sysno);
1327#  endif
1328
1329   /* The default what-to-do-next thing is hand the syscall to the
1330      kernel, so we pre-set that here.  Set .sres to something
1331      harmless looking (is irrelevant because .what is not
1332      SsComplete.) */
1333   sci->status.what = SsHandToKernel;
1334   sci->status.sres = VG_(mk_SysRes_Error)(0);
1335   sci->flags       = 0;
1336
1337   /* Fetch the syscall's handlers.  If no handlers exist for this
1338      syscall, we are given dummy handlers which force an immediate
1339      return with ENOSYS. */
1340   ent = get_syscall_entry(sysno);
1341
1342   /* Fetch the layout information, which tells us where in the guest
1343      state the syscall args reside.  This is a platform-dependent
1344      action.  This info is needed so that the scalar syscall argument
1345      checks (PRE_REG_READ calls) know which bits of the guest state
1346      they need to inspect. */
1347   getSyscallArgLayout( &layout );
1348
1349   /* Make sure the tmp signal mask matches the real signal mask;
1350      sigsuspend may change this. */
1351   vg_assert(VG_(iseqsigset)(&tst->sig_mask, &tst->tmp_sig_mask));
1352
1353   /* Right, we're finally ready to Party.  Call the pre-handler and
1354      see what we get back.  At this point:
1355
1356        sci->status.what  is Unset (we don't know yet).
1357        sci->orig_args    contains the original args.
1358        sci->args         is the same as sci->orig_args.
1359        sci->flags        is zero.
1360   */
1361
1362   PRINT("SYSCALL[%d,%d](%s) ",
1363      VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno));
1364
1365   /* Do any pre-syscall actions */
1366   if (VG_(needs).syscall_wrapper) {
1367      UWord tmpv[8];
1368      tmpv[0] = sci->orig_args.arg1;
1369      tmpv[1] = sci->orig_args.arg2;
1370      tmpv[2] = sci->orig_args.arg3;
1371      tmpv[3] = sci->orig_args.arg4;
1372      tmpv[4] = sci->orig_args.arg5;
1373      tmpv[5] = sci->orig_args.arg6;
1374      tmpv[6] = sci->orig_args.arg7;
1375      tmpv[7] = sci->orig_args.arg8;
1376      VG_TDICT_CALL(tool_pre_syscall, tid, sysno,
1377                    &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]));
1378   }
1379
1380   vg_assert(ent);
1381   vg_assert(ent->before);
1382   (ent->before)( tid,
1383                  &layout,
1384                  &sci->args, &sci->status, &sci->flags );
1385
1386   /* The pre-handler may have modified:
1387         sci->args
1388         sci->status
1389         sci->flags
1390      All else remains unchanged.
1391      Although the args may be modified, pre handlers are not allowed
1392      to change the syscall number.
1393   */
1394   /* Now we proceed according to what the pre-handler decided. */
1395   vg_assert(sci->status.what == SsHandToKernel
1396             || sci->status.what == SsComplete);
1397   vg_assert(sci->args.sysno == sci->orig_args.sysno);
1398
1399   if (sci->status.what == SsComplete && !sr_isError(sci->status.sres)) {
1400      /* The pre-handler completed the syscall itself, declaring
1401         success. */
1402      if (sci->flags & SfNoWriteResult) {
1403         PRINT(" --> [pre-success] NoWriteResult");
1404      } else {
1405         PRINT(" --> [pre-success] Success(0x%llx:0x%llx)",
1406               (ULong)sr_ResHI(sci->status.sres),
1407               (ULong)sr_Res(sci->status.sres));
1408      }
1409      /* In this case the allowable flags are to ask for a signal-poll
1410         and/or a yield after the call.  Changing the args isn't
1411         allowed. */
1412      vg_assert(0 == (sci->flags
1413                      & ~(SfPollAfter | SfYieldAfter | SfNoWriteResult)));
1414      vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
1415   }
1416
1417   else
1418   if (sci->status.what == SsComplete && sr_isError(sci->status.sres)) {
1419      /* The pre-handler decided to fail syscall itself. */
1420      PRINT(" --> [pre-fail] Failure(0x%llx)", (ULong)sr_Err(sci->status.sres));
1421      /* In this case, the pre-handler is also allowed to ask for the
1422         post-handler to be run anyway.  Changing the args is not
1423         allowed. */
1424      vg_assert(0 == (sci->flags & ~(SfMayBlock | SfPostOnFail | SfPollAfter)));
1425      vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
1426   }
1427
1428   else
1429   if (sci->status.what != SsHandToKernel) {
1430      /* huh?! */
1431      vg_assert(0);
1432   }
1433
1434   else /* (sci->status.what == HandToKernel) */ {
1435      /* Ok, this is the usual case -- and the complicated one.  There
1436         are two subcases: sync and async.  async is the general case
1437         and is to be used when there is any possibility that the
1438         syscall might block [a fact that the pre-handler must tell us
1439         via the sci->flags field.]  Because the tidying-away /
1440         context-switch overhead of the async case could be large, if
1441         we are sure that the syscall will not block, we fast-track it
1442         by doing it directly in this thread, which is a lot
1443         simpler. */
1444
1445      /* Check that the given flags are allowable: MayBlock, PollAfter
1446         and PostOnFail are ok. */
1447      vg_assert(0 == (sci->flags & ~(SfMayBlock | SfPostOnFail | SfPollAfter)));
1448
1449      if (sci->flags & SfMayBlock) {
1450
1451         /* Syscall may block, so run it asynchronously */
1452         vki_sigset_t mask;
1453
1454         PRINT(" --> [async] ... \n");
1455
1456         mask = tst->sig_mask;
1457         sanitize_client_sigmask(&mask);
1458
1459         /* Gack.  More impedance matching.  Copy the possibly
1460            modified syscall args back into the guest state. */
1461         /* JRS 2009-Mar-16: if the syscall args are possibly modified,
1462            then this assertion is senseless:
1463              vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
1464            The case that exposed it was sys_posix_spawn on Darwin,
1465            which heavily modifies its arguments but then lets the call
1466            go through anyway, with SfToBlock set, hence we end up here. */
1467         putSyscallArgsIntoGuestState( &sci->args, &tst->arch.vex );
1468
1469         /* Drop the bigLock */
1470         VG_(release_BigLock)(tid, VgTs_WaitSys, "VG_(client_syscall)[async]");
1471         /* Urr.  We're now in a race against other threads trying to
1472            acquire the bigLock.  I guess that doesn't matter provided
1473            that do_syscall_for_client only touches thread-local
1474            state. */
1475
1476         /* Do the call, which operates directly on the guest state,
1477            not on our abstracted copies of the args/result. */
1478         do_syscall_for_client(sysno, tst, &mask);
1479
1480         /* do_syscall_for_client may not return if the syscall was
1481            interrupted by a signal.  In that case, flow of control is
1482            first to m_signals.async_sighandler, which calls
1483            VG_(fixup_guest_state_after_syscall_interrupted), which
1484            fixes up the guest state, and possibly calls
1485            VG_(post_syscall).  Once that's done, control drops back
1486            to the scheduler.  */
1487
1488         /* Darwin: do_syscall_for_client may not return if the
1489            syscall was workq_ops(WQOPS_THREAD_RETURN) and the kernel
1490            responded by starting the thread at wqthread_hijack(reuse=1)
1491            (to run another workqueue item). In that case, wqthread_hijack
1492            calls ML_(wqthread_continue), which is similar to
1493            VG_(fixup_guest_state_after_syscall_interrupted). */
1494
1495         /* Reacquire the lock */
1496         VG_(acquire_BigLock)(tid, "VG_(client_syscall)[async]");
1497
1498         /* Even more impedance matching.  Extract the syscall status
1499            from the guest state. */
1500         getSyscallStatusFromGuestState( &sci->status, &tst->arch.vex );
1501         vg_assert(sci->status.what == SsComplete);
1502
1503         /* Be decorative, if required. */
1504         if (VG_(clo_trace_syscalls)) {
1505            Bool failed = sr_isError(sci->status.sres);
1506            if (failed) {
1507               PRINT("SYSCALL[%d,%d](%s) ... [async] --> Failure(0x%llx)",
1508                     VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno),
1509                     (ULong)sr_Err(sci->status.sres));
1510            } else {
1511               PRINT("SYSCALL[%d,%d](%s) ... [async] --> "
1512                     "Success(0x%llx:0x%llx)",
1513                     VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno),
1514                     (ULong)sr_ResHI(sci->status.sres),
1515                     (ULong)sr_Res(sci->status.sres) );
1516            }
1517         }
1518
1519      } else {
1520
1521         /* run the syscall directly */
1522         /* The pre-handler may have modified the syscall args, but
1523            since we're passing values in ->args directly to the
1524            kernel, there's no point in flushing them back to the
1525            guest state.  Indeed doing so could be construed as
1526            incorrect. */
1527         SysRes sres
1528            = VG_(do_syscall)(sysno, sci->args.arg1, sci->args.arg2,
1529                                     sci->args.arg3, sci->args.arg4,
1530                                     sci->args.arg5, sci->args.arg6,
1531                                     sci->args.arg7, sci->args.arg8 );
1532         sci->status = convert_SysRes_to_SyscallStatus(sres);
1533
1534         /* Be decorative, if required. */
1535         if (VG_(clo_trace_syscalls)) {
1536            Bool failed = sr_isError(sci->status.sres);
1537            if (failed) {
1538               PRINT("[sync] --> Failure(0x%llx)",
1539                     (ULong)sr_Err(sci->status.sres) );
1540            } else {
1541               PRINT("[sync] --> Success(0x%llx:0x%llx)",
1542                     (ULong)sr_ResHI(sci->status.sres),
1543                     (ULong)sr_Res(sci->status.sres) );
1544            }
1545         }
1546      }
1547   }
1548
1549   vg_assert(sci->status.what == SsComplete);
1550
1551   vg_assert(VG_(is_running_thread)(tid));
1552
1553   /* Dump the syscall result back in the guest state.  This is
1554      a platform-specific action. */
1555   if (!(sci->flags & SfNoWriteResult))
1556      putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
1557
1558   /* Situation now:
1559      - the guest state is now correctly modified following the syscall
1560      - modified args, original args and syscall status are still
1561        available in the syscallInfo[] entry for this syscall.
1562
1563      Now go on to do the post-syscall actions (read on down ..)
1564   */
1565   PRINT(" ");
1566   VG_(post_syscall)(tid);
1567   PRINT("\n");
1568}
1569
1570
1571/* Perform post syscall actions.  The expected state on entry is
1572   precisely as at the end of VG_(client_syscall), that is:
1573
1574   - guest state up to date following the syscall
1575   - modified args, original args and syscall status are still
1576     available in the syscallInfo[] entry for this syscall.
1577   - syscall status matches what's in the guest state.
1578
1579   There are two ways to get here: the normal way -- being called by
1580   VG_(client_syscall), and the unusual way, from
1581   VG_(fixup_guest_state_after_syscall_interrupted).
1582   Darwin: there's a third way, ML_(wqthread_continue).
1583*/
1584void VG_(post_syscall) (ThreadId tid)
1585{
1586   SyscallInfo*             sci;
1587   const SyscallTableEntry* ent;
1588   SyscallStatus            test_status;
1589   ThreadState*             tst;
1590   Word sysno;
1591
1592   /* Preliminaries */
1593   vg_assert(VG_(is_valid_tid)(tid));
1594   vg_assert(tid >= 1 && tid < VG_N_THREADS);
1595   vg_assert(VG_(is_running_thread)(tid));
1596
1597   tst = VG_(get_ThreadState)(tid);
1598   sci = & syscallInfo[tid];
1599
1600   /* m_signals.sigvgkill_handler might call here even when not in
1601      a syscall. */
1602   if (sci->status.what == SsIdle || sci->status.what == SsHandToKernel) {
1603      sci->status.what = SsIdle;
1604      return;
1605   }
1606
1607   /* Validate current syscallInfo entry.  In particular we require
1608      that the current .status matches what's actually in the guest
1609      state.  At least in the normal case where we have actually
1610      previously written the result into the guest state. */
1611   vg_assert(sci->status.what == SsComplete);
1612
1613   getSyscallStatusFromGuestState( &test_status, &tst->arch.vex );
1614   if (!(sci->flags & SfNoWriteResult))
1615      vg_assert(eq_SyscallStatus( &sci->status, &test_status ));
1616   /* Failure of the above assertion on Darwin can indicate a problem
1617      in the syscall wrappers that pre-fail or pre-succeed the
1618      syscall, by calling SET_STATUS_Success or SET_STATUS_Failure,
1619      when they really should call SET_STATUS_from_SysRes.  The former
1620      create a UNIX-class syscall result on Darwin, which may not be
1621      correct for the syscall; if that's the case then this assertion
1622      fires.  See PRE(thread_fast_set_cthread_self) for an example.  On
1623      non-Darwin platforms this assertion is should never fail, and this
1624      comment is completely irrelevant. */
1625   /* Ok, looks sane */
1626
1627   /* Get the system call number.  Because the pre-handler isn't
1628      allowed to mess with it, it should be the same for both the
1629      original and potentially-modified args. */
1630   vg_assert(sci->args.sysno == sci->orig_args.sysno);
1631   sysno = sci->args.sysno;
1632   ent = get_syscall_entry(sysno);
1633
1634   /* pre: status == Complete (asserted above) */
1635   /* Consider either success or failure.  Now run the post handler if:
1636      - it exists, and
1637      - Success or (Failure and PostOnFail is set)
1638   */
1639   if (ent->after
1640       && ((!sr_isError(sci->status.sres))
1641           || (sr_isError(sci->status.sres)
1642               && (sci->flags & SfPostOnFail) ))) {
1643
1644      (ent->after)( tid, &sci->args, &sci->status );
1645   }
1646
1647   /* Because the post handler might have changed the status (eg, the
1648      post-handler for sys_open can change the result from success to
1649      failure if the kernel supplied a fd that it doesn't like), once
1650      again dump the syscall result back in the guest state.*/
1651   if (!(sci->flags & SfNoWriteResult))
1652      putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
1653
1654   /* Do any post-syscall actions required by the tool. */
1655   if (VG_(needs).syscall_wrapper) {
1656      UWord tmpv[8];
1657      tmpv[0] = sci->orig_args.arg1;
1658      tmpv[1] = sci->orig_args.arg2;
1659      tmpv[2] = sci->orig_args.arg3;
1660      tmpv[3] = sci->orig_args.arg4;
1661      tmpv[4] = sci->orig_args.arg5;
1662      tmpv[5] = sci->orig_args.arg6;
1663      tmpv[6] = sci->orig_args.arg7;
1664      tmpv[7] = sci->orig_args.arg8;
1665      VG_TDICT_CALL(tool_post_syscall, tid,
1666                    sysno,
1667                    &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]),
1668                    sci->status.sres);
1669   }
1670
1671   /* The syscall is done. */
1672   vg_assert(sci->status.what == SsComplete);
1673   sci->status.what = SsIdle;
1674
1675   /* The pre/post wrappers may have concluded that pending signals
1676      might have been created, and will have set SfPollAfter to
1677      request a poll for them once the syscall is done. */
1678   if (sci->flags & SfPollAfter)
1679      VG_(poll_signals)(tid);
1680
1681   /* Similarly, the wrappers might have asked for a yield
1682      afterwards. */
1683   if (sci->flags & SfYieldAfter)
1684      VG_(vg_yield)();
1685}
1686
1687
1688/* ---------------------------------------------------------------------
1689   Dealing with syscalls which get interrupted by a signal:
1690   VG_(fixup_guest_state_after_syscall_interrupted)
1691   ------------------------------------------------------------------ */
1692
1693/* Syscalls done on behalf of the client are finally handed off to the
1694   kernel in VG_(client_syscall) above, either by calling
1695   do_syscall_for_client (the async case), or by calling
1696   VG_(do_syscall6) (the sync case).
1697
1698   If the syscall is not interrupted by a signal (it may block and
1699   later unblock, but that's irrelevant here) then those functions
1700   eventually return and so control is passed to VG_(post_syscall).
1701   NB: not sure if the sync case can actually get interrupted, as it
1702   operates with all signals masked.
1703
1704   However, the syscall may get interrupted by an async-signal.  In
1705   that case do_syscall_for_client/VG_(do_syscall6) do not
1706   return.  Instead we wind up in m_signals.async_sighandler.  We need
1707   to fix up the guest state to make it look like the syscall was
1708   interrupted for guest.  So async_sighandler calls here, and this
1709   does the fixup.  Note that from here we wind up calling
1710   VG_(post_syscall) too.
1711*/
1712
1713
1714/* These are addresses within ML_(do_syscall_for_client_WRK).  See
1715   syscall-$PLAT.S for details.
1716*/
1717#if defined(VGO_linux)
1718  extern const Addr ML_(blksys_setup);
1719  extern const Addr ML_(blksys_restart);
1720  extern const Addr ML_(blksys_complete);
1721  extern const Addr ML_(blksys_committed);
1722  extern const Addr ML_(blksys_finished);
1723#elif defined(VGO_darwin)
1724  /* Darwin requires extra uglyness */
1725  extern const Addr ML_(blksys_setup_MACH);
1726  extern const Addr ML_(blksys_restart_MACH);
1727  extern const Addr ML_(blksys_complete_MACH);
1728  extern const Addr ML_(blksys_committed_MACH);
1729  extern const Addr ML_(blksys_finished_MACH);
1730  extern const Addr ML_(blksys_setup_MDEP);
1731  extern const Addr ML_(blksys_restart_MDEP);
1732  extern const Addr ML_(blksys_complete_MDEP);
1733  extern const Addr ML_(blksys_committed_MDEP);
1734  extern const Addr ML_(blksys_finished_MDEP);
1735  extern const Addr ML_(blksys_setup_UNIX);
1736  extern const Addr ML_(blksys_restart_UNIX);
1737  extern const Addr ML_(blksys_complete_UNIX);
1738  extern const Addr ML_(blksys_committed_UNIX);
1739  extern const Addr ML_(blksys_finished_UNIX);
1740#else
1741# error "Unknown OS"
1742#endif
1743
1744
1745/* Back up guest state to restart a system call. */
1746
1747void ML_(fixup_guest_state_to_restart_syscall) ( ThreadArchState* arch )
1748{
1749#if defined(VGP_x86_linux)
1750   arch->vex.guest_EIP -= 2;             // sizeof(int $0x80)
1751
1752   /* Make sure our caller is actually sane, and we're really backing
1753      back over a syscall.
1754
1755      int $0x80 == CD 80
1756   */
1757   {
1758      UChar *p = (UChar *)arch->vex.guest_EIP;
1759
1760      if (p[0] != 0xcd || p[1] != 0x80)
1761         VG_(message)(Vg_DebugMsg,
1762                      "?! restarting over syscall at %#x %02x %02x\n",
1763                      arch->vex.guest_EIP, p[0], p[1]);
1764
1765      vg_assert(p[0] == 0xcd && p[1] == 0x80);
1766   }
1767
1768#elif defined(VGP_amd64_linux)
1769   arch->vex.guest_RIP -= 2;             // sizeof(syscall)
1770
1771   /* Make sure our caller is actually sane, and we're really backing
1772      back over a syscall.
1773
1774      syscall == 0F 05
1775   */
1776   {
1777      UChar *p = (UChar *)arch->vex.guest_RIP;
1778
1779      if (p[0] != 0x0F || p[1] != 0x05)
1780         VG_(message)(Vg_DebugMsg,
1781                      "?! restarting over syscall at %#llx %02x %02x\n",
1782                      arch->vex.guest_RIP, p[0], p[1]);
1783
1784      vg_assert(p[0] == 0x0F && p[1] == 0x05);
1785   }
1786
1787#elif defined(VGP_ppc32_linux) || defined(VGP_ppc64_linux)
1788   arch->vex.guest_CIA -= 4;             // sizeof(ppc32 instr)
1789
1790   /* Make sure our caller is actually sane, and we're really backing
1791      back over a syscall.
1792
1793      sc == 44 00 00 02
1794   */
1795   {
1796      UChar *p = (UChar *)arch->vex.guest_CIA;
1797
1798      if (p[0] != 0x44 || p[1] != 0x0 || p[2] != 0x0 || p[3] != 0x02)
1799         VG_(message)(Vg_DebugMsg,
1800                      "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
1801                      arch->vex.guest_CIA + 0ULL, p[0], p[1], p[2], p[3]);
1802
1803      vg_assert(p[0] == 0x44 && p[1] == 0x0 && p[2] == 0x0 && p[3] == 0x2);
1804   }
1805
1806#elif defined(VGP_arm_linux)
1807   if (arch->vex.guest_R15T & 1) {
1808      // Thumb mode.  SVC is a encoded as
1809      //   1101 1111 imm8
1810      // where imm8 is the SVC number, and we only accept 0.
1811      arch->vex.guest_R15T -= 2;   // sizeof(thumb 16 bit insn)
1812      UChar* p     = (UChar*)(arch->vex.guest_R15T - 1);
1813      Bool   valid = p[0] == 0 && p[1] == 0xDF;
1814      if (!valid) {
1815         VG_(message)(Vg_DebugMsg,
1816                      "?! restarting over (Thumb) syscall that is not syscall "
1817                      "at %#llx %02x %02x\n",
1818                      arch->vex.guest_R15T - 1ULL, p[0], p[1]);
1819      }
1820      vg_assert(valid);
1821      // FIXME: NOTE, this really isn't right.  We need to back up
1822      // ITSTATE to what it was before the SVC instruction, but we
1823      // don't know what it was.  At least assert that it is now
1824      // zero, because if it is nonzero then it must also have
1825      // been nonzero for the SVC itself, which means it was
1826      // conditional.  Urk.
1827      vg_assert(arch->vex.guest_ITSTATE == 0);
1828   } else {
1829      // ARM mode.  SVC is encoded as
1830      //   cond 1111 imm24
1831      // where imm24 is the SVC number, and we only accept 0.
1832      arch->vex.guest_R15T -= 4;   // sizeof(arm instr)
1833      UChar* p     = (UChar*)arch->vex.guest_R15T;
1834      Bool   valid = p[0] == 0 && p[1] == 0 && p[2] == 0
1835                     && (p[3] & 0xF) == 0xF;
1836      if (!valid) {
1837         VG_(message)(Vg_DebugMsg,
1838                      "?! restarting over (ARM) syscall that is not syscall "
1839                      "at %#llx %02x %02x %02x %02x\n",
1840                      arch->vex.guest_R15T + 0ULL, p[0], p[1], p[2], p[3]);
1841      }
1842      vg_assert(valid);
1843   }
1844
1845#elif defined(VGP_x86_darwin)
1846   arch->vex.guest_EIP = arch->vex.guest_IP_AT_SYSCALL;
1847
1848   /* Make sure our caller is actually sane, and we're really backing
1849      back over a syscall.
1850
1851      int $0x80 == CD 80
1852      int $0x81 == CD 81
1853      int $0x82 == CD 82
1854      sysenter  == 0F 34
1855   */
1856   {
1857       UChar *p = (UChar *)arch->vex.guest_EIP;
1858       Bool  ok = (p[0] == 0xCD && p[1] == 0x80)
1859                  || (p[0] == 0xCD && p[1] == 0x81)
1860                  || (p[0] == 0xCD && p[1] == 0x82)
1861                  || (p[0] == 0x0F && p[1] == 0x34);
1862       if (!ok)
1863           VG_(message)(Vg_DebugMsg,
1864                        "?! restarting over syscall at %#x %02x %02x\n",
1865                        arch->vex.guest_EIP, p[0], p[1]);
1866       vg_assert(ok);
1867   }
1868
1869#elif defined(VGP_amd64_darwin)
1870   // DDD: #warning GrP fixme amd64 restart unimplemented
1871   vg_assert(0);
1872
1873#elif defined(VGP_s390x_linux)
1874   arch->vex.guest_IA -= 2;             // sizeof(syscall)
1875
1876   /* Make sure our caller is actually sane, and we're really backing
1877      back over a syscall.
1878
1879      syscall == 0A <num>
1880   */
1881   {
1882      UChar *p = (UChar *)arch->vex.guest_IA;
1883      if (p[0] != 0x0A)
1884         VG_(message)(Vg_DebugMsg,
1885                      "?! restarting over syscall at %#llx %02x %02x\n",
1886                      arch->vex.guest_IA, p[0], p[1]);
1887
1888      vg_assert(p[0] == 0x0A);
1889   }
1890#else
1891#  error "ML_(fixup_guest_state_to_restart_syscall): unknown plat"
1892#endif
1893}
1894
1895/*
1896   Fix up the guest state when a syscall is interrupted by a signal
1897   and so has been forced to return 'sysret'.
1898
1899   To do this, we determine the precise state of the syscall by
1900   looking at the (real) IP at the time the signal happened.  The
1901   syscall sequence looks like:
1902
1903     1. unblock signals
1904     2. perform syscall
1905     3. save result to guest state (EAX, RAX, R3+CR0.SO)
1906     4. re-block signals
1907
1908   If a signal
1909   happens at      Then     Why?
1910   [1-2)           restart  nothing has happened (restart syscall)
1911   [2]             restart  syscall hasn't started, or kernel wants to restart
1912   [2-3)           save     syscall complete, but results not saved
1913   [3-4)           syscall complete, results saved
1914
1915   Sometimes we never want to restart an interrupted syscall (because
1916   sigaction says not to), so we only restart if "restart" is True.
1917
1918   This will also call VG_(post_syscall) if the syscall has actually
1919   completed (either because it was interrupted, or because it
1920   actually finished).  It will not call VG_(post_syscall) if the
1921   syscall is set up for restart, which means that the pre-wrapper may
1922   get called multiple times.
1923*/
1924
1925void
1926VG_(fixup_guest_state_after_syscall_interrupted)( ThreadId tid,
1927                                                  Addr     ip,
1928                                                  SysRes   sres,
1929                                                  Bool     restart)
1930{
1931   /* Note that we don't know the syscall number here, since (1) in
1932      general there's no reliable way to get hold of it short of
1933      stashing it in the guest state before the syscall, and (2) in
1934      any case we don't need to know it for the actions done by this
1935      routine.
1936
1937      Furthermore, 'sres' is only used in the case where the syscall
1938      is complete, but the result has not been committed to the guest
1939      state yet.  In any other situation it will be meaningless and
1940      therefore ignored. */
1941
1942   ThreadState*     tst;
1943   SyscallStatus    canonical;
1944   ThreadArchState* th_regs;
1945   SyscallInfo*     sci;
1946
1947   /* Compute some Booleans indicating which range we're in. */
1948   Bool outside_range,
1949        in_setup_to_restart,      // [1,2) in the .S files
1950        at_restart,               // [2]   in the .S files
1951        in_complete_to_committed, // [3,4) in the .S files
1952        in_committed_to_finished; // [4,5) in the .S files
1953
1954#  if defined(VGO_linux)
1955   outside_range
1956      = ip < ML_(blksys_setup) || ip >= ML_(blksys_finished);
1957   in_setup_to_restart
1958      = ip >= ML_(blksys_setup) && ip < ML_(blksys_restart);
1959   at_restart
1960      = ip == ML_(blksys_restart);
1961   in_complete_to_committed
1962      = ip >= ML_(blksys_complete) && ip < ML_(blksys_committed);
1963   in_committed_to_finished
1964      = ip >= ML_(blksys_committed) && ip < ML_(blksys_finished);
1965#  elif defined(VGO_darwin)
1966   outside_range
1967      =  (ip < ML_(blksys_setup_MACH) || ip >= ML_(blksys_finished_MACH))
1968      && (ip < ML_(blksys_setup_MDEP) || ip >= ML_(blksys_finished_MDEP))
1969      && (ip < ML_(blksys_setup_UNIX) || ip >= ML_(blksys_finished_UNIX));
1970   in_setup_to_restart
1971      =  (ip >= ML_(blksys_setup_MACH) && ip < ML_(blksys_restart_MACH))
1972      || (ip >= ML_(blksys_setup_MDEP) && ip < ML_(blksys_restart_MDEP))
1973      || (ip >= ML_(blksys_setup_UNIX) && ip < ML_(blksys_restart_UNIX));
1974   at_restart
1975      =  (ip == ML_(blksys_restart_MACH))
1976      || (ip == ML_(blksys_restart_MDEP))
1977      || (ip == ML_(blksys_restart_UNIX));
1978   in_complete_to_committed
1979      =  (ip >= ML_(blksys_complete_MACH) && ip < ML_(blksys_committed_MACH))
1980      || (ip >= ML_(blksys_complete_MDEP) && ip < ML_(blksys_committed_MDEP))
1981      || (ip >= ML_(blksys_complete_UNIX) && ip < ML_(blksys_committed_UNIX));
1982   in_committed_to_finished
1983      =  (ip >= ML_(blksys_committed_MACH) && ip < ML_(blksys_finished_MACH))
1984      || (ip >= ML_(blksys_committed_MDEP) && ip < ML_(blksys_finished_MDEP))
1985      || (ip >= ML_(blksys_committed_UNIX) && ip < ML_(blksys_finished_UNIX));
1986   /* Wasn't that just So Much Fun?  Does your head hurt yet?  Mine does. */
1987#  else
1988#    error "Unknown OS"
1989#  endif
1990
1991   if (VG_(clo_trace_signals))
1992      VG_(message)( Vg_DebugMsg,
1993                    "interrupted_syscall: tid=%d, ip=0x%llx, "
1994                    "restart=%s, sres.isErr=%s, sres.val=%lld\n",
1995                    (Int)tid,
1996                    (ULong)ip,
1997                    restart ? "True" : "False",
1998                    sr_isError(sres) ? "True" : "False",
1999                    (Long)(sr_isError(sres) ? sr_Err(sres) : sr_Res(sres)) );
2000
2001   vg_assert(VG_(is_valid_tid)(tid));
2002   vg_assert(tid >= 1 && tid < VG_N_THREADS);
2003   vg_assert(VG_(is_running_thread)(tid));
2004
2005   tst     = VG_(get_ThreadState)(tid);
2006   th_regs = &tst->arch;
2007   sci     = & syscallInfo[tid];
2008
2009   /* Figure out what the state of the syscall was by examining the
2010      (real) IP at the time of the signal, and act accordingly. */
2011   if (outside_range) {
2012      if (VG_(clo_trace_signals))
2013         VG_(message)( Vg_DebugMsg,
2014                       "  not in syscall at all: hmm, very suspicious\n" );
2015      /* Looks like we weren't in a syscall at all.  Hmm. */
2016      vg_assert(sci->status.what != SsIdle);
2017      return;
2018   }
2019
2020   /* We should not be here unless this thread had first started up
2021      the machinery for a syscall by calling VG_(client_syscall).
2022      Hence: */
2023   vg_assert(sci->status.what != SsIdle);
2024
2025   /* now, do one of four fixup actions, depending on where the IP has
2026      got to. */
2027
2028   if (in_setup_to_restart) {
2029      /* syscall hasn't even started; go around again */
2030      if (VG_(clo_trace_signals))
2031         VG_(message)( Vg_DebugMsg, "  not started: restarting\n");
2032      vg_assert(sci->status.what == SsHandToKernel);
2033      ML_(fixup_guest_state_to_restart_syscall)(th_regs);
2034   }
2035
2036   else
2037   if (at_restart) {
2038      /* We're either about to run the syscall, or it was interrupted
2039         and the kernel restarted it.  Restart if asked, otherwise
2040         EINTR it. */
2041      if (restart) {
2042         if (VG_(clo_trace_signals))
2043            VG_(message)( Vg_DebugMsg, "  at syscall instr: restarting\n");
2044         ML_(fixup_guest_state_to_restart_syscall)(th_regs);
2045      } else {
2046         if (VG_(clo_trace_signals))
2047            VG_(message)( Vg_DebugMsg, "  at syscall instr: returning EINTR\n");
2048         canonical = convert_SysRes_to_SyscallStatus(
2049                        VG_(mk_SysRes_Error)( VKI_EINTR )
2050                     );
2051         if (!(sci->flags & SfNoWriteResult))
2052            putSyscallStatusIntoGuestState( tid, &canonical, &th_regs->vex );
2053         sci->status = canonical;
2054         VG_(post_syscall)(tid);
2055      }
2056   }
2057
2058   else
2059   if (in_complete_to_committed) {
2060      /* Syscall complete, but result hasn't been written back yet.
2061         Write the SysRes we were supplied with back to the guest
2062         state. */
2063      if (VG_(clo_trace_signals))
2064         VG_(message)( Vg_DebugMsg,
2065                       "  completed, but uncommitted: committing\n");
2066      canonical = convert_SysRes_to_SyscallStatus( sres );
2067      if (!(sci->flags & SfNoWriteResult))
2068         putSyscallStatusIntoGuestState( tid, &canonical, &th_regs->vex );
2069      sci->status = canonical;
2070      VG_(post_syscall)(tid);
2071   }
2072
2073   else
2074   if (in_committed_to_finished) {
2075      /* Result committed, but the signal mask has not been restored;
2076         we expect our caller (the signal handler) will have fixed
2077         this up. */
2078      if (VG_(clo_trace_signals))
2079         VG_(message)( Vg_DebugMsg,
2080                       "  completed and committed: nothing to do\n");
2081      getSyscallStatusFromGuestState( &sci->status, &th_regs->vex );
2082      vg_assert(sci->status.what == SsComplete);
2083      VG_(post_syscall)(tid);
2084   }
2085
2086   else
2087      VG_(core_panic)("?? strange syscall interrupt state?");
2088
2089   /* In all cases, the syscall is now finished (even if we called
2090      ML_(fixup_guest_state_to_restart_syscall), since that just
2091      re-positions the guest's IP for another go at it).  So we need
2092      to record that fact. */
2093   sci->status.what = SsIdle;
2094}
2095
2096
2097#if defined(VGO_darwin)
2098// Clean up after workq_ops(WQOPS_THREAD_RETURN) jumped to wqthread_hijack.
2099// This is similar to VG_(fixup_guest_state_after_syscall_interrupted).
2100// This longjmps back to the scheduler.
2101void ML_(wqthread_continue_NORETURN)(ThreadId tid)
2102{
2103   ThreadState*     tst;
2104   SyscallInfo*     sci;
2105
2106   VG_(acquire_BigLock)(tid, "wqthread_continue_NORETURN");
2107
2108   PRINT("SYSCALL[%d,%d](%s) workq_ops() starting new workqueue item\n",
2109         VG_(getpid)(), tid, VG_SYSNUM_STRING(__NR_workq_ops));
2110
2111   vg_assert(VG_(is_valid_tid)(tid));
2112   vg_assert(tid >= 1 && tid < VG_N_THREADS);
2113   vg_assert(VG_(is_running_thread)(tid));
2114
2115   tst     = VG_(get_ThreadState)(tid);
2116   sci     = & syscallInfo[tid];
2117   vg_assert(sci->status.what != SsIdle);
2118   vg_assert(tst->os_state.wq_jmpbuf_valid);  // check this BEFORE post_syscall
2119
2120   // Pretend the syscall completed normally, but don't touch the thread state.
2121   sci->status = convert_SysRes_to_SyscallStatus( VG_(mk_SysRes_Success)(0) );
2122   sci->flags |= SfNoWriteResult;
2123   VG_(post_syscall)(tid);
2124
2125   sci->status.what = SsIdle;
2126
2127   vg_assert(tst->sched_jmpbuf_valid);
2128   VG_MINIMAL_LONGJMP(tst->sched_jmpbuf);
2129
2130   /* NOTREACHED */
2131   vg_assert(0);
2132}
2133#endif
2134
2135
2136/* ---------------------------------------------------------------------
2137   A place to store the where-to-call-when-really-done pointer
2138   ------------------------------------------------------------------ */
2139
2140// When the final thread is done, where shall I call to shutdown the
2141// system cleanly?  Is set once at startup (in m_main) and never
2142// changes after that.  Is basically a pointer to the exit
2143// continuation.  This is all just a nasty hack to avoid calling
2144// directly from m_syswrap to m_main at exit, since that would cause
2145// m_main to become part of a module cycle, which is silly.
2146void (* VG_(address_of_m_main_shutdown_actions_NORETURN) )
2147       (ThreadId,VgSchedReturnCode)
2148   = NULL;
2149
2150/*--------------------------------------------------------------------*/
2151/*--- end                                                          ---*/
2152/*--------------------------------------------------------------------*/
2153