1/* -*- mode: C; c-basic-offset: 3; -*- */
2
3/*--------------------------------------------------------------------*/
4/*--- Wrappers for generic Unix system calls                       ---*/
5/*---                                            syswrap-generic.c ---*/
6/*--------------------------------------------------------------------*/
7
8/*
9   This file is part of Valgrind, a dynamic binary instrumentation
10   framework.
11
12   Copyright (C) 2000-2015 Julian Seward
13      jseward@acm.org
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#if defined(VGO_linux) || defined(VGO_darwin) || defined(VGO_solaris)
34
35#include "pub_core_basics.h"
36#include "pub_core_vki.h"
37#include "pub_core_vkiscnums.h"
38#include "pub_core_threadstate.h"
39#include "pub_core_debuginfo.h"     // VG_(di_notify_*)
40#include "pub_core_aspacemgr.h"
41#include "pub_core_transtab.h"      // VG_(discard_translations)
42#include "pub_core_xarray.h"
43#include "pub_core_clientstate.h"   // VG_(brk_base), VG_(brk_limit)
44#include "pub_core_debuglog.h"
45#include "pub_core_errormgr.h"
46#include "pub_core_gdbserver.h"     // VG_(gdbserver)
47#include "pub_core_libcbase.h"
48#include "pub_core_libcassert.h"
49#include "pub_core_libcfile.h"
50#include "pub_core_libcprint.h"
51#include "pub_core_libcproc.h"
52#include "pub_core_libcsignal.h"
53#include "pub_core_machine.h"       // VG_(get_SP)
54#include "pub_core_mallocfree.h"
55#include "pub_core_options.h"
56#include "pub_core_scheduler.h"
57#include "pub_core_signals.h"
58#include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
59#include "pub_core_syscall.h"
60#include "pub_core_syswrap.h"
61#include "pub_core_tooliface.h"
62#include "pub_core_ume.h"
63#include "pub_core_stacks.h"
64
65#include "priv_types_n_macros.h"
66#include "priv_syswrap-generic.h"
67
68#include "config.h"
69
70
71void ML_(guess_and_register_stack) (Addr sp, ThreadState* tst)
72{
73   Bool debug = False;
74   NSegment const* seg;
75
76   /* We don't really know where the client stack is, because its
77      allocated by the client.  The best we can do is look at the
78      memory mappings and try to derive some useful information.  We
79      assume that sp starts near its highest possible value, and can
80      only go down to the start of the mmaped segment. */
81   seg = VG_(am_find_nsegment)(sp);
82   if (seg &&
83       VG_(am_is_valid_for_client)(sp, 1, VKI_PROT_READ | VKI_PROT_WRITE)) {
84      tst->client_stack_highest_byte = (Addr)VG_PGROUNDUP(sp)-1;
85      tst->client_stack_szB = tst->client_stack_highest_byte - seg->start + 1;
86
87      VG_(register_stack)(seg->start, tst->client_stack_highest_byte);
88
89      if (debug)
90	 VG_(printf)("tid %u: guessed client stack range [%#lx-%#lx]\n",
91		     tst->tid, seg->start, tst->client_stack_highest_byte);
92   } else {
93      VG_(message)(Vg_UserMsg,
94                   "!? New thread %u starts with SP(%#lx) unmapped\n",
95		   tst->tid, sp);
96      tst->client_stack_highest_byte = 0;
97      tst->client_stack_szB  = 0;
98   }
99}
100
101/* Returns True iff address range is something the client can
102   plausibly mess with: all of it is either already belongs to the
103   client or is free or a reservation. */
104
105Bool ML_(valid_client_addr)(Addr start, SizeT size, ThreadId tid,
106                                   const HChar *syscallname)
107{
108   Bool ret;
109
110   if (size == 0)
111      return True;
112
113   ret = VG_(am_is_valid_for_client_or_free_or_resvn)
114            (start,size,VKI_PROT_NONE);
115
116   if (0)
117      VG_(printf)("%s: test=%#lx-%#lx ret=%d\n",
118		  syscallname, start, start+size-1, (Int)ret);
119
120   if (!ret && syscallname != NULL) {
121      VG_(message)(Vg_UserMsg, "Warning: client syscall %s tried "
122                               "to modify addresses %#lx-%#lx\n",
123                               syscallname, start, start+size-1);
124      if (VG_(clo_verbosity) > 1) {
125         VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
126      }
127   }
128
129   return ret;
130}
131
132
133Bool ML_(client_signal_OK)(Int sigNo)
134{
135   /* signal 0 is OK for kill */
136   Bool ret = sigNo >= 0 && sigNo <= VG_SIGVGRTUSERMAX;
137
138   //VG_(printf)("client_signal_OK(%d) -> %d\n", sigNo, ret);
139
140   return ret;
141}
142
143
144/* Handy small function to help stop wrappers from segfaulting when
145   presented with bogus client addresses.  Is not used for generating
146   user-visible errors. */
147
148Bool ML_(safe_to_deref) ( const void *start, SizeT size )
149{
150   return VG_(am_is_valid_for_client)( (Addr)start, size, VKI_PROT_READ );
151}
152
153
154/* ---------------------------------------------------------------------
155   Doing mmap, mremap
156   ------------------------------------------------------------------ */
157
158/* AFAICT from kernel sources (mm/mprotect.c) and general experimentation,
159   munmap, mprotect (and mremap??) work at the page level.  So addresses
160   and lengths must be adjusted for this. */
161
162/* Mash around start and length so that the area exactly covers
163   an integral number of pages.  If we don't do that, memcheck's
164   idea of addressible memory diverges from that of the
165   kernel's, which causes the leak detector to crash. */
166static
167void page_align_addr_and_len( Addr* a, SizeT* len)
168{
169   Addr ra;
170
171   ra = VG_PGROUNDDN(*a);
172   *len = VG_PGROUNDUP(*a + *len) - ra;
173   *a = ra;
174}
175
176static void notify_core_of_mmap(Addr a, SizeT len, UInt prot,
177                                UInt flags, Int fd, Off64T offset)
178{
179   Bool d;
180
181   /* 'a' is the return value from a real kernel mmap, hence: */
182   vg_assert(VG_IS_PAGE_ALIGNED(a));
183   /* whereas len is whatever the syscall supplied.  So: */
184   len = VG_PGROUNDUP(len);
185
186   d = VG_(am_notify_client_mmap)( a, len, prot, flags, fd, offset );
187
188   if (d)
189      VG_(discard_translations)( a, (ULong)len,
190                                 "notify_core_of_mmap" );
191}
192
193static void notify_tool_of_mmap(Addr a, SizeT len, UInt prot, ULong di_handle)
194{
195   Bool rr, ww, xx;
196
197   /* 'a' is the return value from a real kernel mmap, hence: */
198   vg_assert(VG_IS_PAGE_ALIGNED(a));
199   /* whereas len is whatever the syscall supplied.  So: */
200   len = VG_PGROUNDUP(len);
201
202   rr = toBool(prot & VKI_PROT_READ);
203   ww = toBool(prot & VKI_PROT_WRITE);
204   xx = toBool(prot & VKI_PROT_EXEC);
205
206   VG_TRACK( new_mem_mmap, a, len, rr, ww, xx, di_handle );
207}
208
209
210/* When a client mmap has been successfully done, this function must
211   be called.  It notifies both aspacem and the tool of the new
212   mapping.
213
214   JRS 2008-Aug-14: But notice this is *very* obscure.  The only place
215   it is called from is POST(sys_io_setup).  In particular,
216   ML_(generic_PRE_sys_mmap), in m_syswrap, is the "normal case" handler for
217   client mmap.  But it doesn't call this function; instead it does the
218   relevant notifications itself.  Here, we just pass di_handle=0 to
219   notify_tool_of_mmap as we have no better information.  But really this
220   function should be done away with; problem is I don't understand what
221   POST(sys_io_setup) does or how it works.
222
223   [However, this function is used lots for Darwin, because
224    ML_(generic_PRE_sys_mmap) cannot be used for Darwin.]
225 */
226void
227ML_(notify_core_and_tool_of_mmap) ( Addr a, SizeT len, UInt prot,
228                                    UInt flags, Int fd, Off64T offset )
229{
230   // XXX: unlike the other notify_core_and_tool* functions, this one doesn't
231   // do anything with debug info (ie. it doesn't call VG_(di_notify_mmap)).
232   // Should it?  --njn
233   notify_core_of_mmap(a, len, prot, flags, fd, offset);
234   notify_tool_of_mmap(a, len, prot, 0/*di_handle*/);
235}
236
237void
238ML_(notify_core_and_tool_of_munmap) ( Addr a, SizeT len )
239{
240   Bool d;
241
242   page_align_addr_and_len(&a, &len);
243   d = VG_(am_notify_munmap)(a, len);
244   VG_TRACK( die_mem_munmap, a, len );
245   VG_(di_notify_munmap)( a, len );
246   if (d)
247      VG_(discard_translations)( a, (ULong)len,
248                                 "ML_(notify_core_and_tool_of_munmap)" );
249}
250
251void
252ML_(notify_core_and_tool_of_mprotect) ( Addr a, SizeT len, Int prot )
253{
254   Bool rr = toBool(prot & VKI_PROT_READ);
255   Bool ww = toBool(prot & VKI_PROT_WRITE);
256   Bool xx = toBool(prot & VKI_PROT_EXEC);
257   Bool d;
258
259   page_align_addr_and_len(&a, &len);
260   d = VG_(am_notify_mprotect)(a, len, prot);
261   VG_TRACK( change_mem_mprotect, a, len, rr, ww, xx );
262   VG_(di_notify_mprotect)( a, len, prot );
263   if (d)
264      VG_(discard_translations)( a, (ULong)len,
265                                 "ML_(notify_core_and_tool_of_mprotect)" );
266}
267
268
269
270#if HAVE_MREMAP
271/* Expand (or shrink) an existing mapping, potentially moving it at
272   the same time (controlled by the MREMAP_MAYMOVE flag).  Nightmare.
273*/
274static
275SysRes do_mremap( Addr old_addr, SizeT old_len,
276                  Addr new_addr, SizeT new_len,
277                  UWord flags, ThreadId tid )
278{
279#  define MIN_SIZET(_aa,_bb) (_aa) < (_bb) ? (_aa) : (_bb)
280
281   Bool      ok, d;
282   NSegment const* old_seg;
283   Addr      advised;
284   Bool      f_fixed   = toBool(flags & VKI_MREMAP_FIXED);
285   Bool      f_maymove = toBool(flags & VKI_MREMAP_MAYMOVE);
286
287   if (0)
288      VG_(printf)("do_remap (old %#lx %lu) (new %#lx %lu) %s %s\n",
289                  old_addr,old_len,new_addr,new_len,
290                  flags & VKI_MREMAP_MAYMOVE ? "MAYMOVE" : "",
291                  flags & VKI_MREMAP_FIXED ? "FIXED" : "");
292   if (0)
293      VG_(am_show_nsegments)(0, "do_remap: before");
294
295   if (flags & ~(VKI_MREMAP_FIXED | VKI_MREMAP_MAYMOVE))
296      goto eINVAL;
297
298   if (!VG_IS_PAGE_ALIGNED(old_addr))
299      goto eINVAL;
300
301   old_len = VG_PGROUNDUP(old_len);
302   new_len = VG_PGROUNDUP(new_len);
303
304   if (new_len == 0)
305      goto eINVAL;
306
307   /* kernel doesn't reject this, but we do. */
308   if (old_len == 0)
309      goto eINVAL;
310
311   /* reject wraparounds */
312   if (old_addr + old_len < old_addr)
313      goto eINVAL;
314   if (f_fixed == True && new_addr + new_len < new_len)
315      goto eINVAL;
316
317   /* kernel rejects all fixed, no-move requests (which are
318      meaningless). */
319   if (f_fixed == True && f_maymove == False)
320      goto eINVAL;
321
322   /* Stay away from non-client areas. */
323   if (!ML_(valid_client_addr)(old_addr, old_len, tid, "mremap(old_addr)"))
324      goto eINVAL;
325
326   /* In all remaining cases, if the old range does not fall within a
327      single segment, fail. */
328   old_seg = VG_(am_find_nsegment)( old_addr );
329   if (old_addr < old_seg->start || old_addr+old_len-1 > old_seg->end)
330      goto eINVAL;
331   if (old_seg->kind != SkAnonC && old_seg->kind != SkFileC &&
332       old_seg->kind != SkShmC)
333      goto eINVAL;
334
335   vg_assert(old_len > 0);
336   vg_assert(new_len > 0);
337   vg_assert(VG_IS_PAGE_ALIGNED(old_len));
338   vg_assert(VG_IS_PAGE_ALIGNED(new_len));
339   vg_assert(VG_IS_PAGE_ALIGNED(old_addr));
340
341   /* There are 3 remaining cases:
342
343      * maymove == False
344
345        new space has to be at old address, so:
346            - shrink    -> unmap end
347            - same size -> do nothing
348            - grow      -> if can grow in-place, do so, else fail
349
350      * maymove == True, fixed == False
351
352        new space can be anywhere, so:
353            - shrink    -> unmap end
354            - same size -> do nothing
355            - grow      -> if can grow in-place, do so, else
356                           move to anywhere large enough, else fail
357
358      * maymove == True, fixed == True
359
360        new space must be at new address, so:
361
362            - if new address is not page aligned, fail
363            - if new address range overlaps old one, fail
364            - if new address range cannot be allocated, fail
365            - else move to new address range with new size
366            - else fail
367   */
368
369   if (f_maymove == False) {
370      /* new space has to be at old address */
371      if (new_len < old_len)
372         goto shrink_in_place;
373      if (new_len > old_len)
374         goto grow_in_place_or_fail;
375      goto same_in_place;
376   }
377
378   if (f_maymove == True && f_fixed == False) {
379      /* new space can be anywhere */
380      if (new_len < old_len)
381         goto shrink_in_place;
382      if (new_len > old_len)
383         goto grow_in_place_or_move_anywhere_or_fail;
384      goto same_in_place;
385   }
386
387   if (f_maymove == True && f_fixed == True) {
388      /* new space can only be at the new address */
389      if (!VG_IS_PAGE_ALIGNED(new_addr))
390         goto eINVAL;
391      if (new_addr+new_len-1 < old_addr || new_addr > old_addr+old_len-1) {
392         /* no overlap */
393      } else {
394         goto eINVAL;
395      }
396      if (new_addr == 0)
397         goto eINVAL;
398         /* VG_(am_get_advisory_client_simple) interprets zero to mean
399            non-fixed, which is not what we want */
400      advised = VG_(am_get_advisory_client_simple)(new_addr, new_len, &ok);
401      if (!ok || advised != new_addr)
402         goto eNOMEM;
403      ok = VG_(am_relocate_nooverlap_client)
404              ( &d, old_addr, old_len, new_addr, new_len );
405      if (ok) {
406         VG_TRACK( copy_mem_remap, old_addr, new_addr,
407                                   MIN_SIZET(old_len,new_len) );
408         if (new_len > old_len)
409            VG_TRACK( new_mem_mmap, new_addr+old_len, new_len-old_len,
410                      old_seg->hasR, old_seg->hasW, old_seg->hasX,
411                      0/*di_handle*/ );
412         VG_TRACK(die_mem_munmap, old_addr, old_len);
413         if (d) {
414            VG_(discard_translations)( old_addr, old_len, "do_remap(1)" );
415            VG_(discard_translations)( new_addr, new_len, "do_remap(2)" );
416         }
417         return VG_(mk_SysRes_Success)( new_addr );
418      }
419      goto eNOMEM;
420   }
421
422   /* end of the 3 cases */
423   /*NOTREACHED*/ vg_assert(0);
424
425  grow_in_place_or_move_anywhere_or_fail:
426   {
427   /* try growing it in-place */
428   Addr   needA = old_addr + old_len;
429   SSizeT needL = new_len - old_len;
430
431   vg_assert(needL > 0);
432   vg_assert(needA > 0);
433
434   advised = VG_(am_get_advisory_client_simple)( needA, needL, &ok );
435   if (ok) {
436      /* Fixes bug #129866. */
437      ok = VG_(am_covered_by_single_free_segment) ( needA, needL );
438   }
439   if (ok && advised == needA) {
440      const NSegment *new_seg = VG_(am_extend_map_client)( old_addr, needL );
441      if (new_seg) {
442         VG_TRACK( new_mem_mmap, needA, needL,
443                                 new_seg->hasR,
444                                 new_seg->hasW, new_seg->hasX,
445                                 0/*di_handle*/ );
446         return VG_(mk_SysRes_Success)( old_addr );
447      }
448   }
449
450   /* that failed.  Look elsewhere. */
451   advised = VG_(am_get_advisory_client_simple)( 0, new_len, &ok );
452   if (ok) {
453      Bool oldR = old_seg->hasR;
454      Bool oldW = old_seg->hasW;
455      Bool oldX = old_seg->hasX;
456      /* assert new area does not overlap old */
457      vg_assert(advised+new_len-1 < old_addr
458                || advised > old_addr+old_len-1);
459      ok = VG_(am_relocate_nooverlap_client)
460              ( &d, old_addr, old_len, advised, new_len );
461      if (ok) {
462         VG_TRACK( copy_mem_remap, old_addr, advised,
463                                   MIN_SIZET(old_len,new_len) );
464         if (new_len > old_len)
465            VG_TRACK( new_mem_mmap, advised+old_len, new_len-old_len,
466                      oldR, oldW, oldX, 0/*di_handle*/ );
467         VG_TRACK(die_mem_munmap, old_addr, old_len);
468         if (d) {
469            VG_(discard_translations)( old_addr, old_len, "do_remap(4)" );
470            VG_(discard_translations)( advised, new_len, "do_remap(5)" );
471         }
472         return VG_(mk_SysRes_Success)( advised );
473      }
474   }
475   goto eNOMEM;
476   }
477   /*NOTREACHED*/ vg_assert(0);
478
479  grow_in_place_or_fail:
480   {
481   Addr  needA = old_addr + old_len;
482   SizeT needL = new_len - old_len;
483
484   vg_assert(needA > 0);
485
486   advised = VG_(am_get_advisory_client_simple)( needA, needL, &ok );
487   if (ok) {
488      /* Fixes bug #129866. */
489      ok = VG_(am_covered_by_single_free_segment) ( needA, needL );
490   }
491   if (!ok || advised != needA)
492      goto eNOMEM;
493   const NSegment *new_seg = VG_(am_extend_map_client)( old_addr, needL );
494   if (!new_seg)
495      goto eNOMEM;
496   VG_TRACK( new_mem_mmap, needA, needL,
497                           new_seg->hasR, new_seg->hasW, new_seg->hasX,
498                           0/*di_handle*/ );
499
500   return VG_(mk_SysRes_Success)( old_addr );
501   }
502   /*NOTREACHED*/ vg_assert(0);
503
504  shrink_in_place:
505   {
506   SysRes sres = VG_(am_munmap_client)( &d, old_addr+new_len, old_len-new_len );
507   if (sr_isError(sres))
508      return sres;
509   VG_TRACK( die_mem_munmap, old_addr+new_len, old_len-new_len );
510   if (d)
511      VG_(discard_translations)( old_addr+new_len, old_len-new_len,
512                                 "do_remap(7)" );
513   return VG_(mk_SysRes_Success)( old_addr );
514   }
515   /*NOTREACHED*/ vg_assert(0);
516
517  same_in_place:
518   return VG_(mk_SysRes_Success)( old_addr );
519   /*NOTREACHED*/ vg_assert(0);
520
521  eINVAL:
522   return VG_(mk_SysRes_Error)( VKI_EINVAL );
523  eNOMEM:
524   return VG_(mk_SysRes_Error)( VKI_ENOMEM );
525
526#  undef MIN_SIZET
527}
528#endif /* HAVE_MREMAP */
529
530
531/* ---------------------------------------------------------------------
532   File-descriptor tracking
533   ------------------------------------------------------------------ */
534
535/* One of these is allocated for each open file descriptor.  */
536typedef struct OpenFd
537{
538   Int fd;                        /* The file descriptor */
539   HChar *pathname;               /* NULL if not a regular file or unknown */
540   ExeContext *where;             /* NULL if inherited from parent */
541   struct OpenFd *next, *prev;
542} OpenFd;
543
544/* List of allocated file descriptors. */
545static OpenFd *allocated_fds = NULL;
546
547/* Count of open file descriptors. */
548static Int fd_count = 0;
549
550
551/* Note the fact that a file descriptor was just closed. */
552void ML_(record_fd_close)(Int fd)
553{
554   OpenFd *i = allocated_fds;
555
556   if (fd >= VG_(fd_hard_limit))
557      return;			/* Valgrind internal */
558
559   while(i) {
560      if(i->fd == fd) {
561         if(i->prev)
562            i->prev->next = i->next;
563         else
564            allocated_fds = i->next;
565         if(i->next)
566            i->next->prev = i->prev;
567         if(i->pathname)
568            VG_(free) (i->pathname);
569         VG_(free) (i);
570         fd_count--;
571         break;
572      }
573      i = i->next;
574   }
575}
576
577/* Note the fact that a file descriptor was just opened.  If the
578   tid is -1, this indicates an inherited fd.  If the pathname is NULL,
579   this either indicates a non-standard file (i.e. a pipe or socket or
580   some such thing) or that we don't know the filename.  If the fd is
581   already open, then we're probably doing a dup2() to an existing fd,
582   so just overwrite the existing one. */
583void ML_(record_fd_open_with_given_name)(ThreadId tid, Int fd,
584                                         const HChar *pathname)
585{
586   OpenFd *i;
587
588   if (fd >= VG_(fd_hard_limit))
589      return;			/* Valgrind internal */
590
591   /* Check to see if this fd is already open. */
592   i = allocated_fds;
593   while (i) {
594      if (i->fd == fd) {
595         if (i->pathname) VG_(free)(i->pathname);
596         break;
597      }
598      i = i->next;
599   }
600
601   /* Not already one: allocate an OpenFd */
602   if (i == NULL) {
603      i = VG_(malloc)("syswrap.rfdowgn.1", sizeof(OpenFd));
604
605      i->prev = NULL;
606      i->next = allocated_fds;
607      if(allocated_fds) allocated_fds->prev = i;
608      allocated_fds = i;
609      fd_count++;
610   }
611
612   i->fd = fd;
613   i->pathname = VG_(strdup)("syswrap.rfdowgn.2", pathname);
614   i->where = (tid == -1) ? NULL : VG_(record_ExeContext)(tid, 0/*first_ip_delta*/);
615}
616
617// Record opening of an fd, and find its name.
618void ML_(record_fd_open_named)(ThreadId tid, Int fd)
619{
620   const HChar* buf;
621   const HChar* name;
622   if (VG_(resolve_filename)(fd, &buf))
623      name = buf;
624   else
625      name = NULL;
626
627   ML_(record_fd_open_with_given_name)(tid, fd, name);
628}
629
630// Record opening of a nameless fd.
631void ML_(record_fd_open_nameless)(ThreadId tid, Int fd)
632{
633   ML_(record_fd_open_with_given_name)(tid, fd, NULL);
634}
635
636// Return if a given file descriptor is already recorded.
637Bool ML_(fd_recorded)(Int fd)
638{
639   OpenFd *i = allocated_fds;
640   while (i) {
641      if (i->fd == fd)
642         return True;
643      i = i->next;
644   }
645   return False;
646}
647
648/* Returned string must not be modified nor free'd. */
649const HChar *ML_(find_fd_recorded_by_fd)(Int fd)
650{
651   OpenFd *i = allocated_fds;
652
653   while (i) {
654      if (i->fd == fd)
655         return i->pathname;
656      i = i->next;
657   }
658
659   return NULL;
660}
661
662static
663HChar *unix_to_name(struct vki_sockaddr_un *sa, UInt len, HChar *name)
664{
665   if (sa == NULL || len == 0 || sa->sun_path[0] == '\0') {
666      VG_(sprintf)(name, "<unknown>");
667   } else {
668      VG_(sprintf)(name, "%s", sa->sun_path);
669   }
670
671   return name;
672}
673
674static
675HChar *inet_to_name(struct vki_sockaddr_in *sa, UInt len, HChar *name)
676{
677   if (sa == NULL || len == 0) {
678      VG_(sprintf)(name, "<unknown>");
679   } else if (sa->sin_port == 0) {
680      VG_(sprintf)(name, "<unbound>");
681   } else {
682      UInt addr = VG_(ntohl)(sa->sin_addr.s_addr);
683      VG_(sprintf)(name, "%u.%u.%u.%u:%u",
684                   (addr>>24) & 0xFF, (addr>>16) & 0xFF,
685                   (addr>>8) & 0xFF, addr & 0xFF,
686                   VG_(ntohs)(sa->sin_port));
687   }
688
689   return name;
690}
691
692static
693void inet6_format(HChar *s, const UChar ip[16])
694{
695   static const unsigned char V4mappedprefix[12] = {0,0,0,0,0,0,0,0,0,0,0xff,0xff};
696
697   if (!VG_(memcmp)(ip, V4mappedprefix, 12)) {
698      const struct vki_in_addr *sin_addr =
699          (const struct vki_in_addr *)(ip + 12);
700      UInt addr = VG_(ntohl)(sin_addr->s_addr);
701
702      VG_(sprintf)(s, "::ffff:%u.%u.%u.%u",
703                   (addr>>24) & 0xFF, (addr>>16) & 0xFF,
704                   (addr>>8) & 0xFF, addr & 0xFF);
705   } else {
706      Bool compressing = False;
707      Bool compressed = False;
708      Int len = 0;
709      Int i;
710
711      for (i = 0; i < 16; i += 2) {
712         UInt word = ((UInt)ip[i] << 8) | (UInt)ip[i+1];
713         if (word == 0 && !compressed) {
714            compressing = True;
715         } else {
716            if (compressing) {
717               compressing = False;
718               compressed = True;
719               s[len++] = ':';
720            }
721            if (i > 0) {
722               s[len++] = ':';
723            }
724            len += VG_(sprintf)(s + len, "%x", word);
725         }
726      }
727
728      if (compressing) {
729         s[len++] = ':';
730         s[len++] = ':';
731      }
732
733      s[len++] = 0;
734   }
735
736   return;
737}
738
739static
740HChar *inet6_to_name(struct vki_sockaddr_in6 *sa, UInt len, HChar *name)
741{
742   if (sa == NULL || len == 0) {
743      VG_(sprintf)(name, "<unknown>");
744   } else if (sa->sin6_port == 0) {
745      VG_(sprintf)(name, "<unbound>");
746   } else {
747      HChar addr[100];    // large enough
748      inet6_format(addr, (void *)&(sa->sin6_addr));
749      VG_(sprintf)(name, "[%s]:%u", addr, VG_(ntohs)(sa->sin6_port));
750   }
751
752   return name;
753}
754
755/*
756 * Try get some details about a socket.
757 */
758static void
759getsockdetails(Int fd)
760{
761   union u {
762      struct vki_sockaddr a;
763      struct vki_sockaddr_in in;
764      struct vki_sockaddr_in6 in6;
765      struct vki_sockaddr_un un;
766   } laddr;
767   Int llen;
768
769   llen = sizeof(laddr);
770   VG_(memset)(&laddr, 0, llen);
771
772   if(VG_(getsockname)(fd, (struct vki_sockaddr *)&(laddr.a), &llen) != -1) {
773      switch(laddr.a.sa_family) {
774      case VKI_AF_INET: {
775         HChar lname[32];   // large enough
776         HChar pname[32];   // large enough
777         struct vki_sockaddr_in paddr;
778         Int plen = sizeof(struct vki_sockaddr_in);
779
780         if (VG_(getpeername)(fd, (struct vki_sockaddr *)&paddr, &plen) != -1) {
781            VG_(message)(Vg_UserMsg, "Open AF_INET socket %d: %s <-> %s\n", fd,
782                         inet_to_name(&(laddr.in), llen, lname),
783                         inet_to_name(&paddr, plen, pname));
784         } else {
785            VG_(message)(Vg_UserMsg, "Open AF_INET socket %d: %s <-> unbound\n",
786                         fd, inet_to_name(&(laddr.in), llen, lname));
787         }
788         return;
789         }
790      case VKI_AF_INET6: {
791         HChar lname[128];  // large enough
792         HChar pname[128];  // large enough
793         struct vki_sockaddr_in6 paddr;
794         Int plen = sizeof(struct vki_sockaddr_in6);
795
796         if (VG_(getpeername)(fd, (struct vki_sockaddr *)&paddr, &plen) != -1) {
797            VG_(message)(Vg_UserMsg, "Open AF_INET6 socket %d: %s <-> %s\n", fd,
798                         inet6_to_name(&(laddr.in6), llen, lname),
799                         inet6_to_name(&paddr, plen, pname));
800         } else {
801            VG_(message)(Vg_UserMsg, "Open AF_INET6 socket %d: %s <-> unbound\n",
802                         fd, inet6_to_name(&(laddr.in6), llen, lname));
803         }
804         return;
805         }
806      case VKI_AF_UNIX: {
807         static char lname[256];
808         VG_(message)(Vg_UserMsg, "Open AF_UNIX socket %d: %s\n", fd,
809                      unix_to_name(&(laddr.un), llen, lname));
810         return;
811         }
812      default:
813         VG_(message)(Vg_UserMsg, "Open pf-%d socket %d:\n",
814                      laddr.a.sa_family, fd);
815         return;
816      }
817   }
818
819   VG_(message)(Vg_UserMsg, "Open socket %d:\n", fd);
820}
821
822
823/* Dump out a summary, and a more detailed list, of open file descriptors. */
824void VG_(show_open_fds) (const HChar* when)
825{
826   OpenFd *i = allocated_fds;
827
828   VG_(message)(Vg_UserMsg, "FILE DESCRIPTORS: %d open %s.\n", fd_count, when);
829
830   while (i) {
831      if (i->pathname) {
832         VG_(message)(Vg_UserMsg, "Open file descriptor %d: %s\n", i->fd,
833                      i->pathname);
834      } else {
835         Int val;
836         Int len = sizeof(val);
837
838         if (VG_(getsockopt)(i->fd, VKI_SOL_SOCKET, VKI_SO_TYPE, &val, &len)
839             == -1) {
840            VG_(message)(Vg_UserMsg, "Open file descriptor %d:\n", i->fd);
841         } else {
842            getsockdetails(i->fd);
843         }
844      }
845
846      if(i->where) {
847         VG_(pp_ExeContext)(i->where);
848         VG_(message)(Vg_UserMsg, "\n");
849      } else {
850         VG_(message)(Vg_UserMsg, "   <inherited from parent>\n");
851         VG_(message)(Vg_UserMsg, "\n");
852      }
853
854      i = i->next;
855   }
856
857   VG_(message)(Vg_UserMsg, "\n");
858}
859
860/* If /proc/self/fd doesn't exist (e.g. you've got a Linux kernel that doesn't
861   have /proc support compiled in, or a non-Linux kernel), then we need to
862   find out what file descriptors we inherited from our parent process the
863   hard way - by checking each fd in turn. */
864static
865void init_preopened_fds_without_proc_self_fd(void)
866{
867   struct vki_rlimit lim;
868   UInt count;
869   Int i;
870
871   if (VG_(getrlimit) (VKI_RLIMIT_NOFILE, &lim) == -1) {
872      /* Hmm.  getrlimit() failed.  Now we're screwed, so just choose
873         an arbitrarily high number.  1024 happens to be the limit in
874         the 2.4 Linux kernels. */
875      count = 1024;
876   } else {
877      count = lim.rlim_cur;
878   }
879
880   for (i = 0; i < count; i++)
881      if (VG_(fcntl)(i, VKI_F_GETFL, 0) != -1)
882         ML_(record_fd_open_named)(-1, i);
883}
884
885/* Initialize the list of open file descriptors with the file descriptors
886   we inherited from out parent process. */
887
888void VG_(init_preopened_fds)(void)
889{
890// DDD: should probably use HAVE_PROC here or similar, instead.
891#if defined(VGO_linux)
892   Int ret;
893   struct vki_dirent64 d;
894   SysRes f;
895
896   f = VG_(open)("/proc/self/fd", VKI_O_RDONLY, 0);
897   if (sr_isError(f)) {
898      init_preopened_fds_without_proc_self_fd();
899      return;
900   }
901
902   while ((ret = VG_(getdents64)(sr_Res(f), &d, sizeof(d))) != 0) {
903      if (ret == -1)
904         goto out;
905
906      if (VG_(strcmp)(d.d_name, ".") && VG_(strcmp)(d.d_name, "..")) {
907         HChar* s;
908         Int fno = VG_(strtoll10)(d.d_name, &s);
909         if (*s == '\0') {
910            if (fno != sr_Res(f))
911               if (VG_(clo_track_fds))
912                  ML_(record_fd_open_named)(-1, fno);
913         } else {
914            VG_(message)(Vg_DebugMsg,
915               "Warning: invalid file name in /proc/self/fd: %s\n",
916               d.d_name);
917         }
918      }
919
920      VG_(lseek)(sr_Res(f), d.d_off, VKI_SEEK_SET);
921   }
922
923  out:
924   VG_(close)(sr_Res(f));
925
926#elif defined(VGO_darwin)
927   init_preopened_fds_without_proc_self_fd();
928
929#elif defined(VGO_solaris)
930   Int ret;
931   Char buf[VKI_MAXGETDENTS_SIZE];
932   SysRes f;
933
934   f = VG_(open)("/proc/self/fd", VKI_O_RDONLY, 0);
935   if (sr_isError(f)) {
936      init_preopened_fds_without_proc_self_fd();
937      return;
938   }
939
940   while ((ret = VG_(getdents64)(sr_Res(f), (struct vki_dirent64 *) buf,
941                                 sizeof(buf))) > 0) {
942      Int i = 0;
943      while (i < ret) {
944         /* Proceed one entry. */
945         struct vki_dirent64 *d = (struct vki_dirent64 *) (buf + i);
946         if (VG_(strcmp)(d->d_name, ".") && VG_(strcmp)(d->d_name, "..")) {
947            HChar *s;
948            Int fno = VG_(strtoll10)(d->d_name, &s);
949            if (*s == '\0') {
950               if (fno != sr_Res(f))
951                  if (VG_(clo_track_fds))
952                     ML_(record_fd_open_named)(-1, fno);
953            } else {
954               VG_(message)(Vg_DebugMsg,
955                     "Warning: invalid file name in /proc/self/fd: %s\n",
956                     d->d_name);
957            }
958         }
959
960         /* Move on the next entry. */
961         i += d->d_reclen;
962      }
963   }
964
965   VG_(close)(sr_Res(f));
966
967#else
968#  error Unknown OS
969#endif
970}
971
972static
973HChar *strdupcat ( const HChar* cc, const HChar *s1, const HChar *s2,
974                   ArenaId aid )
975{
976   UInt len = VG_(strlen) ( s1 ) + VG_(strlen) ( s2 ) + 1;
977   HChar *result = VG_(arena_malloc) ( aid, cc, len );
978   VG_(strcpy) ( result, s1 );
979   VG_(strcat) ( result, s2 );
980   return result;
981}
982
983static
984void pre_mem_read_sendmsg ( ThreadId tid, Bool read,
985                            const HChar *msg, Addr base, SizeT size )
986{
987   HChar *outmsg = strdupcat ( "di.syswrap.pmrs.1",
988                               "sendmsg", msg, VG_AR_CORE );
989   PRE_MEM_READ( outmsg, base, size );
990   VG_(free) ( outmsg );
991}
992
993static
994void pre_mem_write_recvmsg ( ThreadId tid, Bool read,
995                             const HChar *msg, Addr base, SizeT size )
996{
997   HChar *outmsg = strdupcat ( "di.syswrap.pmwr.1",
998                               "recvmsg", msg, VG_AR_CORE );
999   if ( read )
1000      PRE_MEM_READ( outmsg, base, size );
1001   else
1002      PRE_MEM_WRITE( outmsg, base, size );
1003   VG_(free) ( outmsg );
1004}
1005
1006static
1007void post_mem_write_recvmsg ( ThreadId tid, Bool read,
1008                              const HChar *fieldName, Addr base, SizeT size )
1009{
1010   if ( !read )
1011      POST_MEM_WRITE( base, size );
1012}
1013
1014static
1015void msghdr_foreachfield (
1016        ThreadId tid,
1017        const HChar *name,
1018        struct vki_msghdr *msg,
1019        UInt length,
1020        void (*foreach_func)( ThreadId, Bool, const HChar *, Addr, SizeT ),
1021        Bool rekv /* "recv" apparently shadows some header decl on OSX108 */
1022     )
1023{
1024   HChar *fieldName;
1025
1026   if ( !msg )
1027      return;
1028
1029   fieldName = VG_(malloc) ( "di.syswrap.mfef", VG_(strlen)(name) + 32 );
1030
1031   VG_(sprintf) ( fieldName, "(%s)", name );
1032
1033   foreach_func ( tid, True, fieldName, (Addr)&msg->msg_name, sizeof( msg->msg_name ) );
1034   foreach_func ( tid, True, fieldName, (Addr)&msg->msg_namelen, sizeof( msg->msg_namelen ) );
1035   foreach_func ( tid, True, fieldName, (Addr)&msg->msg_iov, sizeof( msg->msg_iov ) );
1036   foreach_func ( tid, True, fieldName, (Addr)&msg->msg_iovlen, sizeof( msg->msg_iovlen ) );
1037   foreach_func ( tid, True, fieldName, (Addr)&msg->msg_control, sizeof( msg->msg_control ) );
1038   foreach_func ( tid, True, fieldName, (Addr)&msg->msg_controllen, sizeof( msg->msg_controllen ) );
1039
1040   /* msg_flags is completely ignored for send_mesg, recv_mesg doesn't read
1041      the field, but does write to it. */
1042   if ( rekv )
1043      foreach_func ( tid, False, fieldName, (Addr)&msg->msg_flags, sizeof( msg->msg_flags ) );
1044
1045   if ( ML_(safe_to_deref)(&msg->msg_name, sizeof (void *))
1046        && msg->msg_name ) {
1047      VG_(sprintf) ( fieldName, "(%s.msg_name)", name );
1048      foreach_func ( tid, False, fieldName,
1049                     (Addr)msg->msg_name, msg->msg_namelen );
1050   }
1051
1052   if ( ML_(safe_to_deref)(&msg->msg_iov, sizeof (void *))
1053        && msg->msg_iov ) {
1054      struct vki_iovec *iov = msg->msg_iov;
1055      UInt i;
1056
1057      VG_(sprintf) ( fieldName, "(%s.msg_iov)", name );
1058
1059      foreach_func ( tid, True, fieldName,
1060                     (Addr)iov, msg->msg_iovlen * sizeof( struct vki_iovec ) );
1061
1062      for ( i = 0; i < msg->msg_iovlen; ++i, ++iov ) {
1063         UInt iov_len = iov->iov_len <= length ? iov->iov_len : length;
1064         VG_(sprintf) ( fieldName, "(%s.msg_iov[%u])", name, i );
1065         foreach_func ( tid, False, fieldName,
1066                        (Addr)iov->iov_base, iov_len );
1067         length = length - iov_len;
1068      }
1069   }
1070
1071   if ( ML_(safe_to_deref) (&msg->msg_control, sizeof (void *))
1072        && msg->msg_control )
1073   {
1074      VG_(sprintf) ( fieldName, "(%s.msg_control)", name );
1075      foreach_func ( tid, False, fieldName,
1076                     (Addr)msg->msg_control, msg->msg_controllen );
1077   }
1078
1079   VG_(free) ( fieldName );
1080}
1081
1082static void check_cmsg_for_fds(ThreadId tid, struct vki_msghdr *msg)
1083{
1084   struct vki_cmsghdr *cm = VKI_CMSG_FIRSTHDR(msg);
1085
1086   while (cm) {
1087      if (cm->cmsg_level == VKI_SOL_SOCKET &&
1088          cm->cmsg_type == VKI_SCM_RIGHTS ) {
1089         Int *fds = (Int *) VKI_CMSG_DATA(cm);
1090         Int fdc = (cm->cmsg_len - VKI_CMSG_ALIGN(sizeof(struct vki_cmsghdr)))
1091                         / sizeof(int);
1092         Int i;
1093
1094         for (i = 0; i < fdc; i++)
1095            if(VG_(clo_track_fds))
1096               // XXX: must we check the range on these fds with
1097               //      ML_(fd_allowed)()?
1098               ML_(record_fd_open_named)(tid, fds[i]);
1099      }
1100
1101      cm = VKI_CMSG_NXTHDR(msg, cm);
1102   }
1103}
1104
1105/* GrP kernel ignores sa_len (at least on Darwin); this checks the rest */
1106static
1107void pre_mem_read_sockaddr ( ThreadId tid,
1108                             const HChar *description,
1109                             struct vki_sockaddr *sa, UInt salen )
1110{
1111   HChar *outmsg;
1112   struct vki_sockaddr_un*  saun = (struct vki_sockaddr_un *)sa;
1113   struct vki_sockaddr_in*  sin  = (struct vki_sockaddr_in *)sa;
1114   struct vki_sockaddr_in6* sin6 = (struct vki_sockaddr_in6 *)sa;
1115#  ifdef VKI_AF_BLUETOOTH
1116   struct vki_sockaddr_rc*  rc   = (struct vki_sockaddr_rc *)sa;
1117#  endif
1118#  ifdef VKI_AF_NETLINK
1119   struct vki_sockaddr_nl*  nl   = (struct vki_sockaddr_nl *)sa;
1120#  endif
1121
1122   /* NULL/zero-length sockaddrs are legal */
1123   if ( sa == NULL || salen == 0 ) return;
1124
1125   outmsg = VG_(malloc) ( "di.syswrap.pmr_sockaddr.1",
1126                          VG_(strlen)( description ) + 30 );
1127
1128   VG_(sprintf) ( outmsg, description, "sa_family" );
1129   PRE_MEM_READ( outmsg, (Addr) &sa->sa_family, sizeof(vki_sa_family_t));
1130
1131   switch (sa->sa_family) {
1132
1133      case VKI_AF_UNIX:
1134         VG_(sprintf) ( outmsg, description, "sun_path" );
1135         PRE_MEM_RASCIIZ( outmsg, (Addr) saun->sun_path );
1136         // GrP fixme max of sun_len-2? what about nul char?
1137         break;
1138
1139      case VKI_AF_INET:
1140         VG_(sprintf) ( outmsg, description, "sin_port" );
1141         PRE_MEM_READ( outmsg, (Addr) &sin->sin_port, sizeof (sin->sin_port) );
1142         VG_(sprintf) ( outmsg, description, "sin_addr" );
1143         PRE_MEM_READ( outmsg, (Addr) &sin->sin_addr, sizeof (sin->sin_addr) );
1144         break;
1145
1146      case VKI_AF_INET6:
1147         VG_(sprintf) ( outmsg, description, "sin6_port" );
1148         PRE_MEM_READ( outmsg,
1149            (Addr) &sin6->sin6_port, sizeof (sin6->sin6_port) );
1150         VG_(sprintf) ( outmsg, description, "sin6_flowinfo" );
1151         PRE_MEM_READ( outmsg,
1152            (Addr) &sin6->sin6_flowinfo, sizeof (sin6->sin6_flowinfo) );
1153         VG_(sprintf) ( outmsg, description, "sin6_addr" );
1154         PRE_MEM_READ( outmsg,
1155            (Addr) &sin6->sin6_addr, sizeof (sin6->sin6_addr) );
1156         VG_(sprintf) ( outmsg, description, "sin6_scope_id" );
1157         PRE_MEM_READ( outmsg,
1158            (Addr) &sin6->sin6_scope_id, sizeof (sin6->sin6_scope_id) );
1159         break;
1160
1161#     ifdef VKI_AF_BLUETOOTH
1162      case VKI_AF_BLUETOOTH:
1163         VG_(sprintf) ( outmsg, description, "rc_bdaddr" );
1164         PRE_MEM_READ( outmsg, (Addr) &rc->rc_bdaddr, sizeof (rc->rc_bdaddr) );
1165         VG_(sprintf) ( outmsg, description, "rc_channel" );
1166         PRE_MEM_READ( outmsg, (Addr) &rc->rc_channel, sizeof (rc->rc_channel) );
1167         break;
1168#     endif
1169
1170#     ifdef VKI_AF_NETLINK
1171      case VKI_AF_NETLINK:
1172         VG_(sprintf)(outmsg, description, "nl_pid");
1173         PRE_MEM_READ(outmsg, (Addr)&nl->nl_pid, sizeof(nl->nl_pid));
1174         VG_(sprintf)(outmsg, description, "nl_groups");
1175         PRE_MEM_READ(outmsg, (Addr)&nl->nl_groups, sizeof(nl->nl_groups));
1176         break;
1177#     endif
1178
1179#     ifdef VKI_AF_UNSPEC
1180      case VKI_AF_UNSPEC:
1181         break;
1182#     endif
1183
1184      default:
1185         /* No specific information about this address family.
1186            Let's just check the full data following the family.
1187            Note that this can give false positive if this (unknown)
1188            struct sockaddr_???? has padding bytes between its elements. */
1189         VG_(sprintf) ( outmsg, description, "sa_data" );
1190         PRE_MEM_READ( outmsg, (Addr)&sa->sa_family + sizeof(sa->sa_family),
1191                       salen -  sizeof(sa->sa_family));
1192         break;
1193   }
1194
1195   VG_(free) ( outmsg );
1196}
1197
1198/* Dereference a pointer to a UInt. */
1199static UInt deref_UInt ( ThreadId tid, Addr a, const HChar* s )
1200{
1201   UInt* a_p = (UInt*)a;
1202   PRE_MEM_READ( s, (Addr)a_p, sizeof(UInt) );
1203   if (a_p == NULL)
1204      return 0;
1205   else
1206      return *a_p;
1207}
1208
1209void ML_(buf_and_len_pre_check) ( ThreadId tid, Addr buf_p, Addr buflen_p,
1210                                  const HChar* buf_s, const HChar* buflen_s )
1211{
1212   if (VG_(tdict).track_pre_mem_write) {
1213      UInt buflen_in = deref_UInt( tid, buflen_p, buflen_s);
1214      if (buflen_in > 0) {
1215         VG_(tdict).track_pre_mem_write(
1216            Vg_CoreSysCall, tid, buf_s, buf_p, buflen_in );
1217      }
1218   }
1219}
1220
1221void ML_(buf_and_len_post_check) ( ThreadId tid, SysRes res,
1222                                   Addr buf_p, Addr buflen_p, const HChar* s )
1223{
1224   if (!sr_isError(res) && VG_(tdict).track_post_mem_write) {
1225      UInt buflen_out = deref_UInt( tid, buflen_p, s);
1226      if (buflen_out > 0 && buf_p != (Addr)NULL) {
1227         VG_(tdict).track_post_mem_write( Vg_CoreSysCall, tid, buf_p, buflen_out );
1228      }
1229   }
1230}
1231
1232/* ---------------------------------------------------------------------
1233   Data seg end, for brk()
1234   ------------------------------------------------------------------ */
1235
1236/*   +--------+------------+
1237     | anon   |    resvn   |
1238     +--------+------------+
1239
1240     ^     ^  ^
1241     |     |  boundary is page aligned
1242     |     VG_(brk_limit) -- no alignment constraint
1243     VG_(brk_base) -- page aligned -- does not move
1244
1245     Both the anon part and the reservation part are always at least
1246     one page.
1247*/
1248
1249/* Set the new data segment end to NEWBRK.  If this succeeds, return
1250   NEWBRK, else return the current data segment end. */
1251
1252static Addr do_brk ( Addr newbrk, ThreadId tid )
1253{
1254   NSegment const* aseg;
1255   Addr newbrkP;
1256   SizeT delta;
1257   Bool debug = False;
1258
1259   if (debug)
1260      VG_(printf)("\ndo_brk: brk_base=%#lx brk_limit=%#lx newbrk=%#lx\n",
1261		  VG_(brk_base), VG_(brk_limit), newbrk);
1262
1263   if (0) VG_(am_show_nsegments)(0, "in_brk");
1264
1265   if (newbrk < VG_(brk_base))
1266      /* Clearly impossible. */
1267      goto bad;
1268
1269   if (newbrk < VG_(brk_limit)) {
1270      /* shrinking the data segment.  Be lazy and don't munmap the
1271         excess area. */
1272      NSegment const * seg = VG_(am_find_nsegment)(newbrk);
1273      vg_assert(seg);
1274
1275      if (seg->hasT)
1276         VG_(discard_translations)( newbrk, VG_(brk_limit) - newbrk,
1277                                    "do_brk(shrink)" );
1278      /* Since we're being lazy and not unmapping pages, we have to
1279         zero out the area, so that if the area later comes back into
1280         circulation, it will be filled with zeroes, as if it really
1281         had been unmapped and later remapped.  Be a bit paranoid and
1282         try hard to ensure we're not going to segfault by doing the
1283         write - check both ends of the range are in the same segment
1284         and that segment is writable. */
1285      NSegment const * seg2;
1286
1287      seg2 = VG_(am_find_nsegment)( VG_(brk_limit) - 1 );
1288      vg_assert(seg2);
1289
1290      if (seg == seg2 && seg->hasW)
1291         VG_(memset)( (void*)newbrk, 0, VG_(brk_limit) - newbrk );
1292
1293      VG_(brk_limit) = newbrk;
1294      return newbrk;
1295   }
1296
1297   /* otherwise we're expanding the brk segment. */
1298   if (VG_(brk_limit) > VG_(brk_base))
1299      aseg = VG_(am_find_nsegment)( VG_(brk_limit)-1 );
1300   else
1301      aseg = VG_(am_find_nsegment)( VG_(brk_limit) );
1302
1303   /* These should be assured by setup_client_dataseg in m_main. */
1304   vg_assert(aseg);
1305   vg_assert(aseg->kind == SkAnonC);
1306
1307   if (newbrk <= aseg->end + 1) {
1308      /* still fits within the anon segment. */
1309      VG_(brk_limit) = newbrk;
1310      return newbrk;
1311   }
1312
1313   newbrkP = VG_PGROUNDUP(newbrk);
1314   delta = newbrkP - (aseg->end + 1);
1315   vg_assert(delta > 0);
1316   vg_assert(VG_IS_PAGE_ALIGNED(delta));
1317
1318   Bool overflow;
1319   if (! VG_(am_extend_into_adjacent_reservation_client)( aseg->start, delta,
1320                                                          &overflow)) {
1321      if (overflow)
1322         VG_(umsg)("brk segment overflow in thread #%u: can't grow to %#lx\n",
1323                   tid, newbrkP);
1324      else
1325         VG_(umsg)("Cannot map memory to grow brk segment in thread #%u "
1326                   "to %#lx\n", tid, newbrkP);
1327      goto bad;
1328   }
1329
1330   VG_(brk_limit) = newbrk;
1331   return newbrk;
1332
1333  bad:
1334   return VG_(brk_limit);
1335}
1336
1337
1338/* ---------------------------------------------------------------------
1339   Vet file descriptors for sanity
1340   ------------------------------------------------------------------ */
1341/*
1342> - what does the "Bool soft" parameter mean?
1343
1344(Tom Hughes, 3 Oct 05):
1345
1346Whether or not to consider a file descriptor invalid if it is above
1347the current soft limit.
1348
1349Basically if we are testing whether a newly created file descriptor is
1350valid (in a post handler) then we set soft to true, and if we are
1351testing whether a file descriptor that is about to be used (in a pre
1352handler) is valid [viz, an already-existing fd] then we set it to false.
1353
1354The point is that if the (virtual) soft limit is lowered then any
1355existing descriptors can still be read/written/closed etc (so long as
1356they are below the valgrind reserved descriptors) but no new
1357descriptors can be created above the new soft limit.
1358
1359(jrs 4 Oct 05: in which case, I've renamed it "isNewFd")
1360*/
1361
1362/* Return true if we're allowed to use or create this fd */
1363Bool ML_(fd_allowed)(Int fd, const HChar *syscallname, ThreadId tid,
1364                     Bool isNewFd)
1365{
1366   Bool allowed = True;
1367
1368   /* hard limits always apply */
1369   if (fd < 0 || fd >= VG_(fd_hard_limit))
1370      allowed = False;
1371
1372   /* hijacking the output fds is never allowed */
1373   if (fd == VG_(log_output_sink).fd || fd == VG_(xml_output_sink).fd)
1374      allowed = False;
1375
1376   /* if creating a new fd (rather than using an existing one), the
1377      soft limit must also be observed */
1378   if (isNewFd && fd >= VG_(fd_soft_limit))
1379      allowed = False;
1380
1381   /* this looks like it ought to be included, but causes problems: */
1382   /*
1383   if (fd == 2 && VG_(debugLog_getLevel)() > 0)
1384      allowed = False;
1385   */
1386   /* The difficulty is as follows: consider a program P which expects
1387      to be able to mess with (redirect) its own stderr (fd 2).
1388      Usually to deal with P we would issue command line flags to send
1389      logging somewhere other than stderr, so as not to disrupt P.
1390      The problem is that -d unilaterally hijacks stderr with no
1391      consultation with P.  And so, if this check is enabled, P will
1392      work OK normally but fail if -d is issued.
1393
1394      Basically -d is a hack and you take your chances when using it.
1395      It's very useful for low level debugging -- particularly at
1396      startup -- and having its presence change the behaviour of the
1397      client is exactly what we don't want.  */
1398
1399   /* croak? */
1400   if ((!allowed) && VG_(showing_core_errors)() ) {
1401      VG_(message)(Vg_UserMsg,
1402         "Warning: invalid file descriptor %d in syscall %s()\n",
1403         fd, syscallname);
1404      if (fd == VG_(log_output_sink).fd && VG_(log_output_sink).fd >= 0)
1405	 VG_(message)(Vg_UserMsg,
1406            "   Use --log-fd=<number> to select an alternative log fd.\n");
1407      if (fd == VG_(xml_output_sink).fd && VG_(xml_output_sink).fd >= 0)
1408	 VG_(message)(Vg_UserMsg,
1409            "   Use --xml-fd=<number> to select an alternative XML "
1410            "output fd.\n");
1411      // DDD: consider always printing this stack trace, it's useful.
1412      // Also consider also making this a proper core error, ie.
1413      // suppressible and all that.
1414      if (VG_(clo_verbosity) > 1) {
1415         VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
1416      }
1417   }
1418
1419   return allowed;
1420}
1421
1422
1423/* ---------------------------------------------------------------------
1424   Deal with a bunch of socket-related syscalls
1425   ------------------------------------------------------------------ */
1426
1427/* ------ */
1428
1429void
1430ML_(generic_PRE_sys_socketpair) ( ThreadId tid,
1431                                  UWord arg0, UWord arg1,
1432                                  UWord arg2, UWord arg3 )
1433{
1434   /* int socketpair(int d, int type, int protocol, int sv[2]); */
1435   PRE_MEM_WRITE( "socketcall.socketpair(sv)",
1436                  arg3, 2*sizeof(int) );
1437}
1438
1439SysRes
1440ML_(generic_POST_sys_socketpair) ( ThreadId tid,
1441                                   SysRes res,
1442                                   UWord arg0, UWord arg1,
1443                                   UWord arg2, UWord arg3 )
1444{
1445   SysRes r = res;
1446   Int fd1 = ((Int*)arg3)[0];
1447   Int fd2 = ((Int*)arg3)[1];
1448   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1449   POST_MEM_WRITE( arg3, 2*sizeof(int) );
1450   if (!ML_(fd_allowed)(fd1, "socketcall.socketpair", tid, True) ||
1451       !ML_(fd_allowed)(fd2, "socketcall.socketpair", tid, True)) {
1452      VG_(close)(fd1);
1453      VG_(close)(fd2);
1454      r = VG_(mk_SysRes_Error)( VKI_EMFILE );
1455   } else {
1456      POST_MEM_WRITE( arg3, 2*sizeof(int) );
1457      if (VG_(clo_track_fds)) {
1458         ML_(record_fd_open_nameless)(tid, fd1);
1459         ML_(record_fd_open_nameless)(tid, fd2);
1460      }
1461   }
1462   return r;
1463}
1464
1465/* ------ */
1466
1467SysRes
1468ML_(generic_POST_sys_socket) ( ThreadId tid, SysRes res )
1469{
1470   SysRes r = res;
1471   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1472   if (!ML_(fd_allowed)(sr_Res(res), "socket", tid, True)) {
1473      VG_(close)(sr_Res(res));
1474      r = VG_(mk_SysRes_Error)( VKI_EMFILE );
1475   } else {
1476      if (VG_(clo_track_fds))
1477         ML_(record_fd_open_nameless)(tid, sr_Res(res));
1478   }
1479   return r;
1480}
1481
1482/* ------ */
1483
1484void
1485ML_(generic_PRE_sys_bind) ( ThreadId tid,
1486                            UWord arg0, UWord arg1, UWord arg2 )
1487{
1488   /* int bind(int sockfd, struct sockaddr *my_addr,
1489               int addrlen); */
1490   pre_mem_read_sockaddr(
1491      tid, "socketcall.bind(my_addr.%s)",
1492      (struct vki_sockaddr *) arg1, arg2
1493   );
1494}
1495
1496/* ------ */
1497
1498void
1499ML_(generic_PRE_sys_accept) ( ThreadId tid,
1500                              UWord arg0, UWord arg1, UWord arg2 )
1501{
1502   /* int accept(int s, struct sockaddr *addr, int *addrlen); */
1503   Addr addr_p     = arg1;
1504   Addr addrlen_p  = arg2;
1505   if (addr_p != (Addr)NULL)
1506      ML_(buf_and_len_pre_check) ( tid, addr_p, addrlen_p,
1507                                   "socketcall.accept(addr)",
1508                                   "socketcall.accept(addrlen_in)" );
1509}
1510
1511SysRes
1512ML_(generic_POST_sys_accept) ( ThreadId tid,
1513                               SysRes res,
1514                               UWord arg0, UWord arg1, UWord arg2 )
1515{
1516   SysRes r = res;
1517   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1518   if (!ML_(fd_allowed)(sr_Res(res), "accept", tid, True)) {
1519      VG_(close)(sr_Res(res));
1520      r = VG_(mk_SysRes_Error)( VKI_EMFILE );
1521   } else {
1522      Addr addr_p     = arg1;
1523      Addr addrlen_p  = arg2;
1524      if (addr_p != (Addr)NULL)
1525         ML_(buf_and_len_post_check) ( tid, res, addr_p, addrlen_p,
1526                                       "socketcall.accept(addrlen_out)" );
1527      if (VG_(clo_track_fds))
1528          ML_(record_fd_open_nameless)(tid, sr_Res(res));
1529   }
1530   return r;
1531}
1532
1533/* ------ */
1534
1535void
1536ML_(generic_PRE_sys_sendto) ( ThreadId tid,
1537                              UWord arg0, UWord arg1, UWord arg2,
1538                              UWord arg3, UWord arg4, UWord arg5 )
1539{
1540   /* int sendto(int s, const void *msg, int len,
1541                 unsigned int flags,
1542                 const struct sockaddr *to, int tolen); */
1543   PRE_MEM_READ( "socketcall.sendto(msg)",
1544                 arg1, /* msg */
1545                 arg2  /* len */ );
1546   pre_mem_read_sockaddr(
1547      tid, "socketcall.sendto(to.%s)",
1548      (struct vki_sockaddr *) arg4, arg5
1549   );
1550}
1551
1552/* ------ */
1553
1554void
1555ML_(generic_PRE_sys_send) ( ThreadId tid,
1556                            UWord arg0, UWord arg1, UWord arg2 )
1557{
1558   /* int send(int s, const void *msg, size_t len, int flags); */
1559   PRE_MEM_READ( "socketcall.send(msg)",
1560                  arg1, /* msg */
1561                  arg2  /* len */ );
1562
1563}
1564
1565/* ------ */
1566
1567void
1568ML_(generic_PRE_sys_recvfrom) ( ThreadId tid,
1569                                UWord arg0, UWord arg1, UWord arg2,
1570                                UWord arg3, UWord arg4, UWord arg5 )
1571{
1572   /* int recvfrom(int s, void *buf, int len, unsigned int flags,
1573                   struct sockaddr *from, int *fromlen); */
1574   Addr buf_p      = arg1;
1575   Int  len        = arg2;
1576   Addr from_p     = arg4;
1577   Addr fromlen_p  = arg5;
1578   PRE_MEM_WRITE( "socketcall.recvfrom(buf)", buf_p, len );
1579   if (from_p != (Addr)NULL)
1580      ML_(buf_and_len_pre_check) ( tid, from_p, fromlen_p,
1581                                   "socketcall.recvfrom(from)",
1582                                   "socketcall.recvfrom(fromlen_in)" );
1583}
1584
1585void
1586ML_(generic_POST_sys_recvfrom) ( ThreadId tid,
1587                                 SysRes res,
1588                                 UWord arg0, UWord arg1, UWord arg2,
1589                                 UWord arg3, UWord arg4, UWord arg5 )
1590{
1591   Addr buf_p      = arg1;
1592   Int  len        = arg2;
1593   Addr from_p     = arg4;
1594   Addr fromlen_p  = arg5;
1595
1596   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1597   if (from_p != (Addr)NULL)
1598      ML_(buf_and_len_post_check) ( tid, res, from_p, fromlen_p,
1599                                    "socketcall.recvfrom(fromlen_out)" );
1600   POST_MEM_WRITE( buf_p, len );
1601}
1602
1603/* ------ */
1604
1605void
1606ML_(generic_PRE_sys_recv) ( ThreadId tid,
1607                            UWord arg0, UWord arg1, UWord arg2 )
1608{
1609   /* int recv(int s, void *buf, int len, unsigned int flags); */
1610   /* man 2 recv says:
1611      The  recv call is normally used only on a connected socket
1612      (see connect(2)) and is identical to recvfrom with a  NULL
1613      from parameter.
1614   */
1615   PRE_MEM_WRITE( "socketcall.recv(buf)",
1616                  arg1, /* buf */
1617                  arg2  /* len */ );
1618}
1619
1620void
1621ML_(generic_POST_sys_recv) ( ThreadId tid,
1622                             UWord res,
1623                             UWord arg0, UWord arg1, UWord arg2 )
1624{
1625   if (res >= 0 && arg1 != 0) {
1626      POST_MEM_WRITE( arg1, /* buf */
1627                      arg2  /* len */ );
1628   }
1629}
1630
1631/* ------ */
1632
1633void
1634ML_(generic_PRE_sys_connect) ( ThreadId tid,
1635                               UWord arg0, UWord arg1, UWord arg2 )
1636{
1637   /* int connect(int sockfd,
1638                  struct sockaddr *serv_addr, int addrlen ); */
1639   pre_mem_read_sockaddr( tid,
1640                          "socketcall.connect(serv_addr.%s)",
1641                          (struct vki_sockaddr *) arg1, arg2);
1642}
1643
1644/* ------ */
1645
1646void
1647ML_(generic_PRE_sys_setsockopt) ( ThreadId tid,
1648                                  UWord arg0, UWord arg1, UWord arg2,
1649                                  UWord arg3, UWord arg4 )
1650{
1651   /* int setsockopt(int s, int level, int optname,
1652                     const void *optval, int optlen); */
1653   PRE_MEM_READ( "socketcall.setsockopt(optval)",
1654                 arg3, /* optval */
1655                 arg4  /* optlen */ );
1656}
1657
1658/* ------ */
1659
1660void
1661ML_(generic_PRE_sys_getsockname) ( ThreadId tid,
1662                                   UWord arg0, UWord arg1, UWord arg2 )
1663{
1664   /* int getsockname(int s, struct sockaddr* name, int* namelen) */
1665   Addr name_p     = arg1;
1666   Addr namelen_p  = arg2;
1667   /* Nb: name_p cannot be NULL */
1668   ML_(buf_and_len_pre_check) ( tid, name_p, namelen_p,
1669                                "socketcall.getsockname(name)",
1670                                "socketcall.getsockname(namelen_in)" );
1671}
1672
1673void
1674ML_(generic_POST_sys_getsockname) ( ThreadId tid,
1675                                    SysRes res,
1676                                    UWord arg0, UWord arg1, UWord arg2 )
1677{
1678   Addr name_p     = arg1;
1679   Addr namelen_p  = arg2;
1680   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1681   ML_(buf_and_len_post_check) ( tid, res, name_p, namelen_p,
1682                                 "socketcall.getsockname(namelen_out)" );
1683}
1684
1685/* ------ */
1686
1687void
1688ML_(generic_PRE_sys_getpeername) ( ThreadId tid,
1689                                   UWord arg0, UWord arg1, UWord arg2 )
1690{
1691   /* int getpeername(int s, struct sockaddr* name, int* namelen) */
1692   Addr name_p     = arg1;
1693   Addr namelen_p  = arg2;
1694   /* Nb: name_p cannot be NULL */
1695   ML_(buf_and_len_pre_check) ( tid, name_p, namelen_p,
1696                                "socketcall.getpeername(name)",
1697                                "socketcall.getpeername(namelen_in)" );
1698}
1699
1700void
1701ML_(generic_POST_sys_getpeername) ( ThreadId tid,
1702                                    SysRes res,
1703                                    UWord arg0, UWord arg1, UWord arg2 )
1704{
1705   Addr name_p     = arg1;
1706   Addr namelen_p  = arg2;
1707   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1708   ML_(buf_and_len_post_check) ( tid, res, name_p, namelen_p,
1709                                 "socketcall.getpeername(namelen_out)" );
1710}
1711
1712/* ------ */
1713
1714void
1715ML_(generic_PRE_sys_sendmsg) ( ThreadId tid, const HChar *name,
1716                               struct vki_msghdr *msg )
1717{
1718   msghdr_foreachfield ( tid, name, msg, ~0, pre_mem_read_sendmsg, False );
1719}
1720
1721/* ------ */
1722
1723void
1724ML_(generic_PRE_sys_recvmsg) ( ThreadId tid, const HChar *name,
1725                               struct vki_msghdr *msg )
1726{
1727   msghdr_foreachfield ( tid, name, msg, ~0, pre_mem_write_recvmsg, True );
1728}
1729
1730void
1731ML_(generic_POST_sys_recvmsg) ( ThreadId tid, const HChar *name,
1732                                struct vki_msghdr *msg, UInt length )
1733{
1734   msghdr_foreachfield( tid, name, msg, length, post_mem_write_recvmsg, True );
1735   check_cmsg_for_fds( tid, msg );
1736}
1737
1738
1739/* ---------------------------------------------------------------------
1740   Deal with a bunch of IPC related syscalls
1741   ------------------------------------------------------------------ */
1742
1743/* ------ */
1744
1745void
1746ML_(generic_PRE_sys_semop) ( ThreadId tid,
1747                             UWord arg0, UWord arg1, UWord arg2 )
1748{
1749   /* int semop(int semid, struct sembuf *sops, unsigned nsops); */
1750   PRE_MEM_READ( "semop(sops)", arg1, arg2 * sizeof(struct vki_sembuf) );
1751}
1752
1753/* ------ */
1754
1755void
1756ML_(generic_PRE_sys_semtimedop) ( ThreadId tid,
1757                                  UWord arg0, UWord arg1,
1758                                  UWord arg2, UWord arg3 )
1759{
1760   /* int semtimedop(int semid, struct sembuf *sops, unsigned nsops,
1761                     struct timespec *timeout); */
1762   PRE_MEM_READ( "semtimedop(sops)", arg1, arg2 * sizeof(struct vki_sembuf) );
1763   if (arg3 != 0)
1764      PRE_MEM_READ( "semtimedop(timeout)", arg3, sizeof(struct vki_timespec) );
1765}
1766
1767/* ------ */
1768
1769static
1770UInt get_sem_count( Int semid )
1771{
1772   struct vki_semid_ds buf;
1773   union vki_semun arg;
1774   SysRes res;
1775
1776   /* Doesn't actually seem to be necessary, but gcc-4.4.0 20081017
1777      (experimental) otherwise complains that the use in the return
1778      statement below is uninitialised. */
1779   buf.sem_nsems = 0;
1780
1781   arg.buf = &buf;
1782
1783#  if defined(__NR_semctl)
1784   res = VG_(do_syscall4)(__NR_semctl, semid, 0, VKI_IPC_STAT, *(UWord *)&arg);
1785#  elif defined(__NR_semsys) /* Solaris */
1786   res = VG_(do_syscall5)(__NR_semsys, VKI_SEMCTL, semid, 0, VKI_IPC_STAT,
1787                          *(UWord *)&arg);
1788#  else
1789   res = VG_(do_syscall5)(__NR_ipc, 3 /* IPCOP_semctl */, semid, 0,
1790                          VKI_IPC_STAT, (UWord)&arg);
1791#  endif
1792   if (sr_isError(res))
1793      return 0;
1794
1795   return buf.sem_nsems;
1796}
1797
1798void
1799ML_(generic_PRE_sys_semctl) ( ThreadId tid,
1800                              UWord arg0, UWord arg1,
1801                              UWord arg2, UWord arg3 )
1802{
1803   /* int semctl(int semid, int semnum, int cmd, ...); */
1804   union vki_semun arg = *(union vki_semun *)&arg3;
1805   UInt nsems;
1806   switch (arg2 /* cmd */) {
1807#if defined(VKI_IPC_INFO)
1808   case VKI_IPC_INFO:
1809   case VKI_SEM_INFO:
1810   case VKI_IPC_INFO|VKI_IPC_64:
1811   case VKI_SEM_INFO|VKI_IPC_64:
1812      PRE_MEM_WRITE( "semctl(IPC_INFO, arg.buf)",
1813                     (Addr)arg.buf, sizeof(struct vki_seminfo) );
1814      break;
1815#endif
1816
1817   case VKI_IPC_STAT:
1818#if defined(VKI_SEM_STAT)
1819   case VKI_SEM_STAT:
1820#endif
1821      PRE_MEM_WRITE( "semctl(IPC_STAT, arg.buf)",
1822                     (Addr)arg.buf, sizeof(struct vki_semid_ds) );
1823      break;
1824
1825#if defined(VKI_IPC_64)
1826   case VKI_IPC_STAT|VKI_IPC_64:
1827#if defined(VKI_SEM_STAT)
1828   case VKI_SEM_STAT|VKI_IPC_64:
1829#endif
1830#endif
1831#if defined(VKI_IPC_STAT64)
1832   case VKI_IPC_STAT64:
1833#endif
1834#if defined(VKI_IPC_64) || defined(VKI_IPC_STAT64)
1835      PRE_MEM_WRITE( "semctl(IPC_STAT, arg.buf)",
1836                     (Addr)arg.buf, sizeof(struct vki_semid64_ds) );
1837      break;
1838#endif
1839
1840   case VKI_IPC_SET:
1841      PRE_MEM_READ( "semctl(IPC_SET, arg.buf)",
1842                    (Addr)arg.buf, sizeof(struct vki_semid_ds) );
1843      break;
1844
1845#if defined(VKI_IPC_64)
1846   case VKI_IPC_SET|VKI_IPC_64:
1847#endif
1848#if defined(VKI_IPC_SET64)
1849   case VKI_IPC_SET64:
1850#endif
1851#if defined(VKI_IPC64) || defined(VKI_IPC_SET64)
1852      PRE_MEM_READ( "semctl(IPC_SET, arg.buf)",
1853                    (Addr)arg.buf, sizeof(struct vki_semid64_ds) );
1854      break;
1855#endif
1856
1857   case VKI_GETALL:
1858#if defined(VKI_IPC_64)
1859   case VKI_GETALL|VKI_IPC_64:
1860#endif
1861      nsems = get_sem_count( arg0 );
1862      PRE_MEM_WRITE( "semctl(IPC_GETALL, arg.array)",
1863                     (Addr)arg.array, sizeof(unsigned short) * nsems );
1864      break;
1865
1866   case VKI_SETALL:
1867#if defined(VKI_IPC_64)
1868   case VKI_SETALL|VKI_IPC_64:
1869#endif
1870      nsems = get_sem_count( arg0 );
1871      PRE_MEM_READ( "semctl(IPC_SETALL, arg.array)",
1872                    (Addr)arg.array, sizeof(unsigned short) * nsems );
1873      break;
1874   }
1875}
1876
1877void
1878ML_(generic_POST_sys_semctl) ( ThreadId tid,
1879                               UWord res,
1880                               UWord arg0, UWord arg1,
1881                               UWord arg2, UWord arg3 )
1882{
1883   union vki_semun arg = *(union vki_semun *)&arg3;
1884   UInt nsems;
1885   switch (arg2 /* cmd */) {
1886#if defined(VKI_IPC_INFO)
1887   case VKI_IPC_INFO:
1888   case VKI_SEM_INFO:
1889   case VKI_IPC_INFO|VKI_IPC_64:
1890   case VKI_SEM_INFO|VKI_IPC_64:
1891      POST_MEM_WRITE( (Addr)arg.buf, sizeof(struct vki_seminfo) );
1892      break;
1893#endif
1894
1895   case VKI_IPC_STAT:
1896#if defined(VKI_SEM_STAT)
1897   case VKI_SEM_STAT:
1898#endif
1899      POST_MEM_WRITE( (Addr)arg.buf, sizeof(struct vki_semid_ds) );
1900      break;
1901
1902#if defined(VKI_IPC_64)
1903   case VKI_IPC_STAT|VKI_IPC_64:
1904   case VKI_SEM_STAT|VKI_IPC_64:
1905#endif
1906#if defined(VKI_IPC_STAT64)
1907   case VKI_IPC_STAT64:
1908#endif
1909#if defined(VKI_IPC_64) || defined(VKI_IPC_STAT64)
1910      POST_MEM_WRITE( (Addr)arg.buf, sizeof(struct vki_semid64_ds) );
1911      break;
1912#endif
1913
1914   case VKI_GETALL:
1915#if defined(VKI_IPC_64)
1916   case VKI_GETALL|VKI_IPC_64:
1917#endif
1918      nsems = get_sem_count( arg0 );
1919      POST_MEM_WRITE( (Addr)arg.array, sizeof(unsigned short) * nsems );
1920      break;
1921   }
1922}
1923
1924/* ------ */
1925
1926/* ------ */
1927
1928static
1929SizeT get_shm_size ( Int shmid )
1930{
1931#if defined(__NR_shmctl)
1932#  ifdef VKI_IPC_64
1933   struct vki_shmid64_ds buf;
1934#    if defined(VGP_amd64_linux) || defined(VGP_arm64_linux)
1935     /* See bug 222545 comment 7 */
1936     SysRes __res = VG_(do_syscall3)(__NR_shmctl, shmid,
1937                                     VKI_IPC_STAT, (UWord)&buf);
1938#    else
1939     SysRes __res = VG_(do_syscall3)(__NR_shmctl, shmid,
1940                                     VKI_IPC_STAT|VKI_IPC_64, (UWord)&buf);
1941#    endif
1942#  else /* !def VKI_IPC_64 */
1943   struct vki_shmid_ds buf;
1944   SysRes __res = VG_(do_syscall3)(__NR_shmctl, shmid, VKI_IPC_STAT, (UWord)&buf);
1945#  endif /* def VKI_IPC_64 */
1946#elif defined(__NR_shmsys) /* Solaris */
1947   struct vki_shmid_ds buf;
1948   SysRes __res = VG_(do_syscall4)(__NR_shmsys, VKI_SHMCTL, shmid, VKI_IPC_STAT,
1949                         (UWord)&buf);
1950#else
1951   struct vki_shmid_ds buf;
1952   SysRes __res = VG_(do_syscall5)(__NR_ipc, 24 /* IPCOP_shmctl */, shmid,
1953                                 VKI_IPC_STAT, 0, (UWord)&buf);
1954#endif
1955   if (sr_isError(__res))
1956      return 0;
1957
1958   return (SizeT) buf.shm_segsz;
1959}
1960
1961UWord
1962ML_(generic_PRE_sys_shmat) ( ThreadId tid,
1963                             UWord arg0, UWord arg1, UWord arg2 )
1964{
1965   /* void *shmat(int shmid, const void *shmaddr, int shmflg); */
1966   SizeT  segmentSize = get_shm_size ( arg0 );
1967   UWord tmp;
1968   Bool  ok;
1969   if (arg1 == 0) {
1970      /* arm-linux only: work around the fact that
1971         VG_(am_get_advisory_client_simple) produces something that is
1972         VKI_PAGE_SIZE aligned, whereas what we want is something
1973         VKI_SHMLBA aligned, and VKI_SHMLBA >= VKI_PAGE_SIZE.  Hence
1974         increase the request size by VKI_SHMLBA - VKI_PAGE_SIZE and
1975         then round the result up to the next VKI_SHMLBA boundary.
1976         See bug 222545 comment 15.  So far, arm-linux is the only
1977         platform where this is known to be necessary. */
1978      vg_assert(VKI_SHMLBA >= VKI_PAGE_SIZE);
1979      if (VKI_SHMLBA > VKI_PAGE_SIZE) {
1980         segmentSize += VKI_SHMLBA - VKI_PAGE_SIZE;
1981      }
1982      tmp = VG_(am_get_advisory_client_simple)(0, segmentSize, &ok);
1983      if (ok) {
1984         if (VKI_SHMLBA > VKI_PAGE_SIZE) {
1985            arg1 = VG_ROUNDUP(tmp, VKI_SHMLBA);
1986         } else {
1987            arg1 = tmp;
1988         }
1989      }
1990   }
1991   else if (!ML_(valid_client_addr)(arg1, segmentSize, tid, "shmat"))
1992      arg1 = 0;
1993   return arg1;
1994}
1995
1996void
1997ML_(generic_POST_sys_shmat) ( ThreadId tid,
1998                              UWord res,
1999                              UWord arg0, UWord arg1, UWord arg2 )
2000{
2001   SizeT segmentSize = VG_PGROUNDUP(get_shm_size(arg0));
2002   if ( segmentSize > 0 ) {
2003      UInt prot = VKI_PROT_READ|VKI_PROT_WRITE;
2004      Bool d;
2005
2006      if (arg2 & VKI_SHM_RDONLY)
2007         prot &= ~VKI_PROT_WRITE;
2008      /* It isn't exactly correct to pass 0 for the fd and offset
2009         here.  The kernel seems to think the corresponding section
2010         does have dev/ino numbers:
2011
2012         04e52000-04ec8000 rw-s 00000000 00:06 1966090  /SYSV00000000 (deleted)
2013
2014         However there is no obvious way to find them.  In order to
2015         cope with the discrepancy, aspacem's sync checker omits the
2016         dev/ino correspondence check in cases where V does not know
2017         the dev/ino. */
2018      d = VG_(am_notify_client_shmat)( res, segmentSize, prot );
2019
2020      /* we don't distinguish whether it's read-only or
2021       * read-write -- it doesn't matter really. */
2022      VG_TRACK( new_mem_mmap, res, segmentSize, True, True, False,
2023                              0/*di_handle*/ );
2024      if (d)
2025         VG_(discard_translations)( (Addr)res,
2026                                    (ULong)VG_PGROUNDUP(segmentSize),
2027                                    "ML_(generic_POST_sys_shmat)" );
2028   }
2029}
2030
2031/* ------ */
2032
2033Bool
2034ML_(generic_PRE_sys_shmdt) ( ThreadId tid, UWord arg0 )
2035{
2036   /* int shmdt(const void *shmaddr); */
2037   return ML_(valid_client_addr)(arg0, 1, tid, "shmdt");
2038}
2039
2040void
2041ML_(generic_POST_sys_shmdt) ( ThreadId tid, UWord res, UWord arg0 )
2042{
2043   NSegment const* s = VG_(am_find_nsegment)(arg0);
2044
2045   if (s != NULL) {
2046      Addr  s_start = s->start;
2047      SizeT s_len   = s->end+1 - s->start;
2048      Bool  d;
2049
2050      vg_assert(s->kind == SkShmC);
2051      vg_assert(s->start == arg0);
2052
2053      d = VG_(am_notify_munmap)(s_start, s_len);
2054      s = NULL; /* s is now invalid */
2055      VG_TRACK( die_mem_munmap, s_start, s_len );
2056      if (d)
2057         VG_(discard_translations)( s_start,
2058                                    (ULong)s_len,
2059                                    "ML_(generic_POST_sys_shmdt)" );
2060   }
2061}
2062/* ------ */
2063
2064void
2065ML_(generic_PRE_sys_shmctl) ( ThreadId tid,
2066                              UWord arg0, UWord arg1, UWord arg2 )
2067{
2068   /* int shmctl(int shmid, int cmd, struct shmid_ds *buf); */
2069   switch (arg1 /* cmd */) {
2070#if defined(VKI_IPC_INFO)
2071   case VKI_IPC_INFO:
2072      PRE_MEM_WRITE( "shmctl(IPC_INFO, buf)",
2073                     arg2, sizeof(struct vki_shminfo) );
2074      break;
2075#if defined(VKI_IPC_64)
2076   case VKI_IPC_INFO|VKI_IPC_64:
2077      PRE_MEM_WRITE( "shmctl(IPC_INFO, buf)",
2078                     arg2, sizeof(struct vki_shminfo64) );
2079      break;
2080#endif
2081#endif
2082
2083#if defined(VKI_SHM_INFO)
2084   case VKI_SHM_INFO:
2085#if defined(VKI_IPC_64)
2086   case VKI_SHM_INFO|VKI_IPC_64:
2087#endif
2088      PRE_MEM_WRITE( "shmctl(SHM_INFO, buf)",
2089                     arg2, sizeof(struct vki_shm_info) );
2090      break;
2091#endif
2092
2093   case VKI_IPC_STAT:
2094#if defined(VKI_SHM_STAT)
2095   case VKI_SHM_STAT:
2096#endif
2097      PRE_MEM_WRITE( "shmctl(IPC_STAT, buf)",
2098                     arg2, sizeof(struct vki_shmid_ds) );
2099      break;
2100
2101#if defined(VKI_IPC_64)
2102   case VKI_IPC_STAT|VKI_IPC_64:
2103   case VKI_SHM_STAT|VKI_IPC_64:
2104      PRE_MEM_WRITE( "shmctl(IPC_STAT, arg.buf)",
2105                     arg2, sizeof(struct vki_shmid64_ds) );
2106      break;
2107#endif
2108
2109   case VKI_IPC_SET:
2110      PRE_MEM_READ( "shmctl(IPC_SET, arg.buf)",
2111                    arg2, sizeof(struct vki_shmid_ds) );
2112      break;
2113
2114#if defined(VKI_IPC_64)
2115   case VKI_IPC_SET|VKI_IPC_64:
2116      PRE_MEM_READ( "shmctl(IPC_SET, arg.buf)",
2117                    arg2, sizeof(struct vki_shmid64_ds) );
2118      break;
2119#endif
2120   }
2121}
2122
2123void
2124ML_(generic_POST_sys_shmctl) ( ThreadId tid,
2125                               UWord res,
2126                               UWord arg0, UWord arg1, UWord arg2 )
2127{
2128   switch (arg1 /* cmd */) {
2129#if defined(VKI_IPC_INFO)
2130   case VKI_IPC_INFO:
2131      POST_MEM_WRITE( arg2, sizeof(struct vki_shminfo) );
2132      break;
2133   case VKI_IPC_INFO|VKI_IPC_64:
2134      POST_MEM_WRITE( arg2, sizeof(struct vki_shminfo64) );
2135      break;
2136#endif
2137
2138#if defined(VKI_SHM_INFO)
2139   case VKI_SHM_INFO:
2140   case VKI_SHM_INFO|VKI_IPC_64:
2141      POST_MEM_WRITE( arg2, sizeof(struct vki_shm_info) );
2142      break;
2143#endif
2144
2145   case VKI_IPC_STAT:
2146#if defined(VKI_SHM_STAT)
2147   case VKI_SHM_STAT:
2148#endif
2149      POST_MEM_WRITE( arg2, sizeof(struct vki_shmid_ds) );
2150      break;
2151
2152#if defined(VKI_IPC_64)
2153   case VKI_IPC_STAT|VKI_IPC_64:
2154   case VKI_SHM_STAT|VKI_IPC_64:
2155      POST_MEM_WRITE( arg2, sizeof(struct vki_shmid64_ds) );
2156      break;
2157#endif
2158
2159
2160   }
2161}
2162
2163/* ---------------------------------------------------------------------
2164   Generic handler for mmap
2165   ------------------------------------------------------------------ */
2166
2167/*
2168 * Although mmap is specified by POSIX and the argument are generally
2169 * consistent across platforms the precise details of the low level
2170 * argument passing conventions differ. For example:
2171 *
2172 * - On x86-linux there is mmap (aka old_mmap) which takes the
2173 *   arguments in a memory block and the offset in bytes; and
2174 *   mmap2 (aka sys_mmap2) which takes the arguments in the normal
2175 *   way and the offset in pages.
2176 *
2177 * - On ppc32-linux there is mmap (aka sys_mmap) which takes the
2178 *   arguments in the normal way and the offset in bytes; and
2179 *   mmap2 (aka sys_mmap2) which takes the arguments in the normal
2180 *   way and the offset in pages.
2181 *
2182 * - On amd64-linux everything is simple and there is just the one
2183 *   call, mmap (aka sys_mmap)  which takes the arguments in the
2184 *   normal way and the offset in bytes.
2185 *
2186 * - On s390x-linux there is mmap (aka old_mmap) which takes the
2187 *   arguments in a memory block and the offset in bytes. mmap2
2188 *   is also available (but not exported via unistd.h) with
2189 *   arguments in a memory block and the offset in pages.
2190 *
2191 * To cope with all this we provide a generic handler function here
2192 * and then each platform implements one or more system call handlers
2193 * which call this generic routine after extracting and normalising
2194 * the arguments.
2195 */
2196
2197SysRes
2198ML_(generic_PRE_sys_mmap) ( ThreadId tid,
2199                            UWord arg1, UWord arg2, UWord arg3,
2200                            UWord arg4, UWord arg5, Off64T arg6 )
2201{
2202   Addr       advised;
2203   SysRes     sres;
2204   MapRequest mreq;
2205   Bool       mreq_ok;
2206
2207#  if defined(VGO_darwin)
2208   // Nb: we can't use this on Darwin, it has races:
2209   // * needs to RETRY if advisory succeeds but map fails
2210   //   (could have been some other thread in a nonblocking call)
2211   // * needs to not use fixed-position mmap() on Darwin
2212   //   (mmap will cheerfully smash whatever's already there, which might
2213   //   be a new mapping from some other thread in a nonblocking call)
2214   VG_(core_panic)("can't use ML_(generic_PRE_sys_mmap) on Darwin");
2215#  endif
2216
2217   if (arg2 == 0) {
2218      /* SuSV3 says: If len is zero, mmap() shall fail and no mapping
2219         shall be established. */
2220      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2221   }
2222
2223   if (!VG_IS_PAGE_ALIGNED(arg1)) {
2224      /* zap any misaligned addresses. */
2225      /* SuSV3 says misaligned addresses only cause the MAP_FIXED case
2226         to fail.   Here, we catch them all. */
2227      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2228   }
2229
2230   if (!VG_IS_PAGE_ALIGNED(arg6)) {
2231      /* zap any misaligned offsets. */
2232      /* SuSV3 says: The off argument is constrained to be aligned and
2233         sized according to the value returned by sysconf() when
2234         passed _SC_PAGESIZE or _SC_PAGE_SIZE. */
2235      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2236   }
2237
2238   /* Figure out what kind of allocation constraints there are
2239      (fixed/hint/any), and ask aspacem what we should do. */
2240   mreq.start = arg1;
2241   mreq.len   = arg2;
2242   if (arg4 & VKI_MAP_FIXED) {
2243      mreq.rkind = MFixed;
2244   } else
2245#if defined(VKI_MAP_ALIGN) /* Solaris specific */
2246   if (arg4 & VKI_MAP_ALIGN) {
2247      mreq.rkind = MAlign;
2248      if (mreq.start == 0) {
2249         mreq.start = VKI_PAGE_SIZE;
2250      }
2251      /* VKI_MAP_FIXED and VKI_MAP_ALIGN don't like each other. */
2252      arg4 &= ~VKI_MAP_ALIGN;
2253   } else
2254#endif
2255   if (arg1 != 0) {
2256      mreq.rkind = MHint;
2257   } else {
2258      mreq.rkind = MAny;
2259   }
2260
2261   /* Enquire ... */
2262   advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok );
2263   if (!mreq_ok) {
2264      /* Our request was bounced, so we'd better fail. */
2265      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2266   }
2267
2268#  if defined(VKI_MAP_32BIT)
2269   /* MAP_32BIT is royally unportable, so if the client asks for it, try our
2270      best to make it work (but without complexifying aspacemgr).
2271      If the user requested MAP_32BIT, the mmap-ed space must be in the
2272      first 2GB of the address space. So, return ENOMEM if aspacemgr
2273      advisory is above the first 2GB. If MAP_FIXED is also requested,
2274      MAP_32BIT has to be ignored.
2275      Assumption about aspacemgr behaviour: aspacemgr scans the address space
2276      from low addresses to find a free segment. No special effort is done
2277      to keep the first 2GB 'free' for this MAP_32BIT. So, this will often
2278      fail once the program has already allocated significant memory. */
2279   if ((arg4 & VKI_MAP_32BIT) && !(arg4 & VKI_MAP_FIXED)) {
2280      if (advised + arg2 >= 0x80000000)
2281         return VG_(mk_SysRes_Error)( VKI_ENOMEM );
2282   }
2283#  endif
2284
2285   /* Otherwise we're OK (so far).  Install aspacem's choice of
2286      address, and let the mmap go through.  */
2287   sres = VG_(am_do_mmap_NO_NOTIFY)(advised, arg2, arg3,
2288                                    arg4 | VKI_MAP_FIXED,
2289                                    arg5, arg6);
2290
2291#  if defined(VKI_MAP_32BIT)
2292   /* No recovery trial if the advisory was not accepted. */
2293   if ((arg4 & VKI_MAP_32BIT) && !(arg4 & VKI_MAP_FIXED)
2294       && sr_isError(sres)) {
2295      return VG_(mk_SysRes_Error)( VKI_ENOMEM );
2296   }
2297#  endif
2298
2299   /* A refinement: it may be that the kernel refused aspacem's choice
2300      of address.  If we were originally asked for a hinted mapping,
2301      there is still a last chance: try again at any address.
2302      Hence: */
2303   if (mreq.rkind == MHint && sr_isError(sres)) {
2304      mreq.start = 0;
2305      mreq.len   = arg2;
2306      mreq.rkind = MAny;
2307      advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok );
2308      if (!mreq_ok) {
2309         /* Our request was bounced, so we'd better fail. */
2310         return VG_(mk_SysRes_Error)( VKI_EINVAL );
2311      }
2312      /* and try again with the kernel */
2313      sres = VG_(am_do_mmap_NO_NOTIFY)(advised, arg2, arg3,
2314                                       arg4 | VKI_MAP_FIXED,
2315                                       arg5, arg6);
2316   }
2317
2318   /* Yet another refinement : sometimes valgrind chooses an address
2319      which is not acceptable by the kernel. This at least happens
2320      when mmap-ing huge pages, using the flag MAP_HUGETLB.
2321      valgrind aspacem does not know about huge pages, and modifying
2322      it to handle huge pages is not straightforward (e.g. need
2323      to understand special file system mount options).
2324      So, let's just redo an mmap, without giving any constraint to
2325      the kernel. If that succeeds, check with aspacem that the returned
2326      address is acceptable.
2327      This will give a similar effect as if the user would have
2328      hinted that address.
2329      The aspacem state will be correctly updated afterwards.
2330      We however cannot do this last refinement when the user asked
2331      for a fixed mapping, as the user asked a specific address. */
2332   if (sr_isError(sres) && !(arg4 & VKI_MAP_FIXED)) {
2333      advised = 0;
2334      /* try mmap with NULL address and without VKI_MAP_FIXED
2335         to let the kernel decide. */
2336      sres = VG_(am_do_mmap_NO_NOTIFY)(advised, arg2, arg3,
2337                                       arg4,
2338                                       arg5, arg6);
2339      if (!sr_isError(sres)) {
2340         /* The kernel is supposed to know what it is doing, but let's
2341            do a last sanity check anyway, as if the chosen address had
2342            been initially hinted by the client. The whole point of this
2343            last try was to allow mmap of huge pages to succeed without
2344            making aspacem understand them, on the other hand the kernel
2345            does not know about valgrind reservations, so this mapping
2346            can end up in free space and reservations. */
2347         mreq.start = (Addr)sr_Res(sres);
2348         mreq.len   = arg2;
2349         mreq.rkind = MHint;
2350         advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok );
2351         vg_assert(mreq_ok && advised == mreq.start);
2352      }
2353   }
2354
2355   if (!sr_isError(sres)) {
2356      ULong di_handle;
2357      /* Notify aspacem. */
2358      notify_core_of_mmap(
2359         (Addr)sr_Res(sres), /* addr kernel actually assigned */
2360         arg2, /* length */
2361         arg3, /* prot */
2362         arg4, /* the original flags value */
2363         arg5, /* fd */
2364         arg6  /* offset */
2365      );
2366      /* Load symbols? */
2367      di_handle = VG_(di_notify_mmap)( (Addr)sr_Res(sres),
2368                                       False/*allow_SkFileV*/, (Int)arg5 );
2369      /* Notify the tool. */
2370      notify_tool_of_mmap(
2371         (Addr)sr_Res(sres), /* addr kernel actually assigned */
2372         arg2, /* length */
2373         arg3, /* prot */
2374         di_handle /* so the tool can refer to the read debuginfo later,
2375                      if it wants. */
2376      );
2377   }
2378
2379   /* Stay sane */
2380   if (!sr_isError(sres) && (arg4 & VKI_MAP_FIXED))
2381      vg_assert(sr_Res(sres) == arg1);
2382
2383   return sres;
2384}
2385
2386
2387/* ---------------------------------------------------------------------
2388   The Main Entertainment ... syscall wrappers
2389   ------------------------------------------------------------------ */
2390
2391/* Note: the PRE() and POST() wrappers are for the actual functions
2392   implementing the system calls in the OS kernel.  These mostly have
2393   names like sys_write();  a few have names like old_mmap().  See the
2394   comment for ML_(syscall_table)[] for important info about the __NR_foo
2395   constants and their relationship to the sys_foo() functions.
2396
2397   Some notes about names used for syscalls and args:
2398   - For the --trace-syscalls=yes output, we use the sys_foo() name to avoid
2399     ambiguity.
2400
2401   - For error messages, we generally use a somewhat generic name
2402     for the syscall (eg. "write" rather than "sys_write").  This should be
2403     good enough for the average user to understand what is happening,
2404     without confusing them with names like "sys_write".
2405
2406   - Also, for error messages the arg names are mostly taken from the man
2407     pages (even though many of those man pages are really for glibc
2408     functions of the same name), rather than from the OS kernel source,
2409     for the same reason -- a user presented with a "bogus foo(bar)" arg
2410     will most likely look at the "foo" man page to see which is the "bar"
2411     arg.
2412
2413   Note that we use our own vki_* types.  The one exception is in
2414   PRE_REG_READn calls, where pointer types haven't been changed, because
2415   they don't need to be -- eg. for "foo*" to be used, the type foo need not
2416   be visible.
2417
2418   XXX: some of these are arch-specific, and should be factored out.
2419*/
2420
2421#define PRE(name)      DEFN_PRE_TEMPLATE(generic, name)
2422#define POST(name)     DEFN_POST_TEMPLATE(generic, name)
2423
2424// Macros to support 64-bit syscall args split into two 32 bit values
2425#if defined(VG_LITTLEENDIAN)
2426#define MERGE64(lo,hi)   ( ((ULong)(lo)) | (((ULong)(hi)) << 32) )
2427#define MERGE64_FIRST(name) name##_low
2428#define MERGE64_SECOND(name) name##_high
2429#elif defined(VG_BIGENDIAN)
2430#define MERGE64(hi,lo)   ( ((ULong)(lo)) | (((ULong)(hi)) << 32) )
2431#define MERGE64_FIRST(name) name##_high
2432#define MERGE64_SECOND(name) name##_low
2433#else
2434#error Unknown endianness
2435#endif
2436
2437PRE(sys_exit)
2438{
2439   ThreadState* tst;
2440   /* simple; just make this thread exit */
2441   PRINT("exit( %ld )", SARG1);
2442   PRE_REG_READ1(void, "exit", int, status);
2443   tst = VG_(get_ThreadState)(tid);
2444   /* Set the thread's status to be exiting, then claim that the
2445      syscall succeeded. */
2446   tst->exitreason = VgSrc_ExitThread;
2447   tst->os_state.exitcode = ARG1;
2448   SET_STATUS_Success(0);
2449}
2450
2451PRE(sys_ni_syscall)
2452{
2453   PRINT("unimplemented (by the kernel) syscall: %s! (ni_syscall)\n",
2454      VG_SYSNUM_STRING(SYSNO));
2455   PRE_REG_READ0(long, "ni_syscall");
2456   SET_STATUS_Failure( VKI_ENOSYS );
2457}
2458
2459PRE(sys_iopl)
2460{
2461   PRINT("sys_iopl ( %lu )", ARG1);
2462   PRE_REG_READ1(long, "iopl", unsigned long, level);
2463}
2464
2465PRE(sys_fsync)
2466{
2467   *flags |= SfMayBlock;
2468   PRINT("sys_fsync ( %lu )", ARG1);
2469   PRE_REG_READ1(long, "fsync", unsigned int, fd);
2470}
2471
2472PRE(sys_fdatasync)
2473{
2474   *flags |= SfMayBlock;
2475   PRINT("sys_fdatasync ( %lu )", ARG1);
2476   PRE_REG_READ1(long, "fdatasync", unsigned int, fd);
2477}
2478
2479PRE(sys_msync)
2480{
2481   *flags |= SfMayBlock;
2482   PRINT("sys_msync ( %#lx, %lu, %#lx )", ARG1, ARG2, ARG3);
2483   PRE_REG_READ3(long, "msync",
2484                 unsigned long, start, vki_size_t, length, int, flags);
2485   PRE_MEM_READ( "msync(start)", ARG1, ARG2 );
2486}
2487
2488// Nb: getpmsg() and putpmsg() are special additional syscalls used in early
2489// versions of LiS (Linux Streams).  They are not part of the kernel.
2490// Therefore, we have to provide this type ourself, rather than getting it
2491// from the kernel sources.
2492struct vki_pmsg_strbuf {
2493   int     maxlen;         /* no. of bytes in buffer */
2494   int     len;            /* no. of bytes returned */
2495   vki_caddr_t buf;        /* pointer to data */
2496};
2497PRE(sys_getpmsg)
2498{
2499   /* LiS getpmsg from http://www.gcom.com/home/linux/lis/ */
2500   struct vki_pmsg_strbuf *ctrl;
2501   struct vki_pmsg_strbuf *data;
2502   *flags |= SfMayBlock;
2503   PRINT("sys_getpmsg ( %ld, %#lx, %#lx, %#lx, %#lx )", SARG1, ARG2, ARG3,
2504         ARG4, ARG5);
2505   PRE_REG_READ5(int, "getpmsg",
2506                 int, fd, struct strbuf *, ctrl, struct strbuf *, data,
2507                 int *, bandp, int *, flagsp);
2508   ctrl = (struct vki_pmsg_strbuf *)ARG2;
2509   data = (struct vki_pmsg_strbuf *)ARG3;
2510   if (ctrl && ctrl->maxlen > 0)
2511      PRE_MEM_WRITE( "getpmsg(ctrl)", (Addr)ctrl->buf, ctrl->maxlen);
2512   if (data && data->maxlen > 0)
2513      PRE_MEM_WRITE( "getpmsg(data)", (Addr)data->buf, data->maxlen);
2514   if (ARG4)
2515      PRE_MEM_WRITE( "getpmsg(bandp)", (Addr)ARG4, sizeof(int));
2516   if (ARG5)
2517      PRE_MEM_WRITE( "getpmsg(flagsp)", (Addr)ARG5, sizeof(int));
2518}
2519POST(sys_getpmsg)
2520{
2521   struct vki_pmsg_strbuf *ctrl;
2522   struct vki_pmsg_strbuf *data;
2523   vg_assert(SUCCESS);
2524   ctrl = (struct vki_pmsg_strbuf *)ARG2;
2525   data = (struct vki_pmsg_strbuf *)ARG3;
2526   if (RES == 0 && ctrl && ctrl->len > 0) {
2527      POST_MEM_WRITE( (Addr)ctrl->buf, ctrl->len);
2528   }
2529   if (RES == 0 && data && data->len > 0) {
2530      POST_MEM_WRITE( (Addr)data->buf, data->len);
2531   }
2532}
2533
2534PRE(sys_putpmsg)
2535{
2536   /* LiS putpmsg from http://www.gcom.com/home/linux/lis/ */
2537   struct vki_pmsg_strbuf *ctrl;
2538   struct vki_pmsg_strbuf *data;
2539   *flags |= SfMayBlock;
2540   PRINT("sys_putpmsg ( %ld, %#lx, %#lx, %ld, %ld )", SARG1, ARG2, ARG3,
2541         SARG4, SARG5);
2542   PRE_REG_READ5(int, "putpmsg",
2543                 int, fd, struct strbuf *, ctrl, struct strbuf *, data,
2544                 int, band, int, flags);
2545   ctrl = (struct vki_pmsg_strbuf *)ARG2;
2546   data = (struct vki_pmsg_strbuf *)ARG3;
2547   if (ctrl && ctrl->len > 0)
2548      PRE_MEM_READ( "putpmsg(ctrl)", (Addr)ctrl->buf, ctrl->len);
2549   if (data && data->len > 0)
2550      PRE_MEM_READ( "putpmsg(data)", (Addr)data->buf, data->len);
2551}
2552
2553PRE(sys_getitimer)
2554{
2555   struct vki_itimerval *value = (struct vki_itimerval*)ARG2;
2556   PRINT("sys_getitimer ( %ld, %#lx )", SARG1, ARG2);
2557   PRE_REG_READ2(long, "getitimer", int, which, struct itimerval *, value);
2558
2559   PRE_timeval_WRITE( "getitimer(&value->it_interval)", &(value->it_interval));
2560   PRE_timeval_WRITE( "getitimer(&value->it_value)",    &(value->it_value));
2561}
2562
2563POST(sys_getitimer)
2564{
2565   if (ARG2 != (Addr)NULL) {
2566      struct vki_itimerval *value = (struct vki_itimerval*)ARG2;
2567      POST_timeval_WRITE( &(value->it_interval) );
2568      POST_timeval_WRITE( &(value->it_value) );
2569   }
2570}
2571
2572PRE(sys_setitimer)
2573{
2574   PRINT("sys_setitimer ( %ld, %#lx, %#lx )", SARG1, ARG2, ARG3);
2575   PRE_REG_READ3(long, "setitimer",
2576                 int, which,
2577                 struct itimerval *, value, struct itimerval *, ovalue);
2578   if (ARG2 != (Addr)NULL) {
2579      struct vki_itimerval *value = (struct vki_itimerval*)ARG2;
2580      PRE_timeval_READ( "setitimer(&value->it_interval)",
2581                         &(value->it_interval));
2582      PRE_timeval_READ( "setitimer(&value->it_value)",
2583                         &(value->it_value));
2584   }
2585   if (ARG3 != (Addr)NULL) {
2586      struct vki_itimerval *ovalue = (struct vki_itimerval*)ARG3;
2587      PRE_timeval_WRITE( "setitimer(&ovalue->it_interval)",
2588                         &(ovalue->it_interval));
2589      PRE_timeval_WRITE( "setitimer(&ovalue->it_value)",
2590                         &(ovalue->it_value));
2591   }
2592}
2593
2594POST(sys_setitimer)
2595{
2596   if (ARG3 != (Addr)NULL) {
2597      struct vki_itimerval *ovalue = (struct vki_itimerval*)ARG3;
2598      POST_timeval_WRITE( &(ovalue->it_interval) );
2599      POST_timeval_WRITE( &(ovalue->it_value) );
2600   }
2601}
2602
2603PRE(sys_chroot)
2604{
2605   PRINT("sys_chroot ( %#lx )", ARG1);
2606   PRE_REG_READ1(long, "chroot", const char *, path);
2607   PRE_MEM_RASCIIZ( "chroot(path)", ARG1 );
2608}
2609
2610PRE(sys_madvise)
2611{
2612   *flags |= SfMayBlock;
2613   PRINT("sys_madvise ( %#lx, %lu, %ld )", ARG1, ARG2, SARG3);
2614   PRE_REG_READ3(long, "madvise",
2615                 unsigned long, start, vki_size_t, length, int, advice);
2616}
2617
2618#if HAVE_MREMAP
2619PRE(sys_mremap)
2620{
2621   // Nb: this is different to the glibc version described in the man pages,
2622   // which lacks the fifth 'new_address' argument.
2623   if (ARG4 & VKI_MREMAP_FIXED) {
2624      PRINT("sys_mremap ( %#lx, %lu, %lu, %#lx, %#lx )",
2625            ARG1, ARG2, ARG3, ARG4, ARG5);
2626      PRE_REG_READ5(unsigned long, "mremap",
2627                    unsigned long, old_addr, unsigned long, old_size,
2628                    unsigned long, new_size, unsigned long, flags,
2629                    unsigned long, new_addr);
2630   } else {
2631      PRINT("sys_mremap ( %#lx, %lu, %lu, 0x%lx )",
2632            ARG1, ARG2, ARG3, ARG4);
2633      PRE_REG_READ4(unsigned long, "mremap",
2634                    unsigned long, old_addr, unsigned long, old_size,
2635                    unsigned long, new_size, unsigned long, flags);
2636   }
2637   SET_STATUS_from_SysRes(
2638      do_mremap((Addr)ARG1, ARG2, (Addr)ARG5, ARG3, ARG4, tid)
2639   );
2640}
2641#endif /* HAVE_MREMAP */
2642
2643PRE(sys_nice)
2644{
2645   PRINT("sys_nice ( %ld )", SARG1);
2646   PRE_REG_READ1(long, "nice", int, inc);
2647}
2648
2649PRE(sys_mlock)
2650{
2651   *flags |= SfMayBlock;
2652   PRINT("sys_mlock ( %#lx, %lu )", ARG1, ARG2);
2653   PRE_REG_READ2(long, "mlock", unsigned long, addr, vki_size_t, len);
2654}
2655
2656PRE(sys_munlock)
2657{
2658   *flags |= SfMayBlock;
2659   PRINT("sys_munlock ( %#lx, %lu )", ARG1, ARG2);
2660   PRE_REG_READ2(long, "munlock", unsigned long, addr, vki_size_t, len);
2661}
2662
2663PRE(sys_mlockall)
2664{
2665   *flags |= SfMayBlock;
2666   PRINT("sys_mlockall ( %lx )", ARG1);
2667   PRE_REG_READ1(long, "mlockall", int, flags);
2668}
2669
2670PRE(sys_setpriority)
2671{
2672   PRINT("sys_setpriority ( %ld, %ld, %ld )", SARG1, SARG2, SARG3);
2673   PRE_REG_READ3(long, "setpriority", int, which, int, who, int, prio);
2674}
2675
2676PRE(sys_getpriority)
2677{
2678   PRINT("sys_getpriority ( %ld, %ld )", SARG1, SARG2);
2679   PRE_REG_READ2(long, "getpriority", int, which, int, who);
2680}
2681
2682PRE(sys_pwrite64)
2683{
2684   *flags |= SfMayBlock;
2685#if VG_WORDSIZE == 4
2686   PRINT("sys_pwrite64 ( %lu, %#lx, %lu, %lld )",
2687         ARG1, ARG2, ARG3, (Long)MERGE64(ARG4,ARG5));
2688   PRE_REG_READ5(ssize_t, "pwrite64",
2689                 unsigned int, fd, const char *, buf, vki_size_t, count,
2690                 vki_u32, MERGE64_FIRST(offset), vki_u32, MERGE64_SECOND(offset));
2691#elif VG_WORDSIZE == 8
2692   PRINT("sys_pwrite64 ( %lu, %#lx, %lu, %ld )",
2693         ARG1, ARG2, ARG3, SARG4);
2694   PRE_REG_READ4(ssize_t, "pwrite64",
2695                 unsigned int, fd, const char *, buf, vki_size_t, count,
2696                 Word, offset);
2697#else
2698#  error Unexpected word size
2699#endif
2700   PRE_MEM_READ( "pwrite64(buf)", ARG2, ARG3 );
2701}
2702
2703PRE(sys_sync)
2704{
2705   *flags |= SfMayBlock;
2706   PRINT("sys_sync ( )");
2707   PRE_REG_READ0(long, "sync");
2708}
2709
2710PRE(sys_fstatfs)
2711{
2712   FUSE_COMPATIBLE_MAY_BLOCK();
2713   PRINT("sys_fstatfs ( %lu, %#lx )", ARG1, ARG2);
2714   PRE_REG_READ2(long, "fstatfs",
2715                 unsigned int, fd, struct statfs *, buf);
2716   PRE_MEM_WRITE( "fstatfs(buf)", ARG2, sizeof(struct vki_statfs) );
2717}
2718
2719POST(sys_fstatfs)
2720{
2721   POST_MEM_WRITE( ARG2, sizeof(struct vki_statfs) );
2722}
2723
2724PRE(sys_fstatfs64)
2725{
2726   FUSE_COMPATIBLE_MAY_BLOCK();
2727   PRINT("sys_fstatfs64 ( %lu, %lu, %#lx )", ARG1, ARG2, ARG3);
2728   PRE_REG_READ3(long, "fstatfs64",
2729                 unsigned int, fd, vki_size_t, size, struct statfs64 *, buf);
2730   PRE_MEM_WRITE( "fstatfs64(buf)", ARG3, ARG2 );
2731}
2732POST(sys_fstatfs64)
2733{
2734   POST_MEM_WRITE( ARG3, ARG2 );
2735}
2736
2737PRE(sys_getsid)
2738{
2739   PRINT("sys_getsid ( %ld )", SARG1);
2740   PRE_REG_READ1(long, "getsid", vki_pid_t, pid);
2741}
2742
2743PRE(sys_pread64)
2744{
2745   *flags |= SfMayBlock;
2746#if VG_WORDSIZE == 4
2747   PRINT("sys_pread64 ( %lu, %#lx, %lu, %lld )",
2748         ARG1, ARG2, ARG3, (Long)MERGE64(ARG4,ARG5));
2749   PRE_REG_READ5(ssize_t, "pread64",
2750                 unsigned int, fd, char *, buf, vki_size_t, count,
2751                 vki_u32, MERGE64_FIRST(offset), vki_u32, MERGE64_SECOND(offset));
2752#elif VG_WORDSIZE == 8
2753   PRINT("sys_pread64 ( %lu, %#lx, %lu, %ld )",
2754         ARG1, ARG2, ARG3, SARG4);
2755   PRE_REG_READ4(ssize_t, "pread64",
2756                 unsigned int, fd, char *, buf, vki_size_t, count,
2757                 Word, offset);
2758#else
2759#  error Unexpected word size
2760#endif
2761   PRE_MEM_WRITE( "pread64(buf)", ARG2, ARG3 );
2762}
2763POST(sys_pread64)
2764{
2765   vg_assert(SUCCESS);
2766   if (RES > 0) {
2767      POST_MEM_WRITE( ARG2, RES );
2768   }
2769}
2770
2771PRE(sys_mknod)
2772{
2773   FUSE_COMPATIBLE_MAY_BLOCK();
2774   PRINT("sys_mknod ( %#lx(%s), %#lx, %#lx )", ARG1, (HChar*)ARG1, ARG2, ARG3 );
2775   PRE_REG_READ3(long, "mknod",
2776                 const char *, pathname, int, mode, unsigned, dev);
2777   PRE_MEM_RASCIIZ( "mknod(pathname)", ARG1 );
2778}
2779
2780PRE(sys_flock)
2781{
2782   *flags |= SfMayBlock;
2783   PRINT("sys_flock ( %lu, %lu )", ARG1, ARG2 );
2784   PRE_REG_READ2(long, "flock", unsigned int, fd, unsigned int, operation);
2785}
2786
2787// Pre_read a char** argument.
2788void ML_(pre_argv_envp)(Addr a, ThreadId tid, const HChar *s1, const HChar *s2)
2789{
2790   while (True) {
2791      Addr a_deref;
2792      Addr* a_p = (Addr*)a;
2793      PRE_MEM_READ( s1, (Addr)a_p, sizeof(Addr) );
2794      a_deref = *a_p;
2795      if (0 == a_deref)
2796         break;
2797      PRE_MEM_RASCIIZ( s2, a_deref );
2798      a += sizeof(char*);
2799   }
2800}
2801
2802static Bool i_am_the_only_thread ( void )
2803{
2804   Int c = VG_(count_living_threads)();
2805   vg_assert(c >= 1); /* stay sane */
2806   return c == 1;
2807}
2808
2809/* Wait until all other threads disappear. */
2810void VG_(reap_threads)(ThreadId self)
2811{
2812   while (!i_am_the_only_thread()) {
2813      /* Let other thread(s) run */
2814      VG_(vg_yield)();
2815      VG_(poll_signals)(self);
2816   }
2817   vg_assert(i_am_the_only_thread());
2818}
2819
2820// XXX: prototype here seemingly doesn't match the prototype for i386-linux,
2821// but it seems to work nonetheless...
2822PRE(sys_execve)
2823{
2824   HChar*       path = NULL;       /* path to executable */
2825   HChar**      envp = NULL;
2826   HChar**      argv = NULL;
2827   HChar**      arg2copy;
2828   HChar*       launcher_basename = NULL;
2829   ThreadState* tst;
2830   Int          i, j, tot_args;
2831   SysRes       res;
2832   Bool         setuid_allowed, trace_this_child;
2833
2834   PRINT("sys_execve ( %#lx(%s), %#lx, %#lx )", ARG1, (char*)ARG1, ARG2, ARG3);
2835   PRE_REG_READ3(vki_off_t, "execve",
2836                 char *, filename, char **, argv, char **, envp);
2837   PRE_MEM_RASCIIZ( "execve(filename)", ARG1 );
2838   if (ARG2 != 0)
2839      ML_(pre_argv_envp)( ARG2, tid, "execve(argv)", "execve(argv[i])" );
2840   if (ARG3 != 0)
2841      ML_(pre_argv_envp)( ARG3, tid, "execve(envp)", "execve(envp[i])" );
2842
2843   vg_assert(VG_(is_valid_tid)(tid));
2844   tst = VG_(get_ThreadState)(tid);
2845
2846   /* Erk.  If the exec fails, then the following will have made a
2847      mess of things which makes it hard for us to continue.  The
2848      right thing to do is piece everything together again in
2849      POST(execve), but that's close to impossible.  Instead, we make
2850      an effort to check that the execve will work before actually
2851      doing it. */
2852
2853   /* Check that the name at least begins in client-accessible storage. */
2854   if (ARG1 == 0 /* obviously bogus */
2855       || !VG_(am_is_valid_for_client)( ARG1, 1, VKI_PROT_READ )) {
2856      SET_STATUS_Failure( VKI_EFAULT );
2857      return;
2858   }
2859
2860   // debug-only printing
2861   if (0) {
2862      VG_(printf)("ARG1 = %p(%s)\n", (void*)ARG1, (HChar*)ARG1);
2863      if (ARG2) {
2864         VG_(printf)("ARG2 = ");
2865         Int q;
2866         HChar** vec = (HChar**)ARG2;
2867         for (q = 0; vec[q]; q++)
2868            VG_(printf)("%p(%s) ", vec[q], vec[q]);
2869         VG_(printf)("\n");
2870      } else {
2871         VG_(printf)("ARG2 = null\n");
2872      }
2873   }
2874
2875   // Decide whether or not we want to follow along
2876   { // Make 'child_argv' be a pointer to the child's arg vector
2877     // (skipping the exe name)
2878     const HChar** child_argv = (const HChar**)ARG2;
2879     if (child_argv && child_argv[0] == NULL)
2880        child_argv = NULL;
2881     trace_this_child = VG_(should_we_trace_this_child)( (HChar*)ARG1, child_argv );
2882   }
2883
2884   // Do the important checks:  it is a file, is executable, permissions are
2885   // ok, etc.  We allow setuid executables to run only in the case when
2886   // we are not simulating them, that is, they to be run natively.
2887   setuid_allowed = trace_this_child  ? False  : True;
2888   res = VG_(pre_exec_check)((const HChar *)ARG1, NULL, setuid_allowed);
2889   if (sr_isError(res)) {
2890      SET_STATUS_Failure( sr_Err(res) );
2891      return;
2892   }
2893
2894   /* If we're tracing the child, and the launcher name looks bogus
2895      (possibly because launcher.c couldn't figure it out, see
2896      comments therein) then we have no option but to fail. */
2897   if (trace_this_child
2898       && (VG_(name_of_launcher) == NULL
2899           || VG_(name_of_launcher)[0] != '/')) {
2900      SET_STATUS_Failure( VKI_ECHILD ); /* "No child processes" */
2901      return;
2902   }
2903
2904   /* After this point, we can't recover if the execve fails. */
2905   VG_(debugLog)(1, "syswrap", "Exec of %s\n", (HChar*)ARG1);
2906
2907
2908   // Terminate gdbserver if it is active.
2909   if (VG_(clo_vgdb)  != Vg_VgdbNo) {
2910      // If the child will not be traced, we need to terminate gdbserver
2911      // to cleanup the gdbserver resources (e.g. the FIFO files).
2912      // If child will be traced, we also terminate gdbserver: the new
2913      // Valgrind will start a fresh gdbserver after exec.
2914      VG_(gdbserver) (0);
2915   }
2916
2917   /* Resistance is futile.  Nuke all other threads.  POSIX mandates
2918      this. (Really, nuke them all, since the new process will make
2919      its own new thread.) */
2920   VG_(nuke_all_threads_except)( tid, VgSrc_ExitThread );
2921   VG_(reap_threads)(tid);
2922
2923   // Set up the child's exe path.
2924   //
2925   if (trace_this_child) {
2926
2927      // We want to exec the launcher.  Get its pre-remembered path.
2928      path = VG_(name_of_launcher);
2929      // VG_(name_of_launcher) should have been acquired by m_main at
2930      // startup.
2931      vg_assert(path);
2932
2933      launcher_basename = VG_(strrchr)(path, '/');
2934      if (launcher_basename == NULL || launcher_basename[1] == 0) {
2935         launcher_basename = path;  // hmm, tres dubious
2936      } else {
2937         launcher_basename++;
2938      }
2939
2940   } else {
2941      path = (HChar*)ARG1;
2942   }
2943
2944   // Set up the child's environment.
2945   //
2946   // Remove the valgrind-specific stuff from the environment so the
2947   // child doesn't get vgpreload_core.so, vgpreload_<tool>.so, etc.
2948   // This is done unconditionally, since if we are tracing the child,
2949   // the child valgrind will set up the appropriate client environment.
2950   // Nb: we make a copy of the environment before trying to mangle it
2951   // as it might be in read-only memory (this was bug #101881).
2952   //
2953   // Then, if tracing the child, set VALGRIND_LIB for it.
2954   //
2955   if (ARG3 == 0) {
2956      envp = NULL;
2957   } else {
2958      envp = VG_(env_clone)( (HChar**)ARG3 );
2959      if (envp == NULL) goto hosed;
2960      VG_(env_remove_valgrind_env_stuff)( envp, True /*ro_strings*/, NULL );
2961   }
2962
2963   if (trace_this_child) {
2964      // Set VALGRIND_LIB in ARG3 (the environment)
2965      VG_(env_setenv)( &envp, VALGRIND_LIB, VG_(libdir));
2966   }
2967
2968   // Set up the child's args.  If not tracing it, they are
2969   // simply ARG2.  Otherwise, they are
2970   //
2971   // [launcher_basename] ++ VG_(args_for_valgrind) ++ [ARG1] ++ ARG2[1..]
2972   //
2973   // except that the first VG_(args_for_valgrind_noexecpass) args
2974   // are omitted.
2975   //
2976   if (!trace_this_child) {
2977      argv = (HChar**)ARG2;
2978   } else {
2979      vg_assert( VG_(args_for_valgrind) );
2980      vg_assert( VG_(args_for_valgrind_noexecpass) >= 0 );
2981      vg_assert( VG_(args_for_valgrind_noexecpass)
2982                   <= VG_(sizeXA)( VG_(args_for_valgrind) ) );
2983      /* how many args in total will there be? */
2984      // launcher basename
2985      tot_args = 1;
2986      // V's args
2987      tot_args += VG_(sizeXA)( VG_(args_for_valgrind) );
2988      tot_args -= VG_(args_for_valgrind_noexecpass);
2989      // name of client exe
2990      tot_args++;
2991      // args for client exe, skipping [0]
2992      arg2copy = (HChar**)ARG2;
2993      if (arg2copy && arg2copy[0]) {
2994         for (i = 1; arg2copy[i]; i++)
2995            tot_args++;
2996      }
2997      // allocate
2998      argv = VG_(malloc)( "di.syswrap.pre_sys_execve.1",
2999                          (tot_args+1) * sizeof(HChar*) );
3000      // copy
3001      j = 0;
3002      argv[j++] = launcher_basename;
3003      for (i = 0; i < VG_(sizeXA)( VG_(args_for_valgrind) ); i++) {
3004         if (i < VG_(args_for_valgrind_noexecpass))
3005            continue;
3006         argv[j++] = * (HChar**) VG_(indexXA)( VG_(args_for_valgrind), i );
3007      }
3008      argv[j++] = (HChar*)ARG1;
3009      if (arg2copy && arg2copy[0])
3010         for (i = 1; arg2copy[i]; i++)
3011            argv[j++] = arg2copy[i];
3012      argv[j++] = NULL;
3013      // check
3014      vg_assert(j == tot_args+1);
3015   }
3016
3017   /* restore the DATA rlimit for the child */
3018   VG_(setrlimit)(VKI_RLIMIT_DATA, &VG_(client_rlimit_data));
3019
3020   /*
3021      Set the signal state up for exec.
3022
3023      We need to set the real signal state to make sure the exec'd
3024      process gets SIG_IGN properly.
3025
3026      Also set our real sigmask to match the client's sigmask so that
3027      the exec'd child will get the right mask.  First we need to
3028      clear out any pending signals so they they don't get delivered,
3029      which would confuse things.
3030
3031      XXX This is a bug - the signals should remain pending, and be
3032      delivered to the new process after exec.  There's also a
3033      race-condition, since if someone delivers us a signal between
3034      the sigprocmask and the execve, we'll still get the signal. Oh
3035      well.
3036   */
3037   {
3038      vki_sigset_t allsigs;
3039      vki_siginfo_t info;
3040
3041      /* What this loop does: it queries SCSS (the signal state that
3042         the client _thinks_ the kernel is in) by calling
3043         VG_(do_sys_sigaction), and modifies the real kernel signal
3044         state accordingly. */
3045      for (i = 1; i < VG_(max_signal); i++) {
3046         vki_sigaction_fromK_t sa_f;
3047         vki_sigaction_toK_t   sa_t;
3048         VG_(do_sys_sigaction)(i, NULL, &sa_f);
3049         VG_(convert_sigaction_fromK_to_toK)(&sa_f, &sa_t);
3050         if (sa_t.ksa_handler == VKI_SIG_IGN)
3051            VG_(sigaction)(i, &sa_t, NULL);
3052         else {
3053            sa_t.ksa_handler = VKI_SIG_DFL;
3054            VG_(sigaction)(i, &sa_t, NULL);
3055         }
3056      }
3057
3058      VG_(sigfillset)(&allsigs);
3059      while(VG_(sigtimedwait_zero)(&allsigs, &info) > 0)
3060         ;
3061
3062      VG_(sigprocmask)(VKI_SIG_SETMASK, &tst->sig_mask, NULL);
3063   }
3064
3065   if (0) {
3066      HChar **cpp;
3067      VG_(printf)("exec: %s\n", path);
3068      for (cpp = argv; cpp && *cpp; cpp++)
3069         VG_(printf)("argv: %s\n", *cpp);
3070      if (0)
3071         for (cpp = envp; cpp && *cpp; cpp++)
3072            VG_(printf)("env: %s\n", *cpp);
3073   }
3074
3075   SET_STATUS_from_SysRes(
3076      VG_(do_syscall3)(__NR_execve, (UWord)path, (UWord)argv, (UWord)envp)
3077   );
3078
3079   /* If we got here, then the execve failed.  We've already made way
3080      too much of a mess to continue, so we have to abort. */
3081  hosed:
3082   vg_assert(FAILURE);
3083   VG_(message)(Vg_UserMsg, "execve(%#lx(%s), %#lx, %#lx) failed, errno %lu\n",
3084                ARG1, (HChar*)ARG1, ARG2, ARG3, ERR);
3085   VG_(message)(Vg_UserMsg, "EXEC FAILED: I can't recover from "
3086                            "execve() failing, so I'm dying.\n");
3087   VG_(message)(Vg_UserMsg, "Add more stringent tests in PRE(sys_execve), "
3088                            "or work out how to recover.\n");
3089   VG_(exit)(101);
3090}
3091
3092PRE(sys_access)
3093{
3094   PRINT("sys_access ( %#lx(%s), %ld )", ARG1, (HChar*)ARG1, SARG2);
3095   PRE_REG_READ2(long, "access", const char *, pathname, int, mode);
3096   PRE_MEM_RASCIIZ( "access(pathname)", ARG1 );
3097}
3098
3099PRE(sys_alarm)
3100{
3101   PRINT("sys_alarm ( %lu )", ARG1);
3102   PRE_REG_READ1(unsigned long, "alarm", unsigned int, seconds);
3103}
3104
3105PRE(sys_brk)
3106{
3107   Addr brk_limit = VG_(brk_limit);
3108   Addr brk_new;
3109
3110   /* libc   says: int   brk(void *end_data_segment);
3111      kernel says: void* brk(void* end_data_segment);  (more or less)
3112
3113      libc returns 0 on success, and -1 (and sets errno) on failure.
3114      Nb: if you ask to shrink the dataseg end below what it
3115      currently is, that always succeeds, even if the dataseg end
3116      doesn't actually change (eg. brk(0)).  Unless it seg faults.
3117
3118      Kernel returns the new dataseg end.  If the brk() failed, this
3119      will be unchanged from the old one.  That's why calling (kernel)
3120      brk(0) gives the current dataseg end (libc brk() just returns
3121      zero in that case).
3122
3123      Both will seg fault if you shrink it back into a text segment.
3124   */
3125   PRINT("sys_brk ( %#lx )", ARG1);
3126   PRE_REG_READ1(unsigned long, "brk", unsigned long, end_data_segment);
3127
3128   brk_new = do_brk(ARG1, tid);
3129   SET_STATUS_Success( brk_new );
3130
3131   if (brk_new == ARG1) {
3132      /* brk() succeeded */
3133      if (brk_new < brk_limit) {
3134         /* successfully shrunk the data segment. */
3135         VG_TRACK( die_mem_brk, (Addr)ARG1,
3136		   brk_limit-ARG1 );
3137      } else
3138      if (brk_new > brk_limit) {
3139         /* successfully grew the data segment */
3140         VG_TRACK( new_mem_brk, brk_limit,
3141                   ARG1-brk_limit, tid );
3142      }
3143   } else {
3144      /* brk() failed */
3145      vg_assert(brk_limit == brk_new);
3146   }
3147}
3148
3149PRE(sys_chdir)
3150{
3151   FUSE_COMPATIBLE_MAY_BLOCK();
3152   PRINT("sys_chdir ( %#lx(%s) )", ARG1,(char*)ARG1);
3153   PRE_REG_READ1(long, "chdir", const char *, path);
3154   PRE_MEM_RASCIIZ( "chdir(path)", ARG1 );
3155}
3156
3157PRE(sys_chmod)
3158{
3159   FUSE_COMPATIBLE_MAY_BLOCK();
3160   PRINT("sys_chmod ( %#lx(%s), %lu )", ARG1, (HChar*)ARG1, ARG2);
3161   PRE_REG_READ2(long, "chmod", const char *, path, vki_mode_t, mode);
3162   PRE_MEM_RASCIIZ( "chmod(path)", ARG1 );
3163}
3164
3165PRE(sys_chown)
3166{
3167   FUSE_COMPATIBLE_MAY_BLOCK();
3168   PRINT("sys_chown ( %#lx(%s), 0x%lx, 0x%lx )", ARG1,(char*)ARG1,ARG2,ARG3);
3169   PRE_REG_READ3(long, "chown",
3170                 const char *, path, vki_uid_t, owner, vki_gid_t, group);
3171   PRE_MEM_RASCIIZ( "chown(path)", ARG1 );
3172}
3173
3174PRE(sys_lchown)
3175{
3176   FUSE_COMPATIBLE_MAY_BLOCK();
3177   PRINT("sys_lchown ( %#lx(%s), 0x%lx, 0x%lx )", ARG1,(char*)ARG1,ARG2,ARG3);
3178   PRE_REG_READ3(long, "lchown",
3179                 const char *, path, vki_uid_t, owner, vki_gid_t, group);
3180   PRE_MEM_RASCIIZ( "lchown(path)", ARG1 );
3181}
3182
3183PRE(sys_close)
3184{
3185   FUSE_COMPATIBLE_MAY_BLOCK();
3186   PRINT("sys_close ( %lu )", ARG1);
3187   PRE_REG_READ1(long, "close", unsigned int, fd);
3188
3189   /* Detect and negate attempts by the client to close Valgrind's log fd */
3190   if ( (!ML_(fd_allowed)(ARG1, "close", tid, False))
3191        /* If doing -d style logging (which is to fd=2), don't
3192           allow that to be closed either. */
3193        || (ARG1 == 2/*stderr*/ && VG_(debugLog_getLevel)() > 0) )
3194      SET_STATUS_Failure( VKI_EBADF );
3195}
3196
3197POST(sys_close)
3198{
3199   if (VG_(clo_track_fds)) ML_(record_fd_close)(ARG1);
3200}
3201
3202PRE(sys_dup)
3203{
3204   PRINT("sys_dup ( %lu )", ARG1);
3205   PRE_REG_READ1(long, "dup", unsigned int, oldfd);
3206}
3207
3208POST(sys_dup)
3209{
3210   vg_assert(SUCCESS);
3211   if (!ML_(fd_allowed)(RES, "dup", tid, True)) {
3212      VG_(close)(RES);
3213      SET_STATUS_Failure( VKI_EMFILE );
3214   } else {
3215      if (VG_(clo_track_fds))
3216         ML_(record_fd_open_named)(tid, RES);
3217   }
3218}
3219
3220PRE(sys_dup2)
3221{
3222   PRINT("sys_dup2 ( %lu, %lu )", ARG1, ARG2);
3223   PRE_REG_READ2(long, "dup2", unsigned int, oldfd, unsigned int, newfd);
3224   if (!ML_(fd_allowed)(ARG2, "dup2", tid, True))
3225      SET_STATUS_Failure( VKI_EBADF );
3226}
3227
3228POST(sys_dup2)
3229{
3230   vg_assert(SUCCESS);
3231   if (VG_(clo_track_fds))
3232      ML_(record_fd_open_named)(tid, RES);
3233}
3234
3235PRE(sys_fchdir)
3236{
3237   FUSE_COMPATIBLE_MAY_BLOCK();
3238   PRINT("sys_fchdir ( %lu )", ARG1);
3239   PRE_REG_READ1(long, "fchdir", unsigned int, fd);
3240}
3241
3242PRE(sys_fchown)
3243{
3244   FUSE_COMPATIBLE_MAY_BLOCK();
3245   PRINT("sys_fchown ( %lu, %lu, %lu )", ARG1, ARG2, ARG3);
3246   PRE_REG_READ3(long, "fchown",
3247                 unsigned int, fd, vki_uid_t, owner, vki_gid_t, group);
3248}
3249
3250PRE(sys_fchmod)
3251{
3252   FUSE_COMPATIBLE_MAY_BLOCK();
3253   PRINT("sys_fchmod ( %lu, %lu )", ARG1, ARG2);
3254   PRE_REG_READ2(long, "fchmod", unsigned int, fildes, vki_mode_t, mode);
3255}
3256
3257PRE(sys_newfstat)
3258{
3259   FUSE_COMPATIBLE_MAY_BLOCK();
3260   PRINT("sys_newfstat ( %lu, %#lx )", ARG1, ARG2);
3261   PRE_REG_READ2(long, "fstat", unsigned int, fd, struct stat *, buf);
3262   PRE_MEM_WRITE( "fstat(buf)", ARG2, sizeof(struct vki_stat) );
3263}
3264
3265POST(sys_newfstat)
3266{
3267   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat) );
3268}
3269
3270#if !defined(VGO_solaris)
3271static vki_sigset_t fork_saved_mask;
3272
3273// In Linux, the sys_fork() function varies across architectures, but we
3274// ignore the various args it gets, and so it looks arch-neutral.  Hmm.
3275PRE(sys_fork)
3276{
3277   Bool is_child;
3278   Int child_pid;
3279   vki_sigset_t mask;
3280
3281   PRINT("sys_fork ( )");
3282   PRE_REG_READ0(long, "fork");
3283
3284   /* Block all signals during fork, so that we can fix things up in
3285      the child without being interrupted. */
3286   VG_(sigfillset)(&mask);
3287   VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, &fork_saved_mask);
3288
3289   VG_(do_atfork_pre)(tid);
3290
3291   SET_STATUS_from_SysRes( VG_(do_syscall0)(__NR_fork) );
3292
3293   if (!SUCCESS) return;
3294
3295#if defined(VGO_linux)
3296   // RES is 0 for child, non-0 (the child's PID) for parent.
3297   is_child = ( RES == 0 ? True : False );
3298   child_pid = ( is_child ? -1 : RES );
3299#elif defined(VGO_darwin)
3300   // RES is the child's pid.  RESHI is 1 for child, 0 for parent.
3301   is_child = RESHI;
3302   child_pid = RES;
3303#else
3304#  error Unknown OS
3305#endif
3306
3307   if (is_child) {
3308      VG_(do_atfork_child)(tid);
3309
3310      /* restore signal mask */
3311      VG_(sigprocmask)(VKI_SIG_SETMASK, &fork_saved_mask, NULL);
3312
3313      /* If --child-silent-after-fork=yes was specified, set the
3314         output file descriptors to 'impossible' values.  This is
3315         noticed by send_bytes_to_logging_sink in m_libcprint.c, which
3316         duly stops writing any further output. */
3317      if (VG_(clo_child_silent_after_fork)) {
3318         if (!VG_(log_output_sink).is_socket)
3319            VG_(log_output_sink).fd = -1;
3320         if (!VG_(xml_output_sink).is_socket)
3321            VG_(xml_output_sink).fd = -1;
3322      }
3323
3324   } else {
3325      VG_(do_atfork_parent)(tid);
3326
3327      PRINT("   fork: process %d created child %d\n", VG_(getpid)(), child_pid);
3328
3329      /* restore signal mask */
3330      VG_(sigprocmask)(VKI_SIG_SETMASK, &fork_saved_mask, NULL);
3331   }
3332}
3333#endif // !defined(VGO_solaris)
3334
3335PRE(sys_ftruncate)
3336{
3337   *flags |= SfMayBlock;
3338   PRINT("sys_ftruncate ( %lu, %lu )", ARG1, ARG2);
3339   PRE_REG_READ2(long, "ftruncate", unsigned int, fd, unsigned long, length);
3340}
3341
3342PRE(sys_truncate)
3343{
3344   *flags |= SfMayBlock;
3345   PRINT("sys_truncate ( %#lx(%s), %lu )", ARG1, (HChar*)ARG1, ARG2);
3346   PRE_REG_READ2(long, "truncate",
3347                 const char *, path, unsigned long, length);
3348   PRE_MEM_RASCIIZ( "truncate(path)", ARG1 );
3349}
3350
3351PRE(sys_ftruncate64)
3352{
3353   *flags |= SfMayBlock;
3354#if VG_WORDSIZE == 4
3355   PRINT("sys_ftruncate64 ( %lu, %llu )", ARG1, MERGE64(ARG2,ARG3));
3356   PRE_REG_READ3(long, "ftruncate64",
3357                 unsigned int, fd,
3358                 UWord, MERGE64_FIRST(length), UWord, MERGE64_SECOND(length));
3359#else
3360   PRINT("sys_ftruncate64 ( %lu, %lu )", ARG1, ARG2);
3361   PRE_REG_READ2(long, "ftruncate64",
3362                 unsigned int,fd, UWord,length);
3363#endif
3364}
3365
3366PRE(sys_truncate64)
3367{
3368   *flags |= SfMayBlock;
3369#if VG_WORDSIZE == 4
3370   PRINT("sys_truncate64 ( %#lx, %lld )", ARG1, (Long)MERGE64(ARG2, ARG3));
3371   PRE_REG_READ3(long, "truncate64",
3372                 const char *, path,
3373                 UWord, MERGE64_FIRST(length), UWord, MERGE64_SECOND(length));
3374#else
3375   PRINT("sys_truncate64 ( %#lx, %lld )", ARG1, (Long)ARG2);
3376   PRE_REG_READ2(long, "truncate64",
3377                 const char *,path, UWord,length);
3378#endif
3379   PRE_MEM_RASCIIZ( "truncate64(path)", ARG1 );
3380}
3381
3382PRE(sys_getdents)
3383{
3384   *flags |= SfMayBlock;
3385   PRINT("sys_getdents ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
3386   PRE_REG_READ3(long, "getdents",
3387                 unsigned int, fd, struct vki_dirent *, dirp,
3388                 unsigned int, count);
3389   PRE_MEM_WRITE( "getdents(dirp)", ARG2, ARG3 );
3390}
3391
3392POST(sys_getdents)
3393{
3394   vg_assert(SUCCESS);
3395   if (RES > 0)
3396      POST_MEM_WRITE( ARG2, RES );
3397}
3398
3399PRE(sys_getdents64)
3400{
3401   *flags |= SfMayBlock;
3402   PRINT("sys_getdents64 ( %lu, %#lx, %lu )",ARG1, ARG2, ARG3);
3403   PRE_REG_READ3(long, "getdents64",
3404                 unsigned int, fd, struct vki_dirent64 *, dirp,
3405                 unsigned int, count);
3406   PRE_MEM_WRITE( "getdents64(dirp)", ARG2, ARG3 );
3407}
3408
3409POST(sys_getdents64)
3410{
3411   vg_assert(SUCCESS);
3412   if (RES > 0)
3413      POST_MEM_WRITE( ARG2, RES );
3414}
3415
3416PRE(sys_getgroups)
3417{
3418   PRINT("sys_getgroups ( %ld, %#lx )", SARG1, ARG2);
3419   PRE_REG_READ2(long, "getgroups", int, size, vki_gid_t *, list);
3420   if (ARG1 > 0)
3421      PRE_MEM_WRITE( "getgroups(list)", ARG2, ARG1 * sizeof(vki_gid_t) );
3422}
3423
3424POST(sys_getgroups)
3425{
3426   vg_assert(SUCCESS);
3427   if (ARG1 > 0 && RES > 0)
3428      POST_MEM_WRITE( ARG2, RES * sizeof(vki_gid_t) );
3429}
3430
3431PRE(sys_getcwd)
3432{
3433   // Comment from linux/fs/dcache.c:
3434   //   NOTE! The user-level library version returns a character pointer.
3435   //   The kernel system call just returns the length of the buffer filled
3436   //   (which includes the ending '\0' character), or a negative error
3437   //   value.
3438   // Is this Linux-specific?  If so it should be moved to syswrap-linux.c.
3439   PRINT("sys_getcwd ( %#lx, %llu )", ARG1,(ULong)ARG2);
3440   PRE_REG_READ2(long, "getcwd", char *, buf, unsigned long, size);
3441   PRE_MEM_WRITE( "getcwd(buf)", ARG1, ARG2 );
3442}
3443
3444POST(sys_getcwd)
3445{
3446   vg_assert(SUCCESS);
3447   if (RES != (Addr)NULL)
3448      POST_MEM_WRITE( ARG1, RES );
3449}
3450
3451PRE(sys_geteuid)
3452{
3453   PRINT("sys_geteuid ( )");
3454   PRE_REG_READ0(long, "geteuid");
3455}
3456
3457PRE(sys_getegid)
3458{
3459   PRINT("sys_getegid ( )");
3460   PRE_REG_READ0(long, "getegid");
3461}
3462
3463PRE(sys_getgid)
3464{
3465   PRINT("sys_getgid ( )");
3466   PRE_REG_READ0(long, "getgid");
3467}
3468
3469PRE(sys_getpid)
3470{
3471   PRINT("sys_getpid ()");
3472   PRE_REG_READ0(long, "getpid");
3473}
3474
3475PRE(sys_getpgid)
3476{
3477   PRINT("sys_getpgid ( %ld )", SARG1);
3478   PRE_REG_READ1(long, "getpgid", vki_pid_t, pid);
3479}
3480
3481PRE(sys_getpgrp)
3482{
3483   PRINT("sys_getpgrp ()");
3484   PRE_REG_READ0(long, "getpgrp");
3485}
3486
3487PRE(sys_getppid)
3488{
3489   PRINT("sys_getppid ()");
3490   PRE_REG_READ0(long, "getppid");
3491}
3492
3493static void common_post_getrlimit(ThreadId tid, UWord a1, UWord a2)
3494{
3495   POST_MEM_WRITE( a2, sizeof(struct vki_rlimit) );
3496
3497#ifdef _RLIMIT_POSIX_FLAG
3498   // Darwin will sometimes set _RLIMIT_POSIX_FLAG on getrlimit calls.
3499   // Unset it here to make the switch case below work correctly.
3500   a1 &= ~_RLIMIT_POSIX_FLAG;
3501#endif
3502
3503   switch (a1) {
3504   case VKI_RLIMIT_NOFILE:
3505      ((struct vki_rlimit *)a2)->rlim_cur = VG_(fd_soft_limit);
3506      ((struct vki_rlimit *)a2)->rlim_max = VG_(fd_hard_limit);
3507      break;
3508
3509   case VKI_RLIMIT_DATA:
3510      *((struct vki_rlimit *)a2) = VG_(client_rlimit_data);
3511      break;
3512
3513   case VKI_RLIMIT_STACK:
3514      *((struct vki_rlimit *)a2) = VG_(client_rlimit_stack);
3515      break;
3516   }
3517}
3518
3519PRE(sys_old_getrlimit)
3520{
3521   PRINT("sys_old_getrlimit ( %lu, %#lx )", ARG1, ARG2);
3522   PRE_REG_READ2(long, "old_getrlimit",
3523                 unsigned int, resource, struct rlimit *, rlim);
3524   PRE_MEM_WRITE( "old_getrlimit(rlim)", ARG2, sizeof(struct vki_rlimit) );
3525}
3526
3527POST(sys_old_getrlimit)
3528{
3529   common_post_getrlimit(tid, ARG1, ARG2);
3530}
3531
3532PRE(sys_getrlimit)
3533{
3534   PRINT("sys_getrlimit ( %lu, %#lx )", ARG1, ARG2);
3535   PRE_REG_READ2(long, "getrlimit",
3536                 unsigned int, resource, struct rlimit *, rlim);
3537   PRE_MEM_WRITE( "getrlimit(rlim)", ARG2, sizeof(struct vki_rlimit) );
3538}
3539
3540POST(sys_getrlimit)
3541{
3542   common_post_getrlimit(tid, ARG1, ARG2);
3543}
3544
3545PRE(sys_getrusage)
3546{
3547   PRINT("sys_getrusage ( %ld, %#lx )", SARG1, ARG2);
3548   PRE_REG_READ2(long, "getrusage", int, who, struct rusage *, usage);
3549   PRE_MEM_WRITE( "getrusage(usage)", ARG2, sizeof(struct vki_rusage) );
3550}
3551
3552POST(sys_getrusage)
3553{
3554   vg_assert(SUCCESS);
3555   if (RES == 0)
3556      POST_MEM_WRITE( ARG2, sizeof(struct vki_rusage) );
3557}
3558
3559PRE(sys_gettimeofday)
3560{
3561   PRINT("sys_gettimeofday ( %#lx, %#lx )", ARG1,ARG2);
3562   PRE_REG_READ2(long, "gettimeofday",
3563                 struct timeval *, tv, struct timezone *, tz);
3564   // GrP fixme does darwin write to *tz anymore?
3565   if (ARG1 != 0)
3566      PRE_timeval_WRITE( "gettimeofday(tv)", ARG1 );
3567   if (ARG2 != 0)
3568      PRE_MEM_WRITE( "gettimeofday(tz)", ARG2, sizeof(struct vki_timezone) );
3569}
3570
3571POST(sys_gettimeofday)
3572{
3573   vg_assert(SUCCESS);
3574   if (RES == 0) {
3575      if (ARG1 != 0)
3576         POST_timeval_WRITE( ARG1 );
3577      if (ARG2 != 0)
3578	 POST_MEM_WRITE( ARG2, sizeof(struct vki_timezone) );
3579   }
3580}
3581
3582PRE(sys_settimeofday)
3583{
3584   PRINT("sys_settimeofday ( %#lx, %#lx )", ARG1,ARG2);
3585   PRE_REG_READ2(long, "settimeofday",
3586                 struct timeval *, tv, struct timezone *, tz);
3587   if (ARG1 != 0)
3588      PRE_timeval_READ( "settimeofday(tv)", ARG1 );
3589   if (ARG2 != 0) {
3590      PRE_MEM_READ( "settimeofday(tz)", ARG2, sizeof(struct vki_timezone) );
3591      /* maybe should warn if tz->tz_dsttime is non-zero? */
3592   }
3593}
3594
3595PRE(sys_getuid)
3596{
3597   PRINT("sys_getuid ( )");
3598   PRE_REG_READ0(long, "getuid");
3599}
3600
3601void ML_(PRE_unknown_ioctl)(ThreadId tid, UWord request, UWord arg)
3602{
3603   /* We don't have any specific information on it, so
3604      try to do something reasonable based on direction and
3605      size bits.  The encoding scheme is described in
3606      /usr/include/asm/ioctl.h or /usr/include/sys/ioccom.h .
3607
3608      According to Simon Hausmann, _IOC_READ means the kernel
3609      writes a value to the ioctl value passed from the user
3610      space and the other way around with _IOC_WRITE. */
3611
3612#if defined(VGO_solaris)
3613   /* Majority of Solaris ioctl requests does not honour direction hints. */
3614   UInt dir  = _VKI_IOC_NONE;
3615#else
3616   UInt dir  = _VKI_IOC_DIR(request);
3617#endif
3618   UInt size = _VKI_IOC_SIZE(request);
3619
3620   if (SimHintiS(SimHint_lax_ioctls, VG_(clo_sim_hints))) {
3621      /*
3622       * Be very lax about ioctl handling; the only
3623       * assumption is that the size is correct. Doesn't
3624       * require the full buffer to be initialized when
3625       * writing.  Without this, using some device
3626       * drivers with a large number of strange ioctl
3627       * commands becomes very tiresome.
3628       */
3629   } else if (/* size == 0 || */ dir == _VKI_IOC_NONE) {
3630      static UWord unknown_ioctl[10];
3631      static Int moans = sizeof(unknown_ioctl) / sizeof(unknown_ioctl[0]);
3632
3633      if (moans > 0 && !VG_(clo_xml)) {
3634         /* Check if have not already moaned for this request. */
3635         UInt i;
3636         for (i = 0; i < sizeof(unknown_ioctl)/sizeof(unknown_ioctl[0]); i++) {
3637            if (unknown_ioctl[i] == request)
3638               break;
3639            if (unknown_ioctl[i] == 0) {
3640               unknown_ioctl[i] = request;
3641               moans--;
3642               VG_(umsg)("Warning: noted but unhandled ioctl 0x%lx"
3643                         " with no size/direction hints.\n", request);
3644               VG_(umsg)("   This could cause spurious value errors to appear.\n");
3645               VG_(umsg)("   See README_MISSING_SYSCALL_OR_IOCTL for "
3646                         "guidance on writing a proper wrapper.\n" );
3647               //VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
3648               return;
3649            }
3650         }
3651      }
3652   } else {
3653      //VG_(message)(Vg_UserMsg, "UNKNOWN ioctl %#lx\n", request);
3654      //VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
3655      if ((dir & _VKI_IOC_WRITE) && size > 0)
3656         PRE_MEM_READ( "ioctl(generic)", arg, size);
3657      if ((dir & _VKI_IOC_READ) && size > 0)
3658         PRE_MEM_WRITE( "ioctl(generic)", arg, size);
3659   }
3660}
3661
3662void ML_(POST_unknown_ioctl)(ThreadId tid, UInt res, UWord request, UWord arg)
3663{
3664   /* We don't have any specific information on it, so
3665      try to do something reasonable based on direction and
3666      size bits.  The encoding scheme is described in
3667      /usr/include/asm/ioctl.h or /usr/include/sys/ioccom.h .
3668
3669      According to Simon Hausmann, _IOC_READ means the kernel
3670      writes a value to the ioctl value passed from the user
3671      space and the other way around with _IOC_WRITE. */
3672
3673   UInt dir  = _VKI_IOC_DIR(request);
3674   UInt size = _VKI_IOC_SIZE(request);
3675   if (size > 0 && (dir & _VKI_IOC_READ)
3676       && res == 0
3677       && arg != (Addr)NULL)
3678   {
3679      POST_MEM_WRITE(arg, size);
3680   }
3681}
3682
3683/*
3684   If we're sending a SIGKILL to one of our own threads, then simulate
3685   it rather than really sending the signal, so that the target thread
3686   gets a chance to clean up.  Returns True if we did the killing (or
3687   no killing is necessary), and False if the caller should use the
3688   normal kill syscall.
3689
3690   "pid" is any pid argument which can be passed to kill; group kills
3691   (< -1, 0), and owner kills (-1) are ignored, on the grounds that
3692   they'll most likely hit all the threads and we won't need to worry
3693   about cleanup.  In truth, we can't fully emulate these multicast
3694   kills.
3695
3696   "tgid" is a thread group id.  If it is not -1, then the target
3697   thread must be in that thread group.
3698 */
3699Bool ML_(do_sigkill)(Int pid, Int tgid)
3700{
3701   ThreadState *tst;
3702   ThreadId tid;
3703
3704   if (pid <= 0)
3705      return False;
3706
3707   tid = VG_(lwpid_to_vgtid)(pid);
3708   if (tid == VG_INVALID_THREADID)
3709      return False;		/* none of our threads */
3710
3711   tst = VG_(get_ThreadState)(tid);
3712   if (tst == NULL || tst->status == VgTs_Empty)
3713      return False;		/* hm, shouldn't happen */
3714
3715   if (tgid != -1 && tst->os_state.threadgroup != tgid)
3716      return False;		/* not the right thread group */
3717
3718   /* Check to see that the target isn't already exiting. */
3719   if (!VG_(is_exiting)(tid)) {
3720      if (VG_(clo_trace_signals))
3721	 VG_(message)(Vg_DebugMsg,
3722                      "Thread %u being killed with SIGKILL\n",
3723                      tst->tid);
3724
3725      tst->exitreason = VgSrc_FatalSig;
3726      tst->os_state.fatalsig = VKI_SIGKILL;
3727
3728      if (!VG_(is_running_thread)(tid))
3729	 VG_(get_thread_out_of_syscall)(tid);
3730   }
3731
3732   return True;
3733}
3734
3735PRE(sys_kill)
3736{
3737   PRINT("sys_kill ( %ld, %ld )", SARG1, SARG2);
3738   PRE_REG_READ2(long, "kill", int, pid, int, signal);
3739   if (!ML_(client_signal_OK)(ARG2)) {
3740      SET_STATUS_Failure( VKI_EINVAL );
3741      return;
3742   }
3743
3744   /* If we're sending SIGKILL, check to see if the target is one of
3745      our threads and handle it specially. */
3746   if (ARG2 == VKI_SIGKILL && ML_(do_sigkill)(ARG1, -1))
3747      SET_STATUS_Success(0);
3748   else
3749      /* re syscall3: Darwin has a 3rd arg, which is a flag (boolean)
3750         affecting how posix-compliant the call is.  I guess it is
3751         harmless to pass the 3rd arg on other platforms; hence pass
3752         it on all. */
3753      SET_STATUS_from_SysRes( VG_(do_syscall3)(SYSNO, ARG1, ARG2, ARG3) );
3754
3755   if (VG_(clo_trace_signals))
3756      VG_(message)(Vg_DebugMsg, "kill: sent signal %ld to pid %ld\n",
3757		   SARG2, SARG1);
3758
3759   /* This kill might have given us a pending signal.  Ask for a check once
3760      the syscall is done. */
3761   *flags |= SfPollAfter;
3762}
3763
3764PRE(sys_link)
3765{
3766   *flags |= SfMayBlock;
3767   PRINT("sys_link ( %#lx(%s), %#lx(%s) )", ARG1,(char*)ARG1,ARG2,(char*)ARG2);
3768   PRE_REG_READ2(long, "link", const char *, oldpath, const char *, newpath);
3769   PRE_MEM_RASCIIZ( "link(oldpath)", ARG1);
3770   PRE_MEM_RASCIIZ( "link(newpath)", ARG2);
3771}
3772
3773PRE(sys_newlstat)
3774{
3775   PRINT("sys_newlstat ( %#lx(%s), %#lx )", ARG1,(char*)ARG1,ARG2);
3776   PRE_REG_READ2(long, "lstat", char *, file_name, struct stat *, buf);
3777   PRE_MEM_RASCIIZ( "lstat(file_name)", ARG1 );
3778   PRE_MEM_WRITE( "lstat(buf)", ARG2, sizeof(struct vki_stat) );
3779}
3780
3781POST(sys_newlstat)
3782{
3783   vg_assert(SUCCESS);
3784   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat) );
3785}
3786
3787PRE(sys_mkdir)
3788{
3789   *flags |= SfMayBlock;
3790   PRINT("sys_mkdir ( %#lx(%s), %ld )", ARG1, (HChar*)ARG1, SARG2);
3791   PRE_REG_READ2(long, "mkdir", const char *, pathname, int, mode);
3792   PRE_MEM_RASCIIZ( "mkdir(pathname)", ARG1 );
3793}
3794
3795PRE(sys_mprotect)
3796{
3797   PRINT("sys_mprotect ( %#lx, %lu, %lu )", ARG1, ARG2, ARG3);
3798   PRE_REG_READ3(long, "mprotect",
3799                 unsigned long, addr, vki_size_t, len, unsigned long, prot);
3800
3801   if (!ML_(valid_client_addr)(ARG1, ARG2, tid, "mprotect")) {
3802      SET_STATUS_Failure( VKI_ENOMEM );
3803   }
3804#if defined(VKI_PROT_GROWSDOWN)
3805   else
3806   if (ARG3 & (VKI_PROT_GROWSDOWN|VKI_PROT_GROWSUP)) {
3807      /* Deal with mprotects on growable stack areas.
3808
3809         The critical files to understand all this are mm/mprotect.c
3810         in the kernel and sysdeps/unix/sysv/linux/dl-execstack.c in
3811         glibc.
3812
3813         The kernel provides PROT_GROWSDOWN and PROT_GROWSUP which
3814         round the start/end address of mprotect to the start/end of
3815         the underlying vma and glibc uses that as an easy way to
3816         change the protection of the stack by calling mprotect on the
3817         last page of the stack with PROT_GROWSDOWN set.
3818
3819         The sanity check provided by the kernel is that the vma must
3820         have the VM_GROWSDOWN/VM_GROWSUP flag set as appropriate.  */
3821      UInt grows = ARG3 & (VKI_PROT_GROWSDOWN|VKI_PROT_GROWSUP);
3822      NSegment const *aseg = VG_(am_find_nsegment)(ARG1);
3823      NSegment const *rseg;
3824
3825      vg_assert(aseg);
3826
3827      if (grows == VKI_PROT_GROWSDOWN) {
3828         rseg = VG_(am_next_nsegment)( aseg, False/*backwards*/ );
3829         if (rseg &&
3830             rseg->kind == SkResvn &&
3831             rseg->smode == SmUpper &&
3832             rseg->end+1 == aseg->start) {
3833            Addr end = ARG1 + ARG2;
3834            ARG1 = aseg->start;
3835            ARG2 = end - aseg->start;
3836            ARG3 &= ~VKI_PROT_GROWSDOWN;
3837         } else {
3838            SET_STATUS_Failure( VKI_EINVAL );
3839         }
3840      } else if (grows == VKI_PROT_GROWSUP) {
3841         rseg = VG_(am_next_nsegment)( aseg, True/*forwards*/ );
3842         if (rseg &&
3843             rseg->kind == SkResvn &&
3844             rseg->smode == SmLower &&
3845             aseg->end+1 == rseg->start) {
3846            ARG2 = aseg->end - ARG1 + 1;
3847            ARG3 &= ~VKI_PROT_GROWSUP;
3848         } else {
3849            SET_STATUS_Failure( VKI_EINVAL );
3850         }
3851      } else {
3852         /* both GROWSUP and GROWSDOWN */
3853         SET_STATUS_Failure( VKI_EINVAL );
3854      }
3855   }
3856#endif   // defined(VKI_PROT_GROWSDOWN)
3857}
3858
3859POST(sys_mprotect)
3860{
3861   Addr a    = ARG1;
3862   SizeT len = ARG2;
3863   Int  prot = ARG3;
3864
3865   ML_(notify_core_and_tool_of_mprotect)(a, len, prot);
3866}
3867
3868PRE(sys_munmap)
3869{
3870   if (0) VG_(printf)("  munmap( %#lx )\n", ARG1);
3871   PRINT("sys_munmap ( %#lx, %llu )", ARG1,(ULong)ARG2);
3872   PRE_REG_READ2(long, "munmap", unsigned long, start, vki_size_t, length);
3873
3874   if (!ML_(valid_client_addr)(ARG1, ARG2, tid, "munmap"))
3875      SET_STATUS_Failure( VKI_EINVAL );
3876}
3877
3878POST(sys_munmap)
3879{
3880   Addr  a   = ARG1;
3881   SizeT len = ARG2;
3882
3883   ML_(notify_core_and_tool_of_munmap)( a, len );
3884}
3885
3886PRE(sys_mincore)
3887{
3888   PRINT("sys_mincore ( %#lx, %llu, %#lx )", ARG1,(ULong)ARG2,ARG3);
3889   PRE_REG_READ3(long, "mincore",
3890                 unsigned long, start, vki_size_t, length,
3891                 unsigned char *, vec);
3892   PRE_MEM_WRITE( "mincore(vec)", ARG3, VG_PGROUNDUP(ARG2) / VKI_PAGE_SIZE );
3893}
3894POST(sys_mincore)
3895{
3896   POST_MEM_WRITE( ARG3, VG_PGROUNDUP(ARG2) / VKI_PAGE_SIZE );
3897}
3898
3899PRE(sys_nanosleep)
3900{
3901   *flags |= SfMayBlock|SfPostOnFail;
3902   PRINT("sys_nanosleep ( %#lx, %#lx )", ARG1,ARG2);
3903   PRE_REG_READ2(long, "nanosleep",
3904                 struct timespec *, req, struct timespec *, rem);
3905   PRE_MEM_READ( "nanosleep(req)", ARG1, sizeof(struct vki_timespec) );
3906   if (ARG2 != 0)
3907      PRE_MEM_WRITE( "nanosleep(rem)", ARG2, sizeof(struct vki_timespec) );
3908}
3909
3910POST(sys_nanosleep)
3911{
3912   vg_assert(SUCCESS || FAILURE);
3913   if (ARG2 != 0 && FAILURE && ERR == VKI_EINTR)
3914      POST_MEM_WRITE( ARG2, sizeof(struct vki_timespec) );
3915}
3916
3917#if defined(VGO_linux) || defined(VGO_solaris)
3918/* Handles the case where the open is of /proc/self/auxv or
3919   /proc/<pid>/auxv, and just gives out a copy of the fd for the
3920   fake file we cooked up at startup (in m_main).  Also, seeks the
3921   cloned fd back to the start.
3922   Returns True if auxv open was handled (status is set). */
3923Bool ML_(handle_auxv_open)(SyscallStatus *status, const HChar *filename,
3924                           int flags)
3925{
3926   HChar  name[30];   // large enough
3927
3928   if (!ML_(safe_to_deref)((const void *) filename, 1))
3929      return False;
3930
3931   /* Opening /proc/<pid>/auxv or /proc/self/auxv? */
3932   VG_(sprintf)(name, "/proc/%d/auxv", VG_(getpid)());
3933   if (!VG_STREQ(filename, name) && !VG_STREQ(filename, "/proc/self/auxv"))
3934      return False;
3935
3936   /* Allow to open the file only for reading. */
3937   if (flags & (VKI_O_WRONLY | VKI_O_RDWR)) {
3938      SET_STATUS_Failure(VKI_EACCES);
3939      return True;
3940   }
3941
3942#  if defined(VGO_solaris)
3943   VG_(sprintf)(name, "/proc/self/fd/%d", VG_(cl_auxv_fd));
3944   SysRes sres = VG_(open)(name, flags, 0);
3945   SET_STATUS_from_SysRes(sres);
3946#  else
3947   SysRes sres = VG_(dup)(VG_(cl_auxv_fd));
3948   SET_STATUS_from_SysRes(sres);
3949   if (!sr_isError(sres)) {
3950      OffT off = VG_(lseek)(sr_Res(sres), 0, VKI_SEEK_SET);
3951      if (off < 0)
3952         SET_STATUS_Failure(VKI_EMFILE);
3953   }
3954#  endif
3955
3956   return True;
3957}
3958#endif // defined(VGO_linux) || defined(VGO_solaris)
3959
3960PRE(sys_open)
3961{
3962   if (ARG2 & VKI_O_CREAT) {
3963      // 3-arg version
3964      PRINT("sys_open ( %#lx(%s), %ld, %ld )",ARG1, (HChar*)ARG1, SARG2, SARG3);
3965      PRE_REG_READ3(long, "open",
3966                    const char *, filename, int, flags, int, mode);
3967   } else {
3968      // 2-arg version
3969      PRINT("sys_open ( %#lx(%s), %ld )",ARG1, (HChar*)ARG1, SARG2);
3970      PRE_REG_READ2(long, "open",
3971                    const char *, filename, int, flags);
3972   }
3973   PRE_MEM_RASCIIZ( "open(filename)", ARG1 );
3974
3975#if defined(VGO_linux)
3976   /* Handle the case where the open is of /proc/self/cmdline or
3977      /proc/<pid>/cmdline, and just give it a copy of the fd for the
3978      fake file we cooked up at startup (in m_main).  Also, seek the
3979      cloned fd back to the start. */
3980   {
3981      HChar  name[30];   // large enough
3982      HChar* arg1s = (HChar*) ARG1;
3983      SysRes sres;
3984
3985      VG_(sprintf)(name, "/proc/%d/cmdline", VG_(getpid)());
3986      if (ML_(safe_to_deref)( arg1s, 1 ) &&
3987          (VG_STREQ(arg1s, name) || VG_STREQ(arg1s, "/proc/self/cmdline"))
3988         )
3989      {
3990         sres = VG_(dup)( VG_(cl_cmdline_fd) );
3991         SET_STATUS_from_SysRes( sres );
3992         if (!sr_isError(sres)) {
3993            OffT off = VG_(lseek)( sr_Res(sres), 0, VKI_SEEK_SET );
3994            if (off < 0)
3995               SET_STATUS_Failure( VKI_EMFILE );
3996         }
3997         return;
3998      }
3999   }
4000
4001   /* Handle also the case of /proc/self/auxv or /proc/<pid>/auxv. */
4002   if (ML_(handle_auxv_open)(status, (const HChar *)ARG1, ARG2))
4003      return;
4004#endif // defined(VGO_linux)
4005
4006   /* Otherwise handle normally */
4007   *flags |= SfMayBlock;
4008}
4009
4010POST(sys_open)
4011{
4012   vg_assert(SUCCESS);
4013   if (!ML_(fd_allowed)(RES, "open", tid, True)) {
4014      VG_(close)(RES);
4015      SET_STATUS_Failure( VKI_EMFILE );
4016   } else {
4017      if (VG_(clo_track_fds))
4018         ML_(record_fd_open_with_given_name)(tid, RES, (HChar*)ARG1);
4019   }
4020}
4021
4022PRE(sys_read)
4023{
4024   *flags |= SfMayBlock;
4025   PRINT("sys_read ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4026   PRE_REG_READ3(ssize_t, "read",
4027                 unsigned int, fd, char *, buf, vki_size_t, count);
4028
4029   if (!ML_(fd_allowed)(ARG1, "read", tid, False))
4030      SET_STATUS_Failure( VKI_EBADF );
4031   else
4032      PRE_MEM_WRITE( "read(buf)", ARG2, ARG3 );
4033}
4034
4035POST(sys_read)
4036{
4037   vg_assert(SUCCESS);
4038   POST_MEM_WRITE( ARG2, RES );
4039}
4040
4041PRE(sys_write)
4042{
4043   Bool ok;
4044   *flags |= SfMayBlock;
4045   PRINT("sys_write ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4046   PRE_REG_READ3(ssize_t, "write",
4047                 unsigned int, fd, const char *, buf, vki_size_t, count);
4048   /* check to see if it is allowed.  If not, try for an exemption from
4049      --sim-hints=enable-outer (used for self hosting). */
4050   ok = ML_(fd_allowed)(ARG1, "write", tid, False);
4051   if (!ok && ARG1 == 2/*stderr*/
4052           && SimHintiS(SimHint_enable_outer, VG_(clo_sim_hints)))
4053      ok = True;
4054#if defined(VGO_solaris)
4055   if (!ok && VG_(vfork_fildes_addr) != NULL &&
4056       *VG_(vfork_fildes_addr) >= 0 && *VG_(vfork_fildes_addr) == ARG1)
4057      ok = True;
4058#endif
4059   if (!ok)
4060      SET_STATUS_Failure( VKI_EBADF );
4061   else
4062      PRE_MEM_READ( "write(buf)", ARG2, ARG3 );
4063}
4064
4065PRE(sys_creat)
4066{
4067   *flags |= SfMayBlock;
4068   PRINT("sys_creat ( %#lx(%s), %ld )", ARG1, (HChar*)ARG1, SARG2);
4069   PRE_REG_READ2(long, "creat", const char *, pathname, int, mode);
4070   PRE_MEM_RASCIIZ( "creat(pathname)", ARG1 );
4071}
4072
4073POST(sys_creat)
4074{
4075   vg_assert(SUCCESS);
4076   if (!ML_(fd_allowed)(RES, "creat", tid, True)) {
4077      VG_(close)(RES);
4078      SET_STATUS_Failure( VKI_EMFILE );
4079   } else {
4080      if (VG_(clo_track_fds))
4081         ML_(record_fd_open_with_given_name)(tid, RES, (HChar*)ARG1);
4082   }
4083}
4084
4085PRE(sys_poll)
4086{
4087   /* struct pollfd {
4088        int fd;           -- file descriptor
4089        short events;     -- requested events
4090        short revents;    -- returned events
4091      };
4092      int poll(struct pollfd *ufds, unsigned int nfds, int timeout)
4093   */
4094   UInt i;
4095   struct vki_pollfd* ufds = (struct vki_pollfd *)ARG1;
4096   *flags |= SfMayBlock;
4097   PRINT("sys_poll ( %#lx, %lu, %ld )\n", ARG1, ARG2, SARG3);
4098   PRE_REG_READ3(long, "poll",
4099                 struct vki_pollfd *, ufds, unsigned int, nfds, long, timeout);
4100
4101   for (i = 0; i < ARG2; i++) {
4102      PRE_MEM_READ( "poll(ufds.fd)",
4103                    (Addr)(&ufds[i].fd), sizeof(ufds[i].fd) );
4104      PRE_MEM_READ( "poll(ufds.events)",
4105                    (Addr)(&ufds[i].events), sizeof(ufds[i].events) );
4106      PRE_MEM_WRITE( "poll(ufds.revents)",
4107                     (Addr)(&ufds[i].revents), sizeof(ufds[i].revents) );
4108   }
4109}
4110
4111POST(sys_poll)
4112{
4113   if (RES >= 0) {
4114      UInt i;
4115      struct vki_pollfd* ufds = (struct vki_pollfd *)ARG1;
4116      for (i = 0; i < ARG2; i++)
4117	 POST_MEM_WRITE( (Addr)(&ufds[i].revents), sizeof(ufds[i].revents) );
4118   }
4119}
4120
4121PRE(sys_readlink)
4122{
4123   FUSE_COMPATIBLE_MAY_BLOCK();
4124   Word saved = SYSNO;
4125
4126   PRINT("sys_readlink ( %#lx(%s), %#lx, %llu )", ARG1,(char*)ARG1,ARG2,(ULong)ARG3);
4127   PRE_REG_READ3(long, "readlink",
4128                 const char *, path, char *, buf, int, bufsiz);
4129   PRE_MEM_RASCIIZ( "readlink(path)", ARG1 );
4130   PRE_MEM_WRITE( "readlink(buf)", ARG2,ARG3 );
4131
4132   {
4133#if defined(VGO_linux)
4134      /*
4135       * Handle the case where readlink is looking at /proc/self/exe or
4136       * /proc/<pid>/exe.
4137       */
4138      HChar  name[30];   // large enough
4139      HChar* arg1s = (HChar*) ARG1;
4140      VG_(sprintf)(name, "/proc/%d/exe", VG_(getpid)());
4141      if (ML_(safe_to_deref)(arg1s, 1) &&
4142          (VG_STREQ(arg1s, name) || VG_STREQ(arg1s, "/proc/self/exe"))
4143         )
4144      {
4145         VG_(sprintf)(name, "/proc/self/fd/%d", VG_(cl_exec_fd));
4146         SET_STATUS_from_SysRes( VG_(do_syscall3)(saved, (UWord)name,
4147                                                         ARG2, ARG3));
4148      } else
4149#elif defined(VGO_solaris)
4150      /* Same for Solaris, but /proc/self/path/a.out and
4151         /proc/<pid>/path/a.out. */
4152      HChar  name[30];   // large enough
4153      HChar* arg1s = (HChar*) ARG1;
4154      VG_(sprintf)(name, "/proc/%d/path/a.out", VG_(getpid)());
4155      if (ML_(safe_to_deref)(arg1s, 1) &&
4156          (VG_STREQ(arg1s, name) || VG_STREQ(arg1s, "/proc/self/path/a.out"))
4157         )
4158      {
4159         VG_(sprintf)(name, "/proc/self/path/%d", VG_(cl_exec_fd));
4160         SET_STATUS_from_SysRes( VG_(do_syscall3)(saved, (UWord)name,
4161                                                         ARG2, ARG3));
4162      } else
4163#endif
4164      {
4165         /* Normal case */
4166         SET_STATUS_from_SysRes( VG_(do_syscall3)(saved, ARG1, ARG2, ARG3));
4167      }
4168   }
4169
4170   if (SUCCESS && RES > 0)
4171      POST_MEM_WRITE( ARG2, RES );
4172}
4173
4174PRE(sys_readv)
4175{
4176   Int i;
4177   struct vki_iovec * vec;
4178   *flags |= SfMayBlock;
4179   PRINT("sys_readv ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4180   PRE_REG_READ3(ssize_t, "readv",
4181                 unsigned long, fd, const struct iovec *, vector,
4182                 unsigned long, count);
4183   if (!ML_(fd_allowed)(ARG1, "readv", tid, False)) {
4184      SET_STATUS_Failure( VKI_EBADF );
4185   } else {
4186      if ((Int)ARG3 >= 0)
4187         PRE_MEM_READ( "readv(vector)", ARG2, ARG3 * sizeof(struct vki_iovec) );
4188
4189      if (ARG2 != 0) {
4190         /* ToDo: don't do any of the following if the vector is invalid */
4191         vec = (struct vki_iovec *)ARG2;
4192         for (i = 0; i < (Int)ARG3; i++)
4193            PRE_MEM_WRITE( "readv(vector[...])",
4194                           (Addr)vec[i].iov_base, vec[i].iov_len );
4195      }
4196   }
4197}
4198
4199POST(sys_readv)
4200{
4201   vg_assert(SUCCESS);
4202   if (RES > 0) {
4203      Int i;
4204      struct vki_iovec * vec = (struct vki_iovec *)ARG2;
4205      Int remains = RES;
4206
4207      /* RES holds the number of bytes read. */
4208      for (i = 0; i < (Int)ARG3; i++) {
4209	 Int nReadThisBuf = vec[i].iov_len;
4210	 if (nReadThisBuf > remains) nReadThisBuf = remains;
4211	 POST_MEM_WRITE( (Addr)vec[i].iov_base, nReadThisBuf );
4212	 remains -= nReadThisBuf;
4213	 if (remains < 0) VG_(core_panic)("readv: remains < 0");
4214      }
4215   }
4216}
4217
4218PRE(sys_rename)
4219{
4220   FUSE_COMPATIBLE_MAY_BLOCK();
4221   PRINT("sys_rename ( %#lx(%s), %#lx(%s) )", ARG1,(char*)ARG1,ARG2,(char*)ARG2);
4222   PRE_REG_READ2(long, "rename", const char *, oldpath, const char *, newpath);
4223   PRE_MEM_RASCIIZ( "rename(oldpath)", ARG1 );
4224   PRE_MEM_RASCIIZ( "rename(newpath)", ARG2 );
4225}
4226
4227PRE(sys_rmdir)
4228{
4229   *flags |= SfMayBlock;
4230   PRINT("sys_rmdir ( %#lx(%s) )", ARG1,(char*)ARG1);
4231   PRE_REG_READ1(long, "rmdir", const char *, pathname);
4232   PRE_MEM_RASCIIZ( "rmdir(pathname)", ARG1 );
4233}
4234
4235PRE(sys_select)
4236{
4237   *flags |= SfMayBlock;
4238   PRINT("sys_select ( %ld, %#lx, %#lx, %#lx, %#lx )", SARG1, ARG2, ARG3,
4239         ARG4, ARG5);
4240   PRE_REG_READ5(long, "select",
4241                 int, n, vki_fd_set *, readfds, vki_fd_set *, writefds,
4242                 vki_fd_set *, exceptfds, struct vki_timeval *, timeout);
4243   // XXX: this possibly understates how much memory is read.
4244   if (ARG2 != 0)
4245      PRE_MEM_READ( "select(readfds)",
4246		     ARG2, ARG1/8 /* __FD_SETSIZE/8 */ );
4247   if (ARG3 != 0)
4248      PRE_MEM_READ( "select(writefds)",
4249		     ARG3, ARG1/8 /* __FD_SETSIZE/8 */ );
4250   if (ARG4 != 0)
4251      PRE_MEM_READ( "select(exceptfds)",
4252		     ARG4, ARG1/8 /* __FD_SETSIZE/8 */ );
4253   if (ARG5 != 0)
4254      PRE_timeval_READ( "select(timeout)", ARG5 );
4255}
4256
4257PRE(sys_setgid)
4258{
4259   PRINT("sys_setgid ( %lu )", ARG1);
4260   PRE_REG_READ1(long, "setgid", vki_gid_t, gid);
4261}
4262
4263PRE(sys_setsid)
4264{
4265   PRINT("sys_setsid ( )");
4266   PRE_REG_READ0(long, "setsid");
4267}
4268
4269PRE(sys_setgroups)
4270{
4271   PRINT("setgroups ( %llu, %#lx )", (ULong)ARG1, ARG2);
4272   PRE_REG_READ2(long, "setgroups", int, size, vki_gid_t *, list);
4273   if (ARG1 > 0)
4274      PRE_MEM_READ( "setgroups(list)", ARG2, ARG1 * sizeof(vki_gid_t) );
4275}
4276
4277PRE(sys_setpgid)
4278{
4279   PRINT("setpgid ( %ld, %ld )", SARG1, SARG2);
4280   PRE_REG_READ2(long, "setpgid", vki_pid_t, pid, vki_pid_t, pgid);
4281}
4282
4283PRE(sys_setregid)
4284{
4285   PRINT("sys_setregid ( %lu, %lu )", ARG1, ARG2);
4286   PRE_REG_READ2(long, "setregid", vki_gid_t, rgid, vki_gid_t, egid);
4287}
4288
4289PRE(sys_setreuid)
4290{
4291   PRINT("sys_setreuid ( 0x%lx, 0x%lx )", ARG1, ARG2);
4292   PRE_REG_READ2(long, "setreuid", vki_uid_t, ruid, vki_uid_t, euid);
4293}
4294
4295PRE(sys_setrlimit)
4296{
4297   UWord arg1 = ARG1;
4298   PRINT("sys_setrlimit ( %lu, %#lx )", ARG1, ARG2);
4299   PRE_REG_READ2(long, "setrlimit",
4300                 unsigned int, resource, struct rlimit *, rlim);
4301   PRE_MEM_READ( "setrlimit(rlim)", ARG2, sizeof(struct vki_rlimit) );
4302
4303#ifdef _RLIMIT_POSIX_FLAG
4304   // Darwin will sometimes set _RLIMIT_POSIX_FLAG on setrlimit calls.
4305   // Unset it here to make the if statements below work correctly.
4306   arg1 &= ~_RLIMIT_POSIX_FLAG;
4307#endif
4308
4309   if (!VG_(am_is_valid_for_client)(ARG2, sizeof(struct vki_rlimit),
4310                                    VKI_PROT_READ)) {
4311      SET_STATUS_Failure( VKI_EFAULT );
4312   }
4313   else if (((struct vki_rlimit *)ARG2)->rlim_cur
4314            > ((struct vki_rlimit *)ARG2)->rlim_max) {
4315      SET_STATUS_Failure( VKI_EINVAL );
4316   }
4317   else if (arg1 == VKI_RLIMIT_NOFILE) {
4318      if (((struct vki_rlimit *)ARG2)->rlim_cur > VG_(fd_hard_limit) ||
4319          ((struct vki_rlimit *)ARG2)->rlim_max != VG_(fd_hard_limit)) {
4320         SET_STATUS_Failure( VKI_EPERM );
4321      }
4322      else {
4323         VG_(fd_soft_limit) = ((struct vki_rlimit *)ARG2)->rlim_cur;
4324         SET_STATUS_Success( 0 );
4325      }
4326   }
4327   else if (arg1 == VKI_RLIMIT_DATA) {
4328      if (((struct vki_rlimit *)ARG2)->rlim_cur > VG_(client_rlimit_data).rlim_max ||
4329          ((struct vki_rlimit *)ARG2)->rlim_max > VG_(client_rlimit_data).rlim_max) {
4330         SET_STATUS_Failure( VKI_EPERM );
4331      }
4332      else {
4333         VG_(client_rlimit_data) = *(struct vki_rlimit *)ARG2;
4334         SET_STATUS_Success( 0 );
4335      }
4336   }
4337   else if (arg1 == VKI_RLIMIT_STACK && tid == 1) {
4338      if (((struct vki_rlimit *)ARG2)->rlim_cur > VG_(client_rlimit_stack).rlim_max ||
4339          ((struct vki_rlimit *)ARG2)->rlim_max > VG_(client_rlimit_stack).rlim_max) {
4340         SET_STATUS_Failure( VKI_EPERM );
4341      }
4342      else {
4343         /* Change the value of client_stack_szB to the rlim_cur value but
4344            only if it is smaller than the size of the allocated stack for the
4345            client.
4346            TODO: All platforms should set VG_(clstk_max_size) as part of their
4347                  setup_client_stack(). */
4348         if ((VG_(clstk_max_size) == 0)
4349             || (((struct vki_rlimit *) ARG2)->rlim_cur <= VG_(clstk_max_size)))
4350            VG_(threads)[tid].client_stack_szB = ((struct vki_rlimit *)ARG2)->rlim_cur;
4351
4352         VG_(client_rlimit_stack) = *(struct vki_rlimit *)ARG2;
4353         SET_STATUS_Success( 0 );
4354      }
4355   }
4356}
4357
4358PRE(sys_setuid)
4359{
4360   PRINT("sys_setuid ( %lu )", ARG1);
4361   PRE_REG_READ1(long, "setuid", vki_uid_t, uid);
4362}
4363
4364PRE(sys_newstat)
4365{
4366   FUSE_COMPATIBLE_MAY_BLOCK();
4367   PRINT("sys_newstat ( %#lx(%s), %#lx )", ARG1,(char*)ARG1,ARG2);
4368   PRE_REG_READ2(long, "stat", char *, file_name, struct stat *, buf);
4369   PRE_MEM_RASCIIZ( "stat(file_name)", ARG1 );
4370   PRE_MEM_WRITE( "stat(buf)", ARG2, sizeof(struct vki_stat) );
4371}
4372
4373POST(sys_newstat)
4374{
4375   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat) );
4376}
4377
4378PRE(sys_statfs)
4379{
4380   FUSE_COMPATIBLE_MAY_BLOCK();
4381   PRINT("sys_statfs ( %#lx(%s), %#lx )",ARG1,(char*)ARG1,ARG2);
4382   PRE_REG_READ2(long, "statfs", const char *, path, struct statfs *, buf);
4383   PRE_MEM_RASCIIZ( "statfs(path)", ARG1 );
4384   PRE_MEM_WRITE( "statfs(buf)", ARG2, sizeof(struct vki_statfs) );
4385}
4386POST(sys_statfs)
4387{
4388   POST_MEM_WRITE( ARG2, sizeof(struct vki_statfs) );
4389}
4390
4391PRE(sys_statfs64)
4392{
4393   PRINT("sys_statfs64 ( %#lx(%s), %llu, %#lx )",ARG1,(char*)ARG1,(ULong)ARG2,ARG3);
4394   PRE_REG_READ3(long, "statfs64",
4395                 const char *, path, vki_size_t, size, struct statfs64 *, buf);
4396   PRE_MEM_RASCIIZ( "statfs64(path)", ARG1 );
4397   PRE_MEM_WRITE( "statfs64(buf)", ARG3, ARG2 );
4398}
4399POST(sys_statfs64)
4400{
4401   POST_MEM_WRITE( ARG3, ARG2 );
4402}
4403
4404PRE(sys_symlink)
4405{
4406   *flags |= SfMayBlock;
4407   PRINT("sys_symlink ( %#lx(%s), %#lx(%s) )",ARG1,(char*)ARG1,ARG2,(char*)ARG2);
4408   PRE_REG_READ2(long, "symlink", const char *, oldpath, const char *, newpath);
4409   PRE_MEM_RASCIIZ( "symlink(oldpath)", ARG1 );
4410   PRE_MEM_RASCIIZ( "symlink(newpath)", ARG2 );
4411}
4412
4413PRE(sys_time)
4414{
4415   /* time_t time(time_t *t); */
4416   PRINT("sys_time ( %#lx )",ARG1);
4417   PRE_REG_READ1(long, "time", int *, t);
4418   if (ARG1 != 0) {
4419      PRE_MEM_WRITE( "time(t)", ARG1, sizeof(vki_time_t) );
4420   }
4421}
4422
4423POST(sys_time)
4424{
4425   if (ARG1 != 0) {
4426      POST_MEM_WRITE( ARG1, sizeof(vki_time_t) );
4427   }
4428}
4429
4430PRE(sys_times)
4431{
4432   PRINT("sys_times ( %#lx )", ARG1);
4433   PRE_REG_READ1(long, "times", struct tms *, buf);
4434   if (ARG1 != 0) {
4435      PRE_MEM_WRITE( "times(buf)", ARG1, sizeof(struct vki_tms) );
4436   }
4437}
4438
4439POST(sys_times)
4440{
4441   if (ARG1 != 0) {
4442      POST_MEM_WRITE( ARG1, sizeof(struct vki_tms) );
4443   }
4444}
4445
4446PRE(sys_umask)
4447{
4448   PRINT("sys_umask ( %ld )", SARG1);
4449   PRE_REG_READ1(long, "umask", int, mask);
4450}
4451
4452PRE(sys_unlink)
4453{
4454   *flags |= SfMayBlock;
4455   PRINT("sys_unlink ( %#lx(%s) )", ARG1,(char*)ARG1);
4456   PRE_REG_READ1(long, "unlink", const char *, pathname);
4457   PRE_MEM_RASCIIZ( "unlink(pathname)", ARG1 );
4458}
4459
4460PRE(sys_newuname)
4461{
4462   PRINT("sys_newuname ( %#lx )", ARG1);
4463   PRE_REG_READ1(long, "uname", struct new_utsname *, buf);
4464   PRE_MEM_WRITE( "uname(buf)", ARG1, sizeof(struct vki_new_utsname) );
4465}
4466
4467POST(sys_newuname)
4468{
4469   if (ARG1 != 0) {
4470      POST_MEM_WRITE( ARG1, sizeof(struct vki_new_utsname) );
4471   }
4472}
4473
4474PRE(sys_waitpid)
4475{
4476   *flags |= SfMayBlock;
4477   PRINT("sys_waitpid ( %ld, %#lx, %ld )", SARG1, ARG2, SARG3);
4478   PRE_REG_READ3(long, "waitpid",
4479                 vki_pid_t, pid, unsigned int *, status, int, options);
4480
4481   if (ARG2 != (Addr)NULL)
4482      PRE_MEM_WRITE( "waitpid(status)", ARG2, sizeof(int) );
4483}
4484
4485POST(sys_waitpid)
4486{
4487   if (ARG2 != (Addr)NULL)
4488      POST_MEM_WRITE( ARG2, sizeof(int) );
4489}
4490
4491PRE(sys_wait4)
4492{
4493   *flags |= SfMayBlock;
4494   PRINT("sys_wait4 ( %ld, %#lx, %ld, %#lx )", SARG1, ARG2, SARG3, ARG4);
4495
4496   PRE_REG_READ4(long, "wait4",
4497                 vki_pid_t, pid, unsigned int *, status, int, options,
4498                 struct rusage *, rusage);
4499   if (ARG2 != (Addr)NULL)
4500      PRE_MEM_WRITE( "wait4(status)", ARG2, sizeof(int) );
4501   if (ARG4 != (Addr)NULL)
4502      PRE_MEM_WRITE( "wait4(rusage)", ARG4, sizeof(struct vki_rusage) );
4503}
4504
4505POST(sys_wait4)
4506{
4507   if (ARG2 != (Addr)NULL)
4508      POST_MEM_WRITE( ARG2, sizeof(int) );
4509   if (ARG4 != (Addr)NULL)
4510      POST_MEM_WRITE( ARG4, sizeof(struct vki_rusage) );
4511}
4512
4513PRE(sys_writev)
4514{
4515   Int i;
4516   struct vki_iovec * vec;
4517   *flags |= SfMayBlock;
4518   PRINT("sys_writev ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4519   PRE_REG_READ3(ssize_t, "writev",
4520                 unsigned long, fd, const struct iovec *, vector,
4521                 unsigned long, count);
4522   if (!ML_(fd_allowed)(ARG1, "writev", tid, False)) {
4523      SET_STATUS_Failure( VKI_EBADF );
4524   } else {
4525      if ((Int)ARG3 >= 0)
4526         PRE_MEM_READ( "writev(vector)",
4527                       ARG2, ARG3 * sizeof(struct vki_iovec) );
4528      if (ARG2 != 0) {
4529         /* ToDo: don't do any of the following if the vector is invalid */
4530         vec = (struct vki_iovec *)ARG2;
4531         for (i = 0; i < (Int)ARG3; i++)
4532            PRE_MEM_READ( "writev(vector[...])",
4533                           (Addr)vec[i].iov_base, vec[i].iov_len );
4534      }
4535   }
4536}
4537
4538PRE(sys_utimes)
4539{
4540   FUSE_COMPATIBLE_MAY_BLOCK();
4541   PRINT("sys_utimes ( %#lx(%s), %#lx )", ARG1,(char*)ARG1,ARG2);
4542   PRE_REG_READ2(long, "utimes", char *, filename, struct timeval *, tvp);
4543   PRE_MEM_RASCIIZ( "utimes(filename)", ARG1 );
4544   if (ARG2 != 0) {
4545      PRE_timeval_READ( "utimes(tvp[0])", ARG2 );
4546      PRE_timeval_READ( "utimes(tvp[1])", ARG2+sizeof(struct vki_timeval) );
4547   }
4548}
4549
4550PRE(sys_acct)
4551{
4552   PRINT("sys_acct ( %#lx(%s) )", ARG1,(char*)ARG1);
4553   PRE_REG_READ1(long, "acct", const char *, filename);
4554   PRE_MEM_RASCIIZ( "acct(filename)", ARG1 );
4555}
4556
4557PRE(sys_pause)
4558{
4559   *flags |= SfMayBlock;
4560   PRINT("sys_pause ( )");
4561   PRE_REG_READ0(long, "pause");
4562}
4563
4564PRE(sys_sigaltstack)
4565{
4566   PRINT("sigaltstack ( %#lx, %#lx )",ARG1,ARG2);
4567   PRE_REG_READ2(int, "sigaltstack",
4568                 const vki_stack_t *, ss, vki_stack_t *, oss);
4569   if (ARG1 != 0) {
4570      const vki_stack_t *ss = (vki_stack_t *)ARG1;
4571      PRE_MEM_READ( "sigaltstack(ss)", (Addr)&ss->ss_sp, sizeof(ss->ss_sp) );
4572      PRE_MEM_READ( "sigaltstack(ss)", (Addr)&ss->ss_flags, sizeof(ss->ss_flags) );
4573      PRE_MEM_READ( "sigaltstack(ss)", (Addr)&ss->ss_size, sizeof(ss->ss_size) );
4574   }
4575   if (ARG2 != 0) {
4576      PRE_MEM_WRITE( "sigaltstack(oss)", ARG2, sizeof(vki_stack_t) );
4577   }
4578
4579   /* Be safe. */
4580   if (ARG1 && !ML_(safe_to_deref((void*)ARG1, sizeof(vki_stack_t)))) {
4581      SET_STATUS_Failure(VKI_EFAULT);
4582      return;
4583   }
4584   if (ARG2 && !ML_(safe_to_deref((void*)ARG2, sizeof(vki_stack_t)))) {
4585      SET_STATUS_Failure(VKI_EFAULT);
4586      return;
4587   }
4588
4589   SET_STATUS_from_SysRes(
4590      VG_(do_sys_sigaltstack) (tid, (vki_stack_t*)ARG1,
4591                              (vki_stack_t*)ARG2)
4592   );
4593}
4594POST(sys_sigaltstack)
4595{
4596   vg_assert(SUCCESS);
4597   if (RES == 0 && ARG2 != 0)
4598      POST_MEM_WRITE( ARG2, sizeof(vki_stack_t));
4599}
4600
4601PRE(sys_sethostname)
4602{
4603   PRINT("sys_sethostname ( %#lx, %ld )", ARG1, SARG2);
4604   PRE_REG_READ2(long, "sethostname", char *, name, int, len);
4605   PRE_MEM_READ( "sethostname(name)", ARG1, ARG2 );
4606}
4607
4608#undef PRE
4609#undef POST
4610
4611#endif // defined(VGO_linux) || defined(VGO_darwin) || defined(VGO_solaris)
4612
4613/*--------------------------------------------------------------------*/
4614/*--- end                                                          ---*/
4615/*--------------------------------------------------------------------*/
4616