1/* -*- mode: C; c-basic-offset: 3; -*- */
2
3/*--------------------------------------------------------------------*/
4/*--- Wrappers for generic Unix system calls                       ---*/
5/*---                                            syswrap-generic.c ---*/
6/*--------------------------------------------------------------------*/
7
8/*
9   This file is part of Valgrind, a dynamic binary instrumentation
10   framework.
11
12   Copyright (C) 2000-2017 Julian Seward
13      jseward@acm.org
14
15   This program is free software; you can redistribute it and/or
16   modify it under the terms of the GNU General Public License as
17   published by the Free Software Foundation; either version 2 of the
18   License, or (at your option) any later version.
19
20   This program is distributed in the hope that it will be useful, but
21   WITHOUT ANY WARRANTY; without even the implied warranty of
22   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23   General Public License for more details.
24
25   You should have received a copy of the GNU General Public License
26   along with this program; if not, write to the Free Software
27   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
28   02111-1307, USA.
29
30   The GNU General Public License is contained in the file COPYING.
31*/
32
33#if defined(VGO_linux) || defined(VGO_darwin) || defined(VGO_solaris)
34
35#include "pub_core_basics.h"
36#include "pub_core_vki.h"
37#include "pub_core_vkiscnums.h"
38#include "pub_core_threadstate.h"
39#include "pub_core_debuginfo.h"     // VG_(di_notify_*)
40#include "pub_core_aspacemgr.h"
41#include "pub_core_transtab.h"      // VG_(discard_translations)
42#include "pub_core_xarray.h"
43#include "pub_core_clientstate.h"   // VG_(brk_base), VG_(brk_limit)
44#include "pub_core_debuglog.h"
45#include "pub_core_errormgr.h"
46#include "pub_core_gdbserver.h"     // VG_(gdbserver)
47#include "pub_core_libcbase.h"
48#include "pub_core_libcassert.h"
49#include "pub_core_libcfile.h"
50#include "pub_core_libcprint.h"
51#include "pub_core_libcproc.h"
52#include "pub_core_libcsignal.h"
53#include "pub_core_machine.h"       // VG_(get_SP)
54#include "pub_core_mallocfree.h"
55#include "pub_core_options.h"
56#include "pub_core_scheduler.h"
57#include "pub_core_signals.h"
58#include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
59#include "pub_core_syscall.h"
60#include "pub_core_syswrap.h"
61#include "pub_core_tooliface.h"
62#include "pub_core_ume.h"
63#include "pub_core_stacks.h"
64
65#include "priv_types_n_macros.h"
66#include "priv_syswrap-generic.h"
67
68#include "config.h"
69
70
71void ML_(guess_and_register_stack) (Addr sp, ThreadState* tst)
72{
73   Bool debug = False;
74   NSegment const* seg;
75
76   /* We don't really know where the client stack is, because its
77      allocated by the client.  The best we can do is look at the
78      memory mappings and try to derive some useful information.  We
79      assume that sp starts near its highest possible value, and can
80      only go down to the start of the mmaped segment. */
81   seg = VG_(am_find_nsegment)(sp);
82   if (seg
83       && VG_(am_is_valid_for_client)(sp, 1, VKI_PROT_READ | VKI_PROT_WRITE)) {
84      tst->client_stack_highest_byte = (Addr)VG_PGROUNDUP(sp)-1;
85      tst->client_stack_szB = tst->client_stack_highest_byte - seg->start + 1;
86
87      tst->os_state.stk_id
88         = VG_(register_stack)(seg->start, tst->client_stack_highest_byte);
89
90      if (debug)
91	 VG_(printf)("tid %u: guessed client stack range [%#lx-%#lx]"
92                     " as stk_id %lu\n",
93		     tst->tid, seg->start, tst->client_stack_highest_byte,
94                     tst->os_state.stk_id);
95   } else {
96      VG_(message)(Vg_UserMsg,
97                   "!? New thread %u starts with SP(%#lx) unmapped\n",
98		   tst->tid, sp);
99      tst->client_stack_highest_byte = 0;
100      tst->client_stack_szB  = 0;
101   }
102}
103
104/* Returns True iff address range is something the client can
105   plausibly mess with: all of it is either already belongs to the
106   client or is free or a reservation. */
107
108Bool ML_(valid_client_addr)(Addr start, SizeT size, ThreadId tid,
109                                   const HChar *syscallname)
110{
111   Bool ret;
112
113   if (size == 0)
114      return True;
115
116   ret = VG_(am_is_valid_for_client_or_free_or_resvn)
117            (start,size,VKI_PROT_NONE);
118
119   if (0)
120      VG_(printf)("%s: test=%#lx-%#lx ret=%d\n",
121		  syscallname, start, start+size-1, (Int)ret);
122
123   if (!ret && syscallname != NULL) {
124      VG_(message)(Vg_UserMsg, "Warning: client syscall %s tried "
125                               "to modify addresses %#lx-%#lx\n",
126                               syscallname, start, start+size-1);
127      if (VG_(clo_verbosity) > 1) {
128         VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
129      }
130   }
131
132   return ret;
133}
134
135
136Bool ML_(client_signal_OK)(Int sigNo)
137{
138   /* signal 0 is OK for kill */
139   Bool ret = sigNo >= 0 && sigNo <= VG_SIGVGRTUSERMAX;
140
141   //VG_(printf)("client_signal_OK(%d) -> %d\n", sigNo, ret);
142
143   return ret;
144}
145
146
147/* Handy small function to help stop wrappers from segfaulting when
148   presented with bogus client addresses.  Is not used for generating
149   user-visible errors. */
150
151Bool ML_(safe_to_deref) ( const void *start, SizeT size )
152{
153   return VG_(am_is_valid_for_client)( (Addr)start, size, VKI_PROT_READ );
154}
155
156
157/* ---------------------------------------------------------------------
158   Doing mmap, mremap
159   ------------------------------------------------------------------ */
160
161/* AFAICT from kernel sources (mm/mprotect.c) and general experimentation,
162   munmap, mprotect (and mremap??) work at the page level.  So addresses
163   and lengths must be adjusted for this. */
164
165/* Mash around start and length so that the area exactly covers
166   an integral number of pages.  If we don't do that, memcheck's
167   idea of addressible memory diverges from that of the
168   kernel's, which causes the leak detector to crash. */
169static
170void page_align_addr_and_len( Addr* a, SizeT* len)
171{
172   Addr ra;
173
174   ra = VG_PGROUNDDN(*a);
175   *len = VG_PGROUNDUP(*a + *len) - ra;
176   *a = ra;
177}
178
179static void notify_core_of_mmap(Addr a, SizeT len, UInt prot,
180                                UInt flags, Int fd, Off64T offset)
181{
182   Bool d;
183
184   /* 'a' is the return value from a real kernel mmap, hence: */
185   vg_assert(VG_IS_PAGE_ALIGNED(a));
186   /* whereas len is whatever the syscall supplied.  So: */
187   len = VG_PGROUNDUP(len);
188
189   d = VG_(am_notify_client_mmap)( a, len, prot, flags, fd, offset );
190
191   if (d)
192      VG_(discard_translations)( a, (ULong)len,
193                                 "notify_core_of_mmap" );
194}
195
196static void notify_tool_of_mmap(Addr a, SizeT len, UInt prot, ULong di_handle)
197{
198   Bool rr, ww, xx;
199
200   /* 'a' is the return value from a real kernel mmap, hence: */
201   vg_assert(VG_IS_PAGE_ALIGNED(a));
202   /* whereas len is whatever the syscall supplied.  So: */
203   len = VG_PGROUNDUP(len);
204
205   rr = toBool(prot & VKI_PROT_READ);
206   ww = toBool(prot & VKI_PROT_WRITE);
207   xx = toBool(prot & VKI_PROT_EXEC);
208
209   VG_TRACK( new_mem_mmap, a, len, rr, ww, xx, di_handle );
210}
211
212
213/* When a client mmap has been successfully done, this function must
214   be called.  It notifies both aspacem and the tool of the new
215   mapping.
216
217   JRS 2008-Aug-14: But notice this is *very* obscure.  The only place
218   it is called from is POST(sys_io_setup).  In particular,
219   ML_(generic_PRE_sys_mmap), in m_syswrap, is the "normal case" handler for
220   client mmap.  But it doesn't call this function; instead it does the
221   relevant notifications itself.  Here, we just pass di_handle=0 to
222   notify_tool_of_mmap as we have no better information.  But really this
223   function should be done away with; problem is I don't understand what
224   POST(sys_io_setup) does or how it works.
225
226   [However, this function is used lots for Darwin, because
227    ML_(generic_PRE_sys_mmap) cannot be used for Darwin.]
228 */
229void
230ML_(notify_core_and_tool_of_mmap) ( Addr a, SizeT len, UInt prot,
231                                    UInt flags, Int fd, Off64T offset )
232{
233   // XXX: unlike the other notify_core_and_tool* functions, this one doesn't
234   // do anything with debug info (ie. it doesn't call VG_(di_notify_mmap)).
235   // Should it?  --njn
236   notify_core_of_mmap(a, len, prot, flags, fd, offset);
237   notify_tool_of_mmap(a, len, prot, 0/*di_handle*/);
238}
239
240void
241ML_(notify_core_and_tool_of_munmap) ( Addr a, SizeT len )
242{
243   Bool d;
244
245   page_align_addr_and_len(&a, &len);
246   d = VG_(am_notify_munmap)(a, len);
247   VG_TRACK( die_mem_munmap, a, len );
248   VG_(di_notify_munmap)( a, len );
249   if (d)
250      VG_(discard_translations)( a, (ULong)len,
251                                 "ML_(notify_core_and_tool_of_munmap)" );
252}
253
254void
255ML_(notify_core_and_tool_of_mprotect) ( Addr a, SizeT len, Int prot )
256{
257   Bool rr = toBool(prot & VKI_PROT_READ);
258   Bool ww = toBool(prot & VKI_PROT_WRITE);
259   Bool xx = toBool(prot & VKI_PROT_EXEC);
260   Bool d;
261
262   page_align_addr_and_len(&a, &len);
263   d = VG_(am_notify_mprotect)(a, len, prot);
264   VG_TRACK( change_mem_mprotect, a, len, rr, ww, xx );
265   VG_(di_notify_mprotect)( a, len, prot );
266   if (d)
267      VG_(discard_translations)( a, (ULong)len,
268                                 "ML_(notify_core_and_tool_of_mprotect)" );
269}
270
271
272
273#if HAVE_MREMAP
274/* Expand (or shrink) an existing mapping, potentially moving it at
275   the same time (controlled by the MREMAP_MAYMOVE flag).  Nightmare.
276*/
277static
278SysRes do_mremap( Addr old_addr, SizeT old_len,
279                  Addr new_addr, SizeT new_len,
280                  UWord flags, ThreadId tid )
281{
282#  define MIN_SIZET(_aa,_bb) (_aa) < (_bb) ? (_aa) : (_bb)
283
284   Bool      ok, d;
285   NSegment const* old_seg;
286   Addr      advised;
287   Bool      f_fixed   = toBool(flags & VKI_MREMAP_FIXED);
288   Bool      f_maymove = toBool(flags & VKI_MREMAP_MAYMOVE);
289
290   if (0)
291      VG_(printf)("do_remap (old %#lx %lu) (new %#lx %lu) %s %s\n",
292                  old_addr,old_len,new_addr,new_len,
293                  flags & VKI_MREMAP_MAYMOVE ? "MAYMOVE" : "",
294                  flags & VKI_MREMAP_FIXED ? "FIXED" : "");
295   if (0)
296      VG_(am_show_nsegments)(0, "do_remap: before");
297
298   if (flags & ~(VKI_MREMAP_FIXED | VKI_MREMAP_MAYMOVE))
299      goto eINVAL;
300
301   if (!VG_IS_PAGE_ALIGNED(old_addr))
302      goto eINVAL;
303
304   old_len = VG_PGROUNDUP(old_len);
305   new_len = VG_PGROUNDUP(new_len);
306
307   if (new_len == 0)
308      goto eINVAL;
309
310   /* kernel doesn't reject this, but we do. */
311   if (old_len == 0)
312      goto eINVAL;
313
314   /* reject wraparounds */
315   if (old_addr + old_len < old_addr)
316      goto eINVAL;
317   if (f_fixed == True && new_addr + new_len < new_len)
318      goto eINVAL;
319
320   /* kernel rejects all fixed, no-move requests (which are
321      meaningless). */
322   if (f_fixed == True && f_maymove == False)
323      goto eINVAL;
324
325   /* Stay away from non-client areas. */
326   if (!ML_(valid_client_addr)(old_addr, old_len, tid, "mremap(old_addr)"))
327      goto eINVAL;
328
329   /* In all remaining cases, if the old range does not fall within a
330      single segment, fail. */
331   old_seg = VG_(am_find_nsegment)( old_addr );
332   if (old_addr < old_seg->start || old_addr+old_len-1 > old_seg->end)
333      goto eINVAL;
334   if (old_seg->kind != SkAnonC && old_seg->kind != SkFileC
335       && old_seg->kind != SkShmC)
336      goto eINVAL;
337
338   vg_assert(old_len > 0);
339   vg_assert(new_len > 0);
340   vg_assert(VG_IS_PAGE_ALIGNED(old_len));
341   vg_assert(VG_IS_PAGE_ALIGNED(new_len));
342   vg_assert(VG_IS_PAGE_ALIGNED(old_addr));
343
344   /* There are 3 remaining cases:
345
346      * maymove == False
347
348        new space has to be at old address, so:
349            - shrink    -> unmap end
350            - same size -> do nothing
351            - grow      -> if can grow in-place, do so, else fail
352
353      * maymove == True, fixed == False
354
355        new space can be anywhere, so:
356            - shrink    -> unmap end
357            - same size -> do nothing
358            - grow      -> if can grow in-place, do so, else
359                           move to anywhere large enough, else fail
360
361      * maymove == True, fixed == True
362
363        new space must be at new address, so:
364
365            - if new address is not page aligned, fail
366            - if new address range overlaps old one, fail
367            - if new address range cannot be allocated, fail
368            - else move to new address range with new size
369            - else fail
370   */
371
372   if (f_maymove == False) {
373      /* new space has to be at old address */
374      if (new_len < old_len)
375         goto shrink_in_place;
376      if (new_len > old_len)
377         goto grow_in_place_or_fail;
378      goto same_in_place;
379   }
380
381   if (f_maymove == True && f_fixed == False) {
382      /* new space can be anywhere */
383      if (new_len < old_len)
384         goto shrink_in_place;
385      if (new_len > old_len)
386         goto grow_in_place_or_move_anywhere_or_fail;
387      goto same_in_place;
388   }
389
390   if (f_maymove == True && f_fixed == True) {
391      /* new space can only be at the new address */
392      if (!VG_IS_PAGE_ALIGNED(new_addr))
393         goto eINVAL;
394      if (new_addr+new_len-1 < old_addr || new_addr > old_addr+old_len-1) {
395         /* no overlap */
396      } else {
397         goto eINVAL;
398      }
399      if (new_addr == 0)
400         goto eINVAL;
401         /* VG_(am_get_advisory_client_simple) interprets zero to mean
402            non-fixed, which is not what we want */
403      advised = VG_(am_get_advisory_client_simple)(new_addr, new_len, &ok);
404      if (!ok || advised != new_addr)
405         goto eNOMEM;
406      ok = VG_(am_relocate_nooverlap_client)
407              ( &d, old_addr, old_len, new_addr, new_len );
408      if (ok) {
409         VG_TRACK( copy_mem_remap, old_addr, new_addr,
410                                   MIN_SIZET(old_len,new_len) );
411         if (new_len > old_len)
412            VG_TRACK( new_mem_mmap, new_addr+old_len, new_len-old_len,
413                      old_seg->hasR, old_seg->hasW, old_seg->hasX,
414                      0/*di_handle*/ );
415         VG_TRACK(die_mem_munmap, old_addr, old_len);
416         if (d) {
417            VG_(discard_translations)( old_addr, old_len, "do_remap(1)" );
418            VG_(discard_translations)( new_addr, new_len, "do_remap(2)" );
419         }
420         return VG_(mk_SysRes_Success)( new_addr );
421      }
422      goto eNOMEM;
423   }
424
425   /* end of the 3 cases */
426   /*NOTREACHED*/ vg_assert(0);
427
428  grow_in_place_or_move_anywhere_or_fail:
429   {
430   /* try growing it in-place */
431   Addr   needA = old_addr + old_len;
432   SSizeT needL = new_len - old_len;
433
434   vg_assert(needL > 0);
435   vg_assert(needA > 0);
436
437   advised = VG_(am_get_advisory_client_simple)( needA, needL, &ok );
438   if (ok) {
439      /* Fixes bug #129866. */
440      ok = VG_(am_covered_by_single_free_segment) ( needA, needL );
441   }
442   if (ok && advised == needA) {
443      const NSegment *new_seg = VG_(am_extend_map_client)( old_addr, needL );
444      if (new_seg) {
445         VG_TRACK( new_mem_mmap, needA, needL,
446                                 new_seg->hasR,
447                                 new_seg->hasW, new_seg->hasX,
448                                 0/*di_handle*/ );
449         return VG_(mk_SysRes_Success)( old_addr );
450      }
451   }
452
453   /* that failed.  Look elsewhere. */
454   advised = VG_(am_get_advisory_client_simple)( 0, new_len, &ok );
455   if (ok) {
456      Bool oldR = old_seg->hasR;
457      Bool oldW = old_seg->hasW;
458      Bool oldX = old_seg->hasX;
459      /* assert new area does not overlap old */
460      vg_assert(advised+new_len-1 < old_addr
461                || advised > old_addr+old_len-1);
462      ok = VG_(am_relocate_nooverlap_client)
463              ( &d, old_addr, old_len, advised, new_len );
464      if (ok) {
465         VG_TRACK( copy_mem_remap, old_addr, advised,
466                                   MIN_SIZET(old_len,new_len) );
467         if (new_len > old_len)
468            VG_TRACK( new_mem_mmap, advised+old_len, new_len-old_len,
469                      oldR, oldW, oldX, 0/*di_handle*/ );
470         VG_TRACK(die_mem_munmap, old_addr, old_len);
471         if (d) {
472            VG_(discard_translations)( old_addr, old_len, "do_remap(4)" );
473            VG_(discard_translations)( advised, new_len, "do_remap(5)" );
474         }
475         return VG_(mk_SysRes_Success)( advised );
476      }
477   }
478   goto eNOMEM;
479   }
480   /*NOTREACHED*/ vg_assert(0);
481
482  grow_in_place_or_fail:
483   {
484   Addr  needA = old_addr + old_len;
485   SizeT needL = new_len - old_len;
486
487   vg_assert(needA > 0);
488
489   advised = VG_(am_get_advisory_client_simple)( needA, needL, &ok );
490   if (ok) {
491      /* Fixes bug #129866. */
492      ok = VG_(am_covered_by_single_free_segment) ( needA, needL );
493   }
494   if (!ok || advised != needA)
495      goto eNOMEM;
496   const NSegment *new_seg = VG_(am_extend_map_client)( old_addr, needL );
497   if (!new_seg)
498      goto eNOMEM;
499   VG_TRACK( new_mem_mmap, needA, needL,
500                           new_seg->hasR, new_seg->hasW, new_seg->hasX,
501                           0/*di_handle*/ );
502
503   return VG_(mk_SysRes_Success)( old_addr );
504   }
505   /*NOTREACHED*/ vg_assert(0);
506
507  shrink_in_place:
508   {
509   SysRes sres = VG_(am_munmap_client)( &d, old_addr+new_len, old_len-new_len );
510   if (sr_isError(sres))
511      return sres;
512   VG_TRACK( die_mem_munmap, old_addr+new_len, old_len-new_len );
513   if (d)
514      VG_(discard_translations)( old_addr+new_len, old_len-new_len,
515                                 "do_remap(7)" );
516   return VG_(mk_SysRes_Success)( old_addr );
517   }
518   /*NOTREACHED*/ vg_assert(0);
519
520  same_in_place:
521   return VG_(mk_SysRes_Success)( old_addr );
522   /*NOTREACHED*/ vg_assert(0);
523
524  eINVAL:
525   return VG_(mk_SysRes_Error)( VKI_EINVAL );
526  eNOMEM:
527   return VG_(mk_SysRes_Error)( VKI_ENOMEM );
528
529#  undef MIN_SIZET
530}
531#endif /* HAVE_MREMAP */
532
533
534/* ---------------------------------------------------------------------
535   File-descriptor tracking
536   ------------------------------------------------------------------ */
537
538/* One of these is allocated for each open file descriptor.  */
539typedef struct OpenFd
540{
541   Int fd;                        /* The file descriptor */
542   HChar *pathname;               /* NULL if not a regular file or unknown */
543   ExeContext *where;             /* NULL if inherited from parent */
544   struct OpenFd *next, *prev;
545} OpenFd;
546
547/* List of allocated file descriptors. */
548static OpenFd *allocated_fds = NULL;
549
550/* Count of open file descriptors. */
551static Int fd_count = 0;
552
553
554/* Note the fact that a file descriptor was just closed. */
555void ML_(record_fd_close)(Int fd)
556{
557   OpenFd *i = allocated_fds;
558
559   if (fd >= VG_(fd_hard_limit))
560      return;			/* Valgrind internal */
561
562   while(i) {
563      if(i->fd == fd) {
564         if(i->prev)
565            i->prev->next = i->next;
566         else
567            allocated_fds = i->next;
568         if(i->next)
569            i->next->prev = i->prev;
570         if(i->pathname)
571            VG_(free) (i->pathname);
572         VG_(free) (i);
573         fd_count--;
574         break;
575      }
576      i = i->next;
577   }
578}
579
580/* Note the fact that a file descriptor was just opened.  If the
581   tid is -1, this indicates an inherited fd.  If the pathname is NULL,
582   this either indicates a non-standard file (i.e. a pipe or socket or
583   some such thing) or that we don't know the filename.  If the fd is
584   already open, then we're probably doing a dup2() to an existing fd,
585   so just overwrite the existing one. */
586void ML_(record_fd_open_with_given_name)(ThreadId tid, Int fd,
587                                         const HChar *pathname)
588{
589   OpenFd *i;
590
591   if (fd >= VG_(fd_hard_limit))
592      return;			/* Valgrind internal */
593
594   /* Check to see if this fd is already open. */
595   i = allocated_fds;
596   while (i) {
597      if (i->fd == fd) {
598         if (i->pathname) VG_(free)(i->pathname);
599         break;
600      }
601      i = i->next;
602   }
603
604   /* Not already one: allocate an OpenFd */
605   if (i == NULL) {
606      i = VG_(malloc)("syswrap.rfdowgn.1", sizeof(OpenFd));
607
608      i->prev = NULL;
609      i->next = allocated_fds;
610      if(allocated_fds) allocated_fds->prev = i;
611      allocated_fds = i;
612      fd_count++;
613   }
614
615   i->fd = fd;
616   i->pathname = VG_(strdup)("syswrap.rfdowgn.2", pathname);
617   i->where = (tid == -1) ? NULL : VG_(record_ExeContext)(tid, 0/*first_ip_delta*/);
618}
619
620// Record opening of an fd, and find its name.
621void ML_(record_fd_open_named)(ThreadId tid, Int fd)
622{
623   const HChar* buf;
624   const HChar* name;
625   if (VG_(resolve_filename)(fd, &buf))
626      name = buf;
627   else
628      name = NULL;
629
630   ML_(record_fd_open_with_given_name)(tid, fd, name);
631}
632
633// Record opening of a nameless fd.
634void ML_(record_fd_open_nameless)(ThreadId tid, Int fd)
635{
636   ML_(record_fd_open_with_given_name)(tid, fd, NULL);
637}
638
639// Return if a given file descriptor is already recorded.
640Bool ML_(fd_recorded)(Int fd)
641{
642   OpenFd *i = allocated_fds;
643   while (i) {
644      if (i->fd == fd)
645         return True;
646      i = i->next;
647   }
648   return False;
649}
650
651/* Returned string must not be modified nor free'd. */
652const HChar *ML_(find_fd_recorded_by_fd)(Int fd)
653{
654   OpenFd *i = allocated_fds;
655
656   while (i) {
657      if (i->fd == fd)
658         return i->pathname;
659      i = i->next;
660   }
661
662   return NULL;
663}
664
665static
666HChar *unix_to_name(struct vki_sockaddr_un *sa, UInt len, HChar *name)
667{
668   if (sa == NULL || len == 0 || sa->sun_path[0] == '\0') {
669      VG_(sprintf)(name, "<unknown>");
670   } else {
671      VG_(sprintf)(name, "%s", sa->sun_path);
672   }
673
674   return name;
675}
676
677static
678HChar *inet_to_name(struct vki_sockaddr_in *sa, UInt len, HChar *name)
679{
680   if (sa == NULL || len == 0) {
681      VG_(sprintf)(name, "<unknown>");
682   } else if (sa->sin_port == 0) {
683      VG_(sprintf)(name, "<unbound>");
684   } else {
685      UInt addr = VG_(ntohl)(sa->sin_addr.s_addr);
686      VG_(sprintf)(name, "%u.%u.%u.%u:%u",
687                   (addr>>24) & 0xFF, (addr>>16) & 0xFF,
688                   (addr>>8) & 0xFF, addr & 0xFF,
689                   VG_(ntohs)(sa->sin_port));
690   }
691
692   return name;
693}
694
695static
696void inet6_format(HChar *s, const UChar ip[16])
697{
698   static const unsigned char V4mappedprefix[12] = {0,0,0,0,0,0,0,0,0,0,0xff,0xff};
699
700   if (!VG_(memcmp)(ip, V4mappedprefix, 12)) {
701      const struct vki_in_addr *sin_addr =
702          (const struct vki_in_addr *)(ip + 12);
703      UInt addr = VG_(ntohl)(sin_addr->s_addr);
704
705      VG_(sprintf)(s, "::ffff:%u.%u.%u.%u",
706                   (addr>>24) & 0xFF, (addr>>16) & 0xFF,
707                   (addr>>8) & 0xFF, addr & 0xFF);
708   } else {
709      Bool compressing = False;
710      Bool compressed = False;
711      Int len = 0;
712      Int i;
713
714      for (i = 0; i < 16; i += 2) {
715         UInt word = ((UInt)ip[i] << 8) | (UInt)ip[i+1];
716         if (word == 0 && !compressed) {
717            compressing = True;
718         } else {
719            if (compressing) {
720               compressing = False;
721               compressed = True;
722               s[len++] = ':';
723            }
724            if (i > 0) {
725               s[len++] = ':';
726            }
727            len += VG_(sprintf)(s + len, "%x", word);
728         }
729      }
730
731      if (compressing) {
732         s[len++] = ':';
733         s[len++] = ':';
734      }
735
736      s[len++] = 0;
737   }
738
739   return;
740}
741
742static
743HChar *inet6_to_name(struct vki_sockaddr_in6 *sa, UInt len, HChar *name)
744{
745   if (sa == NULL || len == 0) {
746      VG_(sprintf)(name, "<unknown>");
747   } else if (sa->sin6_port == 0) {
748      VG_(sprintf)(name, "<unbound>");
749   } else {
750      HChar addr[100];    // large enough
751      inet6_format(addr, (void *)&(sa->sin6_addr));
752      VG_(sprintf)(name, "[%s]:%u", addr, VG_(ntohs)(sa->sin6_port));
753   }
754
755   return name;
756}
757
758/*
759 * Try get some details about a socket.
760 */
761static void
762getsockdetails(Int fd)
763{
764   union u {
765      struct vki_sockaddr a;
766      struct vki_sockaddr_in in;
767      struct vki_sockaddr_in6 in6;
768      struct vki_sockaddr_un un;
769   } laddr;
770   Int llen;
771
772   llen = sizeof(laddr);
773   VG_(memset)(&laddr, 0, llen);
774
775   if(VG_(getsockname)(fd, (struct vki_sockaddr *)&(laddr.a), &llen) != -1) {
776      switch(laddr.a.sa_family) {
777      case VKI_AF_INET: {
778         HChar lname[32];   // large enough
779         HChar pname[32];   // large enough
780         struct vki_sockaddr_in paddr;
781         Int plen = sizeof(struct vki_sockaddr_in);
782
783         if (VG_(getpeername)(fd, (struct vki_sockaddr *)&paddr, &plen) != -1) {
784            VG_(message)(Vg_UserMsg, "Open AF_INET socket %d: %s <-> %s\n", fd,
785                         inet_to_name(&(laddr.in), llen, lname),
786                         inet_to_name(&paddr, plen, pname));
787         } else {
788            VG_(message)(Vg_UserMsg, "Open AF_INET socket %d: %s <-> unbound\n",
789                         fd, inet_to_name(&(laddr.in), llen, lname));
790         }
791         return;
792         }
793      case VKI_AF_INET6: {
794         HChar lname[128];  // large enough
795         HChar pname[128];  // large enough
796         struct vki_sockaddr_in6 paddr;
797         Int plen = sizeof(struct vki_sockaddr_in6);
798
799         if (VG_(getpeername)(fd, (struct vki_sockaddr *)&paddr, &plen) != -1) {
800            VG_(message)(Vg_UserMsg, "Open AF_INET6 socket %d: %s <-> %s\n", fd,
801                         inet6_to_name(&(laddr.in6), llen, lname),
802                         inet6_to_name(&paddr, plen, pname));
803         } else {
804            VG_(message)(Vg_UserMsg, "Open AF_INET6 socket %d: %s <-> unbound\n",
805                         fd, inet6_to_name(&(laddr.in6), llen, lname));
806         }
807         return;
808         }
809      case VKI_AF_UNIX: {
810         static char lname[256];
811         VG_(message)(Vg_UserMsg, "Open AF_UNIX socket %d: %s\n", fd,
812                      unix_to_name(&(laddr.un), llen, lname));
813         return;
814         }
815      default:
816         VG_(message)(Vg_UserMsg, "Open pf-%d socket %d:\n",
817                      laddr.a.sa_family, fd);
818         return;
819      }
820   }
821
822   VG_(message)(Vg_UserMsg, "Open socket %d:\n", fd);
823}
824
825
826/* Dump out a summary, and a more detailed list, of open file descriptors. */
827void VG_(show_open_fds) (const HChar* when)
828{
829   OpenFd *i = allocated_fds;
830
831   VG_(message)(Vg_UserMsg, "FILE DESCRIPTORS: %d open %s.\n", fd_count, when);
832
833   while (i) {
834      if (i->pathname) {
835         VG_(message)(Vg_UserMsg, "Open file descriptor %d: %s\n", i->fd,
836                      i->pathname);
837      } else {
838         Int val;
839         Int len = sizeof(val);
840
841         if (VG_(getsockopt)(i->fd, VKI_SOL_SOCKET, VKI_SO_TYPE, &val, &len)
842             == -1) {
843            VG_(message)(Vg_UserMsg, "Open file descriptor %d:\n", i->fd);
844         } else {
845            getsockdetails(i->fd);
846         }
847      }
848
849      if(i->where) {
850         VG_(pp_ExeContext)(i->where);
851         VG_(message)(Vg_UserMsg, "\n");
852      } else {
853         VG_(message)(Vg_UserMsg, "   <inherited from parent>\n");
854         VG_(message)(Vg_UserMsg, "\n");
855      }
856
857      i = i->next;
858   }
859
860   VG_(message)(Vg_UserMsg, "\n");
861}
862
863/* If /proc/self/fd doesn't exist (e.g. you've got a Linux kernel that doesn't
864   have /proc support compiled in, or a non-Linux kernel), then we need to
865   find out what file descriptors we inherited from our parent process the
866   hard way - by checking each fd in turn. */
867static
868void init_preopened_fds_without_proc_self_fd(void)
869{
870   struct vki_rlimit lim;
871   UInt count;
872   Int i;
873
874   if (VG_(getrlimit) (VKI_RLIMIT_NOFILE, &lim) == -1) {
875      /* Hmm.  getrlimit() failed.  Now we're screwed, so just choose
876         an arbitrarily high number.  1024 happens to be the limit in
877         the 2.4 Linux kernels. */
878      count = 1024;
879   } else {
880      count = lim.rlim_cur;
881   }
882
883   for (i = 0; i < count; i++)
884      if (VG_(fcntl)(i, VKI_F_GETFL, 0) != -1)
885         ML_(record_fd_open_named)(-1, i);
886}
887
888/* Initialize the list of open file descriptors with the file descriptors
889   we inherited from out parent process. */
890
891void VG_(init_preopened_fds)(void)
892{
893// DDD: should probably use HAVE_PROC here or similar, instead.
894#if defined(VGO_linux)
895   Int ret;
896   struct vki_dirent64 d;
897   SysRes f;
898
899   f = VG_(open)("/proc/self/fd", VKI_O_RDONLY, 0);
900   if (sr_isError(f)) {
901      init_preopened_fds_without_proc_self_fd();
902      return;
903   }
904
905   while ((ret = VG_(getdents64)(sr_Res(f), &d, sizeof(d))) != 0) {
906      if (ret == -1)
907         goto out;
908
909      if (VG_(strcmp)(d.d_name, ".") && VG_(strcmp)(d.d_name, "..")) {
910         HChar* s;
911         Int fno = VG_(strtoll10)(d.d_name, &s);
912         if (*s == '\0') {
913            if (fno != sr_Res(f))
914               if (VG_(clo_track_fds))
915                  ML_(record_fd_open_named)(-1, fno);
916         } else {
917            VG_(message)(Vg_DebugMsg,
918               "Warning: invalid file name in /proc/self/fd: %s\n",
919               d.d_name);
920         }
921      }
922
923      VG_(lseek)(sr_Res(f), d.d_off, VKI_SEEK_SET);
924   }
925
926  out:
927   VG_(close)(sr_Res(f));
928
929#elif defined(VGO_darwin)
930   init_preopened_fds_without_proc_self_fd();
931
932#elif defined(VGO_solaris)
933   Int ret;
934   Char buf[VKI_MAXGETDENTS_SIZE];
935   SysRes f;
936
937   f = VG_(open)("/proc/self/fd", VKI_O_RDONLY, 0);
938   if (sr_isError(f)) {
939      init_preopened_fds_without_proc_self_fd();
940      return;
941   }
942
943   while ((ret = VG_(getdents64)(sr_Res(f), (struct vki_dirent64 *) buf,
944                                 sizeof(buf))) > 0) {
945      Int i = 0;
946      while (i < ret) {
947         /* Proceed one entry. */
948         struct vki_dirent64 *d = (struct vki_dirent64 *) (buf + i);
949         if (VG_(strcmp)(d->d_name, ".") && VG_(strcmp)(d->d_name, "..")) {
950            HChar *s;
951            Int fno = VG_(strtoll10)(d->d_name, &s);
952            if (*s == '\0') {
953               if (fno != sr_Res(f))
954                  if (VG_(clo_track_fds))
955                     ML_(record_fd_open_named)(-1, fno);
956            } else {
957               VG_(message)(Vg_DebugMsg,
958                     "Warning: invalid file name in /proc/self/fd: %s\n",
959                     d->d_name);
960            }
961         }
962
963         /* Move on the next entry. */
964         i += d->d_reclen;
965      }
966   }
967
968   VG_(close)(sr_Res(f));
969
970#else
971#  error Unknown OS
972#endif
973}
974
975static
976void pre_mem_read_sendmsg ( ThreadId tid, Bool read,
977                            const HChar *msg, Addr base, SizeT size )
978{
979   HChar outmsg[VG_(strlen)(msg) + 10]; // large enough
980   VG_(sprintf)(outmsg, "sendmsg%s", msg);
981   PRE_MEM_READ( outmsg, base, size );
982}
983
984static
985void pre_mem_write_recvmsg ( ThreadId tid, Bool read,
986                             const HChar *msg, Addr base, SizeT size )
987{
988   HChar outmsg[VG_(strlen)(msg) + 10]; // large enough
989   VG_(sprintf)(outmsg, "recvmsg%s", msg);
990   if ( read )
991      PRE_MEM_READ( outmsg, base, size );
992   else
993      PRE_MEM_WRITE( outmsg, base, size );
994}
995
996static
997void post_mem_write_recvmsg ( ThreadId tid, Bool read,
998                              const HChar *fieldName, Addr base, SizeT size )
999{
1000   if ( !read )
1001      POST_MEM_WRITE( base, size );
1002}
1003
1004static
1005void msghdr_foreachfield (
1006        ThreadId tid,
1007        const HChar *name,
1008        struct vki_msghdr *msg,
1009        UInt length,
1010        void (*foreach_func)( ThreadId, Bool, const HChar *, Addr, SizeT ),
1011        Bool rekv /* "recv" apparently shadows some header decl on OSX108 */
1012     )
1013{
1014   HChar fieldName[VG_(strlen)(name) + 32]; // large enough.
1015   Addr a;
1016   SizeT s;
1017
1018   if ( !msg )
1019      return;
1020
1021   VG_(sprintf) ( fieldName, "(%s)", name );
1022
1023   /* FIELDPAIR helps the compiler do one call to foreach_func
1024      for consecutive (no holes) fields. */
1025#define FIELDPAIR(f1,f2) \
1026   if (offsetof(struct vki_msghdr, f1) + sizeof(msg->f1)                \
1027       == offsetof(struct vki_msghdr, f2))                              \
1028      s += sizeof(msg->f2);                                             \
1029   else {                                                               \
1030      foreach_func (tid, True, fieldName, a, s);                        \
1031      a = (Addr)&msg->f2;                                               \
1032      s = sizeof(msg->f2);                                              \
1033   }
1034
1035   a = (Addr)&msg->msg_name;
1036   s = sizeof(msg->msg_name);
1037   FIELDPAIR(msg_name,    msg_namelen);
1038   FIELDPAIR(msg_namelen, msg_iov);
1039   FIELDPAIR(msg_iov,     msg_iovlen);
1040   FIELDPAIR(msg_iovlen,  msg_control);
1041   FIELDPAIR(msg_control, msg_controllen);
1042   foreach_func ( tid, True, fieldName, a, s);
1043#undef FIELDPAIR
1044
1045   /* msg_flags is completely ignored for send_mesg, recv_mesg doesn't read
1046      the field, but does write to it. */
1047   if ( rekv )
1048      foreach_func ( tid, False, fieldName, (Addr)&msg->msg_flags, sizeof( msg->msg_flags ) );
1049
1050   if ( ML_(safe_to_deref)(&msg->msg_name, sizeof (void *))
1051        && msg->msg_name ) {
1052      VG_(sprintf) ( fieldName, "(%s.msg_name)", name );
1053      foreach_func ( tid, False, fieldName,
1054                     (Addr)msg->msg_name, msg->msg_namelen );
1055   }
1056
1057   if ( ML_(safe_to_deref)(&msg->msg_iov, sizeof (void *))
1058        && msg->msg_iov ) {
1059      struct vki_iovec *iov = msg->msg_iov;
1060      UInt i;
1061
1062      if (ML_(safe_to_deref)(&msg->msg_iovlen, sizeof (UInt))) {
1063         VG_(sprintf) ( fieldName, "(%s.msg_iov)", name );
1064         foreach_func ( tid, True, fieldName, (Addr)iov,
1065                        msg->msg_iovlen * sizeof( struct vki_iovec ) );
1066
1067         for ( i = 0; i < msg->msg_iovlen && length > 0; ++i, ++iov ) {
1068            if (ML_(safe_to_deref)(&iov->iov_len, sizeof (UInt))) {
1069               UInt iov_len = iov->iov_len <= length ? iov->iov_len : length;
1070               VG_(sprintf) ( fieldName, "(%s.msg_iov[%u])", name, i );
1071               foreach_func ( tid, False, fieldName,
1072                              (Addr)iov->iov_base, iov_len );
1073               length = length - iov_len;
1074            }
1075         }
1076      }
1077   }
1078
1079   if ( ML_(safe_to_deref) (&msg->msg_control, sizeof (void *))
1080        && msg->msg_control ) {
1081      VG_(sprintf) ( fieldName, "(%s.msg_control)", name );
1082      foreach_func ( tid, False, fieldName,
1083                     (Addr)msg->msg_control, msg->msg_controllen );
1084   }
1085
1086}
1087
1088static void check_cmsg_for_fds(ThreadId tid, struct vki_msghdr *msg)
1089{
1090   struct vki_cmsghdr *cm = VKI_CMSG_FIRSTHDR(msg);
1091
1092   while (cm) {
1093      if (cm->cmsg_level == VKI_SOL_SOCKET
1094          && cm->cmsg_type == VKI_SCM_RIGHTS ) {
1095         Int *fds = (Int *) VKI_CMSG_DATA(cm);
1096         Int fdc = (cm->cmsg_len - VKI_CMSG_ALIGN(sizeof(struct vki_cmsghdr)))
1097                         / sizeof(int);
1098         Int i;
1099
1100         for (i = 0; i < fdc; i++)
1101            if(VG_(clo_track_fds))
1102               // XXX: must we check the range on these fds with
1103               //      ML_(fd_allowed)()?
1104               ML_(record_fd_open_named)(tid, fds[i]);
1105      }
1106
1107      cm = VKI_CMSG_NXTHDR(msg, cm);
1108   }
1109}
1110
1111/* GrP kernel ignores sa_len (at least on Darwin); this checks the rest */
1112static
1113void pre_mem_read_sockaddr ( ThreadId tid,
1114                             const HChar *description,
1115                             struct vki_sockaddr *sa, UInt salen )
1116{
1117   HChar outmsg[VG_(strlen)( description ) + 30]; // large enough
1118   struct vki_sockaddr_un*  saun = (struct vki_sockaddr_un *)sa;
1119   struct vki_sockaddr_in*  sin  = (struct vki_sockaddr_in *)sa;
1120   struct vki_sockaddr_in6* sin6 = (struct vki_sockaddr_in6 *)sa;
1121#  ifdef VKI_AF_BLUETOOTH
1122   struct vki_sockaddr_rc*  rc   = (struct vki_sockaddr_rc *)sa;
1123#  endif
1124#  ifdef VKI_AF_NETLINK
1125   struct vki_sockaddr_nl*  nl   = (struct vki_sockaddr_nl *)sa;
1126#  endif
1127
1128   /* NULL/zero-length sockaddrs are legal */
1129   if ( sa == NULL || salen == 0 ) return;
1130
1131   VG_(sprintf) ( outmsg, description, "sa_family" );
1132   PRE_MEM_READ( outmsg, (Addr) &sa->sa_family, sizeof(vki_sa_family_t));
1133
1134   /* Don't do any extra checking if we cannot determine the sa_family. */
1135   if (! ML_(safe_to_deref) (&sa->sa_family, sizeof(vki_sa_family_t)))
1136      return;
1137
1138   switch (sa->sa_family) {
1139
1140      case VKI_AF_UNIX:
1141         if (ML_(safe_to_deref) (&saun->sun_path, sizeof (Addr))) {
1142            VG_(sprintf) ( outmsg, description, "sun_path" );
1143            PRE_MEM_RASCIIZ( outmsg, (Addr) saun->sun_path );
1144            // GrP fixme max of sun_len-2? what about nul char?
1145         }
1146         break;
1147
1148      case VKI_AF_INET:
1149         VG_(sprintf) ( outmsg, description, "sin_port" );
1150         PRE_MEM_READ( outmsg, (Addr) &sin->sin_port, sizeof (sin->sin_port) );
1151         VG_(sprintf) ( outmsg, description, "sin_addr" );
1152         PRE_MEM_READ( outmsg, (Addr) &sin->sin_addr, sizeof (sin->sin_addr) );
1153         break;
1154
1155      case VKI_AF_INET6:
1156         VG_(sprintf) ( outmsg, description, "sin6_port" );
1157         PRE_MEM_READ( outmsg,
1158            (Addr) &sin6->sin6_port, sizeof (sin6->sin6_port) );
1159         VG_(sprintf) ( outmsg, description, "sin6_flowinfo" );
1160         PRE_MEM_READ( outmsg,
1161            (Addr) &sin6->sin6_flowinfo, sizeof (sin6->sin6_flowinfo) );
1162         VG_(sprintf) ( outmsg, description, "sin6_addr" );
1163         PRE_MEM_READ( outmsg,
1164            (Addr) &sin6->sin6_addr, sizeof (sin6->sin6_addr) );
1165         VG_(sprintf) ( outmsg, description, "sin6_scope_id" );
1166         PRE_MEM_READ( outmsg,
1167            (Addr) &sin6->sin6_scope_id, sizeof (sin6->sin6_scope_id) );
1168         break;
1169
1170#     ifdef VKI_AF_BLUETOOTH
1171      case VKI_AF_BLUETOOTH:
1172         VG_(sprintf) ( outmsg, description, "rc_bdaddr" );
1173         PRE_MEM_READ( outmsg, (Addr) &rc->rc_bdaddr, sizeof (rc->rc_bdaddr) );
1174         VG_(sprintf) ( outmsg, description, "rc_channel" );
1175         PRE_MEM_READ( outmsg, (Addr) &rc->rc_channel, sizeof (rc->rc_channel) );
1176         break;
1177#     endif
1178
1179#     ifdef VKI_AF_NETLINK
1180      case VKI_AF_NETLINK:
1181         VG_(sprintf)(outmsg, description, "nl_pid");
1182         PRE_MEM_READ(outmsg, (Addr)&nl->nl_pid, sizeof(nl->nl_pid));
1183         VG_(sprintf)(outmsg, description, "nl_groups");
1184         PRE_MEM_READ(outmsg, (Addr)&nl->nl_groups, sizeof(nl->nl_groups));
1185         break;
1186#     endif
1187
1188#     ifdef VKI_AF_UNSPEC
1189      case VKI_AF_UNSPEC:
1190         break;
1191#     endif
1192
1193      default:
1194         /* No specific information about this address family.
1195            Let's just check the full data following the family.
1196            Note that this can give false positive if this (unknown)
1197            struct sockaddr_???? has padding bytes between its elements. */
1198         VG_(sprintf) ( outmsg, description, "sa_data" );
1199         PRE_MEM_READ( outmsg, (Addr)&sa->sa_family + sizeof(sa->sa_family),
1200                       salen -  sizeof(sa->sa_family));
1201         break;
1202   }
1203}
1204
1205/* Dereference a pointer to a UInt. */
1206static UInt deref_UInt ( ThreadId tid, Addr a, const HChar* s )
1207{
1208   UInt* a_p = (UInt*)a;
1209   PRE_MEM_READ( s, (Addr)a_p, sizeof(UInt) );
1210   if (a_p == NULL || ! ML_(safe_to_deref) (a_p, sizeof(UInt)))
1211      return 0;
1212   else
1213      return *a_p;
1214}
1215
1216void ML_(buf_and_len_pre_check) ( ThreadId tid, Addr buf_p, Addr buflen_p,
1217                                  const HChar* buf_s, const HChar* buflen_s )
1218{
1219   if (VG_(tdict).track_pre_mem_write) {
1220      UInt buflen_in = deref_UInt( tid, buflen_p, buflen_s);
1221      if (buflen_in > 0) {
1222         VG_(tdict).track_pre_mem_write(
1223            Vg_CoreSysCall, tid, buf_s, buf_p, buflen_in );
1224      }
1225   }
1226}
1227
1228void ML_(buf_and_len_post_check) ( ThreadId tid, SysRes res,
1229                                   Addr buf_p, Addr buflen_p, const HChar* s )
1230{
1231   if (!sr_isError(res) && VG_(tdict).track_post_mem_write) {
1232      UInt buflen_out = deref_UInt( tid, buflen_p, s);
1233      if (buflen_out > 0 && buf_p != (Addr)NULL) {
1234         VG_(tdict).track_post_mem_write( Vg_CoreSysCall, tid, buf_p, buflen_out );
1235      }
1236   }
1237}
1238
1239/* ---------------------------------------------------------------------
1240   Data seg end, for brk()
1241   ------------------------------------------------------------------ */
1242
1243/*   +--------+------------+
1244     | anon   |    resvn   |
1245     +--------+------------+
1246
1247     ^     ^  ^
1248     |     |  boundary is page aligned
1249     |     VG_(brk_limit) -- no alignment constraint
1250     VG_(brk_base) -- page aligned -- does not move
1251
1252     Both the anon part and the reservation part are always at least
1253     one page.
1254*/
1255
1256/* Set the new data segment end to NEWBRK.  If this succeeds, return
1257   NEWBRK, else return the current data segment end. */
1258
1259static Addr do_brk ( Addr newbrk, ThreadId tid )
1260{
1261   NSegment const* aseg;
1262   Addr newbrkP;
1263   SizeT delta;
1264   Bool debug = False;
1265
1266   if (debug)
1267      VG_(printf)("\ndo_brk: brk_base=%#lx brk_limit=%#lx newbrk=%#lx\n",
1268		  VG_(brk_base), VG_(brk_limit), newbrk);
1269
1270   if (0) VG_(am_show_nsegments)(0, "in_brk");
1271
1272   if (newbrk < VG_(brk_base))
1273      /* Clearly impossible. */
1274      goto bad;
1275
1276   if (newbrk < VG_(brk_limit)) {
1277      /* shrinking the data segment.  Be lazy and don't munmap the
1278         excess area. */
1279      NSegment const * seg = VG_(am_find_nsegment)(newbrk);
1280      vg_assert(seg);
1281
1282      if (seg->hasT)
1283         VG_(discard_translations)( newbrk, VG_(brk_limit) - newbrk,
1284                                    "do_brk(shrink)" );
1285      /* Since we're being lazy and not unmapping pages, we have to
1286         zero out the area, so that if the area later comes back into
1287         circulation, it will be filled with zeroes, as if it really
1288         had been unmapped and later remapped.  Be a bit paranoid and
1289         try hard to ensure we're not going to segfault by doing the
1290         write - check both ends of the range are in the same segment
1291         and that segment is writable. */
1292      NSegment const * seg2;
1293
1294      seg2 = VG_(am_find_nsegment)( VG_(brk_limit) - 1 );
1295      vg_assert(seg2);
1296
1297      if (seg == seg2 && seg->hasW)
1298         VG_(memset)( (void*)newbrk, 0, VG_(brk_limit) - newbrk );
1299
1300      VG_(brk_limit) = newbrk;
1301      return newbrk;
1302   }
1303
1304   /* otherwise we're expanding the brk segment. */
1305   if (VG_(brk_limit) > VG_(brk_base))
1306      aseg = VG_(am_find_nsegment)( VG_(brk_limit)-1 );
1307   else
1308      aseg = VG_(am_find_nsegment)( VG_(brk_limit) );
1309
1310   /* These should be assured by setup_client_dataseg in m_main. */
1311   vg_assert(aseg);
1312   vg_assert(aseg->kind == SkAnonC);
1313
1314   if (newbrk <= aseg->end + 1) {
1315      /* still fits within the anon segment. */
1316      VG_(brk_limit) = newbrk;
1317      return newbrk;
1318   }
1319
1320   newbrkP = VG_PGROUNDUP(newbrk);
1321   delta = newbrkP - (aseg->end + 1);
1322   vg_assert(delta > 0);
1323   vg_assert(VG_IS_PAGE_ALIGNED(delta));
1324
1325   Bool overflow = False;
1326   if (! VG_(am_extend_into_adjacent_reservation_client)( aseg->start, delta,
1327                                                          &overflow)) {
1328      if (overflow) {
1329         static Bool alreadyComplained = False;
1330         if (!alreadyComplained) {
1331            alreadyComplained = True;
1332            if (VG_(clo_verbosity) > 0) {
1333               VG_(umsg)("brk segment overflow in thread #%u: "
1334                         "can't grow to %#lx\n",
1335                         tid, newbrkP);
1336               VG_(umsg)("(see section Limitations in user manual)\n");
1337               VG_(umsg)("NOTE: further instances of this message "
1338                         "will not be shown\n");
1339            }
1340         }
1341      } else {
1342         if (VG_(clo_verbosity) > 0) {
1343            VG_(umsg)("Cannot map memory to grow brk segment in thread #%u "
1344                      "to %#lx\n", tid, newbrkP);
1345            VG_(umsg)("(see section Limitations in user manual)\n");
1346         }
1347      }
1348      goto bad;
1349   }
1350
1351   VG_(brk_limit) = newbrk;
1352   return newbrk;
1353
1354  bad:
1355   return VG_(brk_limit);
1356}
1357
1358
1359/* ---------------------------------------------------------------------
1360   Vet file descriptors for sanity
1361   ------------------------------------------------------------------ */
1362/*
1363> - what does the "Bool soft" parameter mean?
1364
1365(Tom Hughes, 3 Oct 05):
1366
1367Whether or not to consider a file descriptor invalid if it is above
1368the current soft limit.
1369
1370Basically if we are testing whether a newly created file descriptor is
1371valid (in a post handler) then we set soft to true, and if we are
1372testing whether a file descriptor that is about to be used (in a pre
1373handler) is valid [viz, an already-existing fd] then we set it to false.
1374
1375The point is that if the (virtual) soft limit is lowered then any
1376existing descriptors can still be read/written/closed etc (so long as
1377they are below the valgrind reserved descriptors) but no new
1378descriptors can be created above the new soft limit.
1379
1380(jrs 4 Oct 05: in which case, I've renamed it "isNewFd")
1381*/
1382
1383/* Return true if we're allowed to use or create this fd */
1384Bool ML_(fd_allowed)(Int fd, const HChar *syscallname, ThreadId tid,
1385                     Bool isNewFd)
1386{
1387   Bool allowed = True;
1388
1389   /* hard limits always apply */
1390   if (fd < 0 || fd >= VG_(fd_hard_limit))
1391      allowed = False;
1392
1393   /* hijacking the output fds is never allowed */
1394   if (fd == VG_(log_output_sink).fd || fd == VG_(xml_output_sink).fd)
1395      allowed = False;
1396
1397   /* if creating a new fd (rather than using an existing one), the
1398      soft limit must also be observed */
1399   if (isNewFd && fd >= VG_(fd_soft_limit))
1400      allowed = False;
1401
1402   /* this looks like it ought to be included, but causes problems: */
1403   /*
1404   if (fd == 2 && VG_(debugLog_getLevel)() > 0)
1405      allowed = False;
1406   */
1407   /* The difficulty is as follows: consider a program P which expects
1408      to be able to mess with (redirect) its own stderr (fd 2).
1409      Usually to deal with P we would issue command line flags to send
1410      logging somewhere other than stderr, so as not to disrupt P.
1411      The problem is that -d unilaterally hijacks stderr with no
1412      consultation with P.  And so, if this check is enabled, P will
1413      work OK normally but fail if -d is issued.
1414
1415      Basically -d is a hack and you take your chances when using it.
1416      It's very useful for low level debugging -- particularly at
1417      startup -- and having its presence change the behaviour of the
1418      client is exactly what we don't want.  */
1419
1420   /* croak? */
1421   if ((!allowed) && VG_(showing_core_errors)() ) {
1422      VG_(message)(Vg_UserMsg,
1423         "Warning: invalid file descriptor %d in syscall %s()\n",
1424         fd, syscallname);
1425      if (fd == VG_(log_output_sink).fd && VG_(log_output_sink).fd >= 0)
1426	 VG_(message)(Vg_UserMsg,
1427            "   Use --log-fd=<number> to select an alternative log fd.\n");
1428      if (fd == VG_(xml_output_sink).fd && VG_(xml_output_sink).fd >= 0)
1429	 VG_(message)(Vg_UserMsg,
1430            "   Use --xml-fd=<number> to select an alternative XML "
1431            "output fd.\n");
1432      // DDD: consider always printing this stack trace, it's useful.
1433      // Also consider also making this a proper core error, ie.
1434      // suppressible and all that.
1435      if (VG_(clo_verbosity) > 1) {
1436         VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
1437      }
1438   }
1439
1440   return allowed;
1441}
1442
1443
1444/* ---------------------------------------------------------------------
1445   Deal with a bunch of socket-related syscalls
1446   ------------------------------------------------------------------ */
1447
1448/* ------ */
1449
1450void
1451ML_(generic_PRE_sys_socketpair) ( ThreadId tid,
1452                                  UWord arg0, UWord arg1,
1453                                  UWord arg2, UWord arg3 )
1454{
1455   /* int socketpair(int d, int type, int protocol, int sv[2]); */
1456   PRE_MEM_WRITE( "socketcall.socketpair(sv)",
1457                  arg3, 2*sizeof(int) );
1458}
1459
1460SysRes
1461ML_(generic_POST_sys_socketpair) ( ThreadId tid,
1462                                   SysRes res,
1463                                   UWord arg0, UWord arg1,
1464                                   UWord arg2, UWord arg3 )
1465{
1466   SysRes r = res;
1467   Int fd1 = ((Int*)arg3)[0];
1468   Int fd2 = ((Int*)arg3)[1];
1469   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1470   POST_MEM_WRITE( arg3, 2*sizeof(int) );
1471   if (!ML_(fd_allowed)(fd1, "socketcall.socketpair", tid, True) ||
1472       !ML_(fd_allowed)(fd2, "socketcall.socketpair", tid, True)) {
1473      VG_(close)(fd1);
1474      VG_(close)(fd2);
1475      r = VG_(mk_SysRes_Error)( VKI_EMFILE );
1476   } else {
1477      POST_MEM_WRITE( arg3, 2*sizeof(int) );
1478      if (VG_(clo_track_fds)) {
1479         ML_(record_fd_open_nameless)(tid, fd1);
1480         ML_(record_fd_open_nameless)(tid, fd2);
1481      }
1482   }
1483   return r;
1484}
1485
1486/* ------ */
1487
1488SysRes
1489ML_(generic_POST_sys_socket) ( ThreadId tid, SysRes res )
1490{
1491   SysRes r = res;
1492   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1493   if (!ML_(fd_allowed)(sr_Res(res), "socket", tid, True)) {
1494      VG_(close)(sr_Res(res));
1495      r = VG_(mk_SysRes_Error)( VKI_EMFILE );
1496   } else {
1497      if (VG_(clo_track_fds))
1498         ML_(record_fd_open_nameless)(tid, sr_Res(res));
1499   }
1500   return r;
1501}
1502
1503/* ------ */
1504
1505void
1506ML_(generic_PRE_sys_bind) ( ThreadId tid,
1507                            UWord arg0, UWord arg1, UWord arg2 )
1508{
1509   /* int bind(int sockfd, struct sockaddr *my_addr,
1510               int addrlen); */
1511   pre_mem_read_sockaddr(
1512      tid, "socketcall.bind(my_addr.%s)",
1513      (struct vki_sockaddr *) arg1, arg2
1514   );
1515}
1516
1517/* ------ */
1518
1519void
1520ML_(generic_PRE_sys_accept) ( ThreadId tid,
1521                              UWord arg0, UWord arg1, UWord arg2 )
1522{
1523   /* int accept(int s, struct sockaddr *addr, int *addrlen); */
1524   Addr addr_p     = arg1;
1525   Addr addrlen_p  = arg2;
1526   if (addr_p != (Addr)NULL)
1527      ML_(buf_and_len_pre_check) ( tid, addr_p, addrlen_p,
1528                                   "socketcall.accept(addr)",
1529                                   "socketcall.accept(addrlen_in)" );
1530}
1531
1532SysRes
1533ML_(generic_POST_sys_accept) ( ThreadId tid,
1534                               SysRes res,
1535                               UWord arg0, UWord arg1, UWord arg2 )
1536{
1537   SysRes r = res;
1538   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1539   if (!ML_(fd_allowed)(sr_Res(res), "accept", tid, True)) {
1540      VG_(close)(sr_Res(res));
1541      r = VG_(mk_SysRes_Error)( VKI_EMFILE );
1542   } else {
1543      Addr addr_p     = arg1;
1544      Addr addrlen_p  = arg2;
1545      if (addr_p != (Addr)NULL)
1546         ML_(buf_and_len_post_check) ( tid, res, addr_p, addrlen_p,
1547                                       "socketcall.accept(addrlen_out)" );
1548      if (VG_(clo_track_fds))
1549          ML_(record_fd_open_nameless)(tid, sr_Res(res));
1550   }
1551   return r;
1552}
1553
1554/* ------ */
1555
1556void
1557ML_(generic_PRE_sys_sendto) ( ThreadId tid,
1558                              UWord arg0, UWord arg1, UWord arg2,
1559                              UWord arg3, UWord arg4, UWord arg5 )
1560{
1561   /* int sendto(int s, const void *msg, int len,
1562                 unsigned int flags,
1563                 const struct sockaddr *to, int tolen); */
1564   PRE_MEM_READ( "socketcall.sendto(msg)",
1565                 arg1, /* msg */
1566                 arg2  /* len */ );
1567   pre_mem_read_sockaddr(
1568      tid, "socketcall.sendto(to.%s)",
1569      (struct vki_sockaddr *) arg4, arg5
1570   );
1571}
1572
1573/* ------ */
1574
1575void
1576ML_(generic_PRE_sys_send) ( ThreadId tid,
1577                            UWord arg0, UWord arg1, UWord arg2 )
1578{
1579   /* int send(int s, const void *msg, size_t len, int flags); */
1580   PRE_MEM_READ( "socketcall.send(msg)",
1581                  arg1, /* msg */
1582                  arg2  /* len */ );
1583
1584}
1585
1586/* ------ */
1587
1588void
1589ML_(generic_PRE_sys_recvfrom) ( ThreadId tid,
1590                                UWord arg0, UWord arg1, UWord arg2,
1591                                UWord arg3, UWord arg4, UWord arg5 )
1592{
1593   /* int recvfrom(int s, void *buf, int len, unsigned int flags,
1594                   struct sockaddr *from, int *fromlen); */
1595   Addr buf_p      = arg1;
1596   Int  len        = arg2;
1597   Addr from_p     = arg4;
1598   Addr fromlen_p  = arg5;
1599   PRE_MEM_WRITE( "socketcall.recvfrom(buf)", buf_p, len );
1600   if (from_p != (Addr)NULL)
1601      ML_(buf_and_len_pre_check) ( tid, from_p, fromlen_p,
1602                                   "socketcall.recvfrom(from)",
1603                                   "socketcall.recvfrom(fromlen_in)" );
1604}
1605
1606void
1607ML_(generic_POST_sys_recvfrom) ( ThreadId tid,
1608                                 SysRes res,
1609                                 UWord arg0, UWord arg1, UWord arg2,
1610                                 UWord arg3, UWord arg4, UWord arg5 )
1611{
1612   Addr buf_p      = arg1;
1613   Int  len        = arg2;
1614   Addr from_p     = arg4;
1615   Addr fromlen_p  = arg5;
1616
1617   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1618   if (from_p != (Addr)NULL)
1619      ML_(buf_and_len_post_check) ( tid, res, from_p, fromlen_p,
1620                                    "socketcall.recvfrom(fromlen_out)" );
1621   POST_MEM_WRITE( buf_p, len );
1622}
1623
1624/* ------ */
1625
1626void
1627ML_(generic_PRE_sys_recv) ( ThreadId tid,
1628                            UWord arg0, UWord arg1, UWord arg2 )
1629{
1630   /* int recv(int s, void *buf, int len, unsigned int flags); */
1631   /* man 2 recv says:
1632      The  recv call is normally used only on a connected socket
1633      (see connect(2)) and is identical to recvfrom with a  NULL
1634      from parameter.
1635   */
1636   PRE_MEM_WRITE( "socketcall.recv(buf)",
1637                  arg1, /* buf */
1638                  arg2  /* len */ );
1639}
1640
1641void
1642ML_(generic_POST_sys_recv) ( ThreadId tid,
1643                             UWord res,
1644                             UWord arg0, UWord arg1, UWord arg2 )
1645{
1646   if (res >= 0 && arg1 != 0) {
1647      POST_MEM_WRITE( arg1, /* buf */
1648                      arg2  /* len */ );
1649   }
1650}
1651
1652/* ------ */
1653
1654void
1655ML_(generic_PRE_sys_connect) ( ThreadId tid,
1656                               UWord arg0, UWord arg1, UWord arg2 )
1657{
1658   /* int connect(int sockfd,
1659                  struct sockaddr *serv_addr, int addrlen ); */
1660   pre_mem_read_sockaddr( tid,
1661                          "socketcall.connect(serv_addr.%s)",
1662                          (struct vki_sockaddr *) arg1, arg2);
1663}
1664
1665/* ------ */
1666
1667void
1668ML_(generic_PRE_sys_setsockopt) ( ThreadId tid,
1669                                  UWord arg0, UWord arg1, UWord arg2,
1670                                  UWord arg3, UWord arg4 )
1671{
1672   /* int setsockopt(int s, int level, int optname,
1673                     const void *optval, int optlen); */
1674   PRE_MEM_READ( "socketcall.setsockopt(optval)",
1675                 arg3, /* optval */
1676                 arg4  /* optlen */ );
1677}
1678
1679/* ------ */
1680
1681void
1682ML_(generic_PRE_sys_getsockname) ( ThreadId tid,
1683                                   UWord arg0, UWord arg1, UWord arg2 )
1684{
1685   /* int getsockname(int s, struct sockaddr* name, int* namelen) */
1686   Addr name_p     = arg1;
1687   Addr namelen_p  = arg2;
1688   /* Nb: name_p cannot be NULL */
1689   ML_(buf_and_len_pre_check) ( tid, name_p, namelen_p,
1690                                "socketcall.getsockname(name)",
1691                                "socketcall.getsockname(namelen_in)" );
1692}
1693
1694void
1695ML_(generic_POST_sys_getsockname) ( ThreadId tid,
1696                                    SysRes res,
1697                                    UWord arg0, UWord arg1, UWord arg2 )
1698{
1699   Addr name_p     = arg1;
1700   Addr namelen_p  = arg2;
1701   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1702   ML_(buf_and_len_post_check) ( tid, res, name_p, namelen_p,
1703                                 "socketcall.getsockname(namelen_out)" );
1704}
1705
1706/* ------ */
1707
1708void
1709ML_(generic_PRE_sys_getpeername) ( ThreadId tid,
1710                                   UWord arg0, UWord arg1, UWord arg2 )
1711{
1712   /* int getpeername(int s, struct sockaddr* name, int* namelen) */
1713   Addr name_p     = arg1;
1714   Addr namelen_p  = arg2;
1715   /* Nb: name_p cannot be NULL */
1716   ML_(buf_and_len_pre_check) ( tid, name_p, namelen_p,
1717                                "socketcall.getpeername(name)",
1718                                "socketcall.getpeername(namelen_in)" );
1719}
1720
1721void
1722ML_(generic_POST_sys_getpeername) ( ThreadId tid,
1723                                    SysRes res,
1724                                    UWord arg0, UWord arg1, UWord arg2 )
1725{
1726   Addr name_p     = arg1;
1727   Addr namelen_p  = arg2;
1728   vg_assert(!sr_isError(res)); /* guaranteed by caller */
1729   ML_(buf_and_len_post_check) ( tid, res, name_p, namelen_p,
1730                                 "socketcall.getpeername(namelen_out)" );
1731}
1732
1733/* ------ */
1734
1735void
1736ML_(generic_PRE_sys_sendmsg) ( ThreadId tid, const HChar *name,
1737                               struct vki_msghdr *msg )
1738{
1739   msghdr_foreachfield ( tid, name, msg, ~0, pre_mem_read_sendmsg, False );
1740}
1741
1742/* ------ */
1743
1744void
1745ML_(generic_PRE_sys_recvmsg) ( ThreadId tid, const HChar *name,
1746                               struct vki_msghdr *msg )
1747{
1748   msghdr_foreachfield ( tid, name, msg, ~0, pre_mem_write_recvmsg, True );
1749}
1750
1751void
1752ML_(generic_POST_sys_recvmsg) ( ThreadId tid, const HChar *name,
1753                                struct vki_msghdr *msg, UInt length )
1754{
1755   msghdr_foreachfield( tid, name, msg, length, post_mem_write_recvmsg, True );
1756   check_cmsg_for_fds( tid, msg );
1757}
1758
1759
1760/* ---------------------------------------------------------------------
1761   Deal with a bunch of IPC related syscalls
1762   ------------------------------------------------------------------ */
1763
1764/* ------ */
1765
1766void
1767ML_(generic_PRE_sys_semop) ( ThreadId tid,
1768                             UWord arg0, UWord arg1, UWord arg2 )
1769{
1770   /* int semop(int semid, struct sembuf *sops, unsigned nsops); */
1771   PRE_MEM_READ( "semop(sops)", arg1, arg2 * sizeof(struct vki_sembuf) );
1772}
1773
1774/* ------ */
1775
1776void
1777ML_(generic_PRE_sys_semtimedop) ( ThreadId tid,
1778                                  UWord arg0, UWord arg1,
1779                                  UWord arg2, UWord arg3 )
1780{
1781   /* int semtimedop(int semid, struct sembuf *sops, unsigned nsops,
1782                     struct timespec *timeout); */
1783   PRE_MEM_READ( "semtimedop(sops)", arg1, arg2 * sizeof(struct vki_sembuf) );
1784   if (arg3 != 0)
1785      PRE_MEM_READ( "semtimedop(timeout)", arg3, sizeof(struct vki_timespec) );
1786}
1787
1788/* ------ */
1789
1790static
1791UInt get_sem_count( Int semid )
1792{
1793   struct vki_semid_ds buf;
1794   union vki_semun arg;
1795   SysRes res;
1796
1797   /* Doesn't actually seem to be necessary, but gcc-4.4.0 20081017
1798      (experimental) otherwise complains that the use in the return
1799      statement below is uninitialised. */
1800   buf.sem_nsems = 0;
1801
1802   arg.buf = &buf;
1803
1804#  if defined(__NR_semctl)
1805   res = VG_(do_syscall4)(__NR_semctl, semid, 0, VKI_IPC_STAT, *(UWord *)&arg);
1806#  elif defined(__NR_semsys) /* Solaris */
1807   res = VG_(do_syscall5)(__NR_semsys, VKI_SEMCTL, semid, 0, VKI_IPC_STAT,
1808                          *(UWord *)&arg);
1809#  else
1810   res = VG_(do_syscall5)(__NR_ipc, 3 /* IPCOP_semctl */, semid, 0,
1811                          VKI_IPC_STAT, (UWord)&arg);
1812#  endif
1813   if (sr_isError(res))
1814      return 0;
1815
1816   return buf.sem_nsems;
1817}
1818
1819void
1820ML_(generic_PRE_sys_semctl) ( ThreadId tid,
1821                              UWord arg0, UWord arg1,
1822                              UWord arg2, UWord arg3 )
1823{
1824   /* int semctl(int semid, int semnum, int cmd, ...); */
1825   union vki_semun arg = *(union vki_semun *)&arg3;
1826   UInt nsems;
1827   switch (arg2 /* cmd */) {
1828#if defined(VKI_IPC_INFO)
1829   case VKI_IPC_INFO:
1830   case VKI_SEM_INFO:
1831   case VKI_IPC_INFO|VKI_IPC_64:
1832   case VKI_SEM_INFO|VKI_IPC_64:
1833      PRE_MEM_WRITE( "semctl(IPC_INFO, arg.buf)",
1834                     (Addr)arg.buf, sizeof(struct vki_seminfo) );
1835      break;
1836#endif
1837
1838   case VKI_IPC_STAT:
1839#if defined(VKI_SEM_STAT)
1840   case VKI_SEM_STAT:
1841#endif
1842      PRE_MEM_WRITE( "semctl(IPC_STAT, arg.buf)",
1843                     (Addr)arg.buf, sizeof(struct vki_semid_ds) );
1844      break;
1845
1846#if defined(VKI_IPC_64)
1847   case VKI_IPC_STAT|VKI_IPC_64:
1848#if defined(VKI_SEM_STAT)
1849   case VKI_SEM_STAT|VKI_IPC_64:
1850#endif
1851#endif
1852#if defined(VKI_IPC_STAT64)
1853   case VKI_IPC_STAT64:
1854#endif
1855#if defined(VKI_IPC_64) || defined(VKI_IPC_STAT64)
1856      PRE_MEM_WRITE( "semctl(IPC_STAT, arg.buf)",
1857                     (Addr)arg.buf, sizeof(struct vki_semid64_ds) );
1858      break;
1859#endif
1860
1861   case VKI_IPC_SET:
1862      PRE_MEM_READ( "semctl(IPC_SET, arg.buf)",
1863                    (Addr)arg.buf, sizeof(struct vki_semid_ds) );
1864      break;
1865
1866#if defined(VKI_IPC_64)
1867   case VKI_IPC_SET|VKI_IPC_64:
1868#endif
1869#if defined(VKI_IPC_SET64)
1870   case VKI_IPC_SET64:
1871#endif
1872#if defined(VKI_IPC64) || defined(VKI_IPC_SET64)
1873      PRE_MEM_READ( "semctl(IPC_SET, arg.buf)",
1874                    (Addr)arg.buf, sizeof(struct vki_semid64_ds) );
1875      break;
1876#endif
1877
1878   case VKI_GETALL:
1879#if defined(VKI_IPC_64)
1880   case VKI_GETALL|VKI_IPC_64:
1881#endif
1882      nsems = get_sem_count( arg0 );
1883      PRE_MEM_WRITE( "semctl(IPC_GETALL, arg.array)",
1884                     (Addr)arg.array, sizeof(unsigned short) * nsems );
1885      break;
1886
1887   case VKI_SETALL:
1888#if defined(VKI_IPC_64)
1889   case VKI_SETALL|VKI_IPC_64:
1890#endif
1891      nsems = get_sem_count( arg0 );
1892      PRE_MEM_READ( "semctl(IPC_SETALL, arg.array)",
1893                    (Addr)arg.array, sizeof(unsigned short) * nsems );
1894      break;
1895   }
1896}
1897
1898void
1899ML_(generic_POST_sys_semctl) ( ThreadId tid,
1900                               UWord res,
1901                               UWord arg0, UWord arg1,
1902                               UWord arg2, UWord arg3 )
1903{
1904   union vki_semun arg = *(union vki_semun *)&arg3;
1905   UInt nsems;
1906   switch (arg2 /* cmd */) {
1907#if defined(VKI_IPC_INFO)
1908   case VKI_IPC_INFO:
1909   case VKI_SEM_INFO:
1910   case VKI_IPC_INFO|VKI_IPC_64:
1911   case VKI_SEM_INFO|VKI_IPC_64:
1912      POST_MEM_WRITE( (Addr)arg.buf, sizeof(struct vki_seminfo) );
1913      break;
1914#endif
1915
1916   case VKI_IPC_STAT:
1917#if defined(VKI_SEM_STAT)
1918   case VKI_SEM_STAT:
1919#endif
1920      POST_MEM_WRITE( (Addr)arg.buf, sizeof(struct vki_semid_ds) );
1921      break;
1922
1923#if defined(VKI_IPC_64)
1924   case VKI_IPC_STAT|VKI_IPC_64:
1925   case VKI_SEM_STAT|VKI_IPC_64:
1926#endif
1927#if defined(VKI_IPC_STAT64)
1928   case VKI_IPC_STAT64:
1929#endif
1930#if defined(VKI_IPC_64) || defined(VKI_IPC_STAT64)
1931      POST_MEM_WRITE( (Addr)arg.buf, sizeof(struct vki_semid64_ds) );
1932      break;
1933#endif
1934
1935   case VKI_GETALL:
1936#if defined(VKI_IPC_64)
1937   case VKI_GETALL|VKI_IPC_64:
1938#endif
1939      nsems = get_sem_count( arg0 );
1940      POST_MEM_WRITE( (Addr)arg.array, sizeof(unsigned short) * nsems );
1941      break;
1942   }
1943}
1944
1945/* ------ */
1946
1947/* ------ */
1948
1949static
1950SizeT get_shm_size ( Int shmid )
1951{
1952#if defined(__NR_shmctl)
1953#  ifdef VKI_IPC_64
1954   struct vki_shmid64_ds buf;
1955#    if defined(VGP_amd64_linux) || defined(VGP_arm64_linux)
1956     /* See bug 222545 comment 7 */
1957     SysRes __res = VG_(do_syscall3)(__NR_shmctl, shmid,
1958                                     VKI_IPC_STAT, (UWord)&buf);
1959#    else
1960     SysRes __res = VG_(do_syscall3)(__NR_shmctl, shmid,
1961                                     VKI_IPC_STAT|VKI_IPC_64, (UWord)&buf);
1962#    endif
1963#  else /* !def VKI_IPC_64 */
1964   struct vki_shmid_ds buf;
1965   SysRes __res = VG_(do_syscall3)(__NR_shmctl, shmid, VKI_IPC_STAT, (UWord)&buf);
1966#  endif /* def VKI_IPC_64 */
1967#elif defined(__NR_shmsys) /* Solaris */
1968   struct vki_shmid_ds buf;
1969   SysRes __res = VG_(do_syscall4)(__NR_shmsys, VKI_SHMCTL, shmid, VKI_IPC_STAT,
1970                         (UWord)&buf);
1971#else
1972   struct vki_shmid_ds buf;
1973   SysRes __res = VG_(do_syscall5)(__NR_ipc, 24 /* IPCOP_shmctl */, shmid,
1974                                 VKI_IPC_STAT, 0, (UWord)&buf);
1975#endif
1976   if (sr_isError(__res))
1977      return 0;
1978
1979   return (SizeT) buf.shm_segsz;
1980}
1981
1982UWord
1983ML_(generic_PRE_sys_shmat) ( ThreadId tid,
1984                             UWord arg0, UWord arg1, UWord arg2 )
1985{
1986   /* void *shmat(int shmid, const void *shmaddr, int shmflg); */
1987   SizeT  segmentSize = get_shm_size ( arg0 );
1988   UWord tmp;
1989   Bool  ok;
1990   if (arg1 == 0) {
1991      /* arm-linux only: work around the fact that
1992         VG_(am_get_advisory_client_simple) produces something that is
1993         VKI_PAGE_SIZE aligned, whereas what we want is something
1994         VKI_SHMLBA aligned, and VKI_SHMLBA >= VKI_PAGE_SIZE.  Hence
1995         increase the request size by VKI_SHMLBA - VKI_PAGE_SIZE and
1996         then round the result up to the next VKI_SHMLBA boundary.
1997         See bug 222545 comment 15.  So far, arm-linux is the only
1998         platform where this is known to be necessary. */
1999      vg_assert(VKI_SHMLBA >= VKI_PAGE_SIZE);
2000      if (VKI_SHMLBA > VKI_PAGE_SIZE) {
2001         segmentSize += VKI_SHMLBA - VKI_PAGE_SIZE;
2002      }
2003      tmp = VG_(am_get_advisory_client_simple)(0, segmentSize, &ok);
2004      if (ok) {
2005         if (VKI_SHMLBA > VKI_PAGE_SIZE) {
2006            arg1 = VG_ROUNDUP(tmp, VKI_SHMLBA);
2007         } else {
2008            arg1 = tmp;
2009         }
2010      }
2011   }
2012   else if (!ML_(valid_client_addr)(arg1, segmentSize, tid, "shmat"))
2013      arg1 = 0;
2014   return arg1;
2015}
2016
2017void
2018ML_(generic_POST_sys_shmat) ( ThreadId tid,
2019                              UWord res,
2020                              UWord arg0, UWord arg1, UWord arg2 )
2021{
2022   SizeT segmentSize = VG_PGROUNDUP(get_shm_size(arg0));
2023   if ( segmentSize > 0 ) {
2024      UInt prot = VKI_PROT_READ|VKI_PROT_WRITE;
2025      Bool d;
2026
2027      if (arg2 & VKI_SHM_RDONLY)
2028         prot &= ~VKI_PROT_WRITE;
2029      /* It isn't exactly correct to pass 0 for the fd and offset
2030         here.  The kernel seems to think the corresponding section
2031         does have dev/ino numbers:
2032
2033         04e52000-04ec8000 rw-s 00000000 00:06 1966090  /SYSV00000000 (deleted)
2034
2035         However there is no obvious way to find them.  In order to
2036         cope with the discrepancy, aspacem's sync checker omits the
2037         dev/ino correspondence check in cases where V does not know
2038         the dev/ino. */
2039      d = VG_(am_notify_client_shmat)( res, segmentSize, prot );
2040
2041      /* we don't distinguish whether it's read-only or
2042       * read-write -- it doesn't matter really. */
2043      VG_TRACK( new_mem_mmap, res, segmentSize, True, True, False,
2044                              0/*di_handle*/ );
2045      if (d)
2046         VG_(discard_translations)( (Addr)res,
2047                                    (ULong)VG_PGROUNDUP(segmentSize),
2048                                    "ML_(generic_POST_sys_shmat)" );
2049   }
2050}
2051
2052/* ------ */
2053
2054Bool
2055ML_(generic_PRE_sys_shmdt) ( ThreadId tid, UWord arg0 )
2056{
2057   /* int shmdt(const void *shmaddr); */
2058   return ML_(valid_client_addr)(arg0, 1, tid, "shmdt");
2059}
2060
2061void
2062ML_(generic_POST_sys_shmdt) ( ThreadId tid, UWord res, UWord arg0 )
2063{
2064   NSegment const* s = VG_(am_find_nsegment)(arg0);
2065
2066   if (s != NULL) {
2067      Addr  s_start = s->start;
2068      SizeT s_len   = s->end+1 - s->start;
2069      Bool  d;
2070
2071      vg_assert(s->kind == SkShmC);
2072      vg_assert(s->start == arg0);
2073
2074      d = VG_(am_notify_munmap)(s_start, s_len);
2075      s = NULL; /* s is now invalid */
2076      VG_TRACK( die_mem_munmap, s_start, s_len );
2077      if (d)
2078         VG_(discard_translations)( s_start,
2079                                    (ULong)s_len,
2080                                    "ML_(generic_POST_sys_shmdt)" );
2081   }
2082}
2083/* ------ */
2084
2085void
2086ML_(generic_PRE_sys_shmctl) ( ThreadId tid,
2087                              UWord arg0, UWord arg1, UWord arg2 )
2088{
2089   /* int shmctl(int shmid, int cmd, struct shmid_ds *buf); */
2090   switch (arg1 /* cmd */) {
2091#if defined(VKI_IPC_INFO)
2092   case VKI_IPC_INFO:
2093      PRE_MEM_WRITE( "shmctl(IPC_INFO, buf)",
2094                     arg2, sizeof(struct vki_shminfo) );
2095      break;
2096#if defined(VKI_IPC_64)
2097   case VKI_IPC_INFO|VKI_IPC_64:
2098      PRE_MEM_WRITE( "shmctl(IPC_INFO, buf)",
2099                     arg2, sizeof(struct vki_shminfo64) );
2100      break;
2101#endif
2102#endif
2103
2104#if defined(VKI_SHM_INFO)
2105   case VKI_SHM_INFO:
2106#if defined(VKI_IPC_64)
2107   case VKI_SHM_INFO|VKI_IPC_64:
2108#endif
2109      PRE_MEM_WRITE( "shmctl(SHM_INFO, buf)",
2110                     arg2, sizeof(struct vki_shm_info) );
2111      break;
2112#endif
2113
2114   case VKI_IPC_STAT:
2115#if defined(VKI_SHM_STAT)
2116   case VKI_SHM_STAT:
2117#endif
2118      PRE_MEM_WRITE( "shmctl(IPC_STAT, buf)",
2119                     arg2, sizeof(struct vki_shmid_ds) );
2120      break;
2121
2122#if defined(VKI_IPC_64)
2123   case VKI_IPC_STAT|VKI_IPC_64:
2124   case VKI_SHM_STAT|VKI_IPC_64:
2125      PRE_MEM_WRITE( "shmctl(IPC_STAT, arg.buf)",
2126                     arg2, sizeof(struct vki_shmid64_ds) );
2127      break;
2128#endif
2129
2130   case VKI_IPC_SET:
2131      PRE_MEM_READ( "shmctl(IPC_SET, arg.buf)",
2132                    arg2, sizeof(struct vki_shmid_ds) );
2133      break;
2134
2135#if defined(VKI_IPC_64)
2136   case VKI_IPC_SET|VKI_IPC_64:
2137      PRE_MEM_READ( "shmctl(IPC_SET, arg.buf)",
2138                    arg2, sizeof(struct vki_shmid64_ds) );
2139      break;
2140#endif
2141   }
2142}
2143
2144void
2145ML_(generic_POST_sys_shmctl) ( ThreadId tid,
2146                               UWord res,
2147                               UWord arg0, UWord arg1, UWord arg2 )
2148{
2149   switch (arg1 /* cmd */) {
2150#if defined(VKI_IPC_INFO)
2151   case VKI_IPC_INFO:
2152      POST_MEM_WRITE( arg2, sizeof(struct vki_shminfo) );
2153      break;
2154   case VKI_IPC_INFO|VKI_IPC_64:
2155      POST_MEM_WRITE( arg2, sizeof(struct vki_shminfo64) );
2156      break;
2157#endif
2158
2159#if defined(VKI_SHM_INFO)
2160   case VKI_SHM_INFO:
2161   case VKI_SHM_INFO|VKI_IPC_64:
2162      POST_MEM_WRITE( arg2, sizeof(struct vki_shm_info) );
2163      break;
2164#endif
2165
2166   case VKI_IPC_STAT:
2167#if defined(VKI_SHM_STAT)
2168   case VKI_SHM_STAT:
2169#endif
2170      POST_MEM_WRITE( arg2, sizeof(struct vki_shmid_ds) );
2171      break;
2172
2173#if defined(VKI_IPC_64)
2174   case VKI_IPC_STAT|VKI_IPC_64:
2175   case VKI_SHM_STAT|VKI_IPC_64:
2176      POST_MEM_WRITE( arg2, sizeof(struct vki_shmid64_ds) );
2177      break;
2178#endif
2179
2180
2181   }
2182}
2183
2184/* ---------------------------------------------------------------------
2185   Generic handler for mmap
2186   ------------------------------------------------------------------ */
2187
2188/*
2189 * Although mmap is specified by POSIX and the argument are generally
2190 * consistent across platforms the precise details of the low level
2191 * argument passing conventions differ. For example:
2192 *
2193 * - On x86-linux there is mmap (aka old_mmap) which takes the
2194 *   arguments in a memory block and the offset in bytes; and
2195 *   mmap2 (aka sys_mmap2) which takes the arguments in the normal
2196 *   way and the offset in pages.
2197 *
2198 * - On ppc32-linux there is mmap (aka sys_mmap) which takes the
2199 *   arguments in the normal way and the offset in bytes; and
2200 *   mmap2 (aka sys_mmap2) which takes the arguments in the normal
2201 *   way and the offset in pages.
2202 *
2203 * - On amd64-linux everything is simple and there is just the one
2204 *   call, mmap (aka sys_mmap)  which takes the arguments in the
2205 *   normal way and the offset in bytes.
2206 *
2207 * - On s390x-linux there is mmap (aka old_mmap) which takes the
2208 *   arguments in a memory block and the offset in bytes. mmap2
2209 *   is also available (but not exported via unistd.h) with
2210 *   arguments in a memory block and the offset in pages.
2211 *
2212 * To cope with all this we provide a generic handler function here
2213 * and then each platform implements one or more system call handlers
2214 * which call this generic routine after extracting and normalising
2215 * the arguments.
2216 */
2217
2218SysRes
2219ML_(generic_PRE_sys_mmap) ( ThreadId tid,
2220                            UWord arg1, UWord arg2, UWord arg3,
2221                            UWord arg4, UWord arg5, Off64T arg6 )
2222{
2223   Addr       advised;
2224   SysRes     sres;
2225   MapRequest mreq;
2226   Bool       mreq_ok;
2227
2228#  if defined(VGO_darwin)
2229   // Nb: we can't use this on Darwin, it has races:
2230   // * needs to RETRY if advisory succeeds but map fails
2231   //   (could have been some other thread in a nonblocking call)
2232   // * needs to not use fixed-position mmap() on Darwin
2233   //   (mmap will cheerfully smash whatever's already there, which might
2234   //   be a new mapping from some other thread in a nonblocking call)
2235   VG_(core_panic)("can't use ML_(generic_PRE_sys_mmap) on Darwin");
2236#  endif
2237
2238   if (arg2 == 0) {
2239      /* SuSV3 says: If len is zero, mmap() shall fail and no mapping
2240         shall be established. */
2241      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2242   }
2243
2244   if (!VG_IS_PAGE_ALIGNED(arg1)) {
2245      /* zap any misaligned addresses. */
2246      /* SuSV3 says misaligned addresses only cause the MAP_FIXED case
2247         to fail.   Here, we catch them all. */
2248      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2249   }
2250
2251   if (!VG_IS_PAGE_ALIGNED(arg6)) {
2252      /* zap any misaligned offsets. */
2253      /* SuSV3 says: The off argument is constrained to be aligned and
2254         sized according to the value returned by sysconf() when
2255         passed _SC_PAGESIZE or _SC_PAGE_SIZE. */
2256      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2257   }
2258
2259   /* Figure out what kind of allocation constraints there are
2260      (fixed/hint/any), and ask aspacem what we should do. */
2261   mreq.start = arg1;
2262   mreq.len   = arg2;
2263   if (arg4 & VKI_MAP_FIXED) {
2264      mreq.rkind = MFixed;
2265   } else
2266#if defined(VKI_MAP_ALIGN) /* Solaris specific */
2267   if (arg4 & VKI_MAP_ALIGN) {
2268      mreq.rkind = MAlign;
2269      if (mreq.start == 0) {
2270         mreq.start = VKI_PAGE_SIZE;
2271      }
2272      /* VKI_MAP_FIXED and VKI_MAP_ALIGN don't like each other. */
2273      arg4 &= ~VKI_MAP_ALIGN;
2274   } else
2275#endif
2276   if (arg1 != 0) {
2277      mreq.rkind = MHint;
2278   } else {
2279      mreq.rkind = MAny;
2280   }
2281
2282   /* Enquire ... */
2283   advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok );
2284   if (!mreq_ok) {
2285      /* Our request was bounced, so we'd better fail. */
2286      return VG_(mk_SysRes_Error)( VKI_EINVAL );
2287   }
2288
2289#  if defined(VKI_MAP_32BIT)
2290   /* MAP_32BIT is royally unportable, so if the client asks for it, try our
2291      best to make it work (but without complexifying aspacemgr).
2292      If the user requested MAP_32BIT, the mmap-ed space must be in the
2293      first 2GB of the address space. So, return ENOMEM if aspacemgr
2294      advisory is above the first 2GB. If MAP_FIXED is also requested,
2295      MAP_32BIT has to be ignored.
2296      Assumption about aspacemgr behaviour: aspacemgr scans the address space
2297      from low addresses to find a free segment. No special effort is done
2298      to keep the first 2GB 'free' for this MAP_32BIT. So, this will often
2299      fail once the program has already allocated significant memory. */
2300   if ((arg4 & VKI_MAP_32BIT) && !(arg4 & VKI_MAP_FIXED)) {
2301      if (advised + arg2 >= 0x80000000)
2302         return VG_(mk_SysRes_Error)( VKI_ENOMEM );
2303   }
2304#  endif
2305
2306   /* Otherwise we're OK (so far).  Install aspacem's choice of
2307      address, and let the mmap go through.  */
2308   sres = VG_(am_do_mmap_NO_NOTIFY)(advised, arg2, arg3,
2309                                    arg4 | VKI_MAP_FIXED,
2310                                    arg5, arg6);
2311
2312#  if defined(VKI_MAP_32BIT)
2313   /* No recovery trial if the advisory was not accepted. */
2314   if ((arg4 & VKI_MAP_32BIT) && !(arg4 & VKI_MAP_FIXED)
2315       && sr_isError(sres)) {
2316      return VG_(mk_SysRes_Error)( VKI_ENOMEM );
2317   }
2318#  endif
2319
2320   /* A refinement: it may be that the kernel refused aspacem's choice
2321      of address.  If we were originally asked for a hinted mapping,
2322      there is still a last chance: try again at any address.
2323      Hence: */
2324   if (mreq.rkind == MHint && sr_isError(sres)) {
2325      mreq.start = 0;
2326      mreq.len   = arg2;
2327      mreq.rkind = MAny;
2328      advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok );
2329      if (!mreq_ok) {
2330         /* Our request was bounced, so we'd better fail. */
2331         return VG_(mk_SysRes_Error)( VKI_EINVAL );
2332      }
2333      /* and try again with the kernel */
2334      sres = VG_(am_do_mmap_NO_NOTIFY)(advised, arg2, arg3,
2335                                       arg4 | VKI_MAP_FIXED,
2336                                       arg5, arg6);
2337   }
2338
2339   /* Yet another refinement : sometimes valgrind chooses an address
2340      which is not acceptable by the kernel. This at least happens
2341      when mmap-ing huge pages, using the flag MAP_HUGETLB.
2342      valgrind aspacem does not know about huge pages, and modifying
2343      it to handle huge pages is not straightforward (e.g. need
2344      to understand special file system mount options).
2345      So, let's just redo an mmap, without giving any constraint to
2346      the kernel. If that succeeds, check with aspacem that the returned
2347      address is acceptable.
2348      This will give a similar effect as if the user would have
2349      hinted that address.
2350      The aspacem state will be correctly updated afterwards.
2351      We however cannot do this last refinement when the user asked
2352      for a fixed mapping, as the user asked a specific address. */
2353   if (sr_isError(sres) && !(arg4 & VKI_MAP_FIXED)) {
2354      advised = 0;
2355      /* try mmap with NULL address and without VKI_MAP_FIXED
2356         to let the kernel decide. */
2357      sres = VG_(am_do_mmap_NO_NOTIFY)(advised, arg2, arg3,
2358                                       arg4,
2359                                       arg5, arg6);
2360      if (!sr_isError(sres)) {
2361         /* The kernel is supposed to know what it is doing, but let's
2362            do a last sanity check anyway, as if the chosen address had
2363            been initially hinted by the client. The whole point of this
2364            last try was to allow mmap of huge pages to succeed without
2365            making aspacem understand them, on the other hand the kernel
2366            does not know about valgrind reservations, so this mapping
2367            can end up in free space and reservations. */
2368         mreq.start = (Addr)sr_Res(sres);
2369         mreq.len   = arg2;
2370         mreq.rkind = MHint;
2371         advised = VG_(am_get_advisory)( &mreq, True/*client*/, &mreq_ok );
2372         vg_assert(mreq_ok && advised == mreq.start);
2373      }
2374   }
2375
2376   if (!sr_isError(sres)) {
2377      ULong di_handle;
2378      /* Notify aspacem. */
2379      notify_core_of_mmap(
2380         (Addr)sr_Res(sres), /* addr kernel actually assigned */
2381         arg2, /* length */
2382         arg3, /* prot */
2383         arg4, /* the original flags value */
2384         arg5, /* fd */
2385         arg6  /* offset */
2386      );
2387      /* Load symbols? */
2388      di_handle = VG_(di_notify_mmap)( (Addr)sr_Res(sres),
2389                                       False/*allow_SkFileV*/, (Int)arg5 );
2390      /* Notify the tool. */
2391      notify_tool_of_mmap(
2392         (Addr)sr_Res(sres), /* addr kernel actually assigned */
2393         arg2, /* length */
2394         arg3, /* prot */
2395         di_handle /* so the tool can refer to the read debuginfo later,
2396                      if it wants. */
2397      );
2398   }
2399
2400   /* Stay sane */
2401   if (!sr_isError(sres) && (arg4 & VKI_MAP_FIXED))
2402      vg_assert(sr_Res(sres) == arg1);
2403
2404   return sres;
2405}
2406
2407
2408/* ---------------------------------------------------------------------
2409   The Main Entertainment ... syscall wrappers
2410   ------------------------------------------------------------------ */
2411
2412/* Note: the PRE() and POST() wrappers are for the actual functions
2413   implementing the system calls in the OS kernel.  These mostly have
2414   names like sys_write();  a few have names like old_mmap().  See the
2415   comment for ML_(syscall_table)[] for important info about the __NR_foo
2416   constants and their relationship to the sys_foo() functions.
2417
2418   Some notes about names used for syscalls and args:
2419   - For the --trace-syscalls=yes output, we use the sys_foo() name to avoid
2420     ambiguity.
2421
2422   - For error messages, we generally use a somewhat generic name
2423     for the syscall (eg. "write" rather than "sys_write").  This should be
2424     good enough for the average user to understand what is happening,
2425     without confusing them with names like "sys_write".
2426
2427   - Also, for error messages the arg names are mostly taken from the man
2428     pages (even though many of those man pages are really for glibc
2429     functions of the same name), rather than from the OS kernel source,
2430     for the same reason -- a user presented with a "bogus foo(bar)" arg
2431     will most likely look at the "foo" man page to see which is the "bar"
2432     arg.
2433
2434   Note that we use our own vki_* types.  The one exception is in
2435   PRE_REG_READn calls, where pointer types haven't been changed, because
2436   they don't need to be -- eg. for "foo*" to be used, the type foo need not
2437   be visible.
2438
2439   XXX: some of these are arch-specific, and should be factored out.
2440*/
2441
2442#define PRE(name)      DEFN_PRE_TEMPLATE(generic, name)
2443#define POST(name)     DEFN_POST_TEMPLATE(generic, name)
2444
2445PRE(sys_exit)
2446{
2447   ThreadState* tst;
2448   /* simple; just make this thread exit */
2449   PRINT("exit( %ld )", SARG1);
2450   PRE_REG_READ1(void, "exit", int, status);
2451   tst = VG_(get_ThreadState)(tid);
2452   /* Set the thread's status to be exiting, then claim that the
2453      syscall succeeded. */
2454   tst->exitreason = VgSrc_ExitThread;
2455   tst->os_state.exitcode = ARG1;
2456   SET_STATUS_Success(0);
2457}
2458
2459PRE(sys_ni_syscall)
2460{
2461   PRINT("unimplemented (by the kernel) syscall: %s! (ni_syscall)\n",
2462      VG_SYSNUM_STRING(SYSNO));
2463   PRE_REG_READ0(long, "ni_syscall");
2464   SET_STATUS_Failure( VKI_ENOSYS );
2465}
2466
2467PRE(sys_iopl)
2468{
2469   PRINT("sys_iopl ( %lu )", ARG1);
2470   PRE_REG_READ1(long, "iopl", unsigned long, level);
2471}
2472
2473PRE(sys_fsync)
2474{
2475   *flags |= SfMayBlock;
2476   PRINT("sys_fsync ( %lu )", ARG1);
2477   PRE_REG_READ1(long, "fsync", unsigned int, fd);
2478}
2479
2480PRE(sys_fdatasync)
2481{
2482   *flags |= SfMayBlock;
2483   PRINT("sys_fdatasync ( %lu )", ARG1);
2484   PRE_REG_READ1(long, "fdatasync", unsigned int, fd);
2485}
2486
2487PRE(sys_msync)
2488{
2489   *flags |= SfMayBlock;
2490   PRINT("sys_msync ( %#lx, %lu, %#lx )", ARG1, ARG2, ARG3);
2491   PRE_REG_READ3(long, "msync",
2492                 unsigned long, start, vki_size_t, length, int, flags);
2493   PRE_MEM_READ( "msync(start)", ARG1, ARG2 );
2494}
2495
2496// Nb: getpmsg() and putpmsg() are special additional syscalls used in early
2497// versions of LiS (Linux Streams).  They are not part of the kernel.
2498// Therefore, we have to provide this type ourself, rather than getting it
2499// from the kernel sources.
2500struct vki_pmsg_strbuf {
2501   int     maxlen;         /* no. of bytes in buffer */
2502   int     len;            /* no. of bytes returned */
2503   vki_caddr_t buf;        /* pointer to data */
2504};
2505PRE(sys_getpmsg)
2506{
2507   /* LiS getpmsg from http://www.gcom.com/home/linux/lis/ */
2508   struct vki_pmsg_strbuf *ctrl;
2509   struct vki_pmsg_strbuf *data;
2510   *flags |= SfMayBlock;
2511   PRINT("sys_getpmsg ( %ld, %#lx, %#lx, %#lx, %#lx )", SARG1, ARG2, ARG3,
2512         ARG4, ARG5);
2513   PRE_REG_READ5(int, "getpmsg",
2514                 int, fd, struct strbuf *, ctrl, struct strbuf *, data,
2515                 int *, bandp, int *, flagsp);
2516   ctrl = (struct vki_pmsg_strbuf *)ARG2;
2517   data = (struct vki_pmsg_strbuf *)ARG3;
2518   if (ctrl && ctrl->maxlen > 0)
2519      PRE_MEM_WRITE( "getpmsg(ctrl)", (Addr)ctrl->buf, ctrl->maxlen);
2520   if (data && data->maxlen > 0)
2521      PRE_MEM_WRITE( "getpmsg(data)", (Addr)data->buf, data->maxlen);
2522   if (ARG4)
2523      PRE_MEM_WRITE( "getpmsg(bandp)", (Addr)ARG4, sizeof(int));
2524   if (ARG5)
2525      PRE_MEM_WRITE( "getpmsg(flagsp)", (Addr)ARG5, sizeof(int));
2526}
2527POST(sys_getpmsg)
2528{
2529   struct vki_pmsg_strbuf *ctrl;
2530   struct vki_pmsg_strbuf *data;
2531   vg_assert(SUCCESS);
2532   ctrl = (struct vki_pmsg_strbuf *)ARG2;
2533   data = (struct vki_pmsg_strbuf *)ARG3;
2534   if (RES == 0 && ctrl && ctrl->len > 0) {
2535      POST_MEM_WRITE( (Addr)ctrl->buf, ctrl->len);
2536   }
2537   if (RES == 0 && data && data->len > 0) {
2538      POST_MEM_WRITE( (Addr)data->buf, data->len);
2539   }
2540}
2541
2542PRE(sys_putpmsg)
2543{
2544   /* LiS putpmsg from http://www.gcom.com/home/linux/lis/ */
2545   struct vki_pmsg_strbuf *ctrl;
2546   struct vki_pmsg_strbuf *data;
2547   *flags |= SfMayBlock;
2548   PRINT("sys_putpmsg ( %ld, %#lx, %#lx, %ld, %ld )", SARG1, ARG2, ARG3,
2549         SARG4, SARG5);
2550   PRE_REG_READ5(int, "putpmsg",
2551                 int, fd, struct strbuf *, ctrl, struct strbuf *, data,
2552                 int, band, int, flags);
2553   ctrl = (struct vki_pmsg_strbuf *)ARG2;
2554   data = (struct vki_pmsg_strbuf *)ARG3;
2555   if (ctrl && ctrl->len > 0)
2556      PRE_MEM_READ( "putpmsg(ctrl)", (Addr)ctrl->buf, ctrl->len);
2557   if (data && data->len > 0)
2558      PRE_MEM_READ( "putpmsg(data)", (Addr)data->buf, data->len);
2559}
2560
2561PRE(sys_getitimer)
2562{
2563   struct vki_itimerval *value = (struct vki_itimerval*)ARG2;
2564   PRINT("sys_getitimer ( %ld, %#lx )", SARG1, ARG2);
2565   PRE_REG_READ2(long, "getitimer", int, which, struct itimerval *, value);
2566
2567   PRE_timeval_WRITE( "getitimer(&value->it_interval)", &(value->it_interval));
2568   PRE_timeval_WRITE( "getitimer(&value->it_value)",    &(value->it_value));
2569}
2570
2571POST(sys_getitimer)
2572{
2573   if (ARG2 != (Addr)NULL) {
2574      struct vki_itimerval *value = (struct vki_itimerval*)ARG2;
2575      POST_timeval_WRITE( &(value->it_interval) );
2576      POST_timeval_WRITE( &(value->it_value) );
2577   }
2578}
2579
2580PRE(sys_setitimer)
2581{
2582   PRINT("sys_setitimer ( %ld, %#lx, %#lx )", SARG1, ARG2, ARG3);
2583   PRE_REG_READ3(long, "setitimer",
2584                 int, which,
2585                 struct itimerval *, value, struct itimerval *, ovalue);
2586   if (ARG2 != (Addr)NULL) {
2587      struct vki_itimerval *value = (struct vki_itimerval*)ARG2;
2588      PRE_timeval_READ( "setitimer(&value->it_interval)",
2589                         &(value->it_interval));
2590      PRE_timeval_READ( "setitimer(&value->it_value)",
2591                         &(value->it_value));
2592   }
2593   if (ARG3 != (Addr)NULL) {
2594      struct vki_itimerval *ovalue = (struct vki_itimerval*)ARG3;
2595      PRE_timeval_WRITE( "setitimer(&ovalue->it_interval)",
2596                         &(ovalue->it_interval));
2597      PRE_timeval_WRITE( "setitimer(&ovalue->it_value)",
2598                         &(ovalue->it_value));
2599   }
2600}
2601
2602POST(sys_setitimer)
2603{
2604   if (ARG3 != (Addr)NULL) {
2605      struct vki_itimerval *ovalue = (struct vki_itimerval*)ARG3;
2606      POST_timeval_WRITE( &(ovalue->it_interval) );
2607      POST_timeval_WRITE( &(ovalue->it_value) );
2608   }
2609}
2610
2611PRE(sys_chroot)
2612{
2613   PRINT("sys_chroot ( %#lx )", ARG1);
2614   PRE_REG_READ1(long, "chroot", const char *, path);
2615   PRE_MEM_RASCIIZ( "chroot(path)", ARG1 );
2616}
2617
2618PRE(sys_madvise)
2619{
2620   *flags |= SfMayBlock;
2621   PRINT("sys_madvise ( %#lx, %lu, %ld )", ARG1, ARG2, SARG3);
2622   PRE_REG_READ3(long, "madvise",
2623                 unsigned long, start, vki_size_t, length, int, advice);
2624}
2625
2626#if HAVE_MREMAP
2627PRE(sys_mremap)
2628{
2629   // Nb: this is different to the glibc version described in the man pages,
2630   // which lacks the fifth 'new_address' argument.
2631   if (ARG4 & VKI_MREMAP_FIXED) {
2632      PRINT("sys_mremap ( %#lx, %lu, %lu, %#lx, %#lx )",
2633            ARG1, ARG2, ARG3, ARG4, ARG5);
2634      PRE_REG_READ5(unsigned long, "mremap",
2635                    unsigned long, old_addr, unsigned long, old_size,
2636                    unsigned long, new_size, unsigned long, flags,
2637                    unsigned long, new_addr);
2638   } else {
2639      PRINT("sys_mremap ( %#lx, %lu, %lu, 0x%lx )",
2640            ARG1, ARG2, ARG3, ARG4);
2641      PRE_REG_READ4(unsigned long, "mremap",
2642                    unsigned long, old_addr, unsigned long, old_size,
2643                    unsigned long, new_size, unsigned long, flags);
2644   }
2645   SET_STATUS_from_SysRes(
2646      do_mremap((Addr)ARG1, ARG2, (Addr)ARG5, ARG3, ARG4, tid)
2647   );
2648}
2649#endif /* HAVE_MREMAP */
2650
2651PRE(sys_nice)
2652{
2653   PRINT("sys_nice ( %ld )", SARG1);
2654   PRE_REG_READ1(long, "nice", int, inc);
2655}
2656
2657PRE(sys_mlock)
2658{
2659   *flags |= SfMayBlock;
2660   PRINT("sys_mlock ( %#lx, %lu )", ARG1, ARG2);
2661   PRE_REG_READ2(long, "mlock", unsigned long, addr, vki_size_t, len);
2662}
2663
2664PRE(sys_munlock)
2665{
2666   *flags |= SfMayBlock;
2667   PRINT("sys_munlock ( %#lx, %lu )", ARG1, ARG2);
2668   PRE_REG_READ2(long, "munlock", unsigned long, addr, vki_size_t, len);
2669}
2670
2671PRE(sys_mlockall)
2672{
2673   *flags |= SfMayBlock;
2674   PRINT("sys_mlockall ( %lx )", ARG1);
2675   PRE_REG_READ1(long, "mlockall", int, flags);
2676}
2677
2678PRE(sys_setpriority)
2679{
2680   PRINT("sys_setpriority ( %ld, %ld, %ld )", SARG1, SARG2, SARG3);
2681   PRE_REG_READ3(long, "setpriority", int, which, int, who, int, prio);
2682}
2683
2684PRE(sys_getpriority)
2685{
2686   PRINT("sys_getpriority ( %ld, %ld )", SARG1, SARG2);
2687   PRE_REG_READ2(long, "getpriority", int, which, int, who);
2688}
2689
2690PRE(sys_pwrite64)
2691{
2692   *flags |= SfMayBlock;
2693#if VG_WORDSIZE == 4
2694   PRINT("sys_pwrite64 ( %lu, %#lx, %lu, %lld )",
2695         ARG1, ARG2, ARG3, (Long)MERGE64(ARG4,ARG5));
2696   PRE_REG_READ5(ssize_t, "pwrite64",
2697                 unsigned int, fd, const char *, buf, vki_size_t, count,
2698                 vki_u32, MERGE64_FIRST(offset), vki_u32, MERGE64_SECOND(offset));
2699#elif VG_WORDSIZE == 8
2700   PRINT("sys_pwrite64 ( %lu, %#lx, %lu, %ld )",
2701         ARG1, ARG2, ARG3, SARG4);
2702   PRE_REG_READ4(ssize_t, "pwrite64",
2703                 unsigned int, fd, const char *, buf, vki_size_t, count,
2704                 Word, offset);
2705#else
2706#  error Unexpected word size
2707#endif
2708   PRE_MEM_READ( "pwrite64(buf)", ARG2, ARG3 );
2709}
2710
2711PRE(sys_sync)
2712{
2713   *flags |= SfMayBlock;
2714   PRINT("sys_sync ( )");
2715   PRE_REG_READ0(long, "sync");
2716}
2717
2718PRE(sys_fstatfs)
2719{
2720   FUSE_COMPATIBLE_MAY_BLOCK();
2721   PRINT("sys_fstatfs ( %lu, %#lx )", ARG1, ARG2);
2722   PRE_REG_READ2(long, "fstatfs",
2723                 unsigned int, fd, struct statfs *, buf);
2724   PRE_MEM_WRITE( "fstatfs(buf)", ARG2, sizeof(struct vki_statfs) );
2725}
2726
2727POST(sys_fstatfs)
2728{
2729   POST_MEM_WRITE( ARG2, sizeof(struct vki_statfs) );
2730}
2731
2732PRE(sys_fstatfs64)
2733{
2734   FUSE_COMPATIBLE_MAY_BLOCK();
2735   PRINT("sys_fstatfs64 ( %lu, %lu, %#lx )", ARG1, ARG2, ARG3);
2736   PRE_REG_READ3(long, "fstatfs64",
2737                 unsigned int, fd, vki_size_t, size, struct statfs64 *, buf);
2738   PRE_MEM_WRITE( "fstatfs64(buf)", ARG3, ARG2 );
2739}
2740POST(sys_fstatfs64)
2741{
2742   POST_MEM_WRITE( ARG3, ARG2 );
2743}
2744
2745PRE(sys_getsid)
2746{
2747   PRINT("sys_getsid ( %ld )", SARG1);
2748   PRE_REG_READ1(long, "getsid", vki_pid_t, pid);
2749}
2750
2751PRE(sys_pread64)
2752{
2753   *flags |= SfMayBlock;
2754#if VG_WORDSIZE == 4
2755   PRINT("sys_pread64 ( %lu, %#lx, %lu, %lld )",
2756         ARG1, ARG2, ARG3, (Long)MERGE64(ARG4,ARG5));
2757   PRE_REG_READ5(ssize_t, "pread64",
2758                 unsigned int, fd, char *, buf, vki_size_t, count,
2759                 vki_u32, MERGE64_FIRST(offset), vki_u32, MERGE64_SECOND(offset));
2760#elif VG_WORDSIZE == 8
2761   PRINT("sys_pread64 ( %lu, %#lx, %lu, %ld )",
2762         ARG1, ARG2, ARG3, SARG4);
2763   PRE_REG_READ4(ssize_t, "pread64",
2764                 unsigned int, fd, char *, buf, vki_size_t, count,
2765                 Word, offset);
2766#else
2767#  error Unexpected word size
2768#endif
2769   PRE_MEM_WRITE( "pread64(buf)", ARG2, ARG3 );
2770}
2771POST(sys_pread64)
2772{
2773   vg_assert(SUCCESS);
2774   if (RES > 0) {
2775      POST_MEM_WRITE( ARG2, RES );
2776   }
2777}
2778
2779PRE(sys_mknod)
2780{
2781   FUSE_COMPATIBLE_MAY_BLOCK();
2782   PRINT("sys_mknod ( %#lx(%s), %#lx, %#lx )", ARG1, (HChar*)ARG1, ARG2, ARG3 );
2783   PRE_REG_READ3(long, "mknod",
2784                 const char *, pathname, int, mode, unsigned, dev);
2785   PRE_MEM_RASCIIZ( "mknod(pathname)", ARG1 );
2786}
2787
2788PRE(sys_flock)
2789{
2790   *flags |= SfMayBlock;
2791   PRINT("sys_flock ( %lu, %lu )", ARG1, ARG2 );
2792   PRE_REG_READ2(long, "flock", unsigned int, fd, unsigned int, operation);
2793}
2794
2795// Pre_read a char** argument.
2796void ML_(pre_argv_envp)(Addr a, ThreadId tid, const HChar *s1, const HChar *s2)
2797{
2798   while (True) {
2799      Addr a_deref;
2800      Addr* a_p = (Addr*)a;
2801      PRE_MEM_READ( s1, (Addr)a_p, sizeof(Addr) );
2802      a_deref = *a_p;
2803      if (0 == a_deref)
2804         break;
2805      PRE_MEM_RASCIIZ( s2, a_deref );
2806      a += sizeof(char*);
2807   }
2808}
2809
2810static Bool i_am_the_only_thread ( void )
2811{
2812   Int c = VG_(count_living_threads)();
2813   vg_assert(c >= 1); /* stay sane */
2814   return c == 1;
2815}
2816
2817/* Wait until all other threads disappear. */
2818void VG_(reap_threads)(ThreadId self)
2819{
2820   while (!i_am_the_only_thread()) {
2821      /* Let other thread(s) run */
2822      VG_(vg_yield)();
2823      VG_(poll_signals)(self);
2824   }
2825   vg_assert(i_am_the_only_thread());
2826}
2827
2828// XXX: prototype here seemingly doesn't match the prototype for i386-linux,
2829// but it seems to work nonetheless...
2830PRE(sys_execve)
2831{
2832   HChar*       path = NULL;       /* path to executable */
2833   HChar**      envp = NULL;
2834   HChar**      argv = NULL;
2835   HChar**      arg2copy;
2836   HChar*       launcher_basename = NULL;
2837   ThreadState* tst;
2838   Int          i, j, tot_args;
2839   SysRes       res;
2840   Bool         setuid_allowed, trace_this_child;
2841
2842   PRINT("sys_execve ( %#lx(%s), %#lx, %#lx )", ARG1, (HChar*)ARG1, ARG2, ARG3);
2843   PRE_REG_READ3(vki_off_t, "execve",
2844                 char *, filename, char **, argv, char **, envp);
2845   PRE_MEM_RASCIIZ( "execve(filename)", ARG1 );
2846   if (ARG2 != 0) {
2847      /* At least the terminating NULL must be addressable. */
2848      if (!ML_(safe_to_deref)((HChar **) ARG2, sizeof(HChar *))) {
2849         SET_STATUS_Failure(VKI_EFAULT);
2850         return;
2851      }
2852      ML_(pre_argv_envp)( ARG2, tid, "execve(argv)", "execve(argv[i])" );
2853   }
2854   if (ARG3 != 0) {
2855      /* At least the terminating NULL must be addressable. */
2856      if (!ML_(safe_to_deref)((HChar **) ARG3, sizeof(HChar *))) {
2857         SET_STATUS_Failure(VKI_EFAULT);
2858         return;
2859      }
2860      ML_(pre_argv_envp)( ARG3, tid, "execve(envp)", "execve(envp[i])" );
2861   }
2862
2863   vg_assert(VG_(is_valid_tid)(tid));
2864   tst = VG_(get_ThreadState)(tid);
2865
2866   /* Erk.  If the exec fails, then the following will have made a
2867      mess of things which makes it hard for us to continue.  The
2868      right thing to do is piece everything together again in
2869      POST(execve), but that's close to impossible.  Instead, we make
2870      an effort to check that the execve will work before actually
2871      doing it. */
2872
2873   /* Check that the name at least begins in client-accessible storage. */
2874   if (ARG1 == 0 /* obviously bogus */
2875       || !VG_(am_is_valid_for_client)( ARG1, 1, VKI_PROT_READ )) {
2876      SET_STATUS_Failure( VKI_EFAULT );
2877      return;
2878   }
2879
2880   // debug-only printing
2881   if (0) {
2882      VG_(printf)("ARG1 = %p(%s)\n", (void*)ARG1, (HChar*)ARG1);
2883      if (ARG2) {
2884         VG_(printf)("ARG2 = ");
2885         Int q;
2886         HChar** vec = (HChar**)ARG2;
2887         for (q = 0; vec[q]; q++)
2888            VG_(printf)("%p(%s) ", vec[q], vec[q]);
2889         VG_(printf)("\n");
2890      } else {
2891         VG_(printf)("ARG2 = null\n");
2892      }
2893   }
2894
2895   // Decide whether or not we want to follow along
2896   { // Make 'child_argv' be a pointer to the child's arg vector
2897     // (skipping the exe name)
2898     const HChar** child_argv = (const HChar**)ARG2;
2899     if (child_argv && child_argv[0] == NULL)
2900        child_argv = NULL;
2901     trace_this_child = VG_(should_we_trace_this_child)( (HChar*)ARG1, child_argv );
2902   }
2903
2904   // Do the important checks:  it is a file, is executable, permissions are
2905   // ok, etc.  We allow setuid executables to run only in the case when
2906   // we are not simulating them, that is, they to be run natively.
2907   setuid_allowed = trace_this_child  ? False  : True;
2908   res = VG_(pre_exec_check)((const HChar *)ARG1, NULL, setuid_allowed);
2909   if (sr_isError(res)) {
2910      SET_STATUS_Failure( sr_Err(res) );
2911      return;
2912   }
2913
2914   /* If we're tracing the child, and the launcher name looks bogus
2915      (possibly because launcher.c couldn't figure it out, see
2916      comments therein) then we have no option but to fail. */
2917   if (trace_this_child
2918       && (VG_(name_of_launcher) == NULL
2919           || VG_(name_of_launcher)[0] != '/')) {
2920      SET_STATUS_Failure( VKI_ECHILD ); /* "No child processes" */
2921      return;
2922   }
2923
2924   /* After this point, we can't recover if the execve fails. */
2925   VG_(debugLog)(1, "syswrap", "Exec of %s\n", (HChar*)ARG1);
2926
2927
2928   // Terminate gdbserver if it is active.
2929   if (VG_(clo_vgdb)  != Vg_VgdbNo) {
2930      // If the child will not be traced, we need to terminate gdbserver
2931      // to cleanup the gdbserver resources (e.g. the FIFO files).
2932      // If child will be traced, we also terminate gdbserver: the new
2933      // Valgrind will start a fresh gdbserver after exec.
2934      VG_(gdbserver) (0);
2935   }
2936
2937   /* Resistance is futile.  Nuke all other threads.  POSIX mandates
2938      this. (Really, nuke them all, since the new process will make
2939      its own new thread.) */
2940   VG_(nuke_all_threads_except)( tid, VgSrc_ExitThread );
2941   VG_(reap_threads)(tid);
2942
2943   // Set up the child's exe path.
2944   //
2945   if (trace_this_child) {
2946
2947      // We want to exec the launcher.  Get its pre-remembered path.
2948      path = VG_(name_of_launcher);
2949      // VG_(name_of_launcher) should have been acquired by m_main at
2950      // startup.
2951      vg_assert(path);
2952
2953      launcher_basename = VG_(strrchr)(path, '/');
2954      if (launcher_basename == NULL || launcher_basename[1] == 0) {
2955         launcher_basename = path;  // hmm, tres dubious
2956      } else {
2957         launcher_basename++;
2958      }
2959
2960   } else {
2961      path = (HChar*)ARG1;
2962   }
2963
2964   // Set up the child's environment.
2965   //
2966   // Remove the valgrind-specific stuff from the environment so the
2967   // child doesn't get vgpreload_core.so, vgpreload_<tool>.so, etc.
2968   // This is done unconditionally, since if we are tracing the child,
2969   // the child valgrind will set up the appropriate client environment.
2970   // Nb: we make a copy of the environment before trying to mangle it
2971   // as it might be in read-only memory (this was bug #101881).
2972   //
2973   // Then, if tracing the child, set VALGRIND_LIB for it.
2974   //
2975   if (ARG3 == 0) {
2976      envp = NULL;
2977   } else {
2978      envp = VG_(env_clone)( (HChar**)ARG3 );
2979      if (envp == NULL) goto hosed;
2980      VG_(env_remove_valgrind_env_stuff)( envp, True /*ro_strings*/, NULL );
2981   }
2982
2983   if (trace_this_child) {
2984      // Set VALGRIND_LIB in ARG3 (the environment)
2985      VG_(env_setenv)( &envp, VALGRIND_LIB, VG_(libdir));
2986   }
2987
2988   // Set up the child's args.  If not tracing it, they are
2989   // simply ARG2.  Otherwise, they are
2990   //
2991   // [launcher_basename] ++ VG_(args_for_valgrind) ++ [ARG1] ++ ARG2[1..]
2992   //
2993   // except that the first VG_(args_for_valgrind_noexecpass) args
2994   // are omitted.
2995   //
2996   if (!trace_this_child) {
2997      argv = (HChar**)ARG2;
2998   } else {
2999      vg_assert( VG_(args_for_valgrind) );
3000      vg_assert( VG_(args_for_valgrind_noexecpass) >= 0 );
3001      vg_assert( VG_(args_for_valgrind_noexecpass)
3002                   <= VG_(sizeXA)( VG_(args_for_valgrind) ) );
3003      /* how many args in total will there be? */
3004      // launcher basename
3005      tot_args = 1;
3006      // V's args
3007      tot_args += VG_(sizeXA)( VG_(args_for_valgrind) );
3008      tot_args -= VG_(args_for_valgrind_noexecpass);
3009      // name of client exe
3010      tot_args++;
3011      // args for client exe, skipping [0]
3012      arg2copy = (HChar**)ARG2;
3013      if (arg2copy && arg2copy[0]) {
3014         for (i = 1; arg2copy[i]; i++)
3015            tot_args++;
3016      }
3017      // allocate
3018      argv = VG_(malloc)( "di.syswrap.pre_sys_execve.1",
3019                          (tot_args+1) * sizeof(HChar*) );
3020      // copy
3021      j = 0;
3022      argv[j++] = launcher_basename;
3023      for (i = 0; i < VG_(sizeXA)( VG_(args_for_valgrind) ); i++) {
3024         if (i < VG_(args_for_valgrind_noexecpass))
3025            continue;
3026         argv[j++] = * (HChar**) VG_(indexXA)( VG_(args_for_valgrind), i );
3027      }
3028      argv[j++] = (HChar*)ARG1;
3029      if (arg2copy && arg2copy[0])
3030         for (i = 1; arg2copy[i]; i++)
3031            argv[j++] = arg2copy[i];
3032      argv[j++] = NULL;
3033      // check
3034      vg_assert(j == tot_args+1);
3035   }
3036
3037   /*
3038      Set the signal state up for exec.
3039
3040      We need to set the real signal state to make sure the exec'd
3041      process gets SIG_IGN properly.
3042
3043      Also set our real sigmask to match the client's sigmask so that
3044      the exec'd child will get the right mask.  First we need to
3045      clear out any pending signals so they they don't get delivered,
3046      which would confuse things.
3047
3048      XXX This is a bug - the signals should remain pending, and be
3049      delivered to the new process after exec.  There's also a
3050      race-condition, since if someone delivers us a signal between
3051      the sigprocmask and the execve, we'll still get the signal. Oh
3052      well.
3053   */
3054   {
3055      vki_sigset_t allsigs;
3056      vki_siginfo_t info;
3057
3058      /* What this loop does: it queries SCSS (the signal state that
3059         the client _thinks_ the kernel is in) by calling
3060         VG_(do_sys_sigaction), and modifies the real kernel signal
3061         state accordingly. */
3062      for (i = 1; i < VG_(max_signal); i++) {
3063         vki_sigaction_fromK_t sa_f;
3064         vki_sigaction_toK_t   sa_t;
3065         VG_(do_sys_sigaction)(i, NULL, &sa_f);
3066         VG_(convert_sigaction_fromK_to_toK)(&sa_f, &sa_t);
3067         if (sa_t.ksa_handler == VKI_SIG_IGN)
3068            VG_(sigaction)(i, &sa_t, NULL);
3069         else {
3070            sa_t.ksa_handler = VKI_SIG_DFL;
3071            VG_(sigaction)(i, &sa_t, NULL);
3072         }
3073      }
3074
3075      VG_(sigfillset)(&allsigs);
3076      while(VG_(sigtimedwait_zero)(&allsigs, &info) > 0)
3077         ;
3078
3079      VG_(sigprocmask)(VKI_SIG_SETMASK, &tst->sig_mask, NULL);
3080   }
3081
3082   if (0) {
3083      HChar **cpp;
3084      VG_(printf)("exec: %s\n", path);
3085      for (cpp = argv; cpp && *cpp; cpp++)
3086         VG_(printf)("argv: %s\n", *cpp);
3087      if (0)
3088         for (cpp = envp; cpp && *cpp; cpp++)
3089            VG_(printf)("env: %s\n", *cpp);
3090   }
3091
3092   SET_STATUS_from_SysRes(
3093      VG_(do_syscall3)(__NR_execve, (UWord)path, (UWord)argv, (UWord)envp)
3094   );
3095
3096   /* If we got here, then the execve failed.  We've already made way
3097      too much of a mess to continue, so we have to abort. */
3098  hosed:
3099   vg_assert(FAILURE);
3100   VG_(message)(Vg_UserMsg, "execve(%#lx(%s), %#lx, %#lx) failed, errno %lu\n",
3101                ARG1, (HChar*)ARG1, ARG2, ARG3, ERR);
3102   VG_(message)(Vg_UserMsg, "EXEC FAILED: I can't recover from "
3103                            "execve() failing, so I'm dying.\n");
3104   VG_(message)(Vg_UserMsg, "Add more stringent tests in PRE(sys_execve), "
3105                            "or work out how to recover.\n");
3106   VG_(exit)(101);
3107}
3108
3109PRE(sys_access)
3110{
3111   PRINT("sys_access ( %#lx(%s), %ld )", ARG1, (HChar*)ARG1, SARG2);
3112   PRE_REG_READ2(long, "access", const char *, pathname, int, mode);
3113   PRE_MEM_RASCIIZ( "access(pathname)", ARG1 );
3114}
3115
3116PRE(sys_alarm)
3117{
3118   PRINT("sys_alarm ( %lu )", ARG1);
3119   PRE_REG_READ1(unsigned long, "alarm", unsigned int, seconds);
3120}
3121
3122PRE(sys_brk)
3123{
3124   Addr brk_limit = VG_(brk_limit);
3125   Addr brk_new;
3126
3127   /* libc   says: int   brk(void *end_data_segment);
3128      kernel says: void* brk(void* end_data_segment);  (more or less)
3129
3130      libc returns 0 on success, and -1 (and sets errno) on failure.
3131      Nb: if you ask to shrink the dataseg end below what it
3132      currently is, that always succeeds, even if the dataseg end
3133      doesn't actually change (eg. brk(0)).  Unless it seg faults.
3134
3135      Kernel returns the new dataseg end.  If the brk() failed, this
3136      will be unchanged from the old one.  That's why calling (kernel)
3137      brk(0) gives the current dataseg end (libc brk() just returns
3138      zero in that case).
3139
3140      Both will seg fault if you shrink it back into a text segment.
3141   */
3142   PRINT("sys_brk ( %#lx )", ARG1);
3143   PRE_REG_READ1(unsigned long, "brk", unsigned long, end_data_segment);
3144
3145   brk_new = do_brk(ARG1, tid);
3146   SET_STATUS_Success( brk_new );
3147
3148   if (brk_new == ARG1) {
3149      /* brk() succeeded */
3150      if (brk_new < brk_limit) {
3151         /* successfully shrunk the data segment. */
3152         VG_TRACK( die_mem_brk, (Addr)ARG1,
3153		   brk_limit-ARG1 );
3154      } else
3155      if (brk_new > brk_limit) {
3156         /* successfully grew the data segment */
3157         VG_TRACK( new_mem_brk, brk_limit,
3158                   ARG1-brk_limit, tid );
3159      }
3160   } else {
3161      /* brk() failed */
3162      vg_assert(brk_limit == brk_new);
3163   }
3164}
3165
3166PRE(sys_chdir)
3167{
3168   FUSE_COMPATIBLE_MAY_BLOCK();
3169   PRINT("sys_chdir ( %#lx(%s) )", ARG1,(char*)ARG1);
3170   PRE_REG_READ1(long, "chdir", const char *, path);
3171   PRE_MEM_RASCIIZ( "chdir(path)", ARG1 );
3172}
3173
3174PRE(sys_chmod)
3175{
3176   FUSE_COMPATIBLE_MAY_BLOCK();
3177   PRINT("sys_chmod ( %#lx(%s), %lu )", ARG1, (HChar*)ARG1, ARG2);
3178   PRE_REG_READ2(long, "chmod", const char *, path, vki_mode_t, mode);
3179   PRE_MEM_RASCIIZ( "chmod(path)", ARG1 );
3180}
3181
3182PRE(sys_chown)
3183{
3184   FUSE_COMPATIBLE_MAY_BLOCK();
3185   PRINT("sys_chown ( %#lx(%s), 0x%lx, 0x%lx )", ARG1,(char*)ARG1,ARG2,ARG3);
3186   PRE_REG_READ3(long, "chown",
3187                 const char *, path, vki_uid_t, owner, vki_gid_t, group);
3188   PRE_MEM_RASCIIZ( "chown(path)", ARG1 );
3189}
3190
3191PRE(sys_lchown)
3192{
3193   FUSE_COMPATIBLE_MAY_BLOCK();
3194   PRINT("sys_lchown ( %#lx(%s), 0x%lx, 0x%lx )", ARG1,(char*)ARG1,ARG2,ARG3);
3195   PRE_REG_READ3(long, "lchown",
3196                 const char *, path, vki_uid_t, owner, vki_gid_t, group);
3197   PRE_MEM_RASCIIZ( "lchown(path)", ARG1 );
3198}
3199
3200PRE(sys_close)
3201{
3202   FUSE_COMPATIBLE_MAY_BLOCK();
3203   PRINT("sys_close ( %lu )", ARG1);
3204   PRE_REG_READ1(long, "close", unsigned int, fd);
3205
3206   /* Detect and negate attempts by the client to close Valgrind's log fd */
3207   if ( (!ML_(fd_allowed)(ARG1, "close", tid, False))
3208        /* If doing -d style logging (which is to fd=2), don't
3209           allow that to be closed either. */
3210        || (ARG1 == 2/*stderr*/ && VG_(debugLog_getLevel)() > 0) )
3211      SET_STATUS_Failure( VKI_EBADF );
3212}
3213
3214POST(sys_close)
3215{
3216   if (VG_(clo_track_fds)) ML_(record_fd_close)(ARG1);
3217}
3218
3219PRE(sys_dup)
3220{
3221   PRINT("sys_dup ( %lu )", ARG1);
3222   PRE_REG_READ1(long, "dup", unsigned int, oldfd);
3223}
3224
3225POST(sys_dup)
3226{
3227   vg_assert(SUCCESS);
3228   if (!ML_(fd_allowed)(RES, "dup", tid, True)) {
3229      VG_(close)(RES);
3230      SET_STATUS_Failure( VKI_EMFILE );
3231   } else {
3232      if (VG_(clo_track_fds))
3233         ML_(record_fd_open_named)(tid, RES);
3234   }
3235}
3236
3237PRE(sys_dup2)
3238{
3239   PRINT("sys_dup2 ( %lu, %lu )", ARG1, ARG2);
3240   PRE_REG_READ2(long, "dup2", unsigned int, oldfd, unsigned int, newfd);
3241   if (!ML_(fd_allowed)(ARG2, "dup2", tid, True))
3242      SET_STATUS_Failure( VKI_EBADF );
3243}
3244
3245POST(sys_dup2)
3246{
3247   vg_assert(SUCCESS);
3248   if (VG_(clo_track_fds))
3249      ML_(record_fd_open_named)(tid, RES);
3250}
3251
3252PRE(sys_fchdir)
3253{
3254   FUSE_COMPATIBLE_MAY_BLOCK();
3255   PRINT("sys_fchdir ( %lu )", ARG1);
3256   PRE_REG_READ1(long, "fchdir", unsigned int, fd);
3257}
3258
3259PRE(sys_fchown)
3260{
3261   FUSE_COMPATIBLE_MAY_BLOCK();
3262   PRINT("sys_fchown ( %lu, %lu, %lu )", ARG1, ARG2, ARG3);
3263   PRE_REG_READ3(long, "fchown",
3264                 unsigned int, fd, vki_uid_t, owner, vki_gid_t, group);
3265}
3266
3267PRE(sys_fchmod)
3268{
3269   FUSE_COMPATIBLE_MAY_BLOCK();
3270   PRINT("sys_fchmod ( %lu, %lu )", ARG1, ARG2);
3271   PRE_REG_READ2(long, "fchmod", unsigned int, fildes, vki_mode_t, mode);
3272}
3273
3274PRE(sys_newfstat)
3275{
3276   FUSE_COMPATIBLE_MAY_BLOCK();
3277   PRINT("sys_newfstat ( %lu, %#lx )", ARG1, ARG2);
3278   PRE_REG_READ2(long, "fstat", unsigned int, fd, struct stat *, buf);
3279   PRE_MEM_WRITE( "fstat(buf)", ARG2, sizeof(struct vki_stat) );
3280}
3281
3282POST(sys_newfstat)
3283{
3284   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat) );
3285}
3286
3287#if !defined(VGO_solaris) && !defined(VGP_arm64_linux)
3288static vki_sigset_t fork_saved_mask;
3289
3290// In Linux, the sys_fork() function varies across architectures, but we
3291// ignore the various args it gets, and so it looks arch-neutral.  Hmm.
3292PRE(sys_fork)
3293{
3294   Bool is_child;
3295   Int child_pid;
3296   vki_sigset_t mask;
3297
3298   PRINT("sys_fork ( )");
3299   PRE_REG_READ0(long, "fork");
3300
3301   /* Block all signals during fork, so that we can fix things up in
3302      the child without being interrupted. */
3303   VG_(sigfillset)(&mask);
3304   VG_(sigprocmask)(VKI_SIG_SETMASK, &mask, &fork_saved_mask);
3305
3306   VG_(do_atfork_pre)(tid);
3307
3308   SET_STATUS_from_SysRes( VG_(do_syscall0)(__NR_fork) );
3309
3310   if (!SUCCESS) return;
3311
3312#if defined(VGO_linux)
3313   // RES is 0 for child, non-0 (the child's PID) for parent.
3314   is_child = ( RES == 0 ? True : False );
3315   child_pid = ( is_child ? -1 : RES );
3316#elif defined(VGO_darwin)
3317   // RES is the child's pid.  RESHI is 1 for child, 0 for parent.
3318   is_child = RESHI;
3319   child_pid = RES;
3320#else
3321#  error Unknown OS
3322#endif
3323
3324   if (is_child) {
3325      VG_(do_atfork_child)(tid);
3326
3327      /* restore signal mask */
3328      VG_(sigprocmask)(VKI_SIG_SETMASK, &fork_saved_mask, NULL);
3329   } else {
3330      VG_(do_atfork_parent)(tid);
3331
3332      PRINT("   fork: process %d created child %d\n", VG_(getpid)(), child_pid);
3333
3334      /* restore signal mask */
3335      VG_(sigprocmask)(VKI_SIG_SETMASK, &fork_saved_mask, NULL);
3336   }
3337}
3338#endif // !defined(VGO_solaris) && !defined(VGP_arm64_linux)
3339
3340PRE(sys_ftruncate)
3341{
3342   *flags |= SfMayBlock;
3343   PRINT("sys_ftruncate ( %lu, %lu )", ARG1, ARG2);
3344   PRE_REG_READ2(long, "ftruncate", unsigned int, fd, unsigned long, length);
3345}
3346
3347PRE(sys_truncate)
3348{
3349   *flags |= SfMayBlock;
3350   PRINT("sys_truncate ( %#lx(%s), %lu )", ARG1, (HChar*)ARG1, ARG2);
3351   PRE_REG_READ2(long, "truncate",
3352                 const char *, path, unsigned long, length);
3353   PRE_MEM_RASCIIZ( "truncate(path)", ARG1 );
3354}
3355
3356PRE(sys_ftruncate64)
3357{
3358   *flags |= SfMayBlock;
3359#if VG_WORDSIZE == 4
3360   PRINT("sys_ftruncate64 ( %lu, %llu )", ARG1, MERGE64(ARG2,ARG3));
3361   PRE_REG_READ3(long, "ftruncate64",
3362                 unsigned int, fd,
3363                 UWord, MERGE64_FIRST(length), UWord, MERGE64_SECOND(length));
3364#else
3365   PRINT("sys_ftruncate64 ( %lu, %lu )", ARG1, ARG2);
3366   PRE_REG_READ2(long, "ftruncate64",
3367                 unsigned int,fd, UWord,length);
3368#endif
3369}
3370
3371PRE(sys_truncate64)
3372{
3373   *flags |= SfMayBlock;
3374#if VG_WORDSIZE == 4
3375   PRINT("sys_truncate64 ( %#lx, %lld )", ARG1, (Long)MERGE64(ARG2, ARG3));
3376   PRE_REG_READ3(long, "truncate64",
3377                 const char *, path,
3378                 UWord, MERGE64_FIRST(length), UWord, MERGE64_SECOND(length));
3379#else
3380   PRINT("sys_truncate64 ( %#lx, %lld )", ARG1, (Long)ARG2);
3381   PRE_REG_READ2(long, "truncate64",
3382                 const char *,path, UWord,length);
3383#endif
3384   PRE_MEM_RASCIIZ( "truncate64(path)", ARG1 );
3385}
3386
3387PRE(sys_getdents)
3388{
3389   *flags |= SfMayBlock;
3390   PRINT("sys_getdents ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
3391   PRE_REG_READ3(long, "getdents",
3392                 unsigned int, fd, struct vki_dirent *, dirp,
3393                 unsigned int, count);
3394   PRE_MEM_WRITE( "getdents(dirp)", ARG2, ARG3 );
3395}
3396
3397POST(sys_getdents)
3398{
3399   vg_assert(SUCCESS);
3400   if (RES > 0)
3401      POST_MEM_WRITE( ARG2, RES );
3402}
3403
3404PRE(sys_getdents64)
3405{
3406   *flags |= SfMayBlock;
3407   PRINT("sys_getdents64 ( %lu, %#lx, %lu )",ARG1, ARG2, ARG3);
3408   PRE_REG_READ3(long, "getdents64",
3409                 unsigned int, fd, struct vki_dirent64 *, dirp,
3410                 unsigned int, count);
3411   PRE_MEM_WRITE( "getdents64(dirp)", ARG2, ARG3 );
3412}
3413
3414POST(sys_getdents64)
3415{
3416   vg_assert(SUCCESS);
3417   if (RES > 0)
3418      POST_MEM_WRITE( ARG2, RES );
3419}
3420
3421PRE(sys_getgroups)
3422{
3423   PRINT("sys_getgroups ( %ld, %#lx )", SARG1, ARG2);
3424   PRE_REG_READ2(long, "getgroups", int, size, vki_gid_t *, list);
3425   if (ARG1 > 0)
3426      PRE_MEM_WRITE( "getgroups(list)", ARG2, ARG1 * sizeof(vki_gid_t) );
3427}
3428
3429POST(sys_getgroups)
3430{
3431   vg_assert(SUCCESS);
3432   if (ARG1 > 0 && RES > 0)
3433      POST_MEM_WRITE( ARG2, RES * sizeof(vki_gid_t) );
3434}
3435
3436PRE(sys_getcwd)
3437{
3438   // Comment from linux/fs/dcache.c:
3439   //   NOTE! The user-level library version returns a character pointer.
3440   //   The kernel system call just returns the length of the buffer filled
3441   //   (which includes the ending '\0' character), or a negative error
3442   //   value.
3443   // Is this Linux-specific?  If so it should be moved to syswrap-linux.c.
3444   PRINT("sys_getcwd ( %#lx, %llu )", ARG1,(ULong)ARG2);
3445   PRE_REG_READ2(long, "getcwd", char *, buf, unsigned long, size);
3446   PRE_MEM_WRITE( "getcwd(buf)", ARG1, ARG2 );
3447}
3448
3449POST(sys_getcwd)
3450{
3451   vg_assert(SUCCESS);
3452   if (RES != (Addr)NULL)
3453      POST_MEM_WRITE( ARG1, RES );
3454}
3455
3456PRE(sys_geteuid)
3457{
3458   PRINT("sys_geteuid ( )");
3459   PRE_REG_READ0(long, "geteuid");
3460}
3461
3462PRE(sys_getegid)
3463{
3464   PRINT("sys_getegid ( )");
3465   PRE_REG_READ0(long, "getegid");
3466}
3467
3468PRE(sys_getgid)
3469{
3470   PRINT("sys_getgid ( )");
3471   PRE_REG_READ0(long, "getgid");
3472}
3473
3474PRE(sys_getpid)
3475{
3476   PRINT("sys_getpid ()");
3477   PRE_REG_READ0(long, "getpid");
3478}
3479
3480PRE(sys_getpgid)
3481{
3482   PRINT("sys_getpgid ( %ld )", SARG1);
3483   PRE_REG_READ1(long, "getpgid", vki_pid_t, pid);
3484}
3485
3486PRE(sys_getpgrp)
3487{
3488   PRINT("sys_getpgrp ()");
3489   PRE_REG_READ0(long, "getpgrp");
3490}
3491
3492PRE(sys_getppid)
3493{
3494   PRINT("sys_getppid ()");
3495   PRE_REG_READ0(long, "getppid");
3496}
3497
3498static void common_post_getrlimit(ThreadId tid, UWord a1, UWord a2)
3499{
3500   POST_MEM_WRITE( a2, sizeof(struct vki_rlimit) );
3501
3502#ifdef _RLIMIT_POSIX_FLAG
3503   // Darwin will sometimes set _RLIMIT_POSIX_FLAG on getrlimit calls.
3504   // Unset it here to make the switch case below work correctly.
3505   a1 &= ~_RLIMIT_POSIX_FLAG;
3506#endif
3507
3508   switch (a1) {
3509   case VKI_RLIMIT_NOFILE:
3510      ((struct vki_rlimit *)a2)->rlim_cur = VG_(fd_soft_limit);
3511      ((struct vki_rlimit *)a2)->rlim_max = VG_(fd_hard_limit);
3512      break;
3513
3514   case VKI_RLIMIT_DATA:
3515      *((struct vki_rlimit *)a2) = VG_(client_rlimit_data);
3516      break;
3517
3518   case VKI_RLIMIT_STACK:
3519      *((struct vki_rlimit *)a2) = VG_(client_rlimit_stack);
3520      break;
3521   }
3522}
3523
3524PRE(sys_old_getrlimit)
3525{
3526   PRINT("sys_old_getrlimit ( %lu, %#lx )", ARG1, ARG2);
3527   PRE_REG_READ2(long, "old_getrlimit",
3528                 unsigned int, resource, struct rlimit *, rlim);
3529   PRE_MEM_WRITE( "old_getrlimit(rlim)", ARG2, sizeof(struct vki_rlimit) );
3530}
3531
3532POST(sys_old_getrlimit)
3533{
3534   common_post_getrlimit(tid, ARG1, ARG2);
3535}
3536
3537PRE(sys_getrlimit)
3538{
3539   PRINT("sys_getrlimit ( %lu, %#lx )", ARG1, ARG2);
3540   PRE_REG_READ2(long, "getrlimit",
3541                 unsigned int, resource, struct rlimit *, rlim);
3542   PRE_MEM_WRITE( "getrlimit(rlim)", ARG2, sizeof(struct vki_rlimit) );
3543}
3544
3545POST(sys_getrlimit)
3546{
3547   common_post_getrlimit(tid, ARG1, ARG2);
3548}
3549
3550PRE(sys_getrusage)
3551{
3552   PRINT("sys_getrusage ( %ld, %#lx )", SARG1, ARG2);
3553   PRE_REG_READ2(long, "getrusage", int, who, struct rusage *, usage);
3554   PRE_MEM_WRITE( "getrusage(usage)", ARG2, sizeof(struct vki_rusage) );
3555}
3556
3557POST(sys_getrusage)
3558{
3559   vg_assert(SUCCESS);
3560   if (RES == 0)
3561      POST_MEM_WRITE( ARG2, sizeof(struct vki_rusage) );
3562}
3563
3564PRE(sys_gettimeofday)
3565{
3566   PRINT("sys_gettimeofday ( %#lx, %#lx )", ARG1,ARG2);
3567   PRE_REG_READ2(long, "gettimeofday",
3568                 struct timeval *, tv, struct timezone *, tz);
3569   // GrP fixme does darwin write to *tz anymore?
3570   if (ARG1 != 0)
3571      PRE_timeval_WRITE( "gettimeofday(tv)", ARG1 );
3572   if (ARG2 != 0)
3573      PRE_MEM_WRITE( "gettimeofday(tz)", ARG2, sizeof(struct vki_timezone) );
3574}
3575
3576POST(sys_gettimeofday)
3577{
3578   vg_assert(SUCCESS);
3579   if (RES == 0) {
3580      if (ARG1 != 0)
3581         POST_timeval_WRITE( ARG1 );
3582      if (ARG2 != 0)
3583	 POST_MEM_WRITE( ARG2, sizeof(struct vki_timezone) );
3584   }
3585}
3586
3587PRE(sys_settimeofday)
3588{
3589   PRINT("sys_settimeofday ( %#lx, %#lx )", ARG1,ARG2);
3590   PRE_REG_READ2(long, "settimeofday",
3591                 struct timeval *, tv, struct timezone *, tz);
3592   if (ARG1 != 0)
3593      PRE_timeval_READ( "settimeofday(tv)", ARG1 );
3594   if (ARG2 != 0) {
3595      PRE_MEM_READ( "settimeofday(tz)", ARG2, sizeof(struct vki_timezone) );
3596      /* maybe should warn if tz->tz_dsttime is non-zero? */
3597   }
3598}
3599
3600PRE(sys_getuid)
3601{
3602   PRINT("sys_getuid ( )");
3603   PRE_REG_READ0(long, "getuid");
3604}
3605
3606void ML_(PRE_unknown_ioctl)(ThreadId tid, UWord request, UWord arg)
3607{
3608   /* We don't have any specific information on it, so
3609      try to do something reasonable based on direction and
3610      size bits.  The encoding scheme is described in
3611      /usr/include/asm/ioctl.h or /usr/include/sys/ioccom.h .
3612
3613      According to Simon Hausmann, _IOC_READ means the kernel
3614      writes a value to the ioctl value passed from the user
3615      space and the other way around with _IOC_WRITE. */
3616
3617#if defined(VGO_solaris)
3618   /* Majority of Solaris ioctl requests does not honour direction hints. */
3619   UInt dir  = _VKI_IOC_NONE;
3620#else
3621   UInt dir  = _VKI_IOC_DIR(request);
3622#endif
3623   UInt size = _VKI_IOC_SIZE(request);
3624
3625   if (SimHintiS(SimHint_lax_ioctls, VG_(clo_sim_hints))) {
3626      /*
3627       * Be very lax about ioctl handling; the only
3628       * assumption is that the size is correct. Doesn't
3629       * require the full buffer to be initialized when
3630       * writing.  Without this, using some device
3631       * drivers with a large number of strange ioctl
3632       * commands becomes very tiresome.
3633       */
3634   } else if (/* size == 0 || */ dir == _VKI_IOC_NONE) {
3635      static UWord unknown_ioctl[10];
3636      static Int moans = sizeof(unknown_ioctl) / sizeof(unknown_ioctl[0]);
3637
3638      if (moans > 0 && !VG_(clo_xml)) {
3639         /* Check if have not already moaned for this request. */
3640         UInt i;
3641         for (i = 0; i < sizeof(unknown_ioctl)/sizeof(unknown_ioctl[0]); i++) {
3642            if (unknown_ioctl[i] == request)
3643               break;
3644            if (unknown_ioctl[i] == 0) {
3645               unknown_ioctl[i] = request;
3646               moans--;
3647               VG_(umsg)("Warning: noted but unhandled ioctl 0x%lx"
3648                         " with no size/direction hints.\n", request);
3649               VG_(umsg)("   This could cause spurious value errors to appear.\n");
3650               VG_(umsg)("   See README_MISSING_SYSCALL_OR_IOCTL for "
3651                         "guidance on writing a proper wrapper.\n" );
3652               //VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
3653               return;
3654            }
3655         }
3656      }
3657   } else {
3658      //VG_(message)(Vg_UserMsg, "UNKNOWN ioctl %#lx\n", request);
3659      //VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
3660      if ((dir & _VKI_IOC_WRITE) && size > 0)
3661         PRE_MEM_READ( "ioctl(generic)", arg, size);
3662      if ((dir & _VKI_IOC_READ) && size > 0)
3663         PRE_MEM_WRITE( "ioctl(generic)", arg, size);
3664   }
3665}
3666
3667void ML_(POST_unknown_ioctl)(ThreadId tid, UInt res, UWord request, UWord arg)
3668{
3669   /* We don't have any specific information on it, so
3670      try to do something reasonable based on direction and
3671      size bits.  The encoding scheme is described in
3672      /usr/include/asm/ioctl.h or /usr/include/sys/ioccom.h .
3673
3674      According to Simon Hausmann, _IOC_READ means the kernel
3675      writes a value to the ioctl value passed from the user
3676      space and the other way around with _IOC_WRITE. */
3677
3678   UInt dir  = _VKI_IOC_DIR(request);
3679   UInt size = _VKI_IOC_SIZE(request);
3680   if (size > 0 && (dir & _VKI_IOC_READ)
3681       && res == 0
3682       && arg != (Addr)NULL) {
3683      POST_MEM_WRITE(arg, size);
3684   }
3685}
3686
3687/*
3688   If we're sending a SIGKILL to one of our own threads, then simulate
3689   it rather than really sending the signal, so that the target thread
3690   gets a chance to clean up.  Returns True if we did the killing (or
3691   no killing is necessary), and False if the caller should use the
3692   normal kill syscall.
3693
3694   "pid" is any pid argument which can be passed to kill; group kills
3695   (< -1, 0), and owner kills (-1) are ignored, on the grounds that
3696   they'll most likely hit all the threads and we won't need to worry
3697   about cleanup.  In truth, we can't fully emulate these multicast
3698   kills.
3699
3700   "tgid" is a thread group id.  If it is not -1, then the target
3701   thread must be in that thread group.
3702 */
3703Bool ML_(do_sigkill)(Int pid, Int tgid)
3704{
3705   ThreadState *tst;
3706   ThreadId tid;
3707
3708   if (pid <= 0)
3709      return False;
3710
3711   tid = VG_(lwpid_to_vgtid)(pid);
3712   if (tid == VG_INVALID_THREADID)
3713      return False;		/* none of our threads */
3714
3715   tst = VG_(get_ThreadState)(tid);
3716   if (tst == NULL || tst->status == VgTs_Empty)
3717      return False;		/* hm, shouldn't happen */
3718
3719   if (tgid != -1 && tst->os_state.threadgroup != tgid)
3720      return False;		/* not the right thread group */
3721
3722   /* Check to see that the target isn't already exiting. */
3723   if (!VG_(is_exiting)(tid)) {
3724      if (VG_(clo_trace_signals))
3725	 VG_(message)(Vg_DebugMsg,
3726                      "Thread %u being killed with SIGKILL\n",
3727                      tst->tid);
3728
3729      tst->exitreason = VgSrc_FatalSig;
3730      tst->os_state.fatalsig = VKI_SIGKILL;
3731
3732      if (!VG_(is_running_thread)(tid))
3733	 VG_(get_thread_out_of_syscall)(tid);
3734   }
3735
3736   return True;
3737}
3738
3739PRE(sys_kill)
3740{
3741   PRINT("sys_kill ( %ld, %ld )", SARG1, SARG2);
3742   PRE_REG_READ2(long, "kill", int, pid, int, signal);
3743   if (!ML_(client_signal_OK)(ARG2)) {
3744      SET_STATUS_Failure( VKI_EINVAL );
3745      return;
3746   }
3747
3748   /* If we're sending SIGKILL, check to see if the target is one of
3749      our threads and handle it specially. */
3750   if (ARG2 == VKI_SIGKILL && ML_(do_sigkill)(ARG1, -1))
3751      SET_STATUS_Success(0);
3752   else
3753      /* re syscall3: Darwin has a 3rd arg, which is a flag (boolean)
3754         affecting how posix-compliant the call is.  I guess it is
3755         harmless to pass the 3rd arg on other platforms; hence pass
3756         it on all. */
3757      SET_STATUS_from_SysRes( VG_(do_syscall3)(SYSNO, ARG1, ARG2, ARG3) );
3758
3759   if (VG_(clo_trace_signals))
3760      VG_(message)(Vg_DebugMsg, "kill: sent signal %ld to pid %ld\n",
3761		   SARG2, SARG1);
3762
3763   /* This kill might have given us a pending signal.  Ask for a check once
3764      the syscall is done. */
3765   *flags |= SfPollAfter;
3766}
3767
3768PRE(sys_link)
3769{
3770   *flags |= SfMayBlock;
3771   PRINT("sys_link ( %#lx(%s), %#lx(%s) )", ARG1,(char*)ARG1,ARG2,(char*)ARG2);
3772   PRE_REG_READ2(long, "link", const char *, oldpath, const char *, newpath);
3773   PRE_MEM_RASCIIZ( "link(oldpath)", ARG1);
3774   PRE_MEM_RASCIIZ( "link(newpath)", ARG2);
3775}
3776
3777PRE(sys_newlstat)
3778{
3779   PRINT("sys_newlstat ( %#lx(%s), %#lx )", ARG1,(char*)ARG1,ARG2);
3780   PRE_REG_READ2(long, "lstat", char *, file_name, struct stat *, buf);
3781   PRE_MEM_RASCIIZ( "lstat(file_name)", ARG1 );
3782   PRE_MEM_WRITE( "lstat(buf)", ARG2, sizeof(struct vki_stat) );
3783}
3784
3785POST(sys_newlstat)
3786{
3787   vg_assert(SUCCESS);
3788   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat) );
3789}
3790
3791PRE(sys_mkdir)
3792{
3793   *flags |= SfMayBlock;
3794   PRINT("sys_mkdir ( %#lx(%s), %ld )", ARG1, (HChar*)ARG1, SARG2);
3795   PRE_REG_READ2(long, "mkdir", const char *, pathname, int, mode);
3796   PRE_MEM_RASCIIZ( "mkdir(pathname)", ARG1 );
3797}
3798
3799PRE(sys_mprotect)
3800{
3801   PRINT("sys_mprotect ( %#lx, %lu, %lu )", ARG1, ARG2, ARG3);
3802   PRE_REG_READ3(long, "mprotect",
3803                 unsigned long, addr, vki_size_t, len, unsigned long, prot);
3804
3805   if (!ML_(valid_client_addr)(ARG1, ARG2, tid, "mprotect")) {
3806      SET_STATUS_Failure( VKI_ENOMEM );
3807   }
3808#if defined(VKI_PROT_GROWSDOWN)
3809   else
3810   if (ARG3 & (VKI_PROT_GROWSDOWN|VKI_PROT_GROWSUP)) {
3811      /* Deal with mprotects on growable stack areas.
3812
3813         The critical files to understand all this are mm/mprotect.c
3814         in the kernel and sysdeps/unix/sysv/linux/dl-execstack.c in
3815         glibc.
3816
3817         The kernel provides PROT_GROWSDOWN and PROT_GROWSUP which
3818         round the start/end address of mprotect to the start/end of
3819         the underlying vma and glibc uses that as an easy way to
3820         change the protection of the stack by calling mprotect on the
3821         last page of the stack with PROT_GROWSDOWN set.
3822
3823         The sanity check provided by the kernel is that the vma must
3824         have the VM_GROWSDOWN/VM_GROWSUP flag set as appropriate.  */
3825      UInt grows = ARG3 & (VKI_PROT_GROWSDOWN|VKI_PROT_GROWSUP);
3826      NSegment const *aseg = VG_(am_find_nsegment)(ARG1);
3827      NSegment const *rseg;
3828
3829      vg_assert(aseg);
3830
3831      if (grows == VKI_PROT_GROWSDOWN) {
3832         rseg = VG_(am_next_nsegment)( aseg, False/*backwards*/ );
3833         if (rseg
3834             && rseg->kind == SkResvn
3835             && rseg->smode == SmUpper
3836             && rseg->end+1 == aseg->start) {
3837            Addr end = ARG1 + ARG2;
3838            ARG1 = aseg->start;
3839            ARG2 = end - aseg->start;
3840            ARG3 &= ~VKI_PROT_GROWSDOWN;
3841         } else {
3842            SET_STATUS_Failure( VKI_EINVAL );
3843         }
3844      } else if (grows == VKI_PROT_GROWSUP) {
3845         rseg = VG_(am_next_nsegment)( aseg, True/*forwards*/ );
3846         if (rseg
3847             && rseg->kind == SkResvn
3848             && rseg->smode == SmLower
3849             && aseg->end+1 == rseg->start) {
3850            ARG2 = aseg->end - ARG1 + 1;
3851            ARG3 &= ~VKI_PROT_GROWSUP;
3852         } else {
3853            SET_STATUS_Failure( VKI_EINVAL );
3854         }
3855      } else {
3856         /* both GROWSUP and GROWSDOWN */
3857         SET_STATUS_Failure( VKI_EINVAL );
3858      }
3859   }
3860#endif   // defined(VKI_PROT_GROWSDOWN)
3861}
3862
3863POST(sys_mprotect)
3864{
3865   Addr a    = ARG1;
3866   SizeT len = ARG2;
3867   Int  prot = ARG3;
3868
3869   ML_(notify_core_and_tool_of_mprotect)(a, len, prot);
3870}
3871
3872PRE(sys_munmap)
3873{
3874   if (0) VG_(printf)("  munmap( %#lx )\n", ARG1);
3875   PRINT("sys_munmap ( %#lx, %llu )", ARG1,(ULong)ARG2);
3876   PRE_REG_READ2(long, "munmap", unsigned long, start, vki_size_t, length);
3877
3878   if (!ML_(valid_client_addr)(ARG1, ARG2, tid, "munmap"))
3879      SET_STATUS_Failure( VKI_EINVAL );
3880}
3881
3882POST(sys_munmap)
3883{
3884   Addr  a   = ARG1;
3885   SizeT len = ARG2;
3886
3887   ML_(notify_core_and_tool_of_munmap)( a, len );
3888}
3889
3890PRE(sys_mincore)
3891{
3892   PRINT("sys_mincore ( %#lx, %llu, %#lx )", ARG1,(ULong)ARG2,ARG3);
3893   PRE_REG_READ3(long, "mincore",
3894                 unsigned long, start, vki_size_t, length,
3895                 unsigned char *, vec);
3896   PRE_MEM_WRITE( "mincore(vec)", ARG3, VG_PGROUNDUP(ARG2) / VKI_PAGE_SIZE );
3897}
3898POST(sys_mincore)
3899{
3900   POST_MEM_WRITE( ARG3, VG_PGROUNDUP(ARG2) / VKI_PAGE_SIZE );
3901}
3902
3903PRE(sys_nanosleep)
3904{
3905   *flags |= SfMayBlock|SfPostOnFail;
3906   PRINT("sys_nanosleep ( %#lx, %#lx )", ARG1,ARG2);
3907   PRE_REG_READ2(long, "nanosleep",
3908                 struct timespec *, req, struct timespec *, rem);
3909   PRE_MEM_READ( "nanosleep(req)", ARG1, sizeof(struct vki_timespec) );
3910   if (ARG2 != 0)
3911      PRE_MEM_WRITE( "nanosleep(rem)", ARG2, sizeof(struct vki_timespec) );
3912}
3913
3914POST(sys_nanosleep)
3915{
3916   vg_assert(SUCCESS || FAILURE);
3917   if (ARG2 != 0 && FAILURE && ERR == VKI_EINTR)
3918      POST_MEM_WRITE( ARG2, sizeof(struct vki_timespec) );
3919}
3920
3921#if defined(VGO_linux) || defined(VGO_solaris)
3922/* Handles the case where the open is of /proc/self/auxv or
3923   /proc/<pid>/auxv, and just gives out a copy of the fd for the
3924   fake file we cooked up at startup (in m_main).  Also, seeks the
3925   cloned fd back to the start.
3926   Returns True if auxv open was handled (status is set). */
3927Bool ML_(handle_auxv_open)(SyscallStatus *status, const HChar *filename,
3928                           int flags)
3929{
3930   HChar  name[30];   // large enough
3931
3932   if (!ML_(safe_to_deref)((const void *) filename, 1))
3933      return False;
3934
3935   /* Opening /proc/<pid>/auxv or /proc/self/auxv? */
3936   VG_(sprintf)(name, "/proc/%d/auxv", VG_(getpid)());
3937   if (!VG_STREQ(filename, name) && !VG_STREQ(filename, "/proc/self/auxv"))
3938      return False;
3939
3940   /* Allow to open the file only for reading. */
3941   if (flags & (VKI_O_WRONLY | VKI_O_RDWR)) {
3942      SET_STATUS_Failure(VKI_EACCES);
3943      return True;
3944   }
3945
3946#  if defined(VGO_solaris)
3947   VG_(sprintf)(name, "/proc/self/fd/%d", VG_(cl_auxv_fd));
3948   SysRes sres = VG_(open)(name, flags, 0);
3949   SET_STATUS_from_SysRes(sres);
3950#  else
3951   SysRes sres = VG_(dup)(VG_(cl_auxv_fd));
3952   SET_STATUS_from_SysRes(sres);
3953   if (!sr_isError(sres)) {
3954      OffT off = VG_(lseek)(sr_Res(sres), 0, VKI_SEEK_SET);
3955      if (off < 0)
3956         SET_STATUS_Failure(VKI_EMFILE);
3957   }
3958#  endif
3959
3960   return True;
3961}
3962#endif // defined(VGO_linux) || defined(VGO_solaris)
3963
3964PRE(sys_open)
3965{
3966   if (ARG2 & VKI_O_CREAT) {
3967      // 3-arg version
3968      PRINT("sys_open ( %#lx(%s), %ld, %ld )",ARG1, (HChar*)ARG1, SARG2, SARG3);
3969      PRE_REG_READ3(long, "open",
3970                    const char *, filename, int, flags, int, mode);
3971   } else {
3972      // 2-arg version
3973      PRINT("sys_open ( %#lx(%s), %ld )",ARG1, (HChar*)ARG1, SARG2);
3974      PRE_REG_READ2(long, "open",
3975                    const char *, filename, int, flags);
3976   }
3977   PRE_MEM_RASCIIZ( "open(filename)", ARG1 );
3978
3979#if defined(VGO_linux)
3980   /* Handle the case where the open is of /proc/self/cmdline or
3981      /proc/<pid>/cmdline, and just give it a copy of the fd for the
3982      fake file we cooked up at startup (in m_main).  Also, seek the
3983      cloned fd back to the start. */
3984   {
3985      HChar  name[30];   // large enough
3986      HChar* arg1s = (HChar*) ARG1;
3987      SysRes sres;
3988
3989      VG_(sprintf)(name, "/proc/%d/cmdline", VG_(getpid)());
3990      if (ML_(safe_to_deref)( arg1s, 1 )
3991          && (VG_STREQ(arg1s, name) || VG_STREQ(arg1s, "/proc/self/cmdline"))) {
3992         sres = VG_(dup)( VG_(cl_cmdline_fd) );
3993         SET_STATUS_from_SysRes( sres );
3994         if (!sr_isError(sres)) {
3995            OffT off = VG_(lseek)( sr_Res(sres), 0, VKI_SEEK_SET );
3996            if (off < 0)
3997               SET_STATUS_Failure( VKI_EMFILE );
3998         }
3999         return;
4000      }
4001   }
4002
4003   /* Handle also the case of /proc/self/auxv or /proc/<pid>/auxv. */
4004   if (ML_(handle_auxv_open)(status, (const HChar *)ARG1, ARG2))
4005      return;
4006#endif // defined(VGO_linux)
4007
4008   /* Otherwise handle normally */
4009   *flags |= SfMayBlock;
4010}
4011
4012POST(sys_open)
4013{
4014   vg_assert(SUCCESS);
4015   if (!ML_(fd_allowed)(RES, "open", tid, True)) {
4016      VG_(close)(RES);
4017      SET_STATUS_Failure( VKI_EMFILE );
4018   } else {
4019      if (VG_(clo_track_fds))
4020         ML_(record_fd_open_with_given_name)(tid, RES, (HChar*)ARG1);
4021   }
4022}
4023
4024PRE(sys_read)
4025{
4026   *flags |= SfMayBlock;
4027   PRINT("sys_read ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4028   PRE_REG_READ3(ssize_t, "read",
4029                 unsigned int, fd, char *, buf, vki_size_t, count);
4030
4031   if (!ML_(fd_allowed)(ARG1, "read", tid, False))
4032      SET_STATUS_Failure( VKI_EBADF );
4033   else
4034      PRE_MEM_WRITE( "read(buf)", ARG2, ARG3 );
4035}
4036
4037POST(sys_read)
4038{
4039   vg_assert(SUCCESS);
4040   POST_MEM_WRITE( ARG2, RES );
4041}
4042
4043PRE(sys_write)
4044{
4045   Bool ok;
4046   *flags |= SfMayBlock;
4047   PRINT("sys_write ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4048   PRE_REG_READ3(ssize_t, "write",
4049                 unsigned int, fd, const char *, buf, vki_size_t, count);
4050   /* check to see if it is allowed.  If not, try for an exemption from
4051      --sim-hints=enable-outer (used for self hosting). */
4052   ok = ML_(fd_allowed)(ARG1, "write", tid, False);
4053   if (!ok && ARG1 == 2/*stderr*/
4054           && SimHintiS(SimHint_enable_outer, VG_(clo_sim_hints)))
4055      ok = True;
4056#if defined(VGO_solaris)
4057   if (!ok && VG_(vfork_fildes_addr) != NULL
4058       && *VG_(vfork_fildes_addr) >= 0 && *VG_(vfork_fildes_addr) == ARG1)
4059      ok = True;
4060#endif
4061   if (!ok)
4062      SET_STATUS_Failure( VKI_EBADF );
4063   else
4064      PRE_MEM_READ( "write(buf)", ARG2, ARG3 );
4065}
4066
4067PRE(sys_creat)
4068{
4069   *flags |= SfMayBlock;
4070   PRINT("sys_creat ( %#lx(%s), %ld )", ARG1, (HChar*)ARG1, SARG2);
4071   PRE_REG_READ2(long, "creat", const char *, pathname, int, mode);
4072   PRE_MEM_RASCIIZ( "creat(pathname)", ARG1 );
4073}
4074
4075POST(sys_creat)
4076{
4077   vg_assert(SUCCESS);
4078   if (!ML_(fd_allowed)(RES, "creat", tid, True)) {
4079      VG_(close)(RES);
4080      SET_STATUS_Failure( VKI_EMFILE );
4081   } else {
4082      if (VG_(clo_track_fds))
4083         ML_(record_fd_open_with_given_name)(tid, RES, (HChar*)ARG1);
4084   }
4085}
4086
4087PRE(sys_poll)
4088{
4089   /* struct pollfd {
4090        int fd;           -- file descriptor
4091        short events;     -- requested events
4092        short revents;    -- returned events
4093      };
4094      int poll(struct pollfd *ufds, unsigned int nfds, int timeout)
4095   */
4096   UInt i;
4097   struct vki_pollfd* ufds = (struct vki_pollfd *)ARG1;
4098   *flags |= SfMayBlock;
4099   PRINT("sys_poll ( %#lx, %lu, %ld )\n", ARG1, ARG2, SARG3);
4100   PRE_REG_READ3(long, "poll",
4101                 struct vki_pollfd *, ufds, unsigned int, nfds, long, timeout);
4102
4103   for (i = 0; i < ARG2; i++) {
4104      PRE_MEM_READ( "poll(ufds.fd)",
4105                    (Addr)(&ufds[i].fd), sizeof(ufds[i].fd) );
4106      PRE_MEM_READ( "poll(ufds.events)",
4107                    (Addr)(&ufds[i].events), sizeof(ufds[i].events) );
4108      PRE_MEM_WRITE( "poll(ufds.revents)",
4109                     (Addr)(&ufds[i].revents), sizeof(ufds[i].revents) );
4110   }
4111}
4112
4113POST(sys_poll)
4114{
4115   if (RES >= 0) {
4116      UInt i;
4117      struct vki_pollfd* ufds = (struct vki_pollfd *)ARG1;
4118      for (i = 0; i < ARG2; i++)
4119	 POST_MEM_WRITE( (Addr)(&ufds[i].revents), sizeof(ufds[i].revents) );
4120   }
4121}
4122
4123PRE(sys_readlink)
4124{
4125   FUSE_COMPATIBLE_MAY_BLOCK();
4126   Word saved = SYSNO;
4127
4128   PRINT("sys_readlink ( %#lx(%s), %#lx, %llu )", ARG1,(char*)ARG1,ARG2,(ULong)ARG3);
4129   PRE_REG_READ3(long, "readlink",
4130                 const char *, path, char *, buf, int, bufsiz);
4131   PRE_MEM_RASCIIZ( "readlink(path)", ARG1 );
4132   PRE_MEM_WRITE( "readlink(buf)", ARG2,ARG3 );
4133
4134
4135   {
4136#if defined(VGO_linux) || defined(VGO_solaris)
4137#if defined(VGO_linux)
4138#define PID_EXEPATH  "/proc/%d/exe"
4139#define SELF_EXEPATH "/proc/self/exe"
4140#define SELF_EXEFD   "/proc/self/fd/%d"
4141#elif defined(VGO_solaris)
4142#define PID_EXEPATH  "/proc/%d/path/a.out"
4143#define SELF_EXEPATH "/proc/self/path/a.out"
4144#define SELF_EXEFD   "/proc/self/path/%d"
4145#endif
4146      /*
4147       * Handle the case where readlink is looking at /proc/self/exe or
4148       * /proc/<pid>/exe, or equivalent on Solaris.
4149       */
4150      HChar  name[30];   // large enough
4151      HChar* arg1s = (HChar*) ARG1;
4152      VG_(sprintf)(name, PID_EXEPATH, VG_(getpid)());
4153      if (ML_(safe_to_deref)(arg1s, 1)
4154          && (VG_STREQ(arg1s, name) || VG_STREQ(arg1s, SELF_EXEPATH))) {
4155         VG_(sprintf)(name, SELF_EXEFD, VG_(cl_exec_fd));
4156         SET_STATUS_from_SysRes( VG_(do_syscall3)(saved, (UWord)name,
4157                                                         ARG2, ARG3));
4158      } else
4159#endif
4160      {
4161         /* Normal case */
4162         SET_STATUS_from_SysRes( VG_(do_syscall3)(saved, ARG1, ARG2, ARG3));
4163      }
4164   }
4165
4166   if (SUCCESS && RES > 0)
4167      POST_MEM_WRITE( ARG2, RES );
4168}
4169
4170PRE(sys_readv)
4171{
4172   Int i;
4173   struct vki_iovec * vec;
4174   *flags |= SfMayBlock;
4175   PRINT("sys_readv ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4176   PRE_REG_READ3(ssize_t, "readv",
4177                 unsigned long, fd, const struct iovec *, vector,
4178                 unsigned long, count);
4179   if (!ML_(fd_allowed)(ARG1, "readv", tid, False)) {
4180      SET_STATUS_Failure( VKI_EBADF );
4181   } else {
4182      if ((Int)ARG3 >= 0)
4183         PRE_MEM_READ( "readv(vector)", ARG2, ARG3 * sizeof(struct vki_iovec) );
4184
4185      if (ARG2 != 0) {
4186         /* ToDo: don't do any of the following if the vector is invalid */
4187         vec = (struct vki_iovec *)ARG2;
4188         for (i = 0; i < (Int)ARG3; i++)
4189            PRE_MEM_WRITE( "readv(vector[...])",
4190                           (Addr)vec[i].iov_base, vec[i].iov_len );
4191      }
4192   }
4193}
4194
4195POST(sys_readv)
4196{
4197   vg_assert(SUCCESS);
4198   if (RES > 0) {
4199      Int i;
4200      struct vki_iovec * vec = (struct vki_iovec *)ARG2;
4201      Int remains = RES;
4202
4203      /* RES holds the number of bytes read. */
4204      for (i = 0; i < (Int)ARG3; i++) {
4205	 Int nReadThisBuf = vec[i].iov_len;
4206	 if (nReadThisBuf > remains) nReadThisBuf = remains;
4207	 POST_MEM_WRITE( (Addr)vec[i].iov_base, nReadThisBuf );
4208	 remains -= nReadThisBuf;
4209	 if (remains < 0) VG_(core_panic)("readv: remains < 0");
4210      }
4211   }
4212}
4213
4214PRE(sys_rename)
4215{
4216   FUSE_COMPATIBLE_MAY_BLOCK();
4217   PRINT("sys_rename ( %#lx(%s), %#lx(%s) )", ARG1,(char*)ARG1,ARG2,(char*)ARG2);
4218   PRE_REG_READ2(long, "rename", const char *, oldpath, const char *, newpath);
4219   PRE_MEM_RASCIIZ( "rename(oldpath)", ARG1 );
4220   PRE_MEM_RASCIIZ( "rename(newpath)", ARG2 );
4221}
4222
4223PRE(sys_rmdir)
4224{
4225   *flags |= SfMayBlock;
4226   PRINT("sys_rmdir ( %#lx(%s) )", ARG1,(char*)ARG1);
4227   PRE_REG_READ1(long, "rmdir", const char *, pathname);
4228   PRE_MEM_RASCIIZ( "rmdir(pathname)", ARG1 );
4229}
4230
4231PRE(sys_select)
4232{
4233   *flags |= SfMayBlock;
4234   PRINT("sys_select ( %ld, %#lx, %#lx, %#lx, %#lx )", SARG1, ARG2, ARG3,
4235         ARG4, ARG5);
4236   PRE_REG_READ5(long, "select",
4237                 int, n, vki_fd_set *, readfds, vki_fd_set *, writefds,
4238                 vki_fd_set *, exceptfds, struct vki_timeval *, timeout);
4239   // XXX: this possibly understates how much memory is read.
4240   if (ARG2 != 0)
4241      PRE_MEM_READ( "select(readfds)",
4242		     ARG2, ARG1/8 /* __FD_SETSIZE/8 */ );
4243   if (ARG3 != 0)
4244      PRE_MEM_READ( "select(writefds)",
4245		     ARG3, ARG1/8 /* __FD_SETSIZE/8 */ );
4246   if (ARG4 != 0)
4247      PRE_MEM_READ( "select(exceptfds)",
4248		     ARG4, ARG1/8 /* __FD_SETSIZE/8 */ );
4249   if (ARG5 != 0)
4250      PRE_timeval_READ( "select(timeout)", ARG5 );
4251}
4252
4253PRE(sys_setgid)
4254{
4255   PRINT("sys_setgid ( %lu )", ARG1);
4256   PRE_REG_READ1(long, "setgid", vki_gid_t, gid);
4257}
4258
4259PRE(sys_setsid)
4260{
4261   PRINT("sys_setsid ( )");
4262   PRE_REG_READ0(long, "setsid");
4263}
4264
4265PRE(sys_setgroups)
4266{
4267   PRINT("setgroups ( %llu, %#lx )", (ULong)ARG1, ARG2);
4268   PRE_REG_READ2(long, "setgroups", int, size, vki_gid_t *, list);
4269   if (ARG1 > 0)
4270      PRE_MEM_READ( "setgroups(list)", ARG2, ARG1 * sizeof(vki_gid_t) );
4271}
4272
4273PRE(sys_setpgid)
4274{
4275   PRINT("setpgid ( %ld, %ld )", SARG1, SARG2);
4276   PRE_REG_READ2(long, "setpgid", vki_pid_t, pid, vki_pid_t, pgid);
4277}
4278
4279PRE(sys_setregid)
4280{
4281   PRINT("sys_setregid ( %lu, %lu )", ARG1, ARG2);
4282   PRE_REG_READ2(long, "setregid", vki_gid_t, rgid, vki_gid_t, egid);
4283}
4284
4285PRE(sys_setreuid)
4286{
4287   PRINT("sys_setreuid ( 0x%lx, 0x%lx )", ARG1, ARG2);
4288   PRE_REG_READ2(long, "setreuid", vki_uid_t, ruid, vki_uid_t, euid);
4289}
4290
4291PRE(sys_setrlimit)
4292{
4293   UWord arg1 = ARG1;
4294   PRINT("sys_setrlimit ( %lu, %#lx )", ARG1, ARG2);
4295   PRE_REG_READ2(long, "setrlimit",
4296                 unsigned int, resource, struct rlimit *, rlim);
4297   PRE_MEM_READ( "setrlimit(rlim)", ARG2, sizeof(struct vki_rlimit) );
4298
4299#ifdef _RLIMIT_POSIX_FLAG
4300   // Darwin will sometimes set _RLIMIT_POSIX_FLAG on setrlimit calls.
4301   // Unset it here to make the if statements below work correctly.
4302   arg1 &= ~_RLIMIT_POSIX_FLAG;
4303#endif
4304
4305   if (!VG_(am_is_valid_for_client)(ARG2, sizeof(struct vki_rlimit),
4306                                    VKI_PROT_READ)) {
4307      SET_STATUS_Failure( VKI_EFAULT );
4308   }
4309   else if (((struct vki_rlimit *)ARG2)->rlim_cur
4310            > ((struct vki_rlimit *)ARG2)->rlim_max) {
4311      SET_STATUS_Failure( VKI_EINVAL );
4312   }
4313   else if (arg1 == VKI_RLIMIT_NOFILE) {
4314      if (((struct vki_rlimit *)ARG2)->rlim_cur > VG_(fd_hard_limit) ||
4315          ((struct vki_rlimit *)ARG2)->rlim_max != VG_(fd_hard_limit)) {
4316         SET_STATUS_Failure( VKI_EPERM );
4317      }
4318      else {
4319         VG_(fd_soft_limit) = ((struct vki_rlimit *)ARG2)->rlim_cur;
4320         SET_STATUS_Success( 0 );
4321      }
4322   }
4323   else if (arg1 == VKI_RLIMIT_DATA) {
4324      if (((struct vki_rlimit *)ARG2)->rlim_cur > VG_(client_rlimit_data).rlim_max ||
4325          ((struct vki_rlimit *)ARG2)->rlim_max > VG_(client_rlimit_data).rlim_max) {
4326         SET_STATUS_Failure( VKI_EPERM );
4327      }
4328      else {
4329         VG_(client_rlimit_data) = *(struct vki_rlimit *)ARG2;
4330         SET_STATUS_Success( 0 );
4331      }
4332   }
4333   else if (arg1 == VKI_RLIMIT_STACK && tid == 1) {
4334      if (((struct vki_rlimit *)ARG2)->rlim_cur > VG_(client_rlimit_stack).rlim_max ||
4335          ((struct vki_rlimit *)ARG2)->rlim_max > VG_(client_rlimit_stack).rlim_max) {
4336         SET_STATUS_Failure( VKI_EPERM );
4337      }
4338      else {
4339         /* Change the value of client_stack_szB to the rlim_cur value but
4340            only if it is smaller than the size of the allocated stack for the
4341            client.
4342            TODO: All platforms should set VG_(clstk_max_size) as part of their
4343                  setup_client_stack(). */
4344         if ((VG_(clstk_max_size) == 0)
4345             || (((struct vki_rlimit *) ARG2)->rlim_cur <= VG_(clstk_max_size)))
4346            VG_(threads)[tid].client_stack_szB = ((struct vki_rlimit *)ARG2)->rlim_cur;
4347
4348         VG_(client_rlimit_stack) = *(struct vki_rlimit *)ARG2;
4349         SET_STATUS_Success( 0 );
4350      }
4351   }
4352}
4353
4354PRE(sys_setuid)
4355{
4356   PRINT("sys_setuid ( %lu )", ARG1);
4357   PRE_REG_READ1(long, "setuid", vki_uid_t, uid);
4358}
4359
4360PRE(sys_newstat)
4361{
4362   FUSE_COMPATIBLE_MAY_BLOCK();
4363   PRINT("sys_newstat ( %#lx(%s), %#lx )", ARG1,(char*)ARG1,ARG2);
4364   PRE_REG_READ2(long, "stat", char *, file_name, struct stat *, buf);
4365   PRE_MEM_RASCIIZ( "stat(file_name)", ARG1 );
4366   PRE_MEM_WRITE( "stat(buf)", ARG2, sizeof(struct vki_stat) );
4367}
4368
4369POST(sys_newstat)
4370{
4371   POST_MEM_WRITE( ARG2, sizeof(struct vki_stat) );
4372}
4373
4374PRE(sys_statfs)
4375{
4376   FUSE_COMPATIBLE_MAY_BLOCK();
4377   PRINT("sys_statfs ( %#lx(%s), %#lx )",ARG1,(char*)ARG1,ARG2);
4378   PRE_REG_READ2(long, "statfs", const char *, path, struct statfs *, buf);
4379   PRE_MEM_RASCIIZ( "statfs(path)", ARG1 );
4380   PRE_MEM_WRITE( "statfs(buf)", ARG2, sizeof(struct vki_statfs) );
4381}
4382POST(sys_statfs)
4383{
4384   POST_MEM_WRITE( ARG2, sizeof(struct vki_statfs) );
4385}
4386
4387PRE(sys_statfs64)
4388{
4389   PRINT("sys_statfs64 ( %#lx(%s), %llu, %#lx )",ARG1,(char*)ARG1,(ULong)ARG2,ARG3);
4390   PRE_REG_READ3(long, "statfs64",
4391                 const char *, path, vki_size_t, size, struct statfs64 *, buf);
4392   PRE_MEM_RASCIIZ( "statfs64(path)", ARG1 );
4393   PRE_MEM_WRITE( "statfs64(buf)", ARG3, ARG2 );
4394}
4395POST(sys_statfs64)
4396{
4397   POST_MEM_WRITE( ARG3, ARG2 );
4398}
4399
4400PRE(sys_symlink)
4401{
4402   *flags |= SfMayBlock;
4403   PRINT("sys_symlink ( %#lx(%s), %#lx(%s) )",ARG1,(char*)ARG1,ARG2,(char*)ARG2);
4404   PRE_REG_READ2(long, "symlink", const char *, oldpath, const char *, newpath);
4405   PRE_MEM_RASCIIZ( "symlink(oldpath)", ARG1 );
4406   PRE_MEM_RASCIIZ( "symlink(newpath)", ARG2 );
4407}
4408
4409PRE(sys_time)
4410{
4411   /* time_t time(time_t *t); */
4412   PRINT("sys_time ( %#lx )",ARG1);
4413   PRE_REG_READ1(long, "time", int *, t);
4414   if (ARG1 != 0) {
4415      PRE_MEM_WRITE( "time(t)", ARG1, sizeof(vki_time_t) );
4416   }
4417}
4418
4419POST(sys_time)
4420{
4421   if (ARG1 != 0) {
4422      POST_MEM_WRITE( ARG1, sizeof(vki_time_t) );
4423   }
4424}
4425
4426PRE(sys_times)
4427{
4428   PRINT("sys_times ( %#lx )", ARG1);
4429   PRE_REG_READ1(long, "times", struct tms *, buf);
4430   if (ARG1 != 0) {
4431      PRE_MEM_WRITE( "times(buf)", ARG1, sizeof(struct vki_tms) );
4432   }
4433}
4434
4435POST(sys_times)
4436{
4437   if (ARG1 != 0) {
4438      POST_MEM_WRITE( ARG1, sizeof(struct vki_tms) );
4439   }
4440}
4441
4442PRE(sys_umask)
4443{
4444   PRINT("sys_umask ( %ld )", SARG1);
4445   PRE_REG_READ1(long, "umask", int, mask);
4446}
4447
4448PRE(sys_unlink)
4449{
4450   *flags |= SfMayBlock;
4451   PRINT("sys_unlink ( %#lx(%s) )", ARG1,(char*)ARG1);
4452   PRE_REG_READ1(long, "unlink", const char *, pathname);
4453   PRE_MEM_RASCIIZ( "unlink(pathname)", ARG1 );
4454}
4455
4456PRE(sys_newuname)
4457{
4458   PRINT("sys_newuname ( %#lx )", ARG1);
4459   PRE_REG_READ1(long, "uname", struct new_utsname *, buf);
4460   PRE_MEM_WRITE( "uname(buf)", ARG1, sizeof(struct vki_new_utsname) );
4461}
4462
4463POST(sys_newuname)
4464{
4465   if (ARG1 != 0) {
4466      POST_MEM_WRITE( ARG1, sizeof(struct vki_new_utsname) );
4467   }
4468}
4469
4470PRE(sys_waitpid)
4471{
4472   *flags |= SfMayBlock;
4473   PRINT("sys_waitpid ( %ld, %#lx, %ld )", SARG1, ARG2, SARG3);
4474   PRE_REG_READ3(long, "waitpid",
4475                 vki_pid_t, pid, unsigned int *, status, int, options);
4476
4477   if (ARG2 != (Addr)NULL)
4478      PRE_MEM_WRITE( "waitpid(status)", ARG2, sizeof(int) );
4479}
4480
4481POST(sys_waitpid)
4482{
4483   if (ARG2 != (Addr)NULL)
4484      POST_MEM_WRITE( ARG2, sizeof(int) );
4485}
4486
4487PRE(sys_wait4)
4488{
4489   *flags |= SfMayBlock;
4490   PRINT("sys_wait4 ( %ld, %#lx, %ld, %#lx )", SARG1, ARG2, SARG3, ARG4);
4491
4492   PRE_REG_READ4(long, "wait4",
4493                 vki_pid_t, pid, unsigned int *, status, int, options,
4494                 struct rusage *, rusage);
4495   if (ARG2 != (Addr)NULL)
4496      PRE_MEM_WRITE( "wait4(status)", ARG2, sizeof(int) );
4497   if (ARG4 != (Addr)NULL)
4498      PRE_MEM_WRITE( "wait4(rusage)", ARG4, sizeof(struct vki_rusage) );
4499}
4500
4501POST(sys_wait4)
4502{
4503   if (ARG2 != (Addr)NULL)
4504      POST_MEM_WRITE( ARG2, sizeof(int) );
4505   if (ARG4 != (Addr)NULL)
4506      POST_MEM_WRITE( ARG4, sizeof(struct vki_rusage) );
4507}
4508
4509PRE(sys_writev)
4510{
4511   Int i;
4512   struct vki_iovec * vec;
4513   *flags |= SfMayBlock;
4514   PRINT("sys_writev ( %lu, %#lx, %lu )", ARG1, ARG2, ARG3);
4515   PRE_REG_READ3(ssize_t, "writev",
4516                 unsigned long, fd, const struct iovec *, vector,
4517                 unsigned long, count);
4518   if (!ML_(fd_allowed)(ARG1, "writev", tid, False)) {
4519      SET_STATUS_Failure( VKI_EBADF );
4520   } else {
4521      if ((Int)ARG3 >= 0)
4522         PRE_MEM_READ( "writev(vector)",
4523                       ARG2, ARG3 * sizeof(struct vki_iovec) );
4524      if (ARG2 != 0) {
4525         /* ToDo: don't do any of the following if the vector is invalid */
4526         vec = (struct vki_iovec *)ARG2;
4527         for (i = 0; i < (Int)ARG3; i++)
4528            PRE_MEM_READ( "writev(vector[...])",
4529                           (Addr)vec[i].iov_base, vec[i].iov_len );
4530      }
4531   }
4532}
4533
4534PRE(sys_utimes)
4535{
4536   FUSE_COMPATIBLE_MAY_BLOCK();
4537   PRINT("sys_utimes ( %#lx(%s), %#lx )", ARG1,(char*)ARG1,ARG2);
4538   PRE_REG_READ2(long, "utimes", char *, filename, struct timeval *, tvp);
4539   PRE_MEM_RASCIIZ( "utimes(filename)", ARG1 );
4540   if (ARG2 != 0) {
4541      PRE_timeval_READ( "utimes(tvp[0])", ARG2 );
4542      PRE_timeval_READ( "utimes(tvp[1])", ARG2+sizeof(struct vki_timeval) );
4543   }
4544}
4545
4546PRE(sys_acct)
4547{
4548   PRINT("sys_acct ( %#lx(%s) )", ARG1,(char*)ARG1);
4549   PRE_REG_READ1(long, "acct", const char *, filename);
4550   PRE_MEM_RASCIIZ( "acct(filename)", ARG1 );
4551}
4552
4553PRE(sys_pause)
4554{
4555   *flags |= SfMayBlock;
4556   PRINT("sys_pause ( )");
4557   PRE_REG_READ0(long, "pause");
4558}
4559
4560PRE(sys_sigaltstack)
4561{
4562   PRINT("sigaltstack ( %#lx, %#lx )",ARG1,ARG2);
4563   PRE_REG_READ2(int, "sigaltstack",
4564                 const vki_stack_t *, ss, vki_stack_t *, oss);
4565   if (ARG1 != 0) {
4566      const vki_stack_t *ss = (vki_stack_t *)ARG1;
4567      PRE_MEM_READ( "sigaltstack(ss)", (Addr)&ss->ss_sp, sizeof(ss->ss_sp) );
4568      PRE_MEM_READ( "sigaltstack(ss)", (Addr)&ss->ss_flags, sizeof(ss->ss_flags) );
4569      PRE_MEM_READ( "sigaltstack(ss)", (Addr)&ss->ss_size, sizeof(ss->ss_size) );
4570   }
4571   if (ARG2 != 0) {
4572      PRE_MEM_WRITE( "sigaltstack(oss)", ARG2, sizeof(vki_stack_t) );
4573   }
4574
4575   /* Be safe. */
4576   if (ARG1 && !ML_(safe_to_deref((void*)ARG1, sizeof(vki_stack_t)))) {
4577      SET_STATUS_Failure(VKI_EFAULT);
4578      return;
4579   }
4580   if (ARG2 && !ML_(safe_to_deref((void*)ARG2, sizeof(vki_stack_t)))) {
4581      SET_STATUS_Failure(VKI_EFAULT);
4582      return;
4583   }
4584
4585   SET_STATUS_from_SysRes(
4586      VG_(do_sys_sigaltstack) (tid, (vki_stack_t*)ARG1,
4587                              (vki_stack_t*)ARG2)
4588   );
4589}
4590POST(sys_sigaltstack)
4591{
4592   vg_assert(SUCCESS);
4593   if (RES == 0 && ARG2 != 0)
4594      POST_MEM_WRITE( ARG2, sizeof(vki_stack_t));
4595}
4596
4597PRE(sys_sethostname)
4598{
4599   PRINT("sys_sethostname ( %#lx, %ld )", ARG1, SARG2);
4600   PRE_REG_READ2(long, "sethostname", char *, name, int, len);
4601   PRE_MEM_READ( "sethostname(name)", ARG1, ARG2 );
4602}
4603
4604#undef PRE
4605#undef POST
4606
4607#endif // defined(VGO_linux) || defined(VGO_darwin) || defined(VGO_solaris)
4608
4609/*--------------------------------------------------------------------*/
4610/*--- end                                                          ---*/
4611/*--------------------------------------------------------------------*/
4612