1 
2 /*--------------------------------------------------------------------*/
3 /*--- Handle system calls.                          syswrap-main.c ---*/
4 /*--------------------------------------------------------------------*/
5 
6 /*
7    This file is part of Valgrind, a dynamic binary instrumentation
8    framework.
9 
10    Copyright (C) 2000-2013 Julian Seward
11       jseward@acm.org
12 
13    This program is free software; you can redistribute it and/or
14    modify it under the terms of the GNU General Public License as
15    published by the Free Software Foundation; either version 2 of the
16    License, or (at your option) any later version.
17 
18    This program is distributed in the hope that it will be useful, but
19    WITHOUT ANY WARRANTY; without even the implied warranty of
20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21    General Public License for more details.
22 
23    You should have received a copy of the GNU General Public License
24    along with this program; if not, write to the Free Software
25    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
26    02111-1307, USA.
27 
28    The GNU General Public License is contained in the file COPYING.
29 */
30 
31 #include "libvex_guest_offsets.h"
32 #include "libvex_trc_values.h"
33 #include "pub_core_basics.h"
34 #include "pub_core_aspacemgr.h"
35 #include "pub_core_vki.h"
36 #include "pub_core_vkiscnums.h"
37 #include "pub_core_threadstate.h"
38 #include "pub_core_libcbase.h"
39 #include "pub_core_libcassert.h"
40 #include "pub_core_libcprint.h"
41 #include "pub_core_libcproc.h"      // For VG_(getpid)()
42 #include "pub_core_libcsignal.h"
43 #include "pub_core_scheduler.h"     // For VG_({acquire,release}_BigLock),
44                                     //   and VG_(vg_yield)
45 #include "pub_core_stacktrace.h"    // For VG_(get_and_pp_StackTrace)()
46 #include "pub_core_tooliface.h"
47 #include "pub_core_options.h"
48 #include "pub_core_signals.h"       // For VG_SIGVGKILL, VG_(poll_signals)
49 #include "pub_core_syscall.h"
50 #include "pub_core_machine.h"
51 #include "pub_core_mallocfree.h"
52 #include "pub_core_syswrap.h"
53 
54 #include "priv_types_n_macros.h"
55 #include "priv_syswrap-main.h"
56 
57 #if defined(VGO_darwin)
58 #include "priv_syswrap-darwin.h"
59 #endif
60 
61 /* Useful info which needs to be recorded somewhere:
62    Use of registers in syscalls is:
63 
64           NUM   ARG1 ARG2 ARG3 ARG4 ARG5 ARG6 ARG7 ARG8 RESULT
65    LINUX:
66    x86    eax   ebx  ecx  edx  esi  edi  ebp  n/a  n/a  eax       (== NUM)
67    amd64  rax   rdi  rsi  rdx  r10  r8   r9   n/a  n/a  rax       (== NUM)
68    ppc32  r0    r3   r4   r5   r6   r7   r8   n/a  n/a  r3+CR0.SO (== ARG1)
69    ppc64  r0    r3   r4   r5   r6   r7   r8   n/a  n/a  r3+CR0.SO (== ARG1)
70    arm    r7    r0   r1   r2   r3   r4   r5   n/a  n/a  r0        (== ARG1)
71    mips32 v0    a0   a1   a2   a3 stack stack n/a  n/a  v0        (== NUM)
72    mips64 v0    a0   a1   a2   a3   a4   a5   a6   a7   v0        (== NUM)
73    arm64  x8    x0   x1   x2   x3   x4   x5   n/a  n/a  x0 ??     (== ARG1??)
74 
75    On s390x the svc instruction is used for system calls. The system call
76    number is encoded in the instruction (8 bit immediate field). Since Linux
77    2.6 it is also allowed to use svc 0 with the system call number in r1.
78    This was introduced for system calls >255, but works for all. It is
79    also possible to see the svc 0 together with an EXecute instruction, that
80    fills in the immediate field.
81    s390x r1/SVC r2   r3   r4   r5   r6   r7   n/a  n/a  r2        (== ARG1)
82 
83           NUM   ARG1 ARG2 ARG3 ARG4 ARG5 ARG6 ARG7 ARG8 RESULT
84    DARWIN:
85    x86    eax   +4   +8   +12  +16  +20  +24  +28  +32  edx:eax, eflags.c
86    amd64  rax   rdi  rsi  rdx  rcx  r8   r9   +8   +16  rdx:rax, rflags.c
87 
88    For x86-darwin, "+N" denotes "in memory at N(%esp)"; ditto
89    amd64-darwin.  Apparently 0(%esp) is some kind of return address
90    (perhaps for syscalls done with "sysenter"?)  I don't think it is
91    relevant for syscalls done with "int $0x80/1/2".
92 */
93 
94 /* This is the top level of the system-call handler module.  All
95    system calls are channelled through here, doing two things:
96 
97    * notify the tool of the events (mem/reg reads, writes) happening
98 
99    * perform the syscall, usually by passing it along to the kernel
100      unmodified.
101 
102    A magical piece of assembly code, do_syscall_for_client_WRK, in
103    syscall-$PLATFORM.S does the tricky bit of passing a syscall to the
104    kernel, whilst having the simulator retain control.
105 */
106 
107 /* The main function is VG_(client_syscall).  The simulation calls it
108    whenever a client thread wants to do a syscall.  The following is a
109    sketch of what it does.
110 
111    * Ensures the root thread's stack is suitably mapped.  Tedious and
112      arcane.  See big big comment in VG_(client_syscall).
113 
114    * First, it rounds up the syscall number and args (which is a
115      platform dependent activity) and puts them in a struct ("args")
116      and also a copy in "orig_args".
117 
118      The pre/post wrappers refer to these structs and so no longer
119      need magic macros to access any specific registers.  This struct
120      is stored in thread-specific storage.
121 
122 
123    * The pre-wrapper is called, passing it a pointer to struct
124      "args".
125 
126 
127    * The pre-wrapper examines the args and pokes the tool
128      appropriately.  It may modify the args; this is why "orig_args"
129      is also stored.
130 
131      The pre-wrapper may choose to 'do' the syscall itself, and
132      concludes one of three outcomes:
133 
134        Success(N)    -- syscall is already complete, with success;
135                         result is N
136 
137        Fail(N)       -- syscall is already complete, with failure;
138                         error code is N
139 
140        HandToKernel  -- (the usual case): this needs to be given to
141                         the kernel to be done, using the values in
142                         the possibly-modified "args" struct.
143 
144      In addition, the pre-wrapper may set some flags:
145 
146        MayBlock   -- only applicable when outcome==HandToKernel
147 
148        PostOnFail -- only applicable when outcome==HandToKernel or Fail
149 
150 
151    * If the pre-outcome is HandToKernel, the syscall is duly handed
152      off to the kernel (perhaps involving some thread switchery, but
153      that's not important).  This reduces the possible set of outcomes
154      to either Success(N) or Fail(N).
155 
156 
157    * The outcome (Success(N) or Fail(N)) is written back to the guest
158      register(s).  This is platform specific:
159 
160      x86:    Success(N) ==>  eax = N
161              Fail(N)    ==>  eax = -N
162 
163      ditto amd64
164 
165      ppc32:  Success(N) ==>  r3 = N, CR0.SO = 0
166              Fail(N) ==>     r3 = N, CR0.SO = 1
167 
168      Darwin:
169      x86:    Success(N) ==>  edx:eax = N, cc = 0
170              Fail(N)    ==>  edx:eax = N, cc = 1
171 
172      s390x:  Success(N) ==>  r2 = N
173              Fail(N)    ==>  r2 = -N
174 
175    * The post wrapper is called if:
176 
177      - it exists, and
178      - outcome==Success or (outcome==Fail and PostOnFail is set)
179 
180      The post wrapper is passed the adulterated syscall args (struct
181      "args"), and the syscall outcome (viz, Success(N) or Fail(N)).
182 
183    There are several other complications, primarily to do with
184    syscalls getting interrupted, explained in comments in the code.
185 */
186 
187 /* CAVEATS for writing wrappers.  It is important to follow these!
188 
189    The macros defined in priv_types_n_macros.h are designed to help
190    decouple the wrapper logic from the actual representation of
191    syscall args/results, since these wrappers are designed to work on
192    multiple platforms.
193 
194    Sometimes a PRE wrapper will complete the syscall itself, without
195    handing it to the kernel.  It will use one of SET_STATUS_Success,
196    SET_STATUS_Failure or SET_STATUS_from_SysRes to set the return
197    value.  It is critical to appreciate that use of the macro does not
198    immediately cause the underlying guest state to be updated -- that
199    is done by the driver logic in this file, when the wrapper returns.
200 
201    As a result, PRE wrappers of the following form will malfunction:
202 
203    PRE(fooble)
204    {
205       ... do stuff ...
206       SET_STATUS_Somehow(...)
207 
208       // do something that assumes guest state is up to date
209    }
210 
211    In particular, direct or indirect calls to VG_(poll_signals) after
212    setting STATUS can cause the guest state to be read (in order to
213    build signal frames).  Do not do this.  If you want a signal poll
214    after the syscall goes through, do "*flags |= SfPollAfter" and the
215    driver logic will do it for you.
216 
217    -----------
218 
219    Another critical requirement following introduction of new address
220    space manager (JRS, 20050923):
221 
222    In a situation where the mappedness of memory has changed, aspacem
223    should be notified BEFORE the tool.  Hence the following is
224    correct:
225 
226       Bool d = VG_(am_notify_munmap)(s->start, s->end+1 - s->start);
227       VG_TRACK( die_mem_munmap, s->start, s->end+1 - s->start );
228       if (d)
229          VG_(discard_translations)(s->start, s->end+1 - s->start);
230 
231    whilst this is wrong:
232 
233       VG_TRACK( die_mem_munmap, s->start, s->end+1 - s->start );
234       Bool d = VG_(am_notify_munmap)(s->start, s->end+1 - s->start);
235       if (d)
236          VG_(discard_translations)(s->start, s->end+1 - s->start);
237 
238    The reason is that the tool may itself ask aspacem for more shadow
239    memory as a result of the VG_TRACK call.  In such a situation it is
240    critical that aspacem's segment array is up to date -- hence the
241    need to notify aspacem first.
242 
243    -----------
244 
245    Also .. take care to call VG_(discard_translations) whenever
246    memory with execute permissions is unmapped.
247 */
248 
249 
250 /* ---------------------------------------------------------------------
251    Do potentially blocking syscall for the client, and mess with
252    signal masks at the same time.
253    ------------------------------------------------------------------ */
254 
255 /* Perform a syscall on behalf of a client thread, using a specific
256    signal mask.  On completion, the signal mask is set to restore_mask
257    (which presumably blocks almost everything).  If a signal happens
258    during the syscall, the handler should call
259    VG_(fixup_guest_state_after_syscall_interrupted) to adjust the
260    thread's context to do the right thing.
261 
262    The _WRK function is handwritten assembly, implemented per-platform
263    in coregrind/m_syswrap/syscall-$PLAT.S.  It has some very magic
264    properties.  See comments at the top of
265    VG_(fixup_guest_state_after_syscall_interrupted) below for details.
266 
267    This function (these functions) are required to return zero in case
268    of success (even if the syscall itself failed), and nonzero if the
269    sigprocmask-swizzling calls failed.  We don't actually care about
270    the failure values from sigprocmask, although most of the assembly
271    implementations do attempt to return that, using the convention
272    0 for success, or 0x8000 | error-code for failure.
273 */
274 #if defined(VGO_linux)
275 extern
276 UWord ML_(do_syscall_for_client_WRK)( Word syscallno,
277                                       void* guest_state,
278                                       const vki_sigset_t *syscall_mask,
279                                       const vki_sigset_t *restore_mask,
280                                       Word sigsetSzB );
281 #elif defined(VGO_darwin)
282 extern
283 UWord ML_(do_syscall_for_client_unix_WRK)( Word syscallno,
284                                            void* guest_state,
285                                            const vki_sigset_t *syscall_mask,
286                                            const vki_sigset_t *restore_mask,
287                                            Word sigsetSzB ); /* unused */
288 extern
289 UWord ML_(do_syscall_for_client_mach_WRK)( Word syscallno,
290                                            void* guest_state,
291                                            const vki_sigset_t *syscall_mask,
292                                            const vki_sigset_t *restore_mask,
293                                            Word sigsetSzB ); /* unused */
294 extern
295 UWord ML_(do_syscall_for_client_mdep_WRK)( Word syscallno,
296                                            void* guest_state,
297                                            const vki_sigset_t *syscall_mask,
298                                            const vki_sigset_t *restore_mask,
299                                            Word sigsetSzB ); /* unused */
300 #else
301 #  error "Unknown OS"
302 #endif
303 
304 
305 static
do_syscall_for_client(Int syscallno,ThreadState * tst,const vki_sigset_t * syscall_mask)306 void do_syscall_for_client ( Int syscallno,
307                              ThreadState* tst,
308                              const vki_sigset_t* syscall_mask )
309 {
310    vki_sigset_t saved;
311    UWord err;
312 #  if defined(VGO_linux)
313    err = ML_(do_syscall_for_client_WRK)(
314             syscallno, &tst->arch.vex,
315             syscall_mask, &saved, sizeof(vki_sigset_t)
316          );
317 #  elif defined(VGO_darwin)
318    switch (VG_DARWIN_SYSNO_CLASS(syscallno)) {
319       case VG_DARWIN_SYSCALL_CLASS_UNIX:
320          err = ML_(do_syscall_for_client_unix_WRK)(
321                   VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
322                   syscall_mask, &saved, 0/*unused:sigsetSzB*/
323                );
324          break;
325       case VG_DARWIN_SYSCALL_CLASS_MACH:
326          err = ML_(do_syscall_for_client_mach_WRK)(
327                   VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
328                   syscall_mask, &saved, 0/*unused:sigsetSzB*/
329                );
330          break;
331       case VG_DARWIN_SYSCALL_CLASS_MDEP:
332          err = ML_(do_syscall_for_client_mdep_WRK)(
333                   VG_DARWIN_SYSNO_FOR_KERNEL(syscallno), &tst->arch.vex,
334                   syscall_mask, &saved, 0/*unused:sigsetSzB*/
335                );
336          break;
337       default:
338          vg_assert(0);
339          /*NOTREACHED*/
340          break;
341    }
342 #  else
343 #    error "Unknown OS"
344 #  endif
345    vg_assert2(
346       err == 0,
347       "ML_(do_syscall_for_client_WRK): sigprocmask error %d",
348       (Int)(err & 0xFFF)
349    );
350 }
351 
352 
353 /* ---------------------------------------------------------------------
354    Impedance matchers and misc helpers
355    ------------------------------------------------------------------ */
356 
357 static
eq_SyscallArgs(SyscallArgs * a1,SyscallArgs * a2)358 Bool eq_SyscallArgs ( SyscallArgs* a1, SyscallArgs* a2 )
359 {
360    return a1->sysno == a2->sysno
361           && a1->arg1 == a2->arg1
362           && a1->arg2 == a2->arg2
363           && a1->arg3 == a2->arg3
364           && a1->arg4 == a2->arg4
365           && a1->arg5 == a2->arg5
366           && a1->arg6 == a2->arg6
367           && a1->arg7 == a2->arg7
368           && a1->arg8 == a2->arg8;
369 }
370 
371 static
eq_SyscallStatus(SyscallStatus * s1,SyscallStatus * s2)372 Bool eq_SyscallStatus ( SyscallStatus* s1, SyscallStatus* s2 )
373 {
374    /* was: return s1->what == s2->what && sr_EQ( s1->sres, s2->sres ); */
375    if (s1->what == s2->what && sr_EQ( s1->sres, s2->sres ))
376       return True;
377 #  if defined(VGO_darwin)
378    /* Darwin-specific debugging guff */
379    vg_assert(s1->what == s2->what);
380    VG_(printf)("eq_SyscallStatus:\n");
381    VG_(printf)("  {%lu %lu %u}\n", s1->sres._wLO, s1->sres._wHI, s1->sres._mode);
382    VG_(printf)("  {%lu %lu %u}\n", s2->sres._wLO, s2->sres._wHI, s2->sres._mode);
383    vg_assert(0);
384 #  endif
385    return False;
386 }
387 
388 /* Convert between SysRes and SyscallStatus, to the extent possible. */
389 
390 static
convert_SysRes_to_SyscallStatus(SysRes res)391 SyscallStatus convert_SysRes_to_SyscallStatus ( SysRes res )
392 {
393    SyscallStatus status;
394    status.what = SsComplete;
395    status.sres = res;
396    return status;
397 }
398 
399 
400 /* Impedance matchers.  These convert syscall arg or result data from
401    the platform-specific in-guest-state format to the canonical
402    formats, and back. */
403 
404 static
getSyscallArgsFromGuestState(SyscallArgs * canonical,VexGuestArchState * gst_vanilla,UInt trc)405 void getSyscallArgsFromGuestState ( /*OUT*/SyscallArgs*       canonical,
406                                     /*IN*/ VexGuestArchState* gst_vanilla,
407                                     /*IN*/ UInt trc )
408 {
409 #if defined(VGP_x86_linux)
410    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
411    canonical->sysno = gst->guest_EAX;
412    canonical->arg1  = gst->guest_EBX;
413    canonical->arg2  = gst->guest_ECX;
414    canonical->arg3  = gst->guest_EDX;
415    canonical->arg4  = gst->guest_ESI;
416    canonical->arg5  = gst->guest_EDI;
417    canonical->arg6  = gst->guest_EBP;
418    canonical->arg7  = 0;
419    canonical->arg8  = 0;
420 
421 #elif defined(VGP_amd64_linux)
422    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
423    canonical->sysno = gst->guest_RAX;
424    canonical->arg1  = gst->guest_RDI;
425    canonical->arg2  = gst->guest_RSI;
426    canonical->arg3  = gst->guest_RDX;
427    canonical->arg4  = gst->guest_R10;
428    canonical->arg5  = gst->guest_R8;
429    canonical->arg6  = gst->guest_R9;
430    canonical->arg7  = 0;
431    canonical->arg8  = 0;
432 
433 #elif defined(VGP_ppc32_linux)
434    VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
435    canonical->sysno = gst->guest_GPR0;
436    canonical->arg1  = gst->guest_GPR3;
437    canonical->arg2  = gst->guest_GPR4;
438    canonical->arg3  = gst->guest_GPR5;
439    canonical->arg4  = gst->guest_GPR6;
440    canonical->arg5  = gst->guest_GPR7;
441    canonical->arg6  = gst->guest_GPR8;
442    canonical->arg7  = 0;
443    canonical->arg8  = 0;
444 
445 #elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
446    VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
447    canonical->sysno = gst->guest_GPR0;
448    canonical->arg1  = gst->guest_GPR3;
449    canonical->arg2  = gst->guest_GPR4;
450    canonical->arg3  = gst->guest_GPR5;
451    canonical->arg4  = gst->guest_GPR6;
452    canonical->arg5  = gst->guest_GPR7;
453    canonical->arg6  = gst->guest_GPR8;
454    canonical->arg7  = 0;
455    canonical->arg8  = 0;
456 
457 #elif defined(VGP_arm_linux)
458    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
459    canonical->sysno = gst->guest_R7;
460    canonical->arg1  = gst->guest_R0;
461    canonical->arg2  = gst->guest_R1;
462    canonical->arg3  = gst->guest_R2;
463    canonical->arg4  = gst->guest_R3;
464    canonical->arg5  = gst->guest_R4;
465    canonical->arg6  = gst->guest_R5;
466    canonical->arg7  = 0;
467    canonical->arg8  = 0;
468 
469 #elif defined(VGP_arm64_linux)
470    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
471    canonical->sysno = gst->guest_X8;
472    canonical->arg1  = gst->guest_X0;
473    canonical->arg2  = gst->guest_X1;
474    canonical->arg3  = gst->guest_X2;
475    canonical->arg4  = gst->guest_X3;
476    canonical->arg5  = gst->guest_X4;
477    canonical->arg6  = gst->guest_X5;
478    canonical->arg7  = 0;
479    canonical->arg8  = 0;
480 
481 #elif defined(VGP_mips32_linux)
482    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
483    canonical->sysno = gst->guest_r2;    // v0
484    if (canonical->sysno == __NR_exit) {
485       canonical->arg1 = gst->guest_r4;    // a0
486       canonical->arg2 = 0;
487       canonical->arg3 = 0;
488       canonical->arg4 = 0;
489       canonical->arg5 = 0;
490       canonical->arg6 = 0;
491       canonical->arg8 = 0;
492    } else if (canonical->sysno != __NR_syscall) {
493       canonical->arg1  = gst->guest_r4;    // a0
494       canonical->arg2  = gst->guest_r5;    // a1
495       canonical->arg3  = gst->guest_r6;    // a2
496       canonical->arg4  = gst->guest_r7;    // a3
497       canonical->arg5  = *((UInt*) (gst->guest_r29 + 16));    // 16(guest_SP/sp)
498       canonical->arg6  = *((UInt*) (gst->guest_r29 + 20));    // 20(sp)
499       canonical->arg8 = 0;
500    } else {
501       // Fixme hack handle syscall()
502       canonical->sysno = gst->guest_r4;    // a0
503       canonical->arg1  = gst->guest_r5;    // a1
504       canonical->arg2  = gst->guest_r6;    // a2
505       canonical->arg3  = gst->guest_r7;    // a3
506       canonical->arg4  = *((UInt*) (gst->guest_r29 + 16));    // 16(guest_SP/sp)
507       canonical->arg5  = *((UInt*) (gst->guest_r29 + 20));    // 20(guest_SP/sp)
508       canonical->arg6  = *((UInt*) (gst->guest_r29 + 24));    // 24(guest_SP/sp)
509       canonical->arg8 = __NR_syscall;
510    }
511 
512 #elif defined(VGP_mips64_linux)
513    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
514    canonical->sysno = gst->guest_r2;    // v0
515    canonical->arg1  = gst->guest_r4;    // a0
516    canonical->arg2  = gst->guest_r5;    // a1
517    canonical->arg3  = gst->guest_r6;    // a2
518    canonical->arg4  = gst->guest_r7;    // a3
519    canonical->arg5  = gst->guest_r8;    // a4
520    canonical->arg6  = gst->guest_r9;    // a5
521 
522 #elif defined(VGP_x86_darwin)
523    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
524    UWord *stack = (UWord *)gst->guest_ESP;
525    // GrP fixme hope syscalls aren't called with really shallow stacks...
526    canonical->sysno = gst->guest_EAX;
527    if (canonical->sysno != 0) {
528       // stack[0] is return address
529       canonical->arg1  = stack[1];
530       canonical->arg2  = stack[2];
531       canonical->arg3  = stack[3];
532       canonical->arg4  = stack[4];
533       canonical->arg5  = stack[5];
534       canonical->arg6  = stack[6];
535       canonical->arg7  = stack[7];
536       canonical->arg8  = stack[8];
537    } else {
538       // GrP fixme hack handle syscall()
539       // GrP fixme what about __syscall() ?
540       // stack[0] is return address
541       // DDD: the tool can't see that the params have been shifted!  Can
542       //      lead to incorrect checking, I think, because the PRRAn/PSARn
543       //      macros will mention the pre-shifted args.
544       canonical->sysno = stack[1];
545       vg_assert(canonical->sysno != 0);
546       canonical->arg1  = stack[2];
547       canonical->arg2  = stack[3];
548       canonical->arg3  = stack[4];
549       canonical->arg4  = stack[5];
550       canonical->arg5  = stack[6];
551       canonical->arg6  = stack[7];
552       canonical->arg7  = stack[8];
553       canonical->arg8  = stack[9];
554 
555       PRINT("SYSCALL[%d,?](0) syscall(%s, ...); please stand by...\n",
556             VG_(getpid)(), /*tid,*/
557             VG_SYSNUM_STRING(canonical->sysno));
558    }
559 
560    // Here we determine what kind of syscall it was by looking at the
561    // interrupt kind, and then encode the syscall number using the 64-bit
562    // encoding for Valgrind's internal use.
563    //
564    // DDD: Would it be better to stash the JMP kind into the Darwin
565    // thread state rather than passing in the trc?
566    switch (trc) {
567    case VEX_TRC_JMP_SYS_INT128:
568       // int $0x80 = Unix, 64-bit result
569       vg_assert(canonical->sysno >= 0);
570       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(canonical->sysno);
571       break;
572    case VEX_TRC_JMP_SYS_SYSENTER:
573       // syscall = Unix, 32-bit result
574       // OR        Mach, 32-bit result
575       if (canonical->sysno >= 0) {
576          // GrP fixme hack:  0xffff == I386_SYSCALL_NUMBER_MASK
577          canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(canonical->sysno
578                                                              & 0xffff);
579       } else {
580          canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MACH(-canonical->sysno);
581       }
582       break;
583    case VEX_TRC_JMP_SYS_INT129:
584       // int $0x81 = Mach, 32-bit result
585       vg_assert(canonical->sysno < 0);
586       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MACH(-canonical->sysno);
587       break;
588    case VEX_TRC_JMP_SYS_INT130:
589       // int $0x82 = mdep, 32-bit result
590       vg_assert(canonical->sysno >= 0);
591       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_MDEP(canonical->sysno);
592       break;
593    default:
594       vg_assert(0);
595       break;
596    }
597 
598 #elif defined(VGP_amd64_darwin)
599    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
600    UWord *stack = (UWord *)gst->guest_RSP;
601 
602    vg_assert(trc == VEX_TRC_JMP_SYS_SYSCALL);
603 
604    // GrP fixme hope syscalls aren't called with really shallow stacks...
605    canonical->sysno = gst->guest_RAX;
606    if (canonical->sysno != __NR_syscall) {
607       // stack[0] is return address
608       canonical->arg1  = gst->guest_RDI;
609       canonical->arg2  = gst->guest_RSI;
610       canonical->arg3  = gst->guest_RDX;
611       canonical->arg4  = gst->guest_R10;  // not rcx with syscall insn
612       canonical->arg5  = gst->guest_R8;
613       canonical->arg6  = gst->guest_R9;
614       canonical->arg7  = stack[1];
615       canonical->arg8  = stack[2];
616    } else {
617       // GrP fixme hack handle syscall()
618       // GrP fixme what about __syscall() ?
619       // stack[0] is return address
620       // DDD: the tool can't see that the params have been shifted!  Can
621       //      lead to incorrect checking, I think, because the PRRAn/PSARn
622       //      macros will mention the pre-shifted args.
623       canonical->sysno = VG_DARWIN_SYSCALL_CONSTRUCT_UNIX(gst->guest_RDI);
624       vg_assert(canonical->sysno != __NR_syscall);
625       canonical->arg1  = gst->guest_RSI;
626       canonical->arg2  = gst->guest_RDX;
627       canonical->arg3  = gst->guest_R10;  // not rcx with syscall insn
628       canonical->arg4  = gst->guest_R8;
629       canonical->arg5  = gst->guest_R9;
630       canonical->arg6  = stack[1];
631       canonical->arg7  = stack[2];
632       canonical->arg8  = stack[3];
633 
634       PRINT("SYSCALL[%d,?](0) syscall(%s, ...); please stand by...\n",
635             VG_(getpid)(), /*tid,*/
636             VG_SYSNUM_STRING(canonical->sysno));
637    }
638 
639    // no canonical->sysno adjustment needed
640 
641 #elif defined(VGP_s390x_linux)
642    VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
643    canonical->sysno = gst->guest_SYSNO;
644    canonical->arg1  = gst->guest_r2;
645    canonical->arg2  = gst->guest_r3;
646    canonical->arg3  = gst->guest_r4;
647    canonical->arg4  = gst->guest_r5;
648    canonical->arg5  = gst->guest_r6;
649    canonical->arg6  = gst->guest_r7;
650    canonical->arg7  = 0;
651    canonical->arg8  = 0;
652 
653 #elif defined(VGP_tilegx_linux)
654    VexGuestTILEGXState* gst = (VexGuestTILEGXState*)gst_vanilla;
655    canonical->sysno = gst->guest_r10;
656    canonical->arg1  = gst->guest_r0;
657    canonical->arg2  = gst->guest_r1;
658    canonical->arg3  = gst->guest_r2;
659    canonical->arg4  = gst->guest_r3;
660    canonical->arg5  = gst->guest_r4;
661    canonical->arg6  = gst->guest_r5;
662    canonical->arg7  = 0;
663    canonical->arg8  = 0;
664 
665 #else
666 #  error "getSyscallArgsFromGuestState: unknown arch"
667 #endif
668 }
669 
670 static
putSyscallArgsIntoGuestState(SyscallArgs * canonical,VexGuestArchState * gst_vanilla)671 void putSyscallArgsIntoGuestState ( /*IN*/ SyscallArgs*       canonical,
672                                     /*OUT*/VexGuestArchState* gst_vanilla )
673 {
674 #if defined(VGP_x86_linux)
675    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
676    gst->guest_EAX = canonical->sysno;
677    gst->guest_EBX = canonical->arg1;
678    gst->guest_ECX = canonical->arg2;
679    gst->guest_EDX = canonical->arg3;
680    gst->guest_ESI = canonical->arg4;
681    gst->guest_EDI = canonical->arg5;
682    gst->guest_EBP = canonical->arg6;
683 
684 #elif defined(VGP_amd64_linux)
685    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
686    gst->guest_RAX = canonical->sysno;
687    gst->guest_RDI = canonical->arg1;
688    gst->guest_RSI = canonical->arg2;
689    gst->guest_RDX = canonical->arg3;
690    gst->guest_R10 = canonical->arg4;
691    gst->guest_R8  = canonical->arg5;
692    gst->guest_R9  = canonical->arg6;
693 
694 #elif defined(VGP_ppc32_linux)
695    VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
696    gst->guest_GPR0 = canonical->sysno;
697    gst->guest_GPR3 = canonical->arg1;
698    gst->guest_GPR4 = canonical->arg2;
699    gst->guest_GPR5 = canonical->arg3;
700    gst->guest_GPR6 = canonical->arg4;
701    gst->guest_GPR7 = canonical->arg5;
702    gst->guest_GPR8 = canonical->arg6;
703 
704 #elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
705    VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
706    gst->guest_GPR0 = canonical->sysno;
707    gst->guest_GPR3 = canonical->arg1;
708    gst->guest_GPR4 = canonical->arg2;
709    gst->guest_GPR5 = canonical->arg3;
710    gst->guest_GPR6 = canonical->arg4;
711    gst->guest_GPR7 = canonical->arg5;
712    gst->guest_GPR8 = canonical->arg6;
713 
714 #elif defined(VGP_arm_linux)
715    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
716    gst->guest_R7 = canonical->sysno;
717    gst->guest_R0 = canonical->arg1;
718    gst->guest_R1 = canonical->arg2;
719    gst->guest_R2 = canonical->arg3;
720    gst->guest_R3 = canonical->arg4;
721    gst->guest_R4 = canonical->arg5;
722    gst->guest_R5 = canonical->arg6;
723 
724 #elif defined(VGP_arm64_linux)
725    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
726    gst->guest_X8 = canonical->sysno;
727    gst->guest_X0 = canonical->arg1;
728    gst->guest_X1 = canonical->arg2;
729    gst->guest_X2 = canonical->arg3;
730    gst->guest_X3 = canonical->arg4;
731    gst->guest_X4 = canonical->arg5;
732    gst->guest_X5 = canonical->arg6;
733 
734 #elif defined(VGP_x86_darwin)
735    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
736    UWord *stack = (UWord *)gst->guest_ESP;
737 
738    gst->guest_EAX = VG_DARWIN_SYSNO_FOR_KERNEL(canonical->sysno);
739 
740    // GrP fixme? gst->guest_TEMP_EFLAG_C = 0;
741    // stack[0] is return address
742    stack[1] = canonical->arg1;
743    stack[2] = canonical->arg2;
744    stack[3] = canonical->arg3;
745    stack[4] = canonical->arg4;
746    stack[5] = canonical->arg5;
747    stack[6] = canonical->arg6;
748    stack[7] = canonical->arg7;
749    stack[8] = canonical->arg8;
750 
751 #elif defined(VGP_amd64_darwin)
752    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
753    UWord *stack = (UWord *)gst->guest_RSP;
754 
755    gst->guest_RAX = VG_DARWIN_SYSNO_FOR_KERNEL(canonical->sysno);
756    // GrP fixme? gst->guest_TEMP_EFLAG_C = 0;
757 
758    // stack[0] is return address
759    gst->guest_RDI = canonical->arg1;
760    gst->guest_RSI = canonical->arg2;
761    gst->guest_RDX = canonical->arg3;
762    gst->guest_RCX = canonical->arg4;
763    gst->guest_R8  = canonical->arg5;
764    gst->guest_R9  = canonical->arg6;
765    stack[1]       = canonical->arg7;
766    stack[2]       = canonical->arg8;
767 
768 #elif defined(VGP_s390x_linux)
769    VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
770    gst->guest_SYSNO  = canonical->sysno;
771    gst->guest_r2     = canonical->arg1;
772    gst->guest_r3     = canonical->arg2;
773    gst->guest_r4     = canonical->arg3;
774    gst->guest_r5     = canonical->arg4;
775    gst->guest_r6     = canonical->arg5;
776    gst->guest_r7     = canonical->arg6;
777 
778 #elif defined(VGP_mips32_linux)
779    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
780    if (canonical->arg8 != __NR_syscall) {
781       gst->guest_r2 = canonical->sysno;
782       gst->guest_r4 = canonical->arg1;
783       gst->guest_r5 = canonical->arg2;
784       gst->guest_r6 = canonical->arg3;
785       gst->guest_r7 = canonical->arg4;
786       *((UInt*) (gst->guest_r29 + 16)) = canonical->arg5; // 16(guest_GPR29/sp)
787       *((UInt*) (gst->guest_r29 + 20)) = canonical->arg6; // 20(sp)
788    } else {
789       canonical->arg8 = 0;
790       gst->guest_r2 = __NR_syscall;
791       gst->guest_r4 = canonical->sysno;
792       gst->guest_r5 = canonical->arg1;
793       gst->guest_r6 = canonical->arg2;
794       gst->guest_r7 = canonical->arg3;
795       *((UInt*) (gst->guest_r29 + 16)) = canonical->arg4; // 16(guest_GPR29/sp)
796       *((UInt*) (gst->guest_r29 + 20)) = canonical->arg5; // 20(sp)
797       *((UInt*) (gst->guest_r29 + 24)) = canonical->arg6; // 24(sp)
798    }
799 
800 #elif defined(VGP_mips64_linux)
801    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
802    gst->guest_r2 = canonical->sysno;
803    gst->guest_r4 = canonical->arg1;
804    gst->guest_r5 = canonical->arg2;
805    gst->guest_r6 = canonical->arg3;
806    gst->guest_r7 = canonical->arg4;
807    gst->guest_r8 = canonical->arg5;
808    gst->guest_r9 = canonical->arg6;
809 
810 #elif defined(VGP_tilegx_linux)
811    VexGuestTILEGXState* gst = (VexGuestTILEGXState*)gst_vanilla;
812    gst->guest_r10 = canonical->sysno;
813    gst->guest_r0 = canonical->arg1;
814    gst->guest_r1 = canonical->arg2;
815    gst->guest_r2 = canonical->arg3;
816    gst->guest_r3 = canonical->arg4;
817    gst->guest_r4 = canonical->arg5;
818    gst->guest_r5 = canonical->arg6;
819 
820 #else
821 #  error "putSyscallArgsIntoGuestState: unknown arch"
822 #endif
823 }
824 
825 static
getSyscallStatusFromGuestState(SyscallStatus * canonical,VexGuestArchState * gst_vanilla)826 void getSyscallStatusFromGuestState ( /*OUT*/SyscallStatus*     canonical,
827                                       /*IN*/ VexGuestArchState* gst_vanilla )
828 {
829 #  if defined(VGP_x86_linux)
830    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
831    canonical->sres = VG_(mk_SysRes_x86_linux)( gst->guest_EAX );
832    canonical->what = SsComplete;
833 
834 #  elif defined(VGP_amd64_linux)
835    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
836    canonical->sres = VG_(mk_SysRes_amd64_linux)( gst->guest_RAX );
837    canonical->what = SsComplete;
838 
839 #  elif defined(VGP_ppc32_linux)
840    VexGuestPPC32State* gst   = (VexGuestPPC32State*)gst_vanilla;
841    UInt                cr    = LibVEX_GuestPPC32_get_CR( gst );
842    UInt                cr0so = (cr >> 28) & 1;
843    canonical->sres = VG_(mk_SysRes_ppc32_linux)( gst->guest_GPR3, cr0so );
844    canonical->what = SsComplete;
845 
846 #  elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
847    VexGuestPPC64State* gst   = (VexGuestPPC64State*)gst_vanilla;
848    UInt                cr    = LibVEX_GuestPPC64_get_CR( gst );
849    UInt                cr0so = (cr >> 28) & 1;
850    canonical->sres = VG_(mk_SysRes_ppc64_linux)( gst->guest_GPR3, cr0so );
851    canonical->what = SsComplete;
852 
853 #  elif defined(VGP_arm_linux)
854    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
855    canonical->sres = VG_(mk_SysRes_arm_linux)( gst->guest_R0 );
856    canonical->what = SsComplete;
857 
858 #  elif defined(VGP_arm64_linux)
859    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
860    canonical->sres = VG_(mk_SysRes_arm64_linux)( gst->guest_X0 );
861    canonical->what = SsComplete;
862 
863 #  elif defined(VGP_mips32_linux)
864    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
865    UInt                v0 = gst->guest_r2;    // v0
866    UInt                v1 = gst->guest_r3;    // v1
867    UInt                a3 = gst->guest_r7;    // a3
868    canonical->sres = VG_(mk_SysRes_mips32_linux)( v0, v1, a3 );
869    canonical->what = SsComplete;
870 
871 #  elif defined(VGP_mips64_linux)
872    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
873    ULong                v0 = gst->guest_r2;    // v0
874    ULong                v1 = gst->guest_r3;    // v1
875    ULong                a3 = gst->guest_r7;    // a3
876    canonical->sres = VG_(mk_SysRes_mips64_linux)(v0, v1, a3);
877    canonical->what = SsComplete;
878 
879 #  elif defined(VGP_x86_darwin)
880    /* duplicates logic in m_signals.VG_UCONTEXT_SYSCALL_SYSRES */
881    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
882    UInt carry = 1 & LibVEX_GuestX86_get_eflags(gst);
883    UInt err = 0;
884    UInt wLO = 0;
885    UInt wHI = 0;
886    switch (gst->guest_SC_CLASS) {
887       case VG_DARWIN_SYSCALL_CLASS_UNIX:
888          // int $0x80 = Unix, 64-bit result
889          err = carry;
890          wLO = gst->guest_EAX;
891          wHI = gst->guest_EDX;
892          break;
893       case VG_DARWIN_SYSCALL_CLASS_MACH:
894          // int $0x81 = Mach, 32-bit result
895          wLO = gst->guest_EAX;
896          break;
897       case VG_DARWIN_SYSCALL_CLASS_MDEP:
898          // int $0x82 = mdep, 32-bit result
899          wLO = gst->guest_EAX;
900          break;
901       default:
902          vg_assert(0);
903          break;
904    }
905    canonical->sres = VG_(mk_SysRes_x86_darwin)(
906                         gst->guest_SC_CLASS, err ? True : False,
907                         wHI, wLO
908                      );
909    canonical->what = SsComplete;
910 
911 #  elif defined(VGP_amd64_darwin)
912    /* duplicates logic in m_signals.VG_UCONTEXT_SYSCALL_SYSRES */
913    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
914    ULong carry = 1 & LibVEX_GuestAMD64_get_rflags(gst);
915    ULong err = 0;
916    ULong wLO = 0;
917    ULong wHI = 0;
918    switch (gst->guest_SC_CLASS) {
919       case VG_DARWIN_SYSCALL_CLASS_UNIX:
920          // syscall = Unix, 128-bit result
921          err = carry;
922          wLO = gst->guest_RAX;
923          wHI = gst->guest_RDX;
924          break;
925       case VG_DARWIN_SYSCALL_CLASS_MACH:
926          // syscall = Mach, 64-bit result
927          wLO = gst->guest_RAX;
928          break;
929       case VG_DARWIN_SYSCALL_CLASS_MDEP:
930          // syscall = mdep, 64-bit result
931          wLO = gst->guest_RAX;
932          break;
933       default:
934          vg_assert(0);
935          break;
936    }
937    canonical->sres = VG_(mk_SysRes_amd64_darwin)(
938                         gst->guest_SC_CLASS, err ? True : False,
939                         wHI, wLO
940                      );
941    canonical->what = SsComplete;
942 
943 #  elif defined(VGP_s390x_linux)
944    VexGuestS390XState* gst   = (VexGuestS390XState*)gst_vanilla;
945    canonical->sres = VG_(mk_SysRes_s390x_linux)( gst->guest_r2 );
946    canonical->what = SsComplete;
947 
948 #  elif defined(VGP_tilegx_linux)
949    VexGuestTILEGXState* gst = (VexGuestTILEGXState*)gst_vanilla;
950    canonical->sres = VG_(mk_SysRes_tilegx_linux)( gst->guest_r0 );
951    canonical->what = SsComplete;
952 
953 #  else
954 #    error "getSyscallStatusFromGuestState: unknown arch"
955 #  endif
956 }
957 
958 static
putSyscallStatusIntoGuestState(ThreadId tid,SyscallStatus * canonical,VexGuestArchState * gst_vanilla)959 void putSyscallStatusIntoGuestState ( /*IN*/ ThreadId tid,
960                                       /*IN*/ SyscallStatus*     canonical,
961                                       /*OUT*/VexGuestArchState* gst_vanilla )
962 {
963 #  if defined(VGP_x86_linux)
964    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
965    vg_assert(canonical->what == SsComplete);
966    if (sr_isError(canonical->sres)) {
967       /* This isn't exactly right, in that really a Failure with res
968          not in the range 1 .. 4095 is unrepresentable in the
969          Linux-x86 scheme.  Oh well. */
970       gst->guest_EAX = - (Int)sr_Err(canonical->sres);
971    } else {
972       gst->guest_EAX = sr_Res(canonical->sres);
973    }
974    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
975              OFFSET_x86_EAX, sizeof(UWord) );
976 
977 #  elif defined(VGP_amd64_linux)
978    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
979    vg_assert(canonical->what == SsComplete);
980    if (sr_isError(canonical->sres)) {
981       /* This isn't exactly right, in that really a Failure with res
982          not in the range 1 .. 4095 is unrepresentable in the
983          Linux-amd64 scheme.  Oh well. */
984       gst->guest_RAX = - (Long)sr_Err(canonical->sres);
985    } else {
986       gst->guest_RAX = sr_Res(canonical->sres);
987    }
988    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
989              OFFSET_amd64_RAX, sizeof(UWord) );
990 
991 #  elif defined(VGP_ppc32_linux)
992    VexGuestPPC32State* gst = (VexGuestPPC32State*)gst_vanilla;
993    UInt old_cr = LibVEX_GuestPPC32_get_CR(gst);
994    vg_assert(canonical->what == SsComplete);
995    if (sr_isError(canonical->sres)) {
996       /* set CR0.SO */
997       LibVEX_GuestPPC32_put_CR( old_cr | (1<<28), gst );
998       gst->guest_GPR3 = sr_Err(canonical->sres);
999    } else {
1000       /* clear CR0.SO */
1001       LibVEX_GuestPPC32_put_CR( old_cr & ~(1<<28), gst );
1002       gst->guest_GPR3 = sr_Res(canonical->sres);
1003    }
1004    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1005              OFFSET_ppc32_GPR3, sizeof(UWord) );
1006    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1007              OFFSET_ppc32_CR0_0, sizeof(UChar) );
1008 
1009 #  elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
1010    VexGuestPPC64State* gst = (VexGuestPPC64State*)gst_vanilla;
1011    UInt old_cr = LibVEX_GuestPPC64_get_CR(gst);
1012    vg_assert(canonical->what == SsComplete);
1013    if (sr_isError(canonical->sres)) {
1014       /* set CR0.SO */
1015       LibVEX_GuestPPC64_put_CR( old_cr | (1<<28), gst );
1016       gst->guest_GPR3 = sr_Err(canonical->sres);
1017    } else {
1018       /* clear CR0.SO */
1019       LibVEX_GuestPPC64_put_CR( old_cr & ~(1<<28), gst );
1020       gst->guest_GPR3 = sr_Res(canonical->sres);
1021    }
1022    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1023              OFFSET_ppc64_GPR3, sizeof(UWord) );
1024    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1025              OFFSET_ppc64_CR0_0, sizeof(UChar) );
1026 
1027 #  elif defined(VGP_arm_linux)
1028    VexGuestARMState* gst = (VexGuestARMState*)gst_vanilla;
1029    vg_assert(canonical->what == SsComplete);
1030    if (sr_isError(canonical->sres)) {
1031       /* This isn't exactly right, in that really a Failure with res
1032          not in the range 1 .. 4095 is unrepresentable in the
1033          Linux-arm scheme.  Oh well. */
1034       gst->guest_R0 = - (Int)sr_Err(canonical->sres);
1035    } else {
1036       gst->guest_R0 = sr_Res(canonical->sres);
1037    }
1038    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1039              OFFSET_arm_R0, sizeof(UWord) );
1040 
1041 #  elif defined(VGP_arm64_linux)
1042    VexGuestARM64State* gst = (VexGuestARM64State*)gst_vanilla;
1043    vg_assert(canonical->what == SsComplete);
1044    if (sr_isError(canonical->sres)) {
1045       /* This isn't exactly right, in that really a Failure with res
1046          not in the range 1 .. 4095 is unrepresentable in the
1047          Linux-arm64 scheme.  Oh well. */
1048       gst->guest_X0 = - (Long)sr_Err(canonical->sres);
1049    } else {
1050       gst->guest_X0 = sr_Res(canonical->sres);
1051    }
1052    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1053              OFFSET_arm64_X0, sizeof(UWord) );
1054 
1055 #elif defined(VGP_x86_darwin)
1056    VexGuestX86State* gst = (VexGuestX86State*)gst_vanilla;
1057    SysRes sres = canonical->sres;
1058    vg_assert(canonical->what == SsComplete);
1059    /* Unfortunately here we have to break abstraction and look
1060       directly inside 'res', in order to decide what to do. */
1061    switch (sres._mode) {
1062       case SysRes_MACH: // int $0x81 = Mach, 32-bit result
1063       case SysRes_MDEP: // int $0x82 = mdep, 32-bit result
1064          gst->guest_EAX = sres._wLO;
1065          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1066                    OFFSET_x86_EAX, sizeof(UInt) );
1067          break;
1068       case SysRes_UNIX_OK:  // int $0x80 = Unix, 64-bit result
1069       case SysRes_UNIX_ERR: // int $0x80 = Unix, 64-bit error
1070          gst->guest_EAX = sres._wLO;
1071          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1072                    OFFSET_x86_EAX, sizeof(UInt) );
1073          gst->guest_EDX = sres._wHI;
1074          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1075                    OFFSET_x86_EDX, sizeof(UInt) );
1076          LibVEX_GuestX86_put_eflag_c( sres._mode==SysRes_UNIX_ERR ? 1 : 0,
1077                                       gst );
1078          // GrP fixme sets defined for entire eflags, not just bit c
1079          // DDD: this breaks exp-ptrcheck.
1080          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1081                    offsetof(VexGuestX86State, guest_CC_DEP1), sizeof(UInt) );
1082          break;
1083       default:
1084          vg_assert(0);
1085          break;
1086    }
1087 
1088 #elif defined(VGP_amd64_darwin)
1089    VexGuestAMD64State* gst = (VexGuestAMD64State*)gst_vanilla;
1090    SysRes sres = canonical->sres;
1091    vg_assert(canonical->what == SsComplete);
1092    /* Unfortunately here we have to break abstraction and look
1093       directly inside 'res', in order to decide what to do. */
1094    switch (sres._mode) {
1095       case SysRes_MACH: // syscall = Mach, 64-bit result
1096       case SysRes_MDEP: // syscall = mdep, 64-bit result
1097          gst->guest_RAX = sres._wLO;
1098          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1099                    OFFSET_amd64_RAX, sizeof(ULong) );
1100          break;
1101       case SysRes_UNIX_OK:  // syscall = Unix, 128-bit result
1102       case SysRes_UNIX_ERR: // syscall = Unix, 128-bit error
1103          gst->guest_RAX = sres._wLO;
1104          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1105                    OFFSET_amd64_RAX, sizeof(ULong) );
1106          gst->guest_RDX = sres._wHI;
1107          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1108                    OFFSET_amd64_RDX, sizeof(ULong) );
1109          LibVEX_GuestAMD64_put_rflag_c( sres._mode==SysRes_UNIX_ERR ? 1 : 0,
1110                                         gst );
1111          // GrP fixme sets defined for entire rflags, not just bit c
1112          // DDD: this breaks exp-ptrcheck.
1113          VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1114                    offsetof(VexGuestAMD64State, guest_CC_DEP1), sizeof(ULong) );
1115          break;
1116       default:
1117          vg_assert(0);
1118          break;
1119    }
1120 
1121 #  elif defined(VGP_s390x_linux)
1122    VexGuestS390XState* gst = (VexGuestS390XState*)gst_vanilla;
1123    vg_assert(canonical->what == SsComplete);
1124    if (sr_isError(canonical->sres)) {
1125       gst->guest_r2 = - (Long)sr_Err(canonical->sres);
1126    } else {
1127       gst->guest_r2 = sr_Res(canonical->sres);
1128    }
1129 
1130 #  elif defined(VGP_mips32_linux)
1131    VexGuestMIPS32State* gst = (VexGuestMIPS32State*)gst_vanilla;
1132    vg_assert(canonical->what == SsComplete);
1133    if (sr_isError(canonical->sres)) {
1134       gst->guest_r2 = (Int)sr_Err(canonical->sres);
1135       gst->guest_r7 = (Int)sr_Err(canonical->sres);
1136    } else {
1137       gst->guest_r2 = sr_Res(canonical->sres);
1138       gst->guest_r3 = sr_ResEx(canonical->sres);
1139       gst->guest_r7 = (Int)sr_Err(canonical->sres);
1140    }
1141    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1142              OFFSET_mips32_r2, sizeof(UWord) );
1143    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1144              OFFSET_mips32_r3, sizeof(UWord) );
1145    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1146              OFFSET_mips32_r7, sizeof(UWord) );
1147 
1148 #  elif defined(VGP_mips64_linux)
1149    VexGuestMIPS64State* gst = (VexGuestMIPS64State*)gst_vanilla;
1150    vg_assert(canonical->what == SsComplete);
1151    if (sr_isError(canonical->sres)) {
1152       gst->guest_r2 = (Int)sr_Err(canonical->sres);
1153       gst->guest_r7 = (Int)sr_Err(canonical->sres);
1154    } else {
1155       gst->guest_r2 = sr_Res(canonical->sres);
1156       gst->guest_r3 = sr_ResEx(canonical->sres);
1157       gst->guest_r7 = (Int)sr_Err(canonical->sres);
1158    }
1159    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1160              OFFSET_mips64_r2, sizeof(UWord) );
1161    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1162              OFFSET_mips64_r3, sizeof(UWord) );
1163    VG_TRACK( post_reg_write, Vg_CoreSysCall, tid,
1164              OFFSET_mips64_r7, sizeof(UWord) );
1165 
1166 #  elif defined(VGP_tilegx_linux)
1167    VexGuestTILEGXState* gst = (VexGuestTILEGXState*)gst_vanilla;
1168    vg_assert(canonical->what == SsComplete);
1169    if (sr_isError(canonical->sres)) {
1170       gst->guest_r0 = - (Long)sr_Err(canonical->sres);
1171       // r1 hold errno
1172       gst->guest_r1 = (Long)sr_Err(canonical->sres);
1173    } else {
1174       gst->guest_r0 = sr_Res(canonical->sres);
1175       gst->guest_r1 = 0;
1176    }
1177 
1178 #  else
1179 #    error "putSyscallStatusIntoGuestState: unknown arch"
1180 #  endif
1181 }
1182 
1183 
1184 /* Tell me the offsets in the guest state of the syscall params, so
1185    that the scalar argument checkers don't have to have this info
1186    hardwired. */
1187 
1188 static
getSyscallArgLayout(SyscallArgLayout * layout)1189 void getSyscallArgLayout ( /*OUT*/SyscallArgLayout* layout )
1190 {
1191    VG_(bzero_inline)(layout, sizeof(*layout));
1192 
1193 #if defined(VGP_x86_linux)
1194    layout->o_sysno  = OFFSET_x86_EAX;
1195    layout->o_arg1   = OFFSET_x86_EBX;
1196    layout->o_arg2   = OFFSET_x86_ECX;
1197    layout->o_arg3   = OFFSET_x86_EDX;
1198    layout->o_arg4   = OFFSET_x86_ESI;
1199    layout->o_arg5   = OFFSET_x86_EDI;
1200    layout->o_arg6   = OFFSET_x86_EBP;
1201    layout->uu_arg7  = -1; /* impossible value */
1202    layout->uu_arg8  = -1; /* impossible value */
1203 
1204 #elif defined(VGP_amd64_linux)
1205    layout->o_sysno  = OFFSET_amd64_RAX;
1206    layout->o_arg1   = OFFSET_amd64_RDI;
1207    layout->o_arg2   = OFFSET_amd64_RSI;
1208    layout->o_arg3   = OFFSET_amd64_RDX;
1209    layout->o_arg4   = OFFSET_amd64_R10;
1210    layout->o_arg5   = OFFSET_amd64_R8;
1211    layout->o_arg6   = OFFSET_amd64_R9;
1212    layout->uu_arg7  = -1; /* impossible value */
1213    layout->uu_arg8  = -1; /* impossible value */
1214 
1215 #elif defined(VGP_ppc32_linux)
1216    layout->o_sysno  = OFFSET_ppc32_GPR0;
1217    layout->o_arg1   = OFFSET_ppc32_GPR3;
1218    layout->o_arg2   = OFFSET_ppc32_GPR4;
1219    layout->o_arg3   = OFFSET_ppc32_GPR5;
1220    layout->o_arg4   = OFFSET_ppc32_GPR6;
1221    layout->o_arg5   = OFFSET_ppc32_GPR7;
1222    layout->o_arg6   = OFFSET_ppc32_GPR8;
1223    layout->uu_arg7  = -1; /* impossible value */
1224    layout->uu_arg8  = -1; /* impossible value */
1225 
1226 #elif defined(VGP_ppc64be_linux) || defined(VGP_ppc64le_linux)
1227    layout->o_sysno  = OFFSET_ppc64_GPR0;
1228    layout->o_arg1   = OFFSET_ppc64_GPR3;
1229    layout->o_arg2   = OFFSET_ppc64_GPR4;
1230    layout->o_arg3   = OFFSET_ppc64_GPR5;
1231    layout->o_arg4   = OFFSET_ppc64_GPR6;
1232    layout->o_arg5   = OFFSET_ppc64_GPR7;
1233    layout->o_arg6   = OFFSET_ppc64_GPR8;
1234    layout->uu_arg7  = -1; /* impossible value */
1235    layout->uu_arg8  = -1; /* impossible value */
1236 
1237 #elif defined(VGP_arm_linux)
1238    layout->o_sysno  = OFFSET_arm_R7;
1239    layout->o_arg1   = OFFSET_arm_R0;
1240    layout->o_arg2   = OFFSET_arm_R1;
1241    layout->o_arg3   = OFFSET_arm_R2;
1242    layout->o_arg4   = OFFSET_arm_R3;
1243    layout->o_arg5   = OFFSET_arm_R4;
1244    layout->o_arg6   = OFFSET_arm_R5;
1245    layout->uu_arg7  = -1; /* impossible value */
1246    layout->uu_arg8  = -1; /* impossible value */
1247 
1248 #elif defined(VGP_arm64_linux)
1249    layout->o_sysno  = OFFSET_arm64_X8;
1250    layout->o_arg1   = OFFSET_arm64_X0;
1251    layout->o_arg2   = OFFSET_arm64_X1;
1252    layout->o_arg3   = OFFSET_arm64_X2;
1253    layout->o_arg4   = OFFSET_arm64_X3;
1254    layout->o_arg5   = OFFSET_arm64_X4;
1255    layout->o_arg6   = OFFSET_arm64_X5;
1256    layout->uu_arg7  = -1; /* impossible value */
1257    layout->uu_arg8  = -1; /* impossible value */
1258 
1259 #elif defined(VGP_mips32_linux)
1260    layout->o_sysno  = OFFSET_mips32_r2;
1261    layout->o_arg1   = OFFSET_mips32_r4;
1262    layout->o_arg2   = OFFSET_mips32_r5;
1263    layout->o_arg3   = OFFSET_mips32_r6;
1264    layout->o_arg4   = OFFSET_mips32_r7;
1265    layout->s_arg5   = sizeof(UWord) * 4;
1266    layout->s_arg6   = sizeof(UWord) * 5;
1267    layout->uu_arg7  = -1; /* impossible value */
1268    layout->uu_arg8  = -1; /* impossible value */
1269 
1270 #elif defined(VGP_mips64_linux)
1271    layout->o_sysno  = OFFSET_mips64_r2;
1272    layout->o_arg1   = OFFSET_mips64_r4;
1273    layout->o_arg2   = OFFSET_mips64_r5;
1274    layout->o_arg3   = OFFSET_mips64_r6;
1275    layout->o_arg4   = OFFSET_mips64_r7;
1276    layout->o_arg5   = OFFSET_mips64_r8;
1277    layout->o_arg6   = OFFSET_mips64_r9;
1278    layout->uu_arg7  = -1; /* impossible value */
1279    layout->uu_arg8  = -1; /* impossible value */
1280 
1281 #elif defined(VGP_x86_darwin)
1282    layout->o_sysno  = OFFSET_x86_EAX;
1283    // syscall parameters are on stack in C convention
1284    layout->s_arg1   = sizeof(UWord) * 1;
1285    layout->s_arg2   = sizeof(UWord) * 2;
1286    layout->s_arg3   = sizeof(UWord) * 3;
1287    layout->s_arg4   = sizeof(UWord) * 4;
1288    layout->s_arg5   = sizeof(UWord) * 5;
1289    layout->s_arg6   = sizeof(UWord) * 6;
1290    layout->s_arg7   = sizeof(UWord) * 7;
1291    layout->s_arg8   = sizeof(UWord) * 8;
1292 
1293 #elif defined(VGP_amd64_darwin)
1294    layout->o_sysno  = OFFSET_amd64_RAX;
1295    layout->o_arg1   = OFFSET_amd64_RDI;
1296    layout->o_arg2   = OFFSET_amd64_RSI;
1297    layout->o_arg3   = OFFSET_amd64_RDX;
1298    layout->o_arg4   = OFFSET_amd64_RCX;
1299    layout->o_arg5   = OFFSET_amd64_R8;
1300    layout->o_arg6   = OFFSET_amd64_R9;
1301    layout->s_arg7   = sizeof(UWord) * 1;
1302    layout->s_arg8   = sizeof(UWord) * 2;
1303 
1304 #elif defined(VGP_s390x_linux)
1305    layout->o_sysno  = OFFSET_s390x_SYSNO;
1306    layout->o_arg1   = OFFSET_s390x_r2;
1307    layout->o_arg2   = OFFSET_s390x_r3;
1308    layout->o_arg3   = OFFSET_s390x_r4;
1309    layout->o_arg4   = OFFSET_s390x_r5;
1310    layout->o_arg5   = OFFSET_s390x_r6;
1311    layout->o_arg6   = OFFSET_s390x_r7;
1312    layout->uu_arg7  = -1; /* impossible value */
1313    layout->uu_arg8  = -1; /* impossible value */
1314 #elif defined(VGP_tilegx_linux)
1315    layout->o_sysno  = OFFSET_tilegx_r(10);
1316    layout->o_arg1   = OFFSET_tilegx_r(0);
1317    layout->o_arg2   = OFFSET_tilegx_r(1);
1318    layout->o_arg3   = OFFSET_tilegx_r(2);
1319    layout->o_arg4   = OFFSET_tilegx_r(3);
1320    layout->o_arg5   = OFFSET_tilegx_r(4);
1321    layout->o_arg6   = OFFSET_tilegx_r(5);
1322    layout->uu_arg7  = -1; /* impossible value */
1323    layout->uu_arg8  = -1; /* impossible value */
1324 
1325 #else
1326 #  error "getSyscallLayout: unknown arch"
1327 #endif
1328 }
1329 
1330 
1331 /* ---------------------------------------------------------------------
1332    The main driver logic
1333    ------------------------------------------------------------------ */
1334 
1335 /* Finding the handlers for a given syscall, or faking up one
1336    when no handler is found. */
1337 
1338 static
bad_before(ThreadId tid,SyscallArgLayout * layout,SyscallArgs * args,SyscallStatus * status,UWord * flags)1339 void bad_before ( ThreadId              tid,
1340                   SyscallArgLayout*     layout,
1341                   /*MOD*/SyscallArgs*   args,
1342                   /*OUT*/SyscallStatus* status,
1343                   /*OUT*/UWord*         flags )
1344 {
1345    VG_(dmsg)("WARNING: unhandled %s syscall: %s\n",
1346       VG_PLATFORM, VG_SYSNUM_STRING(args->sysno));
1347    if (VG_(clo_verbosity) > 1) {
1348       VG_(get_and_pp_StackTrace)(tid, VG_(clo_backtrace_size));
1349    }
1350    VG_(dmsg)("You may be able to write your own handler.\n");
1351    VG_(dmsg)("Read the file README_MISSING_SYSCALL_OR_IOCTL.\n");
1352    VG_(dmsg)("Nevertheless we consider this a bug.  Please report\n");
1353    VG_(dmsg)("it at http://valgrind.org/support/bug_reports.html.\n");
1354 
1355    SET_STATUS_Failure(VKI_ENOSYS);
1356 }
1357 
1358 static SyscallTableEntry bad_sys =
1359    { bad_before, NULL };
1360 
get_syscall_entry(Int syscallno)1361 static const SyscallTableEntry* get_syscall_entry ( Int syscallno )
1362 {
1363    const SyscallTableEntry* sys = NULL;
1364 
1365 #  if defined(VGO_linux)
1366    sys = ML_(get_linux_syscall_entry)( syscallno );
1367 
1368 #  elif defined(VGO_darwin)
1369    Int idx = VG_DARWIN_SYSNO_INDEX(syscallno);
1370 
1371    switch (VG_DARWIN_SYSNO_CLASS(syscallno)) {
1372    case VG_DARWIN_SYSCALL_CLASS_UNIX:
1373       if (idx >= 0 && idx < ML_(syscall_table_size) &&
1374           ML_(syscall_table)[idx].before != NULL)
1375          sys = &ML_(syscall_table)[idx];
1376          break;
1377    case VG_DARWIN_SYSCALL_CLASS_MACH:
1378       if (idx >= 0 && idx < ML_(mach_trap_table_size) &&
1379           ML_(mach_trap_table)[idx].before != NULL)
1380          sys = &ML_(mach_trap_table)[idx];
1381          break;
1382    case VG_DARWIN_SYSCALL_CLASS_MDEP:
1383       if (idx >= 0 && idx < ML_(mdep_trap_table_size) &&
1384           ML_(mdep_trap_table)[idx].before != NULL)
1385          sys = &ML_(mdep_trap_table)[idx];
1386          break;
1387    default:
1388       vg_assert(0);
1389       break;
1390    }
1391 
1392 #  else
1393 #    error Unknown OS
1394 #  endif
1395 
1396    return sys == NULL  ? &bad_sys  : sys;
1397 }
1398 
1399 
1400 /* Add and remove signals from mask so that we end up telling the
1401    kernel the state we actually want rather than what the client
1402    wants. */
sanitize_client_sigmask(vki_sigset_t * mask)1403 static void sanitize_client_sigmask(vki_sigset_t *mask)
1404 {
1405    VG_(sigdelset)(mask, VKI_SIGKILL);
1406    VG_(sigdelset)(mask, VKI_SIGSTOP);
1407    VG_(sigdelset)(mask, VG_SIGVGKILL); /* never block */
1408 }
1409 
1410 typedef
1411    struct {
1412       SyscallArgs   orig_args;
1413       SyscallArgs   args;
1414       SyscallStatus status;
1415       UWord         flags;
1416    }
1417    SyscallInfo;
1418 
1419 SyscallInfo *syscallInfo;
1420 
1421 /* The scheduler needs to be able to zero out these records after a
1422    fork, hence this is exported from m_syswrap. */
VG_(clear_syscallInfo)1423 void VG_(clear_syscallInfo) ( Int tid )
1424 {
1425    vg_assert(syscallInfo);
1426    vg_assert(tid >= 0 && tid < VG_N_THREADS);
1427    VG_(memset)( & syscallInfo[tid], 0, sizeof( syscallInfo[tid] ));
1428    syscallInfo[tid].status.what = SsIdle;
1429 }
1430 
VG_(is_in_syscall)1431 Bool VG_(is_in_syscall) ( Int tid )
1432 {
1433    vg_assert(tid >= 0 && tid < VG_N_THREADS);
1434    return (syscallInfo[tid].status.what != SsIdle);
1435 }
1436 
ensure_initialised(void)1437 static void ensure_initialised ( void )
1438 {
1439    Int i;
1440    static Bool init_done = False;
1441    if (init_done)
1442       return;
1443    init_done = True;
1444 
1445    syscallInfo = VG_(malloc)("scinfo", VG_N_THREADS * sizeof syscallInfo[0]);
1446 
1447    for (i = 0; i < VG_N_THREADS; i++) {
1448       VG_(clear_syscallInfo)( i );
1449    }
1450 }
1451 
1452 /* --- This is the main function of this file. --- */
1453 
VG_(client_syscall)1454 void VG_(client_syscall) ( ThreadId tid, UInt trc )
1455 {
1456    Word                     sysno;
1457    ThreadState*             tst;
1458    const SyscallTableEntry* ent;
1459    SyscallArgLayout         layout;
1460    SyscallInfo*             sci;
1461 
1462    ensure_initialised();
1463 
1464    vg_assert(VG_(is_valid_tid)(tid));
1465    vg_assert(tid >= 1 && tid < VG_N_THREADS);
1466    vg_assert(VG_(is_running_thread)(tid));
1467 
1468 #  if !defined(VGO_darwin)
1469    // Resync filtering is meaningless on non-Darwin targets.
1470    vg_assert(VG_(clo_resync_filter) == 0);
1471 #  endif
1472 
1473    tst = VG_(get_ThreadState)(tid);
1474 
1475    /* BEGIN ensure root thread's stack is suitably mapped */
1476    /* In some rare circumstances, we may do the syscall without the
1477       bottom page of the stack being mapped, because the stack pointer
1478       was moved down just a few instructions before the syscall
1479       instruction, and there have been no memory references since
1480       then, that would cause a call to VG_(extend_stack) to have
1481       happened.
1482 
1483       In native execution that's OK: the kernel automagically extends
1484       the stack's mapped area down to cover the stack pointer (or sp -
1485       redzone, really).  In simulated normal execution that's OK too,
1486       since any signals we get from accessing below the mapped area of
1487       the (guest's) stack lead us to VG_(extend_stack), where we
1488       simulate the kernel's stack extension logic.  But that leaves
1489       the problem of entering a syscall with the SP unmapped.  Because
1490       the kernel doesn't know that the segment immediately above SP is
1491       supposed to be a grow-down segment, it causes the syscall to
1492       fail, and thereby causes a divergence between native behaviour
1493       (syscall succeeds) and simulated behaviour (syscall fails).
1494 
1495       This is quite a rare failure mode.  It has only been seen
1496       affecting calls to sys_readlink on amd64-linux, and even then it
1497       requires a certain code sequence around the syscall to trigger
1498       it.  Here is one:
1499 
1500       extern int my_readlink ( const char* path );
1501       asm(
1502       ".text\n"
1503       ".globl my_readlink\n"
1504       "my_readlink:\n"
1505       "\tsubq    $0x1008,%rsp\n"
1506       "\tmovq    %rdi,%rdi\n"              // path is in rdi
1507       "\tmovq    %rsp,%rsi\n"              // &buf[0] -> rsi
1508       "\tmovl    $0x1000,%edx\n"           // sizeof(buf) in rdx
1509       "\tmovl    $"__NR_READLINK",%eax\n"  // syscall number
1510       "\tsyscall\n"
1511       "\taddq    $0x1008,%rsp\n"
1512       "\tret\n"
1513       ".previous\n"
1514       );
1515 
1516       For more details, see bug #156404
1517       (https://bugs.kde.org/show_bug.cgi?id=156404).
1518 
1519       The fix is actually very simple.  We simply need to call
1520       VG_(extend_stack) for this thread, handing it the lowest
1521       possible valid address for stack (sp - redzone), to ensure the
1522       pages all the way down to that address, are mapped.  Because
1523       this is a potentially expensive and frequent operation, we
1524       do the following:
1525 
1526       Only the main thread (tid=1) has a growdown stack.  So
1527       ignore all others.  It is conceivable, although highly unlikely,
1528       that the main thread exits, and later another thread is
1529       allocated tid=1, but that's harmless, I believe;
1530       VG_(extend_stack) will do nothing when applied to a non-root
1531       thread.
1532 
1533       All this guff is of course Linux-specific.  Hence the ifdef.
1534    */
1535 #  if defined(VGO_linux)
1536    if (tid == 1/*ROOT THREAD*/) {
1537       Addr     stackMin   = VG_(get_SP)(tid) - VG_STACK_REDZONE_SZB;
1538 
1539       /* The precise thing to do here would be to extend the stack only
1540          if the system call can be proven to access unmapped user stack
1541          memory. That is an enormous amount of work even if a proper
1542          spec of system calls was available.
1543 
1544          In the case where the system call does not access user memory
1545          the stack pointer here can have any value. A legitimate testcase
1546          that exercises this is none/tests/s390x/stmg.c:
1547          The stack pointer happens to be in the reservation segment near
1548          the end of the addressable memory and there is no SkAnonC segment
1549          above.
1550 
1551          So the approximation we're taking here is to extend the stack only
1552          if the client stack pointer does not look bogus. */
1553       if (VG_(am_addr_is_in_extensible_client_stack)(stackMin))
1554          VG_(extend_stack)( tid, stackMin );
1555    }
1556 #  endif
1557    /* END ensure root thread's stack is suitably mapped */
1558 
1559    /* First off, get the syscall args and number.  This is a
1560       platform-dependent action. */
1561 
1562    sci = & syscallInfo[tid];
1563    vg_assert(sci->status.what == SsIdle);
1564 
1565    getSyscallArgsFromGuestState( &sci->orig_args, &tst->arch.vex, trc );
1566 
1567    /* Copy .orig_args to .args.  The pre-handler may modify .args, but
1568       we want to keep the originals too, just in case. */
1569    sci->args = sci->orig_args;
1570 
1571    /* Save the syscall number in the thread state in case the syscall
1572       is interrupted by a signal. */
1573    sysno = sci->orig_args.sysno;
1574 
1575    /* It's sometimes useful, as a crude debugging hack, to get a
1576       stack trace at each (or selected) syscalls. */
1577    if (0 && sysno == __NR_ioctl) {
1578       VG_(umsg)("\nioctl:\n");
1579       VG_(get_and_pp_StackTrace)(tid, 10);
1580       VG_(umsg)("\n");
1581    }
1582 
1583 #  if defined(VGO_darwin)
1584    /* Record syscall class.  But why?  Because the syscall might be
1585       interrupted by a signal, and in the signal handler (which will
1586       be m_signals.async_signalhandler) we will need to build a SysRes
1587       reflecting the syscall return result.  In order to do that we
1588       need to know the syscall class.  Hence stash it in the guest
1589       state of this thread.  This madness is not needed on Linux
1590       because it only has a single syscall return convention and so
1591       there is no ambiguity involved in converting the post-signal
1592       machine state into a SysRes. */
1593    tst->arch.vex.guest_SC_CLASS = VG_DARWIN_SYSNO_CLASS(sysno);
1594 #  endif
1595 
1596    /* The default what-to-do-next thing is hand the syscall to the
1597       kernel, so we pre-set that here.  Set .sres to something
1598       harmless looking (is irrelevant because .what is not
1599       SsComplete.) */
1600    sci->status.what = SsHandToKernel;
1601    sci->status.sres = VG_(mk_SysRes_Error)(0);
1602    sci->flags       = 0;
1603 
1604    /* Fetch the syscall's handlers.  If no handlers exist for this
1605       syscall, we are given dummy handlers which force an immediate
1606       return with ENOSYS. */
1607    ent = get_syscall_entry(sysno);
1608 
1609    /* Fetch the layout information, which tells us where in the guest
1610       state the syscall args reside.  This is a platform-dependent
1611       action.  This info is needed so that the scalar syscall argument
1612       checks (PRE_REG_READ calls) know which bits of the guest state
1613       they need to inspect. */
1614    getSyscallArgLayout( &layout );
1615 
1616    /* Make sure the tmp signal mask matches the real signal mask;
1617       sigsuspend may change this. */
1618    vg_assert(VG_(iseqsigset)(&tst->sig_mask, &tst->tmp_sig_mask));
1619 
1620    /* Right, we're finally ready to Party.  Call the pre-handler and
1621       see what we get back.  At this point:
1622 
1623         sci->status.what  is Unset (we don't know yet).
1624         sci->orig_args    contains the original args.
1625         sci->args         is the same as sci->orig_args.
1626         sci->flags        is zero.
1627    */
1628 
1629    PRINT("SYSCALL[%d,%d](%s) ",
1630       VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno));
1631 
1632    /* Do any pre-syscall actions */
1633    if (VG_(needs).syscall_wrapper) {
1634       UWord tmpv[8];
1635       tmpv[0] = sci->orig_args.arg1;
1636       tmpv[1] = sci->orig_args.arg2;
1637       tmpv[2] = sci->orig_args.arg3;
1638       tmpv[3] = sci->orig_args.arg4;
1639       tmpv[4] = sci->orig_args.arg5;
1640       tmpv[5] = sci->orig_args.arg6;
1641       tmpv[6] = sci->orig_args.arg7;
1642       tmpv[7] = sci->orig_args.arg8;
1643       VG_TDICT_CALL(tool_pre_syscall, tid, sysno,
1644                     &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]));
1645    }
1646 
1647    vg_assert(ent);
1648    vg_assert(ent->before);
1649    (ent->before)( tid,
1650                   &layout,
1651                   &sci->args, &sci->status, &sci->flags );
1652 
1653    /* The pre-handler may have modified:
1654          sci->args
1655          sci->status
1656          sci->flags
1657       All else remains unchanged.
1658       Although the args may be modified, pre handlers are not allowed
1659       to change the syscall number.
1660    */
1661    /* Now we proceed according to what the pre-handler decided. */
1662    vg_assert(sci->status.what == SsHandToKernel
1663              || sci->status.what == SsComplete);
1664    vg_assert(sci->args.sysno == sci->orig_args.sysno);
1665 
1666    if (sci->status.what == SsComplete && !sr_isError(sci->status.sres)) {
1667       /* The pre-handler completed the syscall itself, declaring
1668          success. */
1669       if (sci->flags & SfNoWriteResult) {
1670          PRINT(" --> [pre-success] NoWriteResult");
1671       } else {
1672          PRINT(" --> [pre-success] %s", VG_(sr_as_string)(sci->status.sres));
1673       }
1674       /* In this case the allowable flags are to ask for a signal-poll
1675          and/or a yield after the call.  Changing the args isn't
1676          allowed. */
1677       vg_assert(0 == (sci->flags
1678                       & ~(SfPollAfter | SfYieldAfter | SfNoWriteResult)));
1679       vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
1680    }
1681 
1682    else
1683    if (sci->status.what == SsComplete && sr_isError(sci->status.sres)) {
1684       /* The pre-handler decided to fail syscall itself. */
1685       PRINT(" --> [pre-fail] %s", VG_(sr_as_string)(sci->status.sres));
1686       /* In this case, the pre-handler is also allowed to ask for the
1687          post-handler to be run anyway.  Changing the args is not
1688          allowed. */
1689       vg_assert(0 == (sci->flags & ~(SfMayBlock | SfPostOnFail | SfPollAfter)));
1690       vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
1691    }
1692 
1693    else
1694    if (sci->status.what != SsHandToKernel) {
1695       /* huh?! */
1696       vg_assert(0);
1697    }
1698 
1699    else /* (sci->status.what == HandToKernel) */ {
1700       /* Ok, this is the usual case -- and the complicated one.  There
1701          are two subcases: sync and async.  async is the general case
1702          and is to be used when there is any possibility that the
1703          syscall might block [a fact that the pre-handler must tell us
1704          via the sci->flags field.]  Because the tidying-away /
1705          context-switch overhead of the async case could be large, if
1706          we are sure that the syscall will not block, we fast-track it
1707          by doing it directly in this thread, which is a lot
1708          simpler. */
1709 
1710       /* Check that the given flags are allowable: MayBlock, PollAfter
1711          and PostOnFail are ok. */
1712       vg_assert(0 == (sci->flags & ~(SfMayBlock | SfPostOnFail | SfPollAfter)));
1713 
1714       if (sci->flags & SfMayBlock) {
1715 
1716          /* Syscall may block, so run it asynchronously */
1717          vki_sigset_t mask;
1718 
1719          PRINT(" --> [async] ... \n");
1720 
1721          mask = tst->sig_mask;
1722          sanitize_client_sigmask(&mask);
1723 
1724          /* Gack.  More impedance matching.  Copy the possibly
1725             modified syscall args back into the guest state. */
1726          /* JRS 2009-Mar-16: if the syscall args are possibly modified,
1727             then this assertion is senseless:
1728               vg_assert(eq_SyscallArgs(&sci->args, &sci->orig_args));
1729             The case that exposed it was sys_posix_spawn on Darwin,
1730             which heavily modifies its arguments but then lets the call
1731             go through anyway, with SfToBlock set, hence we end up here. */
1732          putSyscallArgsIntoGuestState( &sci->args, &tst->arch.vex );
1733 
1734          /* Drop the bigLock */
1735          VG_(release_BigLock)(tid, VgTs_WaitSys, "VG_(client_syscall)[async]");
1736          /* Urr.  We're now in a race against other threads trying to
1737             acquire the bigLock.  I guess that doesn't matter provided
1738             that do_syscall_for_client only touches thread-local
1739             state. */
1740 
1741          /* Do the call, which operates directly on the guest state,
1742             not on our abstracted copies of the args/result. */
1743          do_syscall_for_client(sysno, tst, &mask);
1744 
1745          /* do_syscall_for_client may not return if the syscall was
1746             interrupted by a signal.  In that case, flow of control is
1747             first to m_signals.async_sighandler, which calls
1748             VG_(fixup_guest_state_after_syscall_interrupted), which
1749             fixes up the guest state, and possibly calls
1750             VG_(post_syscall).  Once that's done, control drops back
1751             to the scheduler.  */
1752 
1753          /* Darwin: do_syscall_for_client may not return if the
1754             syscall was workq_ops(WQOPS_THREAD_RETURN) and the kernel
1755             responded by starting the thread at wqthread_hijack(reuse=1)
1756             (to run another workqueue item). In that case, wqthread_hijack
1757             calls ML_(wqthread_continue), which is similar to
1758             VG_(fixup_guest_state_after_syscall_interrupted). */
1759 
1760          /* Reacquire the lock */
1761          VG_(acquire_BigLock)(tid, "VG_(client_syscall)[async]");
1762 
1763          /* Even more impedance matching.  Extract the syscall status
1764             from the guest state. */
1765          getSyscallStatusFromGuestState( &sci->status, &tst->arch.vex );
1766          vg_assert(sci->status.what == SsComplete);
1767 
1768          /* Be decorative, if required. */
1769          if (VG_(clo_trace_syscalls)) {
1770             PRINT("SYSCALL[%d,%d](%s) ... [async] --> %s",
1771                   VG_(getpid)(), tid, VG_SYSNUM_STRING(sysno),
1772                   VG_(sr_as_string)(sci->status.sres));
1773          }
1774 
1775       } else {
1776 
1777          /* run the syscall directly */
1778          /* The pre-handler may have modified the syscall args, but
1779             since we're passing values in ->args directly to the
1780             kernel, there's no point in flushing them back to the
1781             guest state.  Indeed doing so could be construed as
1782             incorrect. */
1783          SysRes sres
1784             = VG_(do_syscall)(sysno, sci->args.arg1, sci->args.arg2,
1785                                      sci->args.arg3, sci->args.arg4,
1786                                      sci->args.arg5, sci->args.arg6,
1787                                      sci->args.arg7, sci->args.arg8 );
1788          sci->status = convert_SysRes_to_SyscallStatus(sres);
1789 
1790          /* Be decorative, if required. */
1791          if (VG_(clo_trace_syscalls)) {
1792            PRINT("[sync] --> %s", VG_(sr_as_string)(sci->status.sres));
1793          }
1794       }
1795    }
1796 
1797    vg_assert(sci->status.what == SsComplete);
1798 
1799    vg_assert(VG_(is_running_thread)(tid));
1800 
1801    /* Dump the syscall result back in the guest state.  This is
1802       a platform-specific action. */
1803    if (!(sci->flags & SfNoWriteResult))
1804       putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
1805 
1806    /* Situation now:
1807       - the guest state is now correctly modified following the syscall
1808       - modified args, original args and syscall status are still
1809         available in the syscallInfo[] entry for this syscall.
1810 
1811       Now go on to do the post-syscall actions (read on down ..)
1812    */
1813    PRINT(" ");
1814    VG_(post_syscall)(tid);
1815    PRINT("\n");
1816 }
1817 
1818 
1819 /* Perform post syscall actions.  The expected state on entry is
1820    precisely as at the end of VG_(client_syscall), that is:
1821 
1822    - guest state up to date following the syscall
1823    - modified args, original args and syscall status are still
1824      available in the syscallInfo[] entry for this syscall.
1825    - syscall status matches what's in the guest state.
1826 
1827    There are two ways to get here: the normal way -- being called by
1828    VG_(client_syscall), and the unusual way, from
1829    VG_(fixup_guest_state_after_syscall_interrupted).
1830    Darwin: there's a third way, ML_(wqthread_continue).
1831 */
VG_(post_syscall)1832 void VG_(post_syscall) (ThreadId tid)
1833 {
1834    SyscallInfo*             sci;
1835    const SyscallTableEntry* ent;
1836    SyscallStatus            test_status;
1837    ThreadState*             tst;
1838    Word sysno;
1839 
1840    /* Preliminaries */
1841    vg_assert(VG_(is_valid_tid)(tid));
1842    vg_assert(tid >= 1 && tid < VG_N_THREADS);
1843    vg_assert(VG_(is_running_thread)(tid));
1844 
1845    tst = VG_(get_ThreadState)(tid);
1846    sci = & syscallInfo[tid];
1847 
1848    /* m_signals.sigvgkill_handler might call here even when not in
1849       a syscall. */
1850    if (sci->status.what == SsIdle || sci->status.what == SsHandToKernel) {
1851       sci->status.what = SsIdle;
1852       return;
1853    }
1854 
1855    /* Validate current syscallInfo entry.  In particular we require
1856       that the current .status matches what's actually in the guest
1857       state.  At least in the normal case where we have actually
1858       previously written the result into the guest state. */
1859    vg_assert(sci->status.what == SsComplete);
1860 
1861    getSyscallStatusFromGuestState( &test_status, &tst->arch.vex );
1862    if (!(sci->flags & SfNoWriteResult))
1863       vg_assert(eq_SyscallStatus( &sci->status, &test_status ));
1864    /* Failure of the above assertion on Darwin can indicate a problem
1865       in the syscall wrappers that pre-fail or pre-succeed the
1866       syscall, by calling SET_STATUS_Success or SET_STATUS_Failure,
1867       when they really should call SET_STATUS_from_SysRes.  The former
1868       create a UNIX-class syscall result on Darwin, which may not be
1869       correct for the syscall; if that's the case then this assertion
1870       fires.  See PRE(thread_fast_set_cthread_self) for an example.  On
1871       non-Darwin platforms this assertion is should never fail, and this
1872       comment is completely irrelevant. */
1873    /* Ok, looks sane */
1874 
1875    /* Get the system call number.  Because the pre-handler isn't
1876       allowed to mess with it, it should be the same for both the
1877       original and potentially-modified args. */
1878    vg_assert(sci->args.sysno == sci->orig_args.sysno);
1879    sysno = sci->args.sysno;
1880    ent = get_syscall_entry(sysno);
1881 
1882    /* pre: status == Complete (asserted above) */
1883    /* Consider either success or failure.  Now run the post handler if:
1884       - it exists, and
1885       - Success or (Failure and PostOnFail is set)
1886    */
1887    if (ent->after
1888        && ((!sr_isError(sci->status.sres))
1889            || (sr_isError(sci->status.sres)
1890                && (sci->flags & SfPostOnFail) ))) {
1891 
1892       (ent->after)( tid, &sci->args, &sci->status );
1893    }
1894 
1895    /* Because the post handler might have changed the status (eg, the
1896       post-handler for sys_open can change the result from success to
1897       failure if the kernel supplied a fd that it doesn't like), once
1898       again dump the syscall result back in the guest state.*/
1899    if (!(sci->flags & SfNoWriteResult))
1900       putSyscallStatusIntoGuestState( tid, &sci->status, &tst->arch.vex );
1901 
1902    /* Do any post-syscall actions required by the tool. */
1903    if (VG_(needs).syscall_wrapper) {
1904       UWord tmpv[8];
1905       tmpv[0] = sci->orig_args.arg1;
1906       tmpv[1] = sci->orig_args.arg2;
1907       tmpv[2] = sci->orig_args.arg3;
1908       tmpv[3] = sci->orig_args.arg4;
1909       tmpv[4] = sci->orig_args.arg5;
1910       tmpv[5] = sci->orig_args.arg6;
1911       tmpv[6] = sci->orig_args.arg7;
1912       tmpv[7] = sci->orig_args.arg8;
1913       VG_TDICT_CALL(tool_post_syscall, tid,
1914                     sysno,
1915                     &tmpv[0], sizeof(tmpv)/sizeof(tmpv[0]),
1916                     sci->status.sres);
1917    }
1918 
1919    /* The syscall is done. */
1920    vg_assert(sci->status.what == SsComplete);
1921    sci->status.what = SsIdle;
1922 
1923    /* The pre/post wrappers may have concluded that pending signals
1924       might have been created, and will have set SfPollAfter to
1925       request a poll for them once the syscall is done. */
1926    if (sci->flags & SfPollAfter)
1927       VG_(poll_signals)(tid);
1928 
1929    /* Similarly, the wrappers might have asked for a yield
1930       afterwards. */
1931    if (sci->flags & SfYieldAfter)
1932       VG_(vg_yield)();
1933 }
1934 
1935 
1936 /* ---------------------------------------------------------------------
1937    Dealing with syscalls which get interrupted by a signal:
1938    VG_(fixup_guest_state_after_syscall_interrupted)
1939    ------------------------------------------------------------------ */
1940 
1941 /* Syscalls done on behalf of the client are finally handed off to the
1942    kernel in VG_(client_syscall) above, either by calling
1943    do_syscall_for_client (the async case), or by calling
1944    VG_(do_syscall6) (the sync case).
1945 
1946    If the syscall is not interrupted by a signal (it may block and
1947    later unblock, but that's irrelevant here) then those functions
1948    eventually return and so control is passed to VG_(post_syscall).
1949    NB: not sure if the sync case can actually get interrupted, as it
1950    operates with all signals masked.
1951 
1952    However, the syscall may get interrupted by an async-signal.  In
1953    that case do_syscall_for_client/VG_(do_syscall6) do not
1954    return.  Instead we wind up in m_signals.async_sighandler.  We need
1955    to fix up the guest state to make it look like the syscall was
1956    interrupted for guest.  So async_sighandler calls here, and this
1957    does the fixup.  Note that from here we wind up calling
1958    VG_(post_syscall) too.
1959 */
1960 
1961 
1962 /* These are addresses within ML_(do_syscall_for_client_WRK).  See
1963    syscall-$PLAT.S for details.
1964 */
1965 #if defined(VGO_linux)
1966   extern const Addr ML_(blksys_setup);
1967   extern const Addr ML_(blksys_restart);
1968   extern const Addr ML_(blksys_complete);
1969   extern const Addr ML_(blksys_committed);
1970   extern const Addr ML_(blksys_finished);
1971 #elif defined(VGO_darwin)
1972   /* Darwin requires extra uglyness */
1973   extern const Addr ML_(blksys_setup_MACH);
1974   extern const Addr ML_(blksys_restart_MACH);
1975   extern const Addr ML_(blksys_complete_MACH);
1976   extern const Addr ML_(blksys_committed_MACH);
1977   extern const Addr ML_(blksys_finished_MACH);
1978   extern const Addr ML_(blksys_setup_MDEP);
1979   extern const Addr ML_(blksys_restart_MDEP);
1980   extern const Addr ML_(blksys_complete_MDEP);
1981   extern const Addr ML_(blksys_committed_MDEP);
1982   extern const Addr ML_(blksys_finished_MDEP);
1983   extern const Addr ML_(blksys_setup_UNIX);
1984   extern const Addr ML_(blksys_restart_UNIX);
1985   extern const Addr ML_(blksys_complete_UNIX);
1986   extern const Addr ML_(blksys_committed_UNIX);
1987   extern const Addr ML_(blksys_finished_UNIX);
1988 #else
1989 # error "Unknown OS"
1990 #endif
1991 
1992 
1993 /* Back up guest state to restart a system call. */
1994 
ML_(fixup_guest_state_to_restart_syscall)1995 void ML_(fixup_guest_state_to_restart_syscall) ( ThreadArchState* arch )
1996 {
1997 #if defined(VGP_x86_linux)
1998    arch->vex.guest_EIP -= 2;             // sizeof(int $0x80)
1999 
2000    /* Make sure our caller is actually sane, and we're really backing
2001       back over a syscall.
2002 
2003       int $0x80 == CD 80
2004    */
2005    {
2006       UChar *p = (UChar *)arch->vex.guest_EIP;
2007 
2008       if (p[0] != 0xcd || p[1] != 0x80)
2009          VG_(message)(Vg_DebugMsg,
2010                       "?! restarting over syscall at %#x %02x %02x\n",
2011                       arch->vex.guest_EIP, p[0], p[1]);
2012 
2013       vg_assert(p[0] == 0xcd && p[1] == 0x80);
2014    }
2015 
2016 #elif defined(VGP_amd64_linux)
2017    arch->vex.guest_RIP -= 2;             // sizeof(syscall)
2018 
2019    /* Make sure our caller is actually sane, and we're really backing
2020       back over a syscall.
2021 
2022       syscall == 0F 05
2023    */
2024    {
2025       UChar *p = (UChar *)arch->vex.guest_RIP;
2026 
2027       if (p[0] != 0x0F || p[1] != 0x05)
2028          VG_(message)(Vg_DebugMsg,
2029                       "?! restarting over syscall at %#llx %02x %02x\n",
2030                       arch->vex.guest_RIP, p[0], p[1]);
2031 
2032       vg_assert(p[0] == 0x0F && p[1] == 0x05);
2033    }
2034 
2035 #elif defined(VGP_ppc32_linux) || defined(VGP_ppc64be_linux)
2036    arch->vex.guest_CIA -= 4;             // sizeof(ppc32 instr)
2037 
2038    /* Make sure our caller is actually sane, and we're really backing
2039       back over a syscall.
2040 
2041       sc == 44 00 00 02
2042    */
2043    {
2044       UChar *p = (UChar *)arch->vex.guest_CIA;
2045 
2046       if (p[0] != 0x44 || p[1] != 0x0 || p[2] != 0x0 || p[3] != 0x02)
2047          VG_(message)(Vg_DebugMsg,
2048                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
2049                       arch->vex.guest_CIA + 0ULL, p[0], p[1], p[2], p[3]);
2050 
2051       vg_assert(p[0] == 0x44 && p[1] == 0x0 && p[2] == 0x0 && p[3] == 0x2);
2052    }
2053 
2054 #elif defined(VGP_ppc64le_linux)
2055    arch->vex.guest_CIA -= 4;             // sizeof(ppc32 instr)
2056 
2057    /* Make sure our caller is actually sane, and we're really backing
2058       back over a syscall.
2059 
2060       sc == 44 00 00 02
2061    */
2062    {
2063       UChar *p = (UChar *)arch->vex.guest_CIA;
2064 
2065       if (p[3] != 0x44 || p[2] != 0x0 || p[1] != 0x0 || p[0] != 0x02)
2066          VG_(message)(Vg_DebugMsg,
2067                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
2068                       arch->vex.guest_CIA + 0ULL, p[3], p[2], p[1], p[0]);
2069 
2070       vg_assert(p[3] == 0x44 && p[2] == 0x0 && p[1] == 0x0 && p[0] == 0x2);
2071    }
2072 
2073 #elif defined(VGP_arm_linux)
2074    if (arch->vex.guest_R15T & 1) {
2075       // Thumb mode.  SVC is a encoded as
2076       //   1101 1111 imm8
2077       // where imm8 is the SVC number, and we only accept 0.
2078       arch->vex.guest_R15T -= 2;   // sizeof(thumb 16 bit insn)
2079       UChar* p     = (UChar*)(arch->vex.guest_R15T - 1);
2080       Bool   valid = p[0] == 0 && p[1] == 0xDF;
2081       if (!valid) {
2082          VG_(message)(Vg_DebugMsg,
2083                       "?! restarting over (Thumb) syscall that is not syscall "
2084                       "at %#llx %02x %02x\n",
2085                       arch->vex.guest_R15T - 1ULL, p[0], p[1]);
2086       }
2087       vg_assert(valid);
2088       // FIXME: NOTE, this really isn't right.  We need to back up
2089       // ITSTATE to what it was before the SVC instruction, but we
2090       // don't know what it was.  At least assert that it is now
2091       // zero, because if it is nonzero then it must also have
2092       // been nonzero for the SVC itself, which means it was
2093       // conditional.  Urk.
2094       vg_assert(arch->vex.guest_ITSTATE == 0);
2095    } else {
2096       // ARM mode.  SVC is encoded as
2097       //   cond 1111 imm24
2098       // where imm24 is the SVC number, and we only accept 0.
2099       arch->vex.guest_R15T -= 4;   // sizeof(arm instr)
2100       UChar* p     = (UChar*)arch->vex.guest_R15T;
2101       Bool   valid = p[0] == 0 && p[1] == 0 && p[2] == 0
2102                      && (p[3] & 0xF) == 0xF;
2103       if (!valid) {
2104          VG_(message)(Vg_DebugMsg,
2105                       "?! restarting over (ARM) syscall that is not syscall "
2106                       "at %#llx %02x %02x %02x %02x\n",
2107                       arch->vex.guest_R15T + 0ULL, p[0], p[1], p[2], p[3]);
2108       }
2109       vg_assert(valid);
2110    }
2111 
2112 #elif defined(VGP_arm64_linux)
2113    arch->vex.guest_PC -= 4;             // sizeof(arm64 instr)
2114 
2115    /* Make sure our caller is actually sane, and we're really backing
2116       back over a syscall.
2117 
2118       svc #0 == d4 00 00 01
2119    */
2120    {
2121       UChar *p = (UChar *)arch->vex.guest_PC;
2122 
2123       if (p[0] != 0x01 || p[1] != 0x00 || p[2] != 0x00 || p[3] != 0xD4)
2124          VG_(message)(
2125             Vg_DebugMsg,
2126             "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
2127             arch->vex.guest_PC + 0ULL, p[0], p[1], p[2], p[3]
2128           );
2129 
2130       vg_assert(p[0] == 0x01 && p[1] == 0x00 && p[2] == 0x00 && p[3] == 0xD4);
2131    }
2132 
2133 #elif defined(VGP_x86_darwin)
2134    arch->vex.guest_EIP = arch->vex.guest_IP_AT_SYSCALL;
2135 
2136    /* Make sure our caller is actually sane, and we're really backing
2137       back over a syscall.
2138 
2139       int $0x80 == CD 80
2140       int $0x81 == CD 81
2141       int $0x82 == CD 82
2142       sysenter  == 0F 34
2143    */
2144    {
2145        UChar *p = (UChar *)arch->vex.guest_EIP;
2146        Bool  ok = (p[0] == 0xCD && p[1] == 0x80)
2147                   || (p[0] == 0xCD && p[1] == 0x81)
2148                   || (p[0] == 0xCD && p[1] == 0x82)
2149                   || (p[0] == 0x0F && p[1] == 0x34);
2150        if (!ok)
2151            VG_(message)(Vg_DebugMsg,
2152                         "?! restarting over syscall at %#x %02x %02x\n",
2153                         arch->vex.guest_EIP, p[0], p[1]);
2154        vg_assert(ok);
2155    }
2156 
2157 #elif defined(VGP_amd64_darwin)
2158    // DDD: #warning GrP fixme amd64 restart unimplemented
2159    vg_assert(0);
2160 
2161 #elif defined(VGP_s390x_linux)
2162    arch->vex.guest_IA -= 2;             // sizeof(syscall)
2163 
2164    /* Make sure our caller is actually sane, and we're really backing
2165       back over a syscall.
2166 
2167       syscall == 0A <num>
2168    */
2169    {
2170       UChar *p = (UChar *)arch->vex.guest_IA;
2171       if (p[0] != 0x0A)
2172          VG_(message)(Vg_DebugMsg,
2173                       "?! restarting over syscall at %#llx %02x %02x\n",
2174                       arch->vex.guest_IA, p[0], p[1]);
2175 
2176       vg_assert(p[0] == 0x0A);
2177    }
2178 
2179 #elif defined(VGP_mips32_linux) || defined(VGP_mips64_linux)
2180 
2181    arch->vex.guest_PC -= 4;             // sizeof(mips instr)
2182 
2183    /* Make sure our caller is actually sane, and we're really backing
2184       back over a syscall.
2185 
2186       syscall == 00 00 00 0C
2187       big endian
2188       syscall == 0C 00 00 00
2189    */
2190    {
2191       UChar *p = (UChar *)(arch->vex.guest_PC);
2192 #     if defined (VG_LITTLEENDIAN)
2193       if (p[0] != 0x0c || p[1] != 0x00 || p[2] != 0x00 || p[3] != 0x00)
2194          VG_(message)(Vg_DebugMsg,
2195                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
2196                       (ULong)arch->vex.guest_PC, p[0], p[1], p[2], p[3]);
2197 
2198       vg_assert(p[0] == 0x0c && p[1] == 0x00 && p[2] == 0x00 && p[3] == 0x00);
2199 #     elif defined (VG_BIGENDIAN)
2200       if (p[0] != 0x00 || p[1] != 0x00 || p[2] != 0x00 || p[3] != 0x0c)
2201          VG_(message)(Vg_DebugMsg,
2202                       "?! restarting over syscall at %#llx %02x %02x %02x %02x\n",
2203                       (ULong)arch->vex.guest_PC, p[0], p[1], p[2], p[3]);
2204 
2205       vg_assert(p[0] == 0x00 && p[1] == 0x00 && p[2] == 0x00 && p[3] == 0x0c);
2206 #     else
2207 #        error "Unknown endianness"
2208 #     endif
2209    }
2210 #elif defined(VGP_tilegx_linux)
2211    arch->vex.guest_pc -= 8;             // sizeof({ swint1 })
2212 
2213    /* Make sure our caller is actually sane, and we're really backing
2214       back over a syscall. no other instruction in same bundle.
2215    */
2216    {
2217       unsigned long *p = (unsigned long *)arch->vex.guest_pc;
2218 
2219       if (p[0] != 0x286b180051485000ULL )  // "swint1", little enidan only
2220          VG_(message)(Vg_DebugMsg,
2221                       "?! restarting over syscall at 0x%lx %lx\n",
2222                       arch->vex.guest_pc, p[0]);
2223       vg_assert(p[0] == 0x286b180051485000ULL);
2224    }
2225 
2226 #else
2227 #  error "ML_(fixup_guest_state_to_restart_syscall): unknown plat"
2228 #endif
2229 }
2230 
2231 
2232 /*
2233    Fix up the guest state when a syscall is interrupted by a signal
2234    and so has been forced to return 'sysret'.
2235 
2236    To do this, we determine the precise state of the syscall by
2237    looking at the (real) IP at the time the signal happened.  The
2238    syscall sequence looks like:
2239 
2240      1. unblock signals
2241      2. perform syscall
2242      3. save result to guest state (EAX, RAX, R3+CR0.SO, R0, V0)
2243      4. re-block signals
2244 
2245    If a signal
2246    happens at      Then     Why?
2247    [1-2)           restart  nothing has happened (restart syscall)
2248    [2]             restart  syscall hasn't started, or kernel wants to restart
2249    [2-3)           save     syscall complete, but results not saved
2250    [3-4)           syscall complete, results saved
2251 
2252    Sometimes we never want to restart an interrupted syscall (because
2253    sigaction says not to), so we only restart if "restart" is True.
2254 
2255    This will also call VG_(post_syscall) if the syscall has actually
2256    completed (either because it was interrupted, or because it
2257    actually finished).  It will not call VG_(post_syscall) if the
2258    syscall is set up for restart, which means that the pre-wrapper may
2259    get called multiple times.
2260 */
2261 
2262 void
VG_(fixup_guest_state_after_syscall_interrupted)2263 VG_(fixup_guest_state_after_syscall_interrupted)( ThreadId tid,
2264                                                   Addr     ip,
2265                                                   SysRes   sres,
2266                                                   Bool     restart)
2267 {
2268    /* Note that we don't know the syscall number here, since (1) in
2269       general there's no reliable way to get hold of it short of
2270       stashing it in the guest state before the syscall, and (2) in
2271       any case we don't need to know it for the actions done by this
2272       routine.
2273 
2274       Furthermore, 'sres' is only used in the case where the syscall
2275       is complete, but the result has not been committed to the guest
2276       state yet.  In any other situation it will be meaningless and
2277       therefore ignored. */
2278 
2279    ThreadState*     tst;
2280    SyscallStatus    canonical;
2281    ThreadArchState* th_regs;
2282    SyscallInfo*     sci;
2283 
2284    /* Compute some Booleans indicating which range we're in. */
2285    Bool outside_range,
2286         in_setup_to_restart,      // [1,2) in the .S files
2287         at_restart,               // [2]   in the .S files
2288         in_complete_to_committed, // [3,4) in the .S files
2289         in_committed_to_finished; // [4,5) in the .S files
2290 
2291 #  if defined(VGO_linux)
2292    outside_range
2293       = ip < ML_(blksys_setup) || ip >= ML_(blksys_finished);
2294    in_setup_to_restart
2295       = ip >= ML_(blksys_setup) && ip < ML_(blksys_restart);
2296    at_restart
2297       = ip == ML_(blksys_restart);
2298    in_complete_to_committed
2299       = ip >= ML_(blksys_complete) && ip < ML_(blksys_committed);
2300    in_committed_to_finished
2301       = ip >= ML_(blksys_committed) && ip < ML_(blksys_finished);
2302 #  elif defined(VGO_darwin)
2303    outside_range
2304       =  (ip < ML_(blksys_setup_MACH) || ip >= ML_(blksys_finished_MACH))
2305       && (ip < ML_(blksys_setup_MDEP) || ip >= ML_(blksys_finished_MDEP))
2306       && (ip < ML_(blksys_setup_UNIX) || ip >= ML_(blksys_finished_UNIX));
2307    in_setup_to_restart
2308       =  (ip >= ML_(blksys_setup_MACH) && ip < ML_(blksys_restart_MACH))
2309       || (ip >= ML_(blksys_setup_MDEP) && ip < ML_(blksys_restart_MDEP))
2310       || (ip >= ML_(blksys_setup_UNIX) && ip < ML_(blksys_restart_UNIX));
2311    at_restart
2312       =  (ip == ML_(blksys_restart_MACH))
2313       || (ip == ML_(blksys_restart_MDEP))
2314       || (ip == ML_(blksys_restart_UNIX));
2315    in_complete_to_committed
2316       =  (ip >= ML_(blksys_complete_MACH) && ip < ML_(blksys_committed_MACH))
2317       || (ip >= ML_(blksys_complete_MDEP) && ip < ML_(blksys_committed_MDEP))
2318       || (ip >= ML_(blksys_complete_UNIX) && ip < ML_(blksys_committed_UNIX));
2319    in_committed_to_finished
2320       =  (ip >= ML_(blksys_committed_MACH) && ip < ML_(blksys_finished_MACH))
2321       || (ip >= ML_(blksys_committed_MDEP) && ip < ML_(blksys_finished_MDEP))
2322       || (ip >= ML_(blksys_committed_UNIX) && ip < ML_(blksys_finished_UNIX));
2323    /* Wasn't that just So Much Fun?  Does your head hurt yet?  Mine does. */
2324 #  else
2325 #    error "Unknown OS"
2326 #  endif
2327 
2328    if (VG_(clo_trace_signals))
2329       VG_(message)( Vg_DebugMsg,
2330                     "interrupted_syscall: tid=%d, ip=0x%llx, "
2331                     "restart=%s, sres.isErr=%s, sres.val=%lld\n",
2332                     (Int)tid,
2333                     (ULong)ip,
2334                     restart ? "True" : "False",
2335                     sr_isError(sres) ? "True" : "False",
2336                     (Long)(sr_isError(sres) ? sr_Err(sres) : sr_Res(sres)) );
2337 
2338    vg_assert(VG_(is_valid_tid)(tid));
2339    vg_assert(tid >= 1 && tid < VG_N_THREADS);
2340    vg_assert(VG_(is_running_thread)(tid));
2341 
2342    tst     = VG_(get_ThreadState)(tid);
2343    th_regs = &tst->arch;
2344    sci     = & syscallInfo[tid];
2345 
2346    /* Figure out what the state of the syscall was by examining the
2347       (real) IP at the time of the signal, and act accordingly. */
2348    if (outside_range) {
2349       if (VG_(clo_trace_signals))
2350          VG_(message)( Vg_DebugMsg,
2351                        "  not in syscall at all: hmm, very suspicious\n" );
2352       /* Looks like we weren't in a syscall at all.  Hmm. */
2353       vg_assert(sci->status.what != SsIdle);
2354       return;
2355    }
2356 
2357    /* We should not be here unless this thread had first started up
2358       the machinery for a syscall by calling VG_(client_syscall).
2359       Hence: */
2360    vg_assert(sci->status.what != SsIdle);
2361 
2362    /* now, do one of four fixup actions, depending on where the IP has
2363       got to. */
2364 
2365    if (in_setup_to_restart) {
2366       /* syscall hasn't even started; go around again */
2367       if (VG_(clo_trace_signals))
2368          VG_(message)( Vg_DebugMsg, "  not started: restarting\n");
2369       vg_assert(sci->status.what == SsHandToKernel);
2370       ML_(fixup_guest_state_to_restart_syscall)(th_regs);
2371    }
2372 
2373    else
2374    if (at_restart) {
2375       /* We're either about to run the syscall, or it was interrupted
2376          and the kernel restarted it.  Restart if asked, otherwise
2377          EINTR it. */
2378       if (restart) {
2379          if (VG_(clo_trace_signals))
2380             VG_(message)( Vg_DebugMsg, "  at syscall instr: restarting\n");
2381          ML_(fixup_guest_state_to_restart_syscall)(th_regs);
2382       } else {
2383          if (VG_(clo_trace_signals))
2384             VG_(message)( Vg_DebugMsg, "  at syscall instr: returning EINTR\n");
2385          canonical = convert_SysRes_to_SyscallStatus(
2386                         VG_(mk_SysRes_Error)( VKI_EINTR )
2387                      );
2388          if (!(sci->flags & SfNoWriteResult))
2389             putSyscallStatusIntoGuestState( tid, &canonical, &th_regs->vex );
2390          sci->status = canonical;
2391          VG_(post_syscall)(tid);
2392       }
2393    }
2394 
2395    else
2396    if (in_complete_to_committed) {
2397       /* Syscall complete, but result hasn't been written back yet.
2398          Write the SysRes we were supplied with back to the guest
2399          state. */
2400       if (VG_(clo_trace_signals))
2401          VG_(message)( Vg_DebugMsg,
2402                        "  completed, but uncommitted: committing\n");
2403       canonical = convert_SysRes_to_SyscallStatus( sres );
2404       if (!(sci->flags & SfNoWriteResult))
2405          putSyscallStatusIntoGuestState( tid, &canonical, &th_regs->vex );
2406       sci->status = canonical;
2407       VG_(post_syscall)(tid);
2408    }
2409 
2410    else
2411    if (in_committed_to_finished) {
2412       /* Result committed, but the signal mask has not been restored;
2413          we expect our caller (the signal handler) will have fixed
2414          this up. */
2415       if (VG_(clo_trace_signals))
2416          VG_(message)( Vg_DebugMsg,
2417                        "  completed and committed: nothing to do\n");
2418       getSyscallStatusFromGuestState( &sci->status, &th_regs->vex );
2419       vg_assert(sci->status.what == SsComplete);
2420       VG_(post_syscall)(tid);
2421    }
2422 
2423    else
2424       VG_(core_panic)("?? strange syscall interrupt state?");
2425 
2426    /* In all cases, the syscall is now finished (even if we called
2427       ML_(fixup_guest_state_to_restart_syscall), since that just
2428       re-positions the guest's IP for another go at it).  So we need
2429       to record that fact. */
2430    sci->status.what = SsIdle;
2431 }
2432 
2433 
2434 #if defined(VGO_darwin)
2435 // Clean up after workq_ops(WQOPS_THREAD_RETURN) jumped to wqthread_hijack.
2436 // This is similar to VG_(fixup_guest_state_after_syscall_interrupted).
2437 // This longjmps back to the scheduler.
ML_(wqthread_continue_NORETURN)2438 void ML_(wqthread_continue_NORETURN)(ThreadId tid)
2439 {
2440    ThreadState*     tst;
2441    SyscallInfo*     sci;
2442 
2443    VG_(acquire_BigLock)(tid, "wqthread_continue_NORETURN");
2444 
2445    PRINT("SYSCALL[%d,%d](%s) workq_ops() starting new workqueue item\n",
2446          VG_(getpid)(), tid, VG_SYSNUM_STRING(__NR_workq_ops));
2447 
2448    vg_assert(VG_(is_valid_tid)(tid));
2449    vg_assert(tid >= 1 && tid < VG_N_THREADS);
2450    vg_assert(VG_(is_running_thread)(tid));
2451 
2452    tst     = VG_(get_ThreadState)(tid);
2453    sci     = & syscallInfo[tid];
2454    vg_assert(sci->status.what != SsIdle);
2455    vg_assert(tst->os_state.wq_jmpbuf_valid);  // check this BEFORE post_syscall
2456 
2457    // Pretend the syscall completed normally, but don't touch the thread state.
2458    sci->status = convert_SysRes_to_SyscallStatus( VG_(mk_SysRes_Success)(0) );
2459    sci->flags |= SfNoWriteResult;
2460    VG_(post_syscall)(tid);
2461 
2462    ML_(sync_mappings)("in", "ML_(wqthread_continue_NORETURN)", 0);
2463 
2464    sci->status.what = SsIdle;
2465 
2466    vg_assert(tst->sched_jmpbuf_valid);
2467    VG_MINIMAL_LONGJMP(tst->sched_jmpbuf);
2468 
2469    /* NOTREACHED */
2470    vg_assert(0);
2471 }
2472 #endif
2473 
2474 
2475 /* ---------------------------------------------------------------------
2476    A place to store the where-to-call-when-really-done pointer
2477    ------------------------------------------------------------------ */
2478 
2479 // When the final thread is done, where shall I call to shutdown the
2480 // system cleanly?  Is set once at startup (in m_main) and never
2481 // changes after that.  Is basically a pointer to the exit
2482 // continuation.  This is all just a nasty hack to avoid calling
2483 // directly from m_syswrap to m_main at exit, since that would cause
2484 // m_main to become part of a module cycle, which is silly.
2485 void (* VG_(address_of_m_main_shutdown_actions_NORETURN) )
2486        (ThreadId,VgSchedReturnCode)
2487    = NULL;
2488 
2489 /*--------------------------------------------------------------------*/
2490 /*--- end                                                          ---*/
2491 /*--------------------------------------------------------------------*/
2492