/* **********************************************************
* Copyright (c) 2011-2022 Google, Inc. All rights reserved.
* Copyright (c) 2001-2010 VMware, Inc. All rights reserved.
* ********************************************************** */
/*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of VMware, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2001 Hewlett-Packard Company */
/*
* x86.asm - x86 specific assembly and trampoline code
*
* This file is used for both linux and windows.
* We used to use the gnu assembler on both platforms, but
* gas does not support 64-bit windows.
* Thus we now use masm on windows and gas with the new intel-syntax-specifying
* options so that our code here only needs a minimum of macros to
* work on both.
*
* Note that for gas on cygwin we used to need to prepend _ to global
* symbols: we don't need that for linux gas or masm so we don't do it anymore.
*/
/* We handle different registers and calling conventions with a CPP pass.
* It can be difficult to choose registers that work across all ABI's we're
* trying to support: we need to move each ARG into a register in case
* it is passed in memory, but we have to pick registers that don't already
* hold other arguments. Typically, use this order:
* REG_XAX, REG_XBX, REG_XDI, REG_XSI, REG_XDX, REG_XCX
* The suggested order for 3 parameters is:
* REG_XAX = ARG1, REG_XCX = ARG3, REG_XDX = ARG2
* The suggested order for 2 parameters is:
* REG_XAX = ARG2, REG_XDX = ARG1
* Note that REG_XBX is by convention used on linux for PIC base: if we want
* to try and avoid relocations (case 7852) we should avoid using it
* to avoid confusion (though we can always pick a different register,
* even varying by function).
* FIXME: should we use virtual registers instead?
* FIXME: should we have ARG1_IN_REG macro that is either nop or load from stack?
* For now not bothering, but if we add more routines we'll want more support.
* Naturally the ARG* macros are only valid at function entry.
*/
#include "../asm_defines.asm"
START_FILE
#ifdef UNIX
# include "os_asm_defines.asm"
# ifdef LINUX
# include "include/syscall.h"
# else
# include "include/syscall_mach.h"
# include <sys/syscall.h>
# endif
#endif
#define RESTORE_FROM_DCONTEXT_VIA_REG(reg,offs,dest) mov dest, PTRSZ [offs + reg]
#define SAVE_TO_DCONTEXT_VIA_REG(reg,offs,src) mov PTRSZ [offs + reg], src
/* For the few remaining dcontext_t offsets we need here: */
#if defined(WINDOWS) && !defined(X64)
# define UPCXT_BEFORE_INLINE_SLOTS 4 /* at_syscall + padding */
#else
# define UPCXT_BEFORE_INLINE_SLOTS 8 /* IF_UNIX(errno +) at_syscall + padding */
#endif
/* Count the slots for client clean call inlining. */
/* Add CLEANCALL_NUM_INLINE_SLOTS(5) * ARG_SZ for these slots. No padding. */
# define UPCXT_EXTRA (UPCXT_BEFORE_INLINE_SLOTS + 5 * ARG_SZ)
/* XXX: duplicated in os_exports.h */
#ifdef X64
# define TOP_STACK_TIB_OFFSET 8
# define BASE_STACK_TIB_OFFSET 16
#else
# define TOP_STACK_TIB_OFFSET 4
# define BASE_STACK_TIB_OFFSET 8
#endif
/* Upper bound is all we need */
#define DYNAMORIO_STACK_SIZE_UPPER_BOUND 128*1024
/* Should we generate all of our asm code instead of having it static?
* As it is we're duplicating insert_push_all_registers(), dr_insert_call(), etc.,
* but it's not that much code here in these macros, and this is simpler
* than emit_utils.c-style code.
*/
#include "x86_asm_defines.asm" /* PUSHGPR, POPGPR, etc. */
/* Pushes a priv_mcontext_t on the stack, with an xsp value equal to the
* xsp before the pushing. Clobbers xax!
* Does fill in xmm0-5, if necessary, for PR 264138.
* Assumes that DR has been initialized (get_simd_vals() checks proc feature bits).
* Caller should ensure 16-byte stack alignment prior to the push (PR 306421).
*/
#define PUSH_PRIV_MCXT(pc) \
lea REG_XSP, [REG_XSP + PUSH_PRIV_MCXT_PRE_PC_SHIFT] @N@\
push pc @N@\
PUSHF @N@\
PUSHGPR @N@\
lea REG_XAX, [REG_XSP] @N@\
CALLC1(GLOBAL_REF(get_simd_vals), REG_XAX) @N@\
lea REG_XAX, [PRIV_MCXT_SIZE + REG_XSP] @N@\
mov [PUSHGPR_XSP_OFFS + REG_XSP], REG_XAX
/* Pops the GPRs and flags from a priv_mcontext off the stack. Does not
* restore xmm/ymm regs.
*/
#define POP_PRIV_MCXT_GPRS() \
POPGPR @N@\
POPF @N@\
lea REG_XSP, [REG_XSP - PUSH_PRIV_MCXT_PRE_PC_SHIFT + ARG_SZ/*pc*/]
/****************************************************************************/
/****************************************************************************/
DECL_EXTERN(unexpected_return)
DECL_EXTERN(get_own_context_integer_control)
DECL_EXTERN(get_simd_vals)
DECL_EXTERN(auto_setup)
DECL_EXTERN(return_from_native)
DECL_EXTERN(native_module_callout)
DECL_EXTERN(d_r_dispatch)
#ifdef DR_APP_EXPORTS
DECL_EXTERN(dr_app_start_helper)
#endif
DECL_EXTERN(dynamo_process_exit)
DECL_EXTERN(dynamo_thread_exit)
DECL_EXTERN(dynamo_thread_stack_free_and_exit)
DECL_EXTERN(dynamorio_app_take_over_helper)
DECL_EXTERN(found_modified_code)
DECL_EXTERN(get_cleanup_and_terminate_global_do_syscall_entry)
#ifdef INTERNAL
DECL_EXTERN(d_r_internal_error)
#endif
DECL_EXTERN(internal_exception_info)
DECL_EXTERN(is_currently_on_dstack)
DECL_EXTERN(nt_continue_setup)
#if defined(UNIX)
DECL_EXTERN(main_signal_handler_C)
#endif
#ifdef MACOS
DECL_EXTERN(new_bsdthread_setup)
#endif
DECL_EXTERN(hashlookup_null_target)
#if defined(UNIX) && !defined(HAVE_SIGALTSTACK)
DECL_EXTERN(sig_should_swap_stack)
DECL_EXTERN(fixup_rtframe_pointers)
# define CLONE_AND_SWAP_STRUCT_SIZE 2*ARG_SZ
#endif
#ifdef UNIX
DECL_EXTERN(dr_setjmp_sigmask)
DECL_EXTERN(privload_early_inject)
DECL_EXTERN(relocate_dynamorio)
DECL_EXTERN(dynamorio_dl_fixup)
#endif
#ifdef WINDOWS
DECL_EXTERN(dynamorio_earliest_init_takeover_C)
DECL_EXTERN(os_terminate_wow64_stack)
#endif
/* non-functions: these make us non-PIC! (PR 212290) */
DECL_EXTERN(exiting_thread_count)
DECL_EXTERN(d_r_initstack)
DECL_EXTERN(initstack_mutex)
DECL_EXTERN(int_syscall_address)
DECL_EXTERN(syscalls)
DECL_EXTERN(sysenter_ret_address)
DECL_EXTERN(sysenter_tls_offset)
#ifdef WINDOWS
DECL_EXTERN(wow64_index)
# ifdef X64
DECL_EXTERN(syscall_argsz)
# endif
DECL_EXTERN(load_dynamo_failure)
#endif
#ifdef WINDOWS
/* dynamo_auto_start: used for non-early follow children.
* Assumptions: The saved priv_mcontext_t for the start of the app is on
* the stack, followed by a pointer to a region of memory to free
* (which can be NULL) and its size. This routine is reached by a jmp
* so be aware of that for address calculation. This routine does
* not return.
*
* On win32, note that in order to export this from the dynamo dll, which is
* required for non early follow children, we have to explicitly tell the
* linker to do so. This is done in the Makefile.
* Note that if it weren't for wanting local go-native code we would have
* auto_setup in x86_code.c be dynamo_auto_start.
*/
DECLARE_FUNC(dynamo_auto_start)
GLOBAL_LABEL(dynamo_auto_start:)
/* we pass a pointer to TOS as a parameter.
* a param in xsp won't work w/ win64 padding so put in xax */
mov REG_XAX, REG_XSP
CALLC1(GLOBAL_REF(auto_setup), REG_XAX)
/* if auto_setup returns, we need to go native */
jmp load_dynamo_failure
END_FUNC(dynamo_auto_start)
#endif
#ifdef UNIX
/* We avoid performance problems with messing up the RSB by using
* a separate routine. The caller needs to use a plain call
* with _GLOBAL_OFFSET_TABLE_ on the exact return address instruction.
*/
DECLARE_FUNC(get_pic_xdi)
GLOBAL_LABEL(get_pic_xdi:)
mov REG_XDI, PTRSZ [REG_XSP]
ret
END_FUNC(get_pic_xdi)
#endif
/* void call_switch_stack(void *func_arg, // 1*ARG_SZ+XAX
* byte *stack, // 2*ARG_SZ+XAX
* void (*func)(void *arg), // 3*ARG_SZ+XAX
* void *mutex_to_free, // 4*ARG_SZ+XAX
* bool return_on_return) // 5*ARG_SZ+XAX
*/
DECLARE_FUNC(call_switch_stack)
GLOBAL_LABEL(call_switch_stack:)
/* get all args with same offset(xax) regardless of plaform */
#ifdef X64
# ifdef WINDOWS
mov REG_XAX, REG_XSP
/* stack alignment doesn't really matter (b/c we're swapping) but in case
* we add a call we keep this here
*/
lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */
# else
/* no padding so we make our own space. odd #slots keeps align-16 w/ retaddr */
lea REG_XSP, [-5*ARG_SZ + REG_XSP]
/* xax points one beyond TOS to get same offset as having retaddr there */
lea REG_XAX, [-ARG_SZ + REG_XSP]
mov [5*ARG_SZ + REG_XAX], ARG5
# endif
mov [1*ARG_SZ + REG_XAX], ARG1
mov [2*ARG_SZ + REG_XAX], ARG2
mov [3*ARG_SZ + REG_XAX], ARG3
mov [4*ARG_SZ + REG_XAX], ARG4
#else
/* Stack alignment doesn't matter b/c we're swapping. */
mov REG_XAX, REG_XSP
#endif
/* we need a callee-saved reg across our call so save it onto stack */
push REG_XBX
mov REG_XBX, REG_XAX
/* alignment doesn't matter: swapping stacks */
push IF_X64_ELSE(r12, REG_XDI) /* xdi is used for func param in X64 */
mov IF_X64_ELSE(r12, REG_XDI), REG_XSP
/* set up for call */
mov REG_XDX, [3*ARG_SZ + REG_XAX] /* func */
mov REG_XCX, [1*ARG_SZ + REG_XAX] /* func_arg */
mov REG_XSP, [2*ARG_SZ + REG_XAX] /* stack */
cmp PTRSZ [4*ARG_SZ + REG_XAX], 0 /* mutex_to_free */
je call_dispatch_alt_stack_no_free
mov REG_XAX, [4*ARG_SZ + REG_XAX]
mov DWORD [REG_XAX], 0
call_dispatch_alt_stack_no_free:
CALLC1(REG_XDX, REG_XCX)
mov REG_XSP, IF_X64_ELSE(r12, REG_XDI)
mov REG_XAX, REG_XBX
cmp BYTE [5*ARG_SZ + REG_XAX], 0 /* return_on_return */
je GLOBAL_REF(unexpected_return)
pop IF_X64_ELSE(r12, REG_XDI)
pop REG_XBX
#ifdef X64
# ifdef WINDOWS
mov REG_XSP, REG_XAX
# else
lea REG_XSP, [5*ARG_SZ + REG_XSP]
# endif
#else
mov REG_XSP, REG_XAX
#endif
ret
END_FUNC(call_switch_stack)
/*
* Calls the specified function 'func' after switching to the DR stack
* for the thread corresponding to 'drcontext'.
* Passes in 8 arguments. Uses the C calling convention, so 'func' will work
* just fine even if if takes fewer than 8 args.
* Swaps the stack back upon return and returns the value returned by 'func'.
*
* void * dr_call_on_clean_stack(void *drcontext, // 1*ARG_SZ+XAX
* void *(*func)(arg1...arg8), // 2*ARG_SZ+XAX
* void *arg1, // 3*ARG_SZ+XAX
* void *arg2, // 4*ARG_SZ+XAX
* void *arg3, // 5*ARG_SZ+XAX
* void *arg4, // 6*ARG_SZ+XAX
* void *arg5, // 7*ARG_SZ+XAX
* void *arg6, // 8*ARG_SZ+XAX
* void *arg7, // 9*ARG_SZ+XAX
* void *arg8) //10*ARG_SZ+XAX
*/
DECLARE_EXPORTED_FUNC(dr_call_on_clean_stack)
GLOBAL_LABEL(dr_call_on_clean_stack:)
/* avoid colliding with ARG* in either scratch reg */
# ifdef X64
# define SCRATCH1 r10
# define SCRATCH2 r11
# else
# define SCRATCH1 edx
# define SCRATCH2 ecx
# endif
/* get all args with same offset(xax) regardless of plaform */
# ifdef X64
# ifdef WINDOWS
mov REG_XAX, REG_XSP
/* stack alignment doesn't really matter (b/c we're swapping) but in case
* we add a call we keep this here
*/
lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */
# else
/* no padding so we make our own space. odd #slots keeps align-16 w/ retaddr */
lea REG_XSP, [-5*ARG_SZ + REG_XSP]
/* xax points one beyond TOS to get same offset as having retaddr there */
lea REG_XAX, [-ARG_SZ + REG_XSP]
/* save the retaddr */
mov SCRATCH1, [6*ARG_SZ + REG_XAX]
mov [5*ARG_SZ + REG_XAX], ARG5
mov [6*ARG_SZ + REG_XAX], ARG6
# endif
mov [1*ARG_SZ + REG_XAX], ARG1
mov [2*ARG_SZ + REG_XAX], ARG2
mov [3*ARG_SZ + REG_XAX], ARG3
mov [4*ARG_SZ + REG_XAX], ARG4
# else
/* Stack alignment doesn't matter b/c we're swapping. */
mov REG_XAX, REG_XSP
# endif
# if defined(X64) && !defined(WINDOWS)
push SCRATCH1 /* retaddr */
# endif
/* we need a callee-saved reg across our call so save it onto stack */
push REG_XBX
push REG_XBP /* alignment doesn't matter: swapping stacks */
# ifdef WINDOWS
/* DrMi#1676: we have to preserve the app's TEB stack fields.
* DrMi#1723: we no longer swap StackLimit == BASE_STACK_TIB_OFFSET.
* See SWAP_TEB_STACKLIMIT().
*/
push REG_XSI
mov REG_XSI, SEG_TLS:[TOP_STACK_TIB_OFFSET]
# endif
mov REG_XBX, REG_XAX
mov REG_XBP, REG_XSP
/* set up for call */
mov SCRATCH1, [2*ARG_SZ + REG_XAX] /* func */
mov SCRATCH2, [1*ARG_SZ + REG_XAX] /* drcontext */
RESTORE_FROM_DCONTEXT_VIA_REG(SCRATCH2, dstack_OFFSET, REG_XSP)
# ifdef WINDOWS
/* DrMem i#1676: update TEB stack top field for Win8.1. */
mov SEG_TLS:[TOP_STACK_TIB_OFFSET], REG_XSP
# endif
STACK_PAD_NOPUSH(8, 4, 0)
mov SCRATCH2, [10*ARG_SZ + REG_XAX]
mov ARG8_NORETADDR, SCRATCH2
mov SCRATCH2, [9*ARG_SZ + REG_XAX]
mov ARG7_NORETADDR, SCRATCH2
mov SCRATCH2, [8*ARG_SZ + REG_XAX]
mov ARG6_NORETADDR, SCRATCH2
mov SCRATCH2, [7*ARG_SZ + REG_XAX]
mov ARG5_NORETADDR, SCRATCH2
mov SCRATCH2, [6*ARG_SZ + REG_XAX]
mov ARG4_NORETADDR, SCRATCH2
mov SCRATCH2, [5*ARG_SZ + REG_XAX]
mov ARG3_NORETADDR, SCRATCH2
mov SCRATCH2, [4*ARG_SZ + REG_XAX]
mov ARG2_NORETADDR, SCRATCH2
mov SCRATCH2, [3*ARG_SZ + REG_XAX]
mov ARG1_NORETADDR, SCRATCH2
call SCRATCH1
/* preserve return value in xax */
STACK_UNPAD(8, 4, 0)
mov REG_XSP, REG_XBP
mov REG_XCX, REG_XBX
# ifdef WINDOWS
/* DrMem i#1676: we have to preserve the app's TEB stack fields */
mov SEG_TLS:[TOP_STACK_TIB_OFFSET], REG_XSI
pop REG_XSI
# endif
pop REG_XBP
pop REG_XBX
# ifdef X64
# ifdef WINDOWS
mov REG_XSP, REG_XCX
# else
pop SCRATCH1 /* retaddr */
lea REG_XSP, [5*ARG_SZ + REG_XSP]
mov PTRSZ [REG_XSP], SCRATCH1 /* retaddr */
# endif
# else
mov REG_XSP, REG_XCX
# endif
ret
END_FUNC(dr_call_on_clean_stack)
/*
* Copies from the current xsp to tos onto the base of stack and then
* swaps to the cloned top of stack.
*
* void clone_and_swap_stack(byte *stack, byte *tos)
*/
DECLARE_FUNC(clone_and_swap_stack)
GLOBAL_LABEL(clone_and_swap_stack:)
mov REG_XAX, ARG1
mov REG_XCX, ARG2
mov REG_XDX, REG_XSP
/* save not-always-caller-saved regs */
push REG_XSI
push REG_XDI
/* memcpy(stack - sz, cur_esp, sz) */
sub REG_XCX, REG_XDX /* sz = tos - cur_esp */
mov REG_XSI, REG_XDX /* source = tos */
mov REG_XDI, REG_XAX /* dest = stack - sz */
sub REG_XDI, REG_XCX
sub REG_XAX, REG_XCX /* before lose sz, calculate tos on stack */
/* cld from signal handler for app signal should be ok */
cld
rep movsb
/* restore and swap to cloned stack */
pop REG_XDI
pop REG_XSI
mov REG_XSP, REG_XAX
ret
END_FUNC(clone_and_swap_stack)
/*
* dr_app_start - Causes application to run under Dynamo control
*/
#ifdef DR_APP_EXPORTS
DECLARE_EXPORTED_FUNC(dr_app_start)
GLOBAL_LABEL(dr_app_start:)
ADD_STACK_ALIGNMENT_NOSEH
/* grab exec state and pass as param in a priv_mcontext_t struct */
PUSH_PRIV_MCXT(PTRSZ [FRAME_ALIGNMENT - ARG_SZ + REG_XSP -\
PUSH_PRIV_MCXT_PRE_PC_SHIFT]) /* return address as pc */
/* do the rest in C */
lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */
CALLC1(GLOBAL_REF(dr_app_start_helper), REG_XAX)
/* If we come back, then DR is not taking control so clean up stack and return. */
add REG_XSP, PRIV_MCXT_SIZE + FRAME_ALIGNMENT - ARG_SZ
ret
END_FUNC(dr_app_start)
/*
* dr_app_take_over - For the client interface, we'll export 'dr_app_take_over'
* for consistency with the dr_ naming convention of all exported functions.
* We'll keep 'dynamorio_app_take_over' for compatibility with the preinjector.
*/
DECLARE_EXPORTED_FUNC(dr_app_take_over)
GLOBAL_LABEL(dr_app_take_over: )
jmp GLOBAL_REF(dynamorio_app_take_over)
END_FUNC(dr_app_take_over)
/* dr_app_running_under_dynamorio - Indicates whether the current thread
* is running within the DynamoRIO code cache.
* Returns false (not under dynamorio) by default.
* The function is mangled by dynamorio to return true instead when
* it is brought into the code cache.
*/
DECLARE_EXPORTED_FUNC(dr_app_running_under_dynamorio)
GLOBAL_LABEL(dr_app_running_under_dynamorio: )
mov eax, 0
ret
END_FUNC(dr_app_running_under_dynamorio)
#endif
/*
* dynamorio_app_take_over - Causes application to run under Dynamo
* control. Dynamo never releases control.
*/
DECLARE_EXPORTED_FUNC(dynamorio_app_take_over)
GLOBAL_LABEL(dynamorio_app_take_over:)
ADD_STACK_ALIGNMENT_NOSEH
/* grab exec state and pass as param in a priv_mcontext_t struct */
PUSH_PRIV_MCXT(PTRSZ [FRAME_ALIGNMENT - ARG_SZ + REG_XSP -\
PUSH_PRIV_MCXT_PRE_PC_SHIFT]) /* return address as pc */
/* do the rest in C */
lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */
CALLC1(GLOBAL_REF(dynamorio_app_take_over_helper), REG_XAX)
/* If we come back, then DR is not taking control so clean up stack and return. */
add REG_XSP, PRIV_MCXT_SIZE + FRAME_ALIGNMENT - ARG_SZ
ret
END_FUNC(dynamorio_app_take_over)
/*
* cleanup_and_terminate(dcontext_t *dcontext, // 1*ARG_SZ+XBP
* ptr_uint_t sysnum, // 2*ARG_SZ+XBP = syscall #
* ptr_uint_t sys_arg1/param_base, // 3*ARG_SZ+XBP = arg1 for syscall
* ptr_uint_t sys_arg2, // 4*ARG_SZ+XBP = arg2 for syscall
* bool exitproc, // 5*ARG_SZ+XBP
* (These 2 args are only used for Mac thread exit:)
* ptr_uint_t sys_arg3, // 6*ARG_SZ+XBP = arg3 for syscall
* ptr_uint_t sys_arg4) // 7*ARG_SZ+XBP = arg4 for syscall
*
* See decl in arch_exports.h for description.
*
* Note that this routine does not return and thus clobbers callee-saved regs.
*/
DECLARE_FUNC(cleanup_and_terminate)
GLOBAL_LABEL(cleanup_and_terminate:)
/* get all args with same offset(xbp) regardless of plaform, to save
* across our calls.
*/
#ifdef X64
# ifdef WINDOWS
mov REG_XBP, REG_XSP
lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */
# else
/* no padding so we make our own space. odd #slots keeps align-16 w/ retaddr */
lea REG_XSP, [-5*ARG_SZ + REG_XSP]
/* xbp points one beyond TOS to get same offset as having retaddr there */
lea REG_XBP, [-ARG_SZ + REG_XSP]
mov [5*ARG_SZ + REG_XBP], ARG5
mov [6*ARG_SZ + REG_XBP], ARG6
mov REG_XAX, ARG7
mov [7*ARG_SZ + REG_XBP], REG_XAX
# endif
mov [1*ARG_SZ + REG_XBP], ARG1
mov [2*ARG_SZ + REG_XBP], ARG2
mov [3*ARG_SZ + REG_XBP], ARG3
mov [4*ARG_SZ + REG_XBP], ARG4
#else
mov REG_XBP, REG_XSP
# ifdef UNIX
lea REG_XSP, [-3*ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */
# endif
#endif
/* increment exiting_thread_count so that we don't get killed after
* thread_exit removes us from the all_threads list */
#if !defined(X64) && defined(LINUX)
/* PR 212290: avoid text relocations: get PIC base into callee-saved xdi.
* Can't use CALLC0 since it inserts a nop: we need the exact retaddr.
*/
call get_pic_xdi
lea REG_XDI, [_GLOBAL_OFFSET_TABLE_ + REG_XDI]
lea REG_XAX, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(exiting_thread_count))
lock inc DWORD [REG_XAX]
#else
lock inc DWORD SYMREF(exiting_thread_count) /* rip-rel for x64 */
#endif
/* save dcontext->dstack for freeing later and set dcontext->is_exiting */
/* xbx is callee-saved and not an x64 param so we can use it */
mov REG_XBX, PTRSZ [1*ARG_SZ + REG_XBP] /* dcontext */
SAVE_TO_DCONTEXT_VIA_REG(REG_XBX,is_exiting_OFFSET,1)
CALLC1(GLOBAL_REF(is_currently_on_dstack), REG_XBX) /* xbx is callee-saved */
cmp al, 0
jnz cat_save_dstack
mov REG_XBX, 0 /* save 0 for dstack to avoid double-free */
jmp cat_done_saving_dstack
cat_save_dstack:
RESTORE_FROM_DCONTEXT_VIA_REG(REG_XBX,dstack_OFFSET,REG_XBX)
cat_done_saving_dstack:
/* PR 306421: xbx is callee-saved for all platforms, so don't push yet,
* to maintain 16-byte stack alignment
*/
/* avoid sygate sysenter version as our stack may be static const at
* that point, caller will take care of sygate hack */
CALLC0(GLOBAL_REF(get_cleanup_and_terminate_global_do_syscall_entry))
#if defined(UNIX) && !defined(X64)
lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* maintain align-16 w/ 2 pushes below */
#endif
push REG_XBX /* 16-byte aligned again */
push REG_XAX
/* upper bytes are 0xab so only look at lower bytes */
movzx esi, BYTE [5*ARG_SZ + REG_XBP] /* exitproc */
cmp esi, 0
jz cat_thread_only
CALLC0(GLOBAL_REF(dynamo_process_exit))
jmp cat_no_thread
cat_thread_only:
CALLC0(GLOBAL_REF(dynamo_thread_exit))
cat_no_thread:
/* now switch to d_r_initstack for cleanup of dstack
* could use d_r_initstack for whole thing but that's too long
* of a time to hold global initstack_mutex */
mov ecx, 1
#if !defined(X64) && defined(LINUX)
/* PIC base is still in xdi */
lea REG_XAX, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(initstack_mutex))
#endif
cat_spin:
#if !defined(X64) && defined(LINUX)
xchg DWORD [REG_XAX], ecx
#else
xchg DWORD SYMREF(initstack_mutex), ecx /* rip-relative on x64 */
#endif
jecxz cat_have_lock
/* try again -- too few free regs to call sleep() */
pause /* good thing gas now knows about pause */
jmp cat_spin
cat_have_lock:
/* need to grab everything off dstack first */
#ifdef WINDOWS
/* PR 601533: the wow64 syscall writes to the stack b/c it
* makes a call, so we have a race that can lead to a hang or
* worse. we do not expect the syscall to return, so we can
* use a global single-entry stack (the wow64 layer swaps to a
* different stack: presumably for alignment and other reasons).
*/
CALLC1(GLOBAL_REF(os_terminate_wow64_stack), -1/*INVALID_HANDLE_VALUE*/)
mov REG_XDI, REG_XAX /* esp to use */
#endif
mov REG_XSI, [2*ARG_SZ + REG_XBP] /* sysnum */
#ifdef MACOS64
/* For now we assume a BSD syscall */
or REG_XSI, SYSCALL_NUM_MARKER_BSD
#endif
pop REG_XAX /* syscall */
pop REG_XCX /* dstack */
#if defined(UNIX) && !defined(X64)
lea REG_XSP, [2*ARG_SZ + REG_XSP] /* undo align-16 lea from above */
#endif
mov REG_XBX, REG_XBP /* save for arg access after swapping stacks */
/* swap stacks */
#if !defined(X64) && defined(LINUX)
/* PIC base is still in xdi */
lea REG_XBP, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(d_r_initstack))
mov REG_XSP, PTRSZ [REG_XBP]
#else
mov REG_XSP, PTRSZ SYMREF(d_r_initstack) /* rip-relative on x64 */
#endif
/* now save registers */
#if defined(MACOS) && !defined(X64)
cmp BYTE [5*ARG_SZ + REG_XBX], 0 /* exitproc */
jz cat_thread_only2
/* ensure aligned after 1st 2 arg pushes below, which are the syscall args */
lea REG_XSP, [-2*ARG_SZ + REG_XSP]
jmp cat_no_thread2
cat_thread_only2: /* for thread, the 4 pushes make it aligned */
push PTRSZ [7*ARG_SZ + REG_XBX] /* sys_arg4 */
push PTRSZ [6*ARG_SZ + REG_XBX] /* sys_arg3 */
cat_no_thread2:
#endif
#ifdef WINDOWS
push REG_XDI /* esp to use */
#endif
push PTRSZ [4*ARG_SZ + REG_XBX] /* sys_arg2 */
push PTRSZ [3*ARG_SZ + REG_XBX] /* sys_arg1 */
push REG_XAX /* syscall */
push REG_XSI /* sysnum => xsp 16-byte aligned for x64 and x86 */
#if defined(UNIX) && !defined(X64)
lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* align to 16 for this call */
#endif
/* free dstack and call the EXIT_DR_HOOK */
CALLC1(GLOBAL_REF(dynamo_thread_stack_free_and_exit), REG_XCX) /* pass dstack */
#if defined(UNIX) && !defined(X64)
lea REG_XSP, [2*ARG_SZ + REG_XSP] /* undo align to 16 */
#endif
/* finally, execute the termination syscall */
pop REG_XAX /* sysnum */
#ifdef X64
/* We assume we're doing "syscall" on Windows & Linux, where r10 is dead */
pop r10 /* syscall, in reg dead at syscall */
# ifdef UNIX
pop REG_XDI /* sys_arg1 */
pop REG_XSI /* sys_arg2 */
# else
pop REG_XCX /* sys_arg1 */
pop REG_XDX /* sys_arg2 */
# endif
#else
pop REG_XSI /* syscall */
# ifdef MACOS
/* Leave the args on the stack for 32-bit Mac. We actually need another
* slot before the 1st arg (usually the retaddr for app syscall).
* This ends up with stack alignment of 0xc, which is what we want.
*/
push 0
# elif defined(LINUX)
pop REG_XBX /* sys_arg1 */
pop REG_XCX /* sys_arg2 */
# else
pop REG_XDX /* sys_arg1 == param_base */
pop REG_XCX /* sys_arg2 (unused) */
# endif
#endif
#ifdef WINDOWS
pop REG_XSP /* get the stack pointer we pushed earlier */
#endif
/* Give up initstack_mutex -- potential problem here with a thread getting
* an asynch event that then uses d_r_initstack, but syscall should only care
* about ebx and edx. */
#if !defined(X64) && defined(LINUX)
/* PIC base is still in xdi */
lea REG_XBP, VAR_VIA_GOT(REG_XDI, initstack_mutex)
mov DWORD [REG_XBP], 0
#else
mov DWORD SYMREF(initstack_mutex), 0 /* rip-relative on x64 */
#endif
/* we are finished with all shared resources, decrement the
* exiting_thread_count (allows another thread to kill us) */
#if !defined(X64) && defined(LINUX)
/* PIC base is still in xdi */
lea REG_XBP, VAR_VIA_GOT(REG_XDI, GLOBAL_REF(exiting_thread_count))
lock dec DWORD [REG_XBP]
#else
lock dec DWORD SYMREF(exiting_thread_count) /* rip-rel on x64 */
#endif
#ifdef X64
jmp r10 /* go do the syscall! */
#else
jmp REG_XSI /* go do the syscall! */
#endif
END_FUNC(cleanup_and_terminate)
/* global_do_syscall_int
* Caller is responsible for all set up. For windows this means putting the
* syscall num in eax and putting the args at edx. For linux this means putting
* the syscall num in eax, and the args in ebx, ecx, edx, esi, edi and ebp (in
* that order, as needed). global_do_syscall is only used with system calls
* that we don't expect to return, so for debug builds we go into an infinite
* loop if syscall returns.
*/
DECLARE_FUNC(global_do_syscall_int)
GLOBAL_LABEL(global_do_syscall_int:)
#ifdef WINDOWS
int HEX(2e)
#else
/* XXX: if we need to make any Mach syscalls for MacOS here, we'll
* need a sysenter version, as the kernel throws SIGSYS when using int.
*/
int HEX(80)
#endif
#ifdef DEBUG
jmp GLOBAL_REF(debug_infinite_loop)
#endif
#ifdef UNIX
/* we do come here for SYS_kill which can fail: try again via exit_group */
jmp GLOBAL_REF(dynamorio_sys_exit_group)
#endif
END_FUNC(global_do_syscall_int)
/* For sygate hack need to indirect the system call through ntdll. */
#ifdef WINDOWS
DECLARE_FUNC(global_do_syscall_sygate_int)
GLOBAL_LABEL(global_do_syscall_sygate_int:)
/* would be nicer to call so we could return to debug_infinite_loop on
* failure, but on some paths (cleanup_and_terminate) we can no longer
* safetly use the stack */
jmp PTRSZ SYMREF(int_syscall_address)
END_FUNC(global_do_syscall_sygate_int)
#endif
/* global_do_syscall_sysenter
* Caller is responsible for all set up, this means putting the syscall num
* in eax and putting the args at edx+8 (windows specific, we don't yet support
* linux sysenter). global_do_syscall is only used with system calls that we
* don't expect to return. As edx becomes esp, if the syscall does return it
* will go to the address in [edx] (again windows specific) (if any debugging
* code is desired should be pointed to there, do note that edx will become esp
* so be aware of stack limitations/protections).
*/
DECLARE_FUNC(global_do_syscall_sysenter)
GLOBAL_LABEL(global_do_syscall_sysenter:)
RAW(0f) RAW(34) /* sysenter */
#ifdef DEBUG
/* We'll never ever reach here, sysenter won't/can't return to this
* address since it doesn't know it, but we'll put in a jmp to
* debug_infinite_loop just in case */
jmp GLOBAL_REF(debug_infinite_loop)
#endif
END_FUNC(global_do_syscall_sysenter)
/* Sygate case 5441 hack - the return address (edx) needs to point to
* ntdll to pass their verification. Global_do_syscall is really only
* used with system calls that aren't expected to return so we don't
* have to be too careful. Just shuffle the stack using the sysret addr.
* If there is already a return address we'll keep that (just move down
* a slot).
*/
#ifdef WINDOWS
DECLARE_FUNC(global_do_syscall_sygate_sysenter)
GLOBAL_LABEL(global_do_syscall_sygate_sysenter:)
mov REG_XSP, REG_XDX
/* move existing ret down a slot (note target address is
* computed with already inc'ed esp [see intel docs]) */
pop PTRSZ [REG_XSP]
push PTRSZ SYMREF(sysenter_ret_address)
#if defined(X64) && defined(WINDOWS)
syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */
#else
sysenter
#endif
#ifdef DEBUG
/* We'll never ever reach here, sysenter won't/can't return to this
* address since it doesn't know it, but we'll put in a jmp to
* debug_infinite_loop just in case */
jmp GLOBAL_REF(debug_infinite_loop)
#endif
END_FUNC(global_do_syscall_sygate_sysenter)
#endif
/* Both Windows and Linux put rcx into r10 since rcx is used as the return addr */
#ifdef X64
/* global_do_syscall_syscall
* Caller is responsible for all set up: putting the syscall num in eax
* and the args in registers/memory. Only used with system calls
* that we don't expect to return, so for debug builds we go into an infinite
* loop if syscall returns.
*/
DECLARE_FUNC(global_do_syscall_syscall)
GLOBAL_LABEL(global_do_syscall_syscall:)
mov r10, REG_XCX
syscall
# ifdef DEBUG
jmp GLOBAL_REF(debug_infinite_loop)
# endif
# ifdef UNIX
/* we do come here for SYS_kill which can fail: try again via exit_group */
jmp GLOBAL_REF(dynamorio_sys_exit_group)
# endif
END_FUNC(global_do_syscall_syscall)
#endif
#ifdef WINDOWS
/* global_do_syscall_wow64
* Xref case 3922
* Caller is responsible for all set up: putting the syscall num in eax,
* the wow64 index into ecx, and the args in edx. Only used with system calls
* that we don't expect to return, so for debug builds we go into an infinite
* loop if syscall returns.
*/
DECLARE_FUNC(global_do_syscall_wow64)
GLOBAL_LABEL(global_do_syscall_wow64:)
call PTRSZ SEGMEM(fs,HEX(0c0))
#ifdef DEBUG
jmp GLOBAL_REF(debug_infinite_loop)
#endif
END_FUNC(global_do_syscall_wow64)
/* global_do_syscall_wow64_index0
* Sames as global_do_syscall_wow64, except zeros out ecx.
*/
DECLARE_FUNC(global_do_syscall_wow64_index0)
GLOBAL_LABEL(global_do_syscall_wow64_index0:)
xor ecx, ecx
call PTRSZ SEGMEM(fs,HEX(0c0))
#ifdef DEBUG
jmp GLOBAL_REF(debug_infinite_loop)
#endif
END_FUNC(global_do_syscall_wow64_index0)
#endif /* WINDOWS */
#ifdef DEBUG
/* Just an infinite CPU eating loop used to mark certain failures.
*/
DECLARE_FUNC(debug_infinite_loop)
GLOBAL_LABEL(debug_infinite_loop:)
jmp GLOBAL_REF(debug_infinite_loop)
END_FUNC(debug_infinite_loop)
#endif
#ifdef WINDOWS
/* We use our own syscall wrapper for key win32 system calls.
*
* We would use a dynamically generated routine created by decoding
* a real ntdll wrapper and tweaking it, but we need to use
* this for our own syscalls and have a bootstrapping problem -- so
* rather than hacking to get the power to decode w/o a heap, we hardcode
* the types we support here.
*
* We assume that all syscall wrappers are identical, and they have
* specific instruction sequences -- thus this routine needs to be updated
* with any syscall sequence change in a future version of ntdll.dll!
*
* We construct our own minimalist versions that use C calling convention
* and take as a first argument the system call number:
*
* ref case 5217, for Sygate compatibility the int needs to come from
* ntdll.dll, we use a call to NtYieldExecution+9 (int 2e; ret;)
*
* 1) mov immed, eax mov 4(esp), eax
* lea 4(esp), edx ==> lea 8(esp), edx
* int 2e int 2e
* ret 4*numargs ret
*
* 2) mov immed, eax mov 4(esp), eax
* mov 0x7ffe0300, edx mov esp, edx
* call {edx,(edx)} < juggle stack, see below >
* NOTE - to support the sygate case 5441 hack the actual instructions
* - we use are different, but the end up doing the same thing
* callee: ==> sysenter
* mov esp, edx our_ret:
* sysenter ret
* ret
* ret 4*numargs
*
* => signature: dynamorio_syscall_{int2e,sysenter}(sysnum, arg1, arg2, ...)
*/
DECLARE_FUNC(dynamorio_syscall_int2e)
GLOBAL_LABEL(dynamorio_syscall_int2e:)
mov eax, [4 + esp]
lea edx, [8 + esp]
int HEX(2e)
ret
END_FUNC(dynamorio_syscall_int2e)
DECLARE_FUNC(dynamorio_syscall_sygate_int2e)
GLOBAL_LABEL(dynamorio_syscall_sygate_int2e:)
mov eax, [4 + esp]
lea edx, [8 + esp]
call PTRSZ SYMREF(int_syscall_address)
ret
END_FUNC(dynamorio_syscall_sygate_int2e)
DECLARE_FUNC(dynamorio_syscall_sysenter)
GLOBAL_LABEL(dynamorio_syscall_sysenter:)
/* esp + 0 return address
* 4 syscall num
* 8+ syscall args
* Ref case 5461 edx serves as both the argument pointer (edx+8) and the
* top of stack for the kernel sysexit. */
mov eax, [4 + esp]
mov REG_XDX, REG_XSP
#if defined(X64) && defined(WINDOWS)
syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */
#else
sysenter
#endif
/* Kernel sends control to hardcoded location, which does ret,
* which will return directly back to the caller. Thus the following
* ret will never execute. */
ret
END_FUNC(dynamorio_syscall_sysenter)
DECLARE_GLOBAL(dynamorio_mach_syscall_fixup)
DECLARE_FUNC(dynamorio_syscall_sygate_sysenter)
GLOBAL_LABEL(dynamorio_syscall_sygate_sysenter:)
/* stack looks like:
* esp + 0 return address
* 4 syscall num
* 8+ syscall args
* Ref case 5461 edx serves as both the argument pointer (edx+8) and the
* top of stack for the kernel sysexit. While we could do nothing and
* just have the sysenter return straight back to the caller, we use
* sysenter_ret_address indirection to support the Sygate compatibility
* fix for case 5441 where steal a ret from ntdll.dll so need to mangle
* our stack to look like
* esp + 0 sysenter_ret_address
* 4 dynamorio_mach_syscall_fixup
* 8+ syscall args
* sysenter_tls_slot return address
* before we do the edx <- esp
*
* NOTE - we can NOT just have
* esp + 0 sysenter_ret_address
* 4 return address
* 8 args
* as even though this will go the right place, the stack will be one
* off on the return (debug builds with frame ptr are ok, but not
* release). We could roll our own custom calling convention for this
* but would be a pain given how this function is called. So we use a
* tls slot to store the return address around the system call since
* there isn't room on the stack, thus is not re-entrant, but neither is
* dr and we don't make alertable system calls. An alternate scheme
* kept the return address off the top of the stack which works fine
* (nothing alertable), but just seemed too risky.
* FIXME - any perf impact from breaking hardware return predictor */
pop REG_XDX
mov eax, DWORD SYMREF(sysenter_tls_offset)
mov SEGMEM(fs,eax), edx
pop REG_XAX
#ifdef X64
/* Can't push a 64-bit immed */
mov REG_XCX, dynamorio_mach_syscall_fixup
push REG_XCX
#else
push dynamorio_mach_syscall_fixup
#endif
push PTRSZ SYMREF(sysenter_ret_address)
mov REG_XDX, REG_XSP
#if defined(X64) && defined(WINDOWS)
syscall /* FIXME ml64 won't take "sysenter" so half-fixing now */
#else
sysenter
#endif
ADDRTAKEN_LABEL(dynamorio_mach_syscall_fixup:)
/* push whatever (was the slot for the eax arg) */
push REG_XAX
/* ecx/edx should be dead here, just borrow one */
mov edx, DWORD SYMREF(sysenter_tls_offset)
push PTRSZ SEGMEM(fs,edx)
ret
END_FUNC(dynamorio_syscall_sygate_sysenter)
# ifdef X64
/* With the 1st 4 args in registers, we don't want the sysnum to shift them
* all as it's not easy to un-shift. So, we put the 1st arg last, and
* the SYS enum value first. We use the syscall_argsz array to restore
* the 1st arg. Since the return value is never larger than 64 bits, we
* never have to worry about a hidden 1st arg that shifts the rest.
*/
DECLARE_FUNC(dynamorio_syscall_syscall)
GLOBAL_LABEL(dynamorio_syscall_syscall:)
mov rax, QWORD SYMREF(syscalls)
/* the upper 32 bits are automatically zeroed */
mov eax, DWORD [rax + ARG1*4] /* sysnum in rax */
mov r10, syscall_argsz
/* the upper 32 bits are automatically zeroed */
mov r10d, DWORD [r10 + ARG1*4] /* # args in r10 */
cmp r10, 0
je dynamorio_syscall_syscall_ready
cmp r10, 1
je dynamorio_syscall_syscall_1arg
cmp r10, 2
je dynamorio_syscall_syscall_2arg
cmp r10, 3
je dynamorio_syscall_syscall_3arg
/* else, >= 4 args, so pull from arg slot of (#args + 1) */
mov ARG1, QWORD [rsp + r10*8 + 8]
jmp dynamorio_syscall_syscall_ready
dynamorio_syscall_syscall_1arg:
mov ARG1, ARG2
jmp dynamorio_syscall_syscall_ready
dynamorio_syscall_syscall_2arg:
mov ARG1, ARG3
jmp dynamorio_syscall_syscall_ready
dynamorio_syscall_syscall_3arg:
mov ARG1, ARG4
/* fall-through */
dynamorio_syscall_syscall_ready:
mov r10, rcx /* put rcx in r10 just like Nt wrappers (syscall writes rcx) */
syscall
ret
END_FUNC(dynamorio_syscall_syscall)
# endif
/* For WOW64 (case 3922) the syscall wrappers call *teb->WOW32Reserved (==
* wow64cpu!X86SwitchTo64BitMode), which is a far jmp that switches to the
* 64-bit cs segment (0x33 selector). They pass in ecx an index into
* a function table of argument conversion routines.
*
* 3) mov sysnum, eax
* mov tableidx, ecx
* call *fs:0xc0
* callee:
* jmp 0x33:wow64cpu!CpupReturnFromSimulatedCode
* ret 4*numargs
*
* rather than taking in sysnum and tableidx, we take in sys_enum and
* look up the sysnum and tableidx to keep the same args as the other
* dynamorio_syscall_* routines
* => signature: dynamorio_syscall_wow64(sys_enum, arg1, arg2, ...)
*/
DECLARE_FUNC(dynamorio_syscall_wow64)
GLOBAL_LABEL(dynamorio_syscall_wow64:)
mov eax, [4 + esp]
mov edx, DWORD SYMREF(wow64_index)
mov ecx, [edx + eax*4]
mov edx, DWORD SYMREF(syscalls)
mov eax, [edx + eax*4]
lea edx, [8 + esp]
call PTRSZ SEGMEM(fs,HEX(0c0))
ret
END_FUNC(dynamorio_syscall_wow64)
/* Win8 has no index and furthermore requires the stack to be set
* up (i.e., we can't just point edx where we want it).
* Thus, we must shift the retaddr one slot down on top of sys_enum.
* => signature: dynamorio_syscall_wow64_noedx(sys_enum, arg1, arg2, ...)
*/
DECLARE_FUNC(dynamorio_syscall_wow64_noedx)
GLOBAL_LABEL(dynamorio_syscall_wow64_noedx:)
mov eax, [4 + esp]
mov ecx, DWORD SYMREF(syscalls)
mov eax, [ecx + eax*4]
mov ecx, [esp]
mov [esp + 4], ecx
lea esp, [esp + 4]
call PTRSZ SEGMEM(fs,HEX(0c0))
/* we have to restore the stack shift of course (i#1036) */
mov ecx, [esp]
mov [esp - 4], ecx
lea esp, [esp - 4]
ret
END_FUNC(dynamorio_syscall_wow64_noedx)
#endif /* WINDOWS */
#ifdef UNIX
/* i#555: to avoid client use app's vsyscall, we enforce all clients
* use int 0x80 for system call.
*/
DECLARE_FUNC(client_int_syscall)
GLOBAL_LABEL(client_int_syscall:)
int HEX(80)
ret
END_FUNC(client_int_syscall)
#endif /* UNIX */
#ifdef UNIX
#ifdef LINUX /* XXX i#1285: implement MacOS private loader + injector */
#if !defined(STANDALONE_UNIT_TEST) && !defined(STATIC_LIBRARY)
/* i#47: Early injection _start routine. The kernel sets all registers to zero
* except the SP and PC. The stack has argc, argv[], envp[], and the auxiliary
* vector laid out on it.
* If we reload ourselves (i#1227) we'll set xdi and xsi to the base and size
* of the old library that needs to be unmapped.
*/
DECLARE_FUNC(_start)
GLOBAL_LABEL(_start:)
/* i#38: Attaching while in middle of blocking syscall requires padded null bytes
* with number_of_nop_instr = sizeof(syscall_instr) / sizeof(nop_instr).
* For detailed explanation see issue page.
*/
nop
nop
/* i#1676, i#1708: relocate dynamorio if it is not loaded to preferred address.
* We call this here to ensure it's safe to access globals once in C code
* (xref i#1865).
*/
cmp REG_XDI, 0 /* if reloaded, skip for speed + preserve xdi and xsi */
jne reloaded_xfer
mov REG_XAX, REG_XSP /* The CALLC3 may change xsp so grab it first. */
CALLC3(GLOBAL_REF(relocate_dynamorio), 0, 0, REG_XAX)
mov REG_XDI, 0 /* xdi should be callee-saved but is not always: i#2641 */
reloaded_xfer:
xor REG_XBP, REG_XBP /* Terminate stack traces at NULL. */
# ifdef X64
/* Reverse order to avoid clobbering */
mov ARG3, REG_XSI
mov ARG2, REG_XDI
mov ARG1, REG_XSP
# else
mov REG_XAX, REG_XSP
/* We maintain 16-byte alignment not just for MacOS but also for
* the new Linux ABI. Xref DrMi#1899 and i#847.
*/
lea REG_XSP, [-ARG_SZ + REG_XSP]
push REG_XSI
push REG_XDI
push REG_XAX
# endif
CALLC0(GLOBAL_REF(privload_early_inject))
jmp GLOBAL_REF(unexpected_return)
END_FUNC(_start)
#endif /* !STANDALONE_UNIT_TEST && !STATIC_LIBRARY */
/* i#1227: on a conflict with the app we reload ourselves.
* xfer_to_new_libdr(entry, init_sp, cur_dr_map, cur_dr_size)
* =>
* Invokes entry after setting sp to init_sp and placing the current (old)
* libdr bounds in registers for the new libdr to unmap.
*/
DECLARE_FUNC(xfer_to_new_libdr)
GLOBAL_LABEL(xfer_to_new_libdr:)
/* Get the args */
mov REG_XAX, ARG1
mov REG_XBX, ARG2
/* _start looks in xdi and xsi for these */
mov REG_XDI, ARG3
mov REG_XSI, ARG4
/* Restore sp */
mov REG_XSP, REG_XBX
jmp REG_XAX
END_FUNC(xfer_to_new_libdr)
#endif /* LINUX */
/* while with pre-2.6.9 kernels we were able to rely on the kernel's
* default sigreturn code sequence and be more platform independent,
* case 6700 necessitates having our own code, which for now, like
* dynamorio_syscall, hardcodes int 0x80
*/
DECLARE_FUNC(dynamorio_sigreturn)
GLOBAL_LABEL(dynamorio_sigreturn:)
#ifdef X64
# ifdef MACOS
mov eax, HEX(20000b8)
# else
mov eax, HEX(f)
# endif
mov r10, rcx
syscall
#else
# ifdef MACOS
/* we assume we don't need to align the stack (tricky to do so) */
/* XXX: should we target _sigtramp instead? Some callers aren't
* on a signal frame though.
*/
mov eax, HEX(b8)
# else
mov eax, HEX(ad)
# endif
/* PR 254280: we assume int$80 is ok even for LOL64 */
int HEX(80)
#endif
/* should not return. if we somehow do,infinite loop is intentional.
* FIXME: do better in release build! FIXME - why not an int3? */
jmp GLOBAL_REF(unexpected_return)
END_FUNC(dynamorio_sigreturn)
/* we need to exit without using any stack, to support
* THREAD_SYNCH_TERMINATED_AND_CLEANED.
* XXX: on MacOS this does use the stack.
* FIXME i#1403: on MacOS we fail to free the app's stack: we need to pass it to
* bsdthread_terminate.
*/
DECLARE_FUNC(dynamorio_sys_exit)
GLOBAL_LABEL(dynamorio_sys_exit:)
#ifdef MACOS
/* We need the mach port in order to invoke bsdthread_terminate */
mov eax, MACH_thread_self_trap
# ifdef X64
or eax, SYSCALL_NUM_MARKER_MACH
# else
neg eax
/* XXX: what about stack alignment? hard to control since we jumped here */
# endif
/* see dynamorio_mach_syscall about why we do this call;pop and sysenter */
call dynamorio_sys_exit_next
dynamorio_sys_exit_next:
pop REG_XDX
lea REG_XDX, [1/*pop*/ + 3/*lea*/ + 2/*sysenter*/ + 2/*mov*/ + REG_XDX]
mov REG_XCX, REG_XSP
sysenter
jae dynamorio_sys_exit_failed
# ifdef X64
mov ARG4, 0 /* stack to free: NULL */
mov ARG3, 0 /* stack free size: 0 */
mov ARG2, REG_XAX /* kernel port, which we just acquired */
mov ARG1, 0 /* join semaphore: SEMAPHORE_NULL */
mov eax, SYS_bsdthread_terminate
or eax, SYSCALL_NUM_MARKER_BSD
mov r10, rcx
syscall
# else
lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */
push 0 /* stack to free: NULL */
push 0 /* stack free size: 0 */
push REG_XAX /* kernel port, which we just acquired */
push 0 /* join semaphore: SEMAPHORE_NULL */
push 0 /* retaddr slot */
mov eax, SYS_bsdthread_terminate
int HEX(80)
# endif
#else /* LINUX: */
# ifdef X64
mov edi, 0 /* exit code: hardcoded */
mov eax, SYS_exit
mov r10, rcx
syscall
# else
mov ebx, 0 /* exit code: hardcoded */
mov eax, SYS_exit
/* PR 254280: we assume int$80 is ok even for LOL64 */
int HEX(80)
# endif
#endif
/* should not return. if we somehow do, infinite loop is intentional.
* FIXME: do better in release build! FIXME - why not an int3? */
dynamorio_sys_exit_failed:
jmp GLOBAL_REF(unexpected_return)
END_FUNC(dynamorio_sys_exit)
#ifdef UNIX
/* We need to signal a futex or semaphore without using our dstack, to support
* THREAD_SYNCH_TERMINATED_AND_CLEANED and detach.
* Takes KSYNCH_TYPE* in xax and the post-syscall jump target in xcx.
*/
DECLARE_FUNC(dynamorio_condvar_wake_and_jmp)
GLOBAL_LABEL(dynamorio_condvar_wake_and_jmp:)
# ifdef LINUX
/* We call futex_wakeall */
# ifdef X64
mov r12, rcx /* save across syscall */
mov ARG6, 0
mov ARG5, 0
mov ARG4, 0
mov ARG3, 0x7fffffff /* arg3 = INT_MAX */
mov ARG2, 1 /* arg2 = FUTEX_WAKE */
mov ARG1, rax /* &futex, passed in rax */
mov rax, 202 /* SYS_futex */
mov r10, rcx
syscall
jmp r12
# else
/* We use the stack, which should be the app stack: see the MacOS args below. */
push ecx /* save across syscall */
mov ebp, 0 /* arg6 */
mov edi, 0 /* arg5 */
mov esi, 0 /* arg4 */
mov edx, 0x7fffffff /* arg3 = INT_MAX */
mov ecx, 1 /* arg2 = FUTEX_WAKE */
mov ebx, eax /* arg1 = &futex, passed in eax */
mov eax, 240 /* SYS_futex */
/* PR 254280: we assume int$80 is ok even for LOL64 */
int HEX(80)
pop ecx
jmp ecx
# endif
# elif defined(MACOS)
/* We call semaphore_signal_all. We have to put syscall args on
* the stack for 32-bit, and we use the stack for call;pop for
* sysenter -- so we use the app stack, which we assume the caller has
* put us on. We're only called when terminating a thread or detaching
* so transparency should be ok so long as the app's stack is valid.
*/
mov REG_XDI, REG_XCX /* save across syscall */
mov REG_XAX, PTRSZ [REG_XAX] /* load mach_synch_t->sem */
# ifdef X64
mov ARG1, REG_XAX
mov eax, MACH_semaphore_signal_all_trap
or eax, SYSCALL_NUM_MARKER_MACH
# else
push REG_XAX
mov eax, MACH_semaphore_signal_all_trap
neg eax
/* args are on stack, w/ an extra slot (retaddr of syscall wrapper) */
push 0 /* extra slot */
/* XXX: what about stack alignment? hard to control since we jumped here */
# endif
/* see dynamorio_mach_syscall about why we do this call;pop and sysenter */
call dynamorio_semaphore_next
dynamorio_semaphore_next:
pop REG_XDX
lea REG_XDX, [1/*pop*/ + 3/*lea*/ + 2/*sysenter*/ + 2/*mov*/ + REG_XDX]
mov REG_XCX, REG_XSP
sysenter
# ifndef X64
lea esp, [2*ARG_SZ + esp] /* must not change flags */
# endif
/* we ignore return val */
jmp REG_XDI
# endif /* MACOS */
END_FUNC(dynamorio_condvar_wake_and_jmp)
#endif /* UNIX */
/* exit entire group without using any stack, in case something like
* SYS_kill via cleanup_and_terminate fails.
* XXX: on 32-bit MacOS this does use the stack.
*/
DECLARE_FUNC(dynamorio_sys_exit_group)
GLOBAL_LABEL(dynamorio_sys_exit_group:)
#ifdef X64
mov edi, 0 /* exit code: hardcoded */
# ifdef MACOS
mov eax, SYS_exit
# else
mov eax, SYS_exit_group
# endif
mov r10, rcx
syscall
#else
# ifdef MACOS
lea REG_XSP, [-ARG_SZ + REG_XSP] /* maintain align-16: offset retaddr */
push 0 /* exit code: hardcoded */
push 0 /* retaddr slot */
mov eax, SYS_exit
# else
mov ebx, 0 /* exit code: hardcoded */
mov eax, SYS_exit_group
# endif
/* PR 254280: we assume int$80 is ok even for LOL64 */
int HEX(80)
#endif
/* should not return. if we somehow do, infinite loop is intentional.
* FIXME: do better in release build! why not an int3? */
jmp GLOBAL_REF(unexpected_return)
END_FUNC(dynamorio_sys_exit_group)
#if defined(LINUX) && !defined(X64)
/* since our handler is rt, we have no source for the kernel's/libc's
* default non-rt sigreturn, so we set up our own.
*/
DECLARE_FUNC(dynamorio_nonrt_sigreturn)
GLOBAL_LABEL(dynamorio_nonrt_sigreturn:)
pop eax /* I don't understand why */
mov eax, HEX(77)
/* PR 254280: we assume int$80 is ok even for LOL64 */
int HEX(80)
/* should not return. if we somehow do,infinite loop is intentional.
* FIXME: do better in release build! FIXME - why not an int3? */
jmp GLOBAL_REF(unexpected_return)
END_FUNC(dynamorio_nonrt_sigreturn)
#endif
#ifdef HAVE_SIGALTSTACK
/* We used to get the SP by taking the address of our args, but that doesn't
* work on x64 nor with other compilers. Today we use asm to pass in the
* initial SP. For x64, we add a 4th register param and tail call to
* main_signal_handler_C. Adding a param and doing a tail call on ia32 is
* hard, so we make a real call and pass only xsp. The C routine uses it to
* read the original params.
* See also PR 305020.
*/
DECLARE_FUNC(main_signal_handler)
GLOBAL_LABEL(main_signal_handler:)
#ifdef X64
# ifdef LINUX
mov ARG4, REG_XSP /* pass as extra arg */
jmp GLOBAL_REF(main_signal_handler_C)
/* main_signal_handler_C will do the ret */
# else /* MACOS */
mov rax, REG_XSP /* save for extra arg */
push ARG2 /* infostyle */
push ARG5 /* ucxt */
push ARG6 /* token */
/* rsp is now aligned again */
mov ARG6, rax /* pass as extra arg */
CALLC0(GLOBAL_REF(main_signal_handler_C))
/* Set up args to SYS_sigreturn */
pop ARG3 /* token */
pop ARG1 /* ucxt */
pop ARG2 /* infostyle */
CALLC0(GLOBAL_REF(dynamorio_sigreturn))
jmp GLOBAL_REF(unexpected_return)
# endif
#else
/* We need to pass in xsp. The easiest way is to create an
* intermediate frame.
*/
mov REG_XAX, REG_XSP
CALLC1_FRESH(GLOBAL_REF(main_signal_handler_C), REG_XAX)
# ifdef MACOS
mov eax, ARG5 /* ucxt */
/* Set up args to SYS_sigreturn, skipping the retaddr slot */
mov edx, ARG2 /* style */
CALLC2_FRESH(GLOBAL_REF(dynamorio_sigreturn), eax, edx)
jmp GLOBAL_REF(unexpected_return)
# else
ret
# endif
#endif
END_FUNC(main_signal_handler)
#else /* !HAVE_SIGALTSTACK */
/* PR 283149: if we're on the app stack now and we need to deliver
* immediately, we can't copy over our own sig frame w/ the app's, and we
* can't push the app's below ours and have continuation work. One choice
* is to copy the frame to pending and assume we'll deliver right away.
* Instead we always swap to dstack, which also makes us a little more
* transparent wrt running out of app stack or triggering app stack guard
* pages. We do it in asm since it's ugly to swap stacks in the middle
* of a C routine: have to fix up locals + frame ptr, or jmp to start of
* func and clobber callee-saved regs (which messes up vmkernel sigreturn).
*/
DECLARE_FUNC(main_signal_handler)
GLOBAL_LABEL(main_signal_handler:)
mov REG_XAX, ARG1
mov REG_XCX, ARG2
mov REG_XDX, ARG3
/* save args */
push REG_XAX
push REG_XCX
push REG_XDX
/* make space for answers: struct clone_and_swap_args */
sub REG_XSP, CLONE_AND_SWAP_STRUCT_SIZE
mov REG_XAX, REG_XSP
/* call a C routine rather than writing everything in asm */
CALLC2(GLOBAL_REF(sig_should_swap_stack), REG_XAX, REG_XDX)
cmp al, 0
pop REG_XAX /* clone_and_swap_args.stack */
pop REG_XCX /* clone_and_swap_args.tos */
je no_swap
/* calculate the offset between stacks */
mov REG_XDX, REG_XAX
sub REG_XDX, REG_XCX /* shift = stack - tos */
# ifdef VMX86_SERVER
/* update the two parameters to sigreturn for new stack
* we can eliminate this once we have PR 405694
*/
# ifdef X64
add r12, REG_XDX /* r12 += shift */
# else
add REG_XSI, REG_XDX /* xsi += shift */
# endif
add REG_XBP, REG_XDX /* xbp += shift */
# endif
push REG_XDX
CALLC2(GLOBAL_REF(clone_and_swap_stack), REG_XAX, REG_XCX)
/* get shift back and update arg2 and arg3 */
pop REG_XDX
pop REG_XCX /* arg3 */
pop REG_XAX /* arg2 */
add REG_XAX, REG_XDX /* arg2 += shift */
add REG_XCX, REG_XDX /* arg3 += shift */
# ifndef X64
/* update the official arg2 and arg3 on the stack */
mov [3*ARG_SZ + REG_XSP], REG_XAX /* skip arg1+retaddr+arg1 */
mov [4*ARG_SZ + REG_XSP], REG_XCX
# endif
push REG_XAX
push REG_XCX
/* need to get arg1, old frame, new frame */
mov REG_XAX, [4*ARG_SZ + REG_XSP] /* skip 3 args + retaddr */
neg REG_XDX
add REG_XDX, REG_XSP /* xsp-shift = old frame */
add REG_XDX, 3*ARG_SZ /* old frame */
mov REG_XCX, REG_XSP
add REG_XCX, 3*ARG_SZ /* new frame */
/* have to be careful about order of reg params */
CALLC5(GLOBAL_REF(fixup_rtframe_pointers), 0, REG_XAX, REG_XDX, REG_XCX, 0)
no_swap:
# ifdef X64
pop ARG3
pop ARG2
pop ARG1
mov rcx, rsp /* pass as 4th arg */
jmp GLOBAL_REF(main_signal_handler_C)
/* can't return, no retaddr */
# else
add REG_XSP, 3*ARG_SZ
/* We need to pass in xsp. The easiest way is to create an
* intermediate frame.
*/
mov REG_XAX, REG_XSP
CALLC1(GLOBAL_REF(main_signal_handler_C), REG_XAX)
ret
# endif
END_FUNC(main_signal_handler)
#endif /* !HAVE_SIGALTSTACK */
#ifdef LINUX
/* SYS_clone swaps the stack so we need asm support to call it.
* signature:
* thread_id_t dynamorio_clone(uint flags, byte *newsp, void *ptid, void *tls,
* void *ctid, void (*func)(void))
*/
DECLARE_FUNC(dynamorio_clone)
GLOBAL_LABEL(dynamorio_clone:)
/* save func for use post-syscall on the newsp.
* when using clone_record_t we have 4 slots we can clobber.
*/
# ifdef X64
sub ARG2, ARG_SZ
mov [ARG2], ARG6 /* func is now on TOS of newsp */
/* all args are already in syscall registers */
mov r10, rcx
mov REG_XAX, SYS_clone
syscall
# else
mov REG_XAX, ARG6
mov REG_XCX, ARG2
sub REG_XCX, ARG_SZ
mov [REG_XCX], REG_XAX /* func is now on TOS of newsp */
mov REG_XDX, ARG3
/* preserve callee-saved regs */
push REG_XBX
push REG_XSI
push REG_XDI
/* now can't use ARG* since xsp modified by pushes */
mov REG_XBX, DWORD [4*ARG_SZ + REG_XSP] /* ARG1 + 3 pushes */
mov REG_XSI, DWORD [7*ARG_SZ + REG_XSP] /* ARG4 + 3 pushes */
mov REG_XDI, DWORD [8*ARG_SZ + REG_XSP] /* ARG5 + 3 pushes */
mov REG_XAX, SYS_clone
/* PR 254280: we assume int$80 is ok even for LOL64 */
int HEX(80)
# endif
cmp REG_XAX, 0
jne dynamorio_clone_parent
pop REG_XCX
call REG_XCX
/* shouldn't return */
jmp GLOBAL_REF(unexpected_return)
dynamorio_clone_parent:
# ifndef X64
/* restore callee-saved regs */
pop REG_XDI
pop REG_XSI
pop REG_XBX
# endif
/* return val is in eax still */
ret
END_FUNC(dynamorio_clone)
#endif /* LINUX */
#endif /* UNIX */
#ifdef MACOS
/* Thread interception at the user function. We need to get the
* stack pointer and to preserve callee-saved registers, as we will return
* back past the user function to the pthread layer (i#1403 covers
* intercepting earlier). We also clear fs, as the kernel seems to set it to
* point at a flat whole-address-space value, messing up our checks for
* it being initialized.
*/
DECLARE_FUNC(new_bsdthread_intercept)
GLOBAL_LABEL(new_bsdthread_intercept:)
/* We assume we can go ahead and clobber caller-saved regs. */
mov eax, 0
mov fs, eax
mov REG_XAX, ARG1
PUSH_PRIV_MCXT(0 /* for priv_mcontext_t.pc */)
lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */
CALLC1_FRESH(GLOBAL_REF(new_bsdthread_setup), REG_XAX)
/* should not return */
jmp GLOBAL_REF(unexpected_return)
END_FUNC(new_bsdthread_intercept)
#endif
#ifdef WINDOWS
/*
* nt_continue_dynamo_start -- invoked to give dynamo control over
* exception handler continuation (after a call to NtContinue).
* identical to internal_dynamo_start except it calls nt_continue_start_setup
* to get the real next pc, and has an esp adjustment at the start.
*/
DECLARE_FUNC(nt_continue_dynamo_start)
GLOBAL_LABEL(nt_continue_dynamo_start:)
/* assume valid esp
* FIXME: this routine should really not assume esp */
/* grab exec state and pass as param in a priv_mcontext_t struct */
PUSH_PRIV_MCXT(0 /* for priv_mcontext_t.pc */)
lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */
/* Call nt_continue_setup passing the priv_mcontext_t. It will
* obtain and initialize this thread's dcontext pointer and
* begin execution with the passed-in state.
*/
CALLC1(GLOBAL_REF(nt_continue_setup), REG_XAX)
/* should not return */
jmp GLOBAL_REF(unexpected_return)
END_FUNC(nt_continue_dynamo_start)
#endif /* WINDOWS */
/* back_from_native_retstubs -- We use a different version of back_from_native for
* each nested module transition. This has to have MAX_NATIVE_RETSTACK
* elements, which we check in native_exec_init(). The size of each entry has
* to match BACK_FROM_NATIVE_RETSTUB_SIZE in arch_exports.h. Currently we
* assume that the assembler uses push imm8 and jmp rel8, but to get that
* to happen for nasm 0.98.40 we're forced to use raw bytes for the pushes. As in
* back_from_native, this code is executed natively by the app, so we assume the
* app stack is valid and can be clobbered.
*/
DECLARE_FUNC(back_from_native_retstubs)
GLOBAL_LABEL(back_from_native_retstubs:)
#ifndef ASSEMBLE_WITH_GAS
/* MASM does short jumps for public symbols. */
# define Lback_from_native GLOBAL_REF(back_from_native)
#endif
RAW(6a) RAW(0) /* push 0 */
jmp short Lback_from_native
RAW(6a) RAW(1) /* push 1 */
jmp short Lback_from_native
RAW(6a) RAW(2) /* push 2 */
jmp short Lback_from_native
RAW(6a) RAW(3) /* push 3 */
jmp short Lback_from_native
RAW(6a) RAW(4) /* push 4 */
jmp short Lback_from_native
RAW(6a) RAW(5) /* push 5 */
jmp short Lback_from_native
RAW(6a) RAW(6) /* push 6 */
jmp short Lback_from_native
RAW(6a) RAW(7) /* push 7 */
jmp short Lback_from_native
RAW(6a) RAW(8) /* push 8 */
jmp short Lback_from_native
RAW(6a) RAW(9) /* push 9 */
jmp short Lback_from_native
DECLARE_GLOBAL(back_from_native_retstubs_end)
#ifndef ASSEMBLE_WITH_GAS
# undef Lback_from_native
#endif
ADDRTAKEN_LABEL(back_from_native_retstubs_end:)
END_FUNC(back_from_native_retstubs)
/*
* back_from_native -- for taking control back after letting a module
* execute natively
* assumptions: app stack is valid
*/
DECLARE_FUNC(back_from_native)
GLOBAL_LABEL(back_from_native:)
#ifdef ASSEMBLE_WITH_GAS
/* We use Lback_from_native to force short jumps with gas. */
Lback_from_native:
#endif
/* assume valid esp
* FIXME: more robust if don't use app's esp -- should use d_r_initstack
*/
/* grab exec state and pass as param in a priv_mcontext_t struct */
PUSH_PRIV_MCXT(0 /* for priv_mcontext_t.pc */)
lea REG_XAX, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */
/* Call return_from_native passing the priv_mcontext_t. It will obtain
* this thread's dcontext pointer and begin execution with the passed-in
* state.
*/
#if defined(X64) || defined(MACOS)
and REG_XSP, -FRAME_ALIGNMENT /* x64 or Mac alignment */
#endif
CALLC1(GLOBAL_REF(return_from_native), REG_XAX)
/* should not return */
jmp GLOBAL_REF(unexpected_return)
END_FUNC(back_from_native)
#ifdef UNIX
/* Like back_from_native, except we're calling from a native module into a
* module that should execute from the code cache. We transfer here from PLT
* stubs generated by create_plt_stub() in core/unix/native_elf.c. See also
* initialize_plt_stub_template(). On entry, next_pc is on the stack for ia32
* and in %r11 for x64. We use %r11 because it is scratch in the sysv amd64
* calling convention.
*/
DECLARE_FUNC(native_plt_call)
GLOBAL_LABEL(native_plt_call:)
PUSH_PRIV_MCXT(0 /* pc */)
lea REG_XAX, [REG_XSP] /* lea priv_mcontext_t */
# ifdef X64
mov REG_XCX, r11 /* next_pc in r11 */
# else
mov REG_XCX, [REG_XSP + PRIV_MCXT_SIZE] /* next_pc on stack */
add DWORD [REG_XAX + MCONTEXT_XSP_OFFS], ARG_SZ /* adjust app xsp for arg */
# endif
CALLC2_FRESH(GLOBAL_REF(native_module_callout), REG_XAX, REG_XCX)
/* If we returned, continue to execute natively on the app stack. */
POP_PRIV_MCXT_GPRS()
# ifdef X64
jmp r11 /* next_pc still in r11 */
# else
ret /* next_pc was on stack */
# endif
END_FUNC(native_plt_call)
#endif /* UNIX */
/* Our version of setjmp & long jmp. We don't want to modify app state like
* SEH or do unwinding which is done by standard versions.
*/
/* Front-end for client use where we don't want to expose our struct layouts,
* yet we must call dr_setjmp directly w/o a call frame in between for
* a proper restore point.
*
* int dr_try_start(try_except_context_t *cxt) ;
*/
DECLARE_EXPORTED_FUNC(dr_try_start)
GLOBAL_LABEL(dr_try_start:)
add ARG1, TRY_CXT_SETJMP_OFFS
jmp GLOBAL_REF(dr_setjmp)
END_FUNC(dr_try_start)
/* int cdecl dr_setjmp(dr_jmp_buf *buf);
*/
DECLARE_FUNC(dr_setjmp)
GLOBAL_LABEL(dr_setjmp:)
#ifdef UNIX
/* PR 206278: for try/except we need to save the signal mask */
mov REG_XDX, ARG1
push REG_XDX /* preserve */
# ifndef X64
lea REG_XSP, [-2*ARG_SZ + REG_XSP] /* maintain align-16: ra + push */
# endif
CALLC1(GLOBAL_REF(dr_setjmp_sigmask), REG_XDX)
# ifndef X64
lea REG_XSP, [2*ARG_SZ + REG_XSP] /* maintain align-16: ra + push */
# endif
pop REG_XDX /* preserve */
#else
mov REG_XDX, ARG1
#endif
mov [ 0 + REG_XDX], REG_XBX
mov [ ARG_SZ + REG_XDX], REG_XCX
mov [2*ARG_SZ + REG_XDX], REG_XDI
mov [3*ARG_SZ + REG_XDX], REG_XSI
mov [4*ARG_SZ + REG_XDX], REG_XBP
mov [5*ARG_SZ + REG_XDX], REG_XSP
mov REG_XAX, [REG_XSP]
mov [6*ARG_SZ + REG_XDX], REG_XAX
#ifdef X64
mov [ 7*ARG_SZ + REG_XDX], r8
mov [ 8*ARG_SZ + REG_XDX], r9
mov [ 9*ARG_SZ + REG_XDX], r10
mov [10*ARG_SZ + REG_XDX], r11
mov [11*ARG_SZ + REG_XDX], r12
mov [12*ARG_SZ + REG_XDX], r13
mov [13*ARG_SZ + REG_XDX], r14
mov [14*ARG_SZ + REG_XDX], r15
#endif
xor eax, eax
ret
END_FUNC(dr_setjmp)
/* int cdecl dr_longjmp(dr_jmp_buf *buf, int retval);
*/
DECLARE_FUNC(dr_longjmp)
GLOBAL_LABEL(dr_longjmp:)
mov REG_XAX, ARG2
mov REG_XDX, ARG1
mov REG_XBX, [ 0 + REG_XDX]
mov REG_XDI, [2*ARG_SZ + REG_XDX]
mov REG_XSI, [3*ARG_SZ + REG_XDX]
mov REG_XBP, [4*ARG_SZ + REG_XDX]
mov REG_XSP, [5*ARG_SZ + REG_XDX] /* now we've switched to the old stack */
mov REG_XCX, [6*ARG_SZ + REG_XDX]
mov [REG_XSP], REG_XCX /* restore the return address on to the stack */
mov REG_XCX, [ ARG_SZ + REG_XDX]
#ifdef X64
mov r8, [ 7*ARG_SZ + REG_XDX]
mov r9, [ 8*ARG_SZ + REG_XDX]
mov r10, [ 9*ARG_SZ + REG_XDX]
mov r11, [10*ARG_SZ + REG_XDX]
mov r12, [11*ARG_SZ + REG_XDX]
mov r13, [12*ARG_SZ + REG_XDX]
mov r14, [13*ARG_SZ + REG_XDX]
mov r15, [14*ARG_SZ + REG_XDX]
#endif
ret
END_FUNC(dr_longjmp)
/*#############################################################################
*#############################################################################
* Utility routines moved here due to the lack of inline asm support
* in VC8.
*/
/* uint atomic_swap(uint *addr, uint value)
* return current contents of addr and replace contents with value.
* on win32 could use InterlockedExchange intrinsic instead.
*/
DECLARE_FUNC(atomic_swap)
GLOBAL_LABEL(atomic_swap:)
mov REG_XAX, ARG2
mov REG_XCX, ARG1 /* nop on win64 (ditto for linux64 if used rdi) */
xchg [REG_XCX], eax
ret
END_FUNC(atomic_swap)
/* bool cpuid_supported(void)
* Checks for existence of the cpuid instr by attempting to modify bit 21 of eflags
*/
DECLARE_FUNC(cpuid_supported)
GLOBAL_LABEL(cpuid_supported:)
PUSHF
pop REG_XAX
mov ecx, eax /* original eflags in ecx */
xor eax, HEX(200000) /* try to modify bit 21 of eflags */
push REG_XAX
POPF
PUSHF
pop REG_XAX
cmp ecx, eax
mov eax, 0 /* zero out top bytes */
setne al
push REG_XCX /* now restore original eflags */
POPF
ret
END_FUNC(cpuid_supported)
/* void our_cpuid(int res[4], int eax, int ecx)
* Executes cpuid instr, which is hard for x64 inline asm b/c clobbers rbx and can't
* push in middle of func.
*/
DECLARE_FUNC(our_cpuid)
GLOBAL_LABEL(our_cpuid:)
mov REG_XAX, ARG1
/* We're clobbering REG_XCX before REG_XDX, because ARG3 is REG_XDX in
* UNIX 64-bit mode.
*/
mov REG_XCX, ARG3
mov REG_XDX, ARG2
push REG_XBX /* callee-saved */
push REG_XDI /* callee-saved */
/* not making a call so don't bother w/ 16-byte stack alignment */
mov REG_XDI, REG_XAX
mov REG_XAX, REG_XDX
cpuid
mov [ 0 + REG_XDI], eax
mov [ 4 + REG_XDI], ebx
mov [ 8 + REG_XDI], ecx
mov [12 + REG_XDI], edx
pop REG_XDI /* callee-saved */
pop REG_XBX /* callee-saved */
ret
END_FUNC(our_cpuid)
/* We could use inline asm on Linux but it's cleaner to share the same code: */
/* void dr_stmxcsr(uint *val) */
#define FUNCNAME dr_stmxcsr
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
stmxcsr [REG_XAX]
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
/* void dr_xgetbv(uint *high, uint *low) */
#define FUNCNAME dr_xgetbv
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
mov REG_XDX, ARG2
push REG_XAX /* high */
push REG_XDX /* low */
mov ecx, 0
/* VS2005 assembler doesn't know xgetbv */
RAW(0f) RAW(01) RAW(d0) /* xgetbv */
pop REG_XCX
mov DWORD [REG_XCX], eax /* low */
pop REG_XCX
mov DWORD [REG_XCX], edx /* high */
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
/* void dr_fxsave(byte *buf_aligned) */
#define FUNCNAME dr_fxsave
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
#ifdef X64
/* VS2005 doesn't know "fxsave64" (and it's "fxsaveq" for gcc 4.4) */
RAW(48) RAW(0f) RAW(ae) RAW(00) /* fxsave64 [REG_XAX] */
#else
fxsave [REG_XAX]
#endif
fnclex
finit
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
/* void dr_fnsave(byte *buf_aligned) */
#define FUNCNAME dr_fnsave
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
/* FIXME: do we need an fwait prior to the fnsave? */
fnsave [REG_XAX]
fwait
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
/* void dr_fxrstor(byte *buf_aligned) */
#define FUNCNAME dr_fxrstor
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
#ifdef X64
/* VS2005 doesn't know "fxrstor64" */
RAW(48) RAW(0f) RAW(ae) RAW(08) /* fxrstor64 [REG_XAX] */
#else
fxrstor [REG_XAX]
#endif
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
/* void dr_frstor(byte *buf_aligned) */
#define FUNCNAME dr_frstor
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
frstor [REG_XAX]
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
#ifdef X64
/* void dr_fxsave32(byte *buf_aligned) */
#define FUNCNAME dr_fxsave32
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
fxsave [REG_XAX]
fnclex
finit
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
/* void dr_fxrstor32(byte *buf_aligned) */
#define FUNCNAME dr_fxrstor32
DECLARE_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
mov REG_XAX, ARG1
fxrstor [REG_XAX]
ret
END_FUNC(FUNCNAME)
#undef FUNCNAME
#endif
#ifdef WINDOWS /* on linux we use inline asm versions */
/*
* void call_modcode_alt_stack(dcontext_t *dcontext,
* EXCEPTION_RECORD *pExcptRec,
* CONTEXT *cxt, app_pc target, uint flags,
* bool using_initstack, fragment_t *f)
* custom routine used to transfer control from check_for_modified_code()
* to found_modified_code() win32/callback.c.
*/
#define dcontext ARG1
#define pExcptRec ARG2
#define cxt ARG3
#define target ARG4
#define flags ARG5
#define using_initstack ARG6
#define fragment ARG7
DECLARE_FUNC(call_modcode_alt_stack)
GLOBAL_LABEL(call_modcode_alt_stack:)
mov REG_XAX, dcontext /* be careful not to clobber other in-reg params */
mov REG_XBX, pExcptRec
mov REG_XDI, cxt
mov REG_XSI, target
mov REG_XDX, flags
mov REG_XCX, fragment
/* bool is byte-sized but rest should be zeroed as separate param */
cmp using_initstack, 0
je call_modcode_alt_stack_no_free
mov DWORD SYMREF(initstack_mutex), 0 /* rip-relative on x64 */
call_modcode_alt_stack_no_free:
RESTORE_FROM_DCONTEXT_VIA_REG(REG_XAX,dstack_OFFSET,REG_XSP)
CALLC6(GLOBAL_REF(found_modified_code), REG_XAX, REG_XBX, REG_XDI, REG_XSI, REG_XDX, REG_XCX)
/* should never return */
jmp GLOBAL_REF(unexpected_return)
ret
END_FUNC(call_modcode_alt_stack)
#undef dcontext
#undef pExcptRec
#undef cxt
#undef target
#undef flags
#undef using_initstack
/* void call_intr_excpt_alt_stack(dcontext_t *dcontext, EXCEPTION_RECORD *pExcptRec,
* CONTEXT *cxt, byte *stack, bool is_client)
*
* Routine to switch to a separate exception stack before calling
* internal_exception_info(). This switch is useful if the dstack
* is exhausted and we want to ensure we have enough space for
* error reporting.
*/
#define dcontext ARG1
#define pExcptRec ARG2
#define cxt ARG3
#define stack ARG4
#define is_client ARG5
DECLARE_FUNC(call_intr_excpt_alt_stack)
GLOBAL_LABEL(call_intr_excpt_alt_stack:)
mov REG_XAX, dcontext
mov REG_XBX, pExcptRec
mov REG_XDI, cxt
mov REG_XBP, is_client
mov REG_XSI, REG_XSP
mov REG_XSP, stack
# ifdef X64
/* retaddr + this push => 16-byte alignment prior to call */
# endif
push REG_XSI /* save xsp */
CALLC5(GLOBAL_REF(internal_exception_info), \
REG_XAX /* dcontext */, \
REG_XBX /* pExcptRec */, \
REG_XDI /* cxt */, \
1 /* dstack overflow == true */, \
REG_XBP /* is_client */)
pop REG_XSP
ret
END_FUNC(call_intr_excpt_alt_stack)
#undef dcontext
#undef pExcptRec
#undef cxt
#undef stack
/* CONTEXT.Seg* is WORD for x64 but DWORD for x86 */
#ifdef X64
# define REG_XAX_SEGWIDTH ax
#else
# define REG_XAX_SEGWIDTH eax
#endif
/* Need a second volatile register for any calling convention. In all
* conventions, XCX is volatile, but it's ARG4 on Lin64 and ARG1 on Win64.
* Using XCX on Win64 is fine, but on Lin64 it clobbers ARG4 so we use XDI as
* the free reg instead.
*/
#if defined(UNIX) && defined(X64)
# define FREE_REG rdi
#else
# define FREE_REG REG_XCX
#endif
/* void get_segments_defg(cxt_seg_t *ds, cxt_seg_t *es, cxt_seg_t *fs, cxt_seg_t *gs) */
DECLARE_FUNC(get_segments_defg)
GLOBAL_LABEL(get_segments_defg:)
xor eax, eax /* Zero XAX, use it for reading segments. */
mov FREE_REG, ARG1
mov ax, ds
mov [FREE_REG], REG_XAX_SEGWIDTH
mov FREE_REG, ARG2
mov ax, es
mov [FREE_REG], REG_XAX_SEGWIDTH
mov FREE_REG, ARG3
mov ax, fs
mov [FREE_REG], REG_XAX_SEGWIDTH
mov FREE_REG, ARG4
mov ax, gs
mov [FREE_REG], REG_XAX_SEGWIDTH
ret
END_FUNC(get_segments_defg)
/* void get_segments_cs_ss(cxt_seg_t *cs, cxt_seg_t *ss) */
DECLARE_FUNC(get_segments_cs_ss)
GLOBAL_LABEL(get_segments_cs_ss:)
xor eax, eax /* Zero XAX, use it for reading segments. */
mov FREE_REG, ARG1
mov ax, cs
mov [FREE_REG], REG_XAX_SEGWIDTH
mov FREE_REG, ARG2
mov ax, ss
mov [FREE_REG], REG_XAX_SEGWIDTH
ret
END_FUNC(get_segments_cs_ss)
#undef FREE_REG
#undef REG_XAX_SEGWIDTH
/* void get_own_context_helper(CONTEXT *cxt)
* does not fix up xsp to match the call site
* does not preserve callee-saved registers
*/
DECLARE_FUNC(get_own_context_helper)
GLOBAL_LABEL(get_own_context_helper:)
/* push callee-saved registers that we use only */
push REG_XBX
push REG_XSI
push REG_XDI
#ifdef X64
/* w/ retaddr, we're now at 16-byte alignment */
/* save argument register (PUSH_PRIV_MCXT calls out to c code) */
mov REG_XDI, ARG1
#endif
/* grab exec state and pass as param in a priv_mcontext_t struct */
/* use retaddr for pc */
PUSH_PRIV_MCXT([(3 * ARG_SZ) + REG_XSP - PUSH_PRIV_MCXT_PRE_PC_SHIFT])
/* we don't have enough registers to avoid parameter regs so we carefully
* use the suggested register order
*/
lea REG_XSI, [REG_XSP] /* stack grew down, so priv_mcontext_t at tos */
#ifdef X64
mov REG_XAX, REG_XDI
#else
/* 4 * arg_sz = 3 callee saved registers pushed to stack plus return addr */
mov REG_XAX, [PRIV_MCXT_SIZE + (4 * ARG_SZ) + REG_XSP]
#endif
xor edi, edi
mov di, ss
xor ebx, ebx
mov bx, cs
CALLC4(GLOBAL_REF(get_own_context_integer_control), REG_XAX, REG_XBX, REG_XDI, REG_XSI)
add REG_XSP, PRIV_MCXT_SIZE
pop REG_XDI
pop REG_XSI
pop REG_XBX
ret
END_FUNC(get_own_context_helper)
#endif /* WINDOWS */
/* void get_xmm_caller_saved(byte *xmm_caller_saved_buf)
* stores the values of xmm0 through xmm5 consecutively into xmm_caller_saved_buf.
* xmm_caller_saved_buf need not be 16-byte aligned.
* for linux, also saves xmm6-15 (PR 302107).
* caller must ensure that the underlying processor supports SSE!
* FIXME PR 266305: AMD optimization guide says to use movlps+movhps for unaligned
* stores, instead of movups (movups is best for loads): but for
* simplicity I'm sticking with movups (assumed not perf-critical here).
*/
DECLARE_FUNC(get_xmm_caller_saved)
GLOBAL_LABEL(get_xmm_caller_saved:)
mov REG_XAX, ARG1
movups [REG_XAX + 0*MCXT_SIMD_SLOT_SIZE], xmm0
movups [REG_XAX + 1*MCXT_SIMD_SLOT_SIZE], xmm1
movups [REG_XAX + 2*MCXT_SIMD_SLOT_SIZE], xmm2
movups [REG_XAX + 3*MCXT_SIMD_SLOT_SIZE], xmm3
movups [REG_XAX + 4*MCXT_SIMD_SLOT_SIZE], xmm4
movups [REG_XAX + 5*MCXT_SIMD_SLOT_SIZE], xmm5
#ifdef UNIX
movups [REG_XAX + 6*MCXT_SIMD_SLOT_SIZE], xmm6
movups [REG_XAX + 7*MCXT_SIMD_SLOT_SIZE], xmm7
#endif
#if defined(UNIX) && defined(X64)
movups [REG_XAX + 8*MCXT_SIMD_SLOT_SIZE], xmm8
movups [REG_XAX + 9*MCXT_SIMD_SLOT_SIZE], xmm9
movups [REG_XAX + 10*MCXT_SIMD_SLOT_SIZE], xmm10
movups [REG_XAX + 11*MCXT_SIMD_SLOT_SIZE], xmm11
movups [REG_XAX + 12*MCXT_SIMD_SLOT_SIZE], xmm12
movups [REG_XAX + 13*MCXT_SIMD_SLOT_SIZE], xmm13
movups [REG_XAX + 14*MCXT_SIMD_SLOT_SIZE], xmm14
movups [REG_XAX + 15*MCXT_SIMD_SLOT_SIZE], xmm15
#endif
ret
END_FUNC(get_xmm_caller_saved)
/* void get_ymm_caller_saved(byte *ymm_caller_saved_buf)
* stores the values of ymm0 through ymm5 consecutively into ymm_caller_saved_buf.
* ymm_caller_saved_buf need not be 32-byte aligned.
* for linux, also saves ymm6-15 (PR 302107).
* The caller must ensure that the underlying processor supports AVX!
*/
DECLARE_FUNC(get_ymm_caller_saved)
GLOBAL_LABEL(get_ymm_caller_saved:)
mov REG_XAX, ARG1
/* i#441: Some compilers need one of the architectural flags set (e.g. -mavx or
* -march=skylake-avx512), which would cause DynamoRIO to be less (or un-)
* portable or cause frequency scaling (i#3169). We just put in the raw bytes
* for these instrs:
* Note the 64/32 bit have the same encoding for either rax or eax.
* c5 fe 7f 00 vmovdqu %ymm0,0x00(%xax)
* c5 fe 7f 48 40 vmovdqu %ymm1,0x40(%xax)
* c5 fe 7f 90 80 00 00 00 vmovdqu %ymm2,0x80(%xax)
* c5 fe 7f 98 c0 00 00 00 vmovdqu %ymm3,0xc0(%xax)
* c5 fe 7f a0 00 01 00 00 vmovdqu %ymm4,0x100(%xax)
* c5 fe 7f a8 40 01 00 00 vmovdqu %ymm5,0x140(%xax)
*/
RAW(c5) RAW(fe) RAW(7f) RAW(00)
RAW(c5) RAW(fe) RAW(7f) RAW(48) RAW(40)
RAW(c5) RAW(fe) RAW(7f) RAW(90) RAW(80) RAW(00) RAW(00) RAW(00)
RAW(c5) RAW(fe) RAW(7f) RAW(98) RAW(c0) RAW(00) RAW(00) RAW(00)
RAW(c5) RAW(fe) RAW(7f) RAW(a0) RAW(00) RAW(01) RAW(00) RAW(00)
RAW(c5) RAW(fe) RAW(7f) RAW(a8) RAW(40) RAW(01) RAW(00) RAW(00)
#ifdef UNIX
/*
* c5 fe 7f b0 80 01 00 00 vmovdqu %ymm6,0x180(%xax)
* c5 fe 7f b8 c0 01 00 00 vmovdqu %ymm7,0x1c0(%xax)
*/
RAW(c5) RAW(fe) RAW(7f) RAW(b0) RAW(80) RAW(01) RAW(00) RAW(00)
RAW(c5) RAW(fe) RAW(7f) RAW(b8) RAW(c0) RAW(01) RAW(00) RAW(00)
# ifdef X64
/*
* c5 7e 7f 80 00 02 00 00 vmovdqu %ymm8,0x200(%xax)
* c5 7e 7f 88 40 02 00 00 vmovdqu %ymm9,0x240(%xax)
* c5 7e 7f 90 80 02 00 00 vmovdqu %ymm10,0x280(%xax)
* c5 7e 7f 98 c0 02 00 00 vmovdqu %ymm11,0x2c0(%xax)
* c5 7e 7f a0 00 03 00 00 vmovdqu %ymm12,0x300(%xax)
* c5 7e 7f a8 40 03 00 00 vmovdqu %ymm13,0x340(%xax)
* c5 7e 7f b0 80 03 00 00 vmovdqu %ymm14,0x380(%xax)
* c5 7e 7f b8 c0 03 00 00 vmovdqu %ymm15,0x3c0(%xax)
*/
RAW(c5) RAW(7e) RAW(7f) RAW(80) RAW(00) RAW(02) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(88) RAW(40) RAW(02) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(90) RAW(80) RAW(02) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(98) RAW(c0) RAW(02) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(a0) RAW(00) RAW(03) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(a8) RAW(40) RAW(03) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(b0) RAW(80) RAW(03) RAW(00) RAW(00)
RAW(c5) RAW(7e) RAW(7f) RAW(b8) RAW(c0) RAW(03) RAW(00) RAW(00)
# endif
#endif
ret
END_FUNC(get_ymm_caller_saved)
/* void get_zmm_caller_saved(byte *zmm_caller_saved_buf)
* stores the values of zmm0 through zmm31 consecutively into zmm_caller_saved_buf.
* zmm_caller_saved_buf need not be 64-byte aligned.
* The caller must ensure that the underlying processor supports AVX-512!
*/
DECLARE_FUNC(get_zmm_caller_saved)
GLOBAL_LABEL(get_zmm_caller_saved:)
mov REG_XAX, ARG1
/* i#441: Some compilers need one of the architectural flags set (e.g. -mavx or
* -march=skylake-avx512), which would cause DynamoRIO to be less (or un-)
* portable or cause frequency scaling (i#3169). We just put in the raw bytes
* for these instrs:
* Note the 64/32 bit have the same encoding for either rax or eax.
* Note the encodings are using the EVEX scaled compressed displacement form.
* 62 f1 fe 48 7f 00 vmovdqu64 %zmm0,0x00(%xax)
* 62 f1 fe 48 7f 48 01 vmovdqu64 %zmm1,0x40(%xax)
* 62 f1 fe 48 7f 50 02 vmovdqu64 %zmm2,0x80(%xax)
* 62 f1 fe 48 7f 58 03 vmovdqu64 %zmm3,0xc0(%xax)
* 62 f1 fe 48 7f 60 04 vmovdqu64 %zmm4,0x100(%xax)
* 62 f1 fe 48 7f 68 05 vmovdqu64 %zmm5,0x140(%xax)
* 62 f1 fe 48 7f 70 06 vmovdqu64 %zmm6,0x180(%xax)
* 62 f1 fe 48 7f 78 07 vmovdqu64 %zmm7,0x1c0(%xax)
*/
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(00)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(48) RAW(01)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(50) RAW(02)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(58) RAW(03)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(60) RAW(04)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(68) RAW(05)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(70) RAW(06)
RAW(62) RAW(f1) RAW(fe) RAW(48) RAW(7f) RAW(78) RAW(07)
#ifdef X64
/* 62 71 fe 48 7f 40 08 vmovdqu64 %zmm8,0x200(%xax)
* 62 71 fe 48 7f 48 09 vmovdqu64 %zmm9,0x240(%xax)
* 62 71 fe 48 7f 50 0a vmovdqu64 %zmm10,0x280(%xax)
* 62 71 fe 48 7f 58 0b vmovdqu64 %zmm11,0x2c0(%xax)
* 62 71 fe 48 7f 60 0c vmovdqu64 %zmm12,0x300(%xax)
* 62 71 fe 48 7f 68 0d vmovdqu64 %zmm13,0x340(%xax)
* 62 71 fe 48 7f 70 0e vmovdqu64 %zmm14,0x380(%xax)
* 62 71 fe 48 7f 78 0f vmovdqu64 %zmm15,0x3c0(%xax)
* 62 e1 fe 48 7f 40 10 vmovdqu64 %zmm16,0x400(%xax)
* 62 e1 fe 48 7f 48 11 vmovdqu64 %zmm17,0x440(%xax)
* 62 e1 fe 48 7f 50 12 vmovdqu64 %zmm18,0x480(%xax)
* 62 e1 fe 48 7f 58 13 vmovdqu64 %zmm19,0x4c0(%xax)
* 62 e1 fe 48 7f 60 14 vmovdqu64 %zmm20,0x500(%xax)
* 62 e1 fe 48 7f 68 15 vmovdqu64 %zmm21,0x540(%xax)
* 62 e1 fe 48 7f 70 16 vmovdqu64 %zmm22,0x580(%xax)
* 62 e1 fe 48 7f 78 17 vmovdqu64 %zmm23,0x5c0(%xax)
* 62 61 fe 48 7f 40 18 vmovdqu64 %zmm24,0x600(%xax)
* 62 61 fe 48 7f 48 19 vmovdqu64 %zmm25,0x640(%xax)
* 62 61 fe 48 7f 50 1a vmovdqu64 %zmm26,0x680(%xax)
* 62 61 fe 48 7f 58 1b vmovdqu64 %zmm27,0x6c0(%xax)
* 62 61 fe 48 7f 60 1c vmovdqu64 %zmm28,0x700(%xax)
* 62 61 fe 48 7f 68 1d vmovdqu64 %zmm29,0x740(%xax)
* 62 61 fe 48 7f 70 1e vmovdqu64 %zmm30,0x780(%xax)
* 62 61 fe 48 7f 78 1f vmovdqu64 %zmm31,0x7c0(%xax)
*/
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(40) RAW(08)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(48) RAW(09)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(50) RAW(0a)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(58) RAW(0b)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(60) RAW(0c)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(68) RAW(0d)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(70) RAW(0e)
RAW(62) RAW(71) RAW(fe) RAW(48) RAW(7f) RAW(78) RAW(0f)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(40) RAW(10)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(48) RAW(11)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(50) RAW(12)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(58) RAW(13)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(60) RAW(14)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(68) RAW(15)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(70) RAW(16)
RAW(62) RAW(e1) RAW(fe) RAW(48) RAW(7f) RAW(78) RAW(17)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(40) RAW(18)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(48) RAW(19)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(50) RAW(1a)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(58) RAW(1b)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(60) RAW(1c)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(68) RAW(1d)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(70) RAW(1e)
RAW(62) RAW(61) RAW(fe) RAW(48) RAW(7f) RAW(78) RAW(1f)
#endif
ret
END_FUNC(get_zmm_caller_saved)
/* void get_opmask_caller_saved(byte *opmask_caller_saved_buf)
* stores the values of k0 through k7 consecutively in 8 byte slots each into
* opmask_caller_saved_buf. opmask_caller_saved_buf need not be 8-byte aligned.
* The caller must ensure that the underlying processor supports AVX-512.
*/
DECLARE_FUNC(get_opmask_caller_saved)
GLOBAL_LABEL(get_opmask_caller_saved:)
mov REG_XAX, ARG1
/*
* c5 f8 91 00 kmovw %k0,(%rax)
* c5 f8 91 48 08 kmovw %k1,0x8(%rax)
* c5 f8 91 50 10 kmovw %k2,0x10(%rax)
* c5 f8 91 58 18 kmovw %k3,0x18(%rax)
* c5 f8 91 60 20 kmovw %k4,0x20(%rax)
* c5 f8 91 68 28 kmovw %k5,0x28(%rax)
* c5 f8 91 70 30 kmovw %k6,0x30(%rax)
* c5 f8 91 78 38 kmovw %k7,0x38(%rax)
*/
RAW(c5) RAW(f8) RAW(91) RAW(00)
RAW(c5) RAW(f8) RAW(91) RAW(48) RAW(08)
RAW(c5) RAW(f8) RAW(91) RAW(50) RAW(10)
RAW(c5) RAW(f8) RAW(91) RAW(58) RAW(18)
RAW(c5) RAW(f8) RAW(91) RAW(60) RAW(20)
RAW(c5) RAW(f8) RAW(91) RAW(68) RAW(28)
RAW(c5) RAW(f8) RAW(91) RAW(70) RAW(30)
RAW(c5) RAW(f8) RAW(91) RAW(78) RAW(38)
ret
END_FUNC(get_opmask_caller_saved)
/* void hashlookup_null_handler(void)
* PR 305731: if the app targets NULL, it ends up here, which indirects
* through hashlookup_null_target to end up in an ibl miss routine.
*/
DECLARE_FUNC(hashlookup_null_handler)
GLOBAL_LABEL(hashlookup_null_handler:)
#if !defined(X64) && defined(LINUX)
/* We don't have any free registers to make this PIC so we patch
* this up. It would be better to generate than patch .text,
* but we need a static address to reference in null_fragment
* (though if we used shared ibl target_delete we could
* set our final address prior to using null_fragment anywhere).
*/
jmp .+130 /* force long jump for patching: i#1895 */
#else
jmp PTRSZ SYMREF(hashlookup_null_target) /* rip-relative on x64 */
#endif
END_FUNC(hashlookup_null_handler)
/* Declare these labels global so we can take their addresses in C. pre, mid,
* and post are defined by REP_STRING_OP().
*/
DECLARE_GLOBAL(safe_read_asm_pre)
DECLARE_GLOBAL(safe_read_asm_mid)
DECLARE_GLOBAL(safe_read_asm_post)
DECLARE_GLOBAL(safe_read_asm_recover)
/* i#350: We implement safe_read in assembly and save the PCs that can fault.
* If these PCs fault, we return from the signal handler to the epilog, which
* can recover. We return the source pointer from XSI, and the caller uses this
* to determine how many bytes were copied and whether it matches size.
*
* XXX: Do we care about differentiating whether the read or write faulted?
* Currently this is just "safe_memcpy", and we recover regardless of whether
* the read or write faulted.
*
* void *
* safe_read_asm(void *dst, const void *src, size_t n);
*/
DECLARE_FUNC(safe_read_asm)
GLOBAL_LABEL(safe_read_asm:)
ARGS_TO_XDI_XSI_XDX() /* dst=xdi, src=xsi, n=xdx */
/* Copy xdx bytes, align on src. */
REP_STRING_OP(safe_read_asm, REG_XSI, movs)
ADDRTAKEN_LABEL(safe_read_asm_recover:)
mov REG_XAX, REG_XSI /* Return cur_src */
RESTORE_XDI_XSI()
ret
END_FUNC(safe_read_asm)
#ifdef UNIX
DECLARE_GLOBAL(safe_read_tls_magic)
DECLARE_GLOBAL(safe_read_tls_magic_recover)
DECLARE_GLOBAL(safe_read_tls_self)
DECLARE_GLOBAL(safe_read_tls_self_recover)
DECLARE_GLOBAL(safe_read_tls_app_self)
DECLARE_GLOBAL(safe_read_tls_app_self_recover)
DECLARE_FUNC(safe_read_tls_magic)
GLOBAL_LABEL(safe_read_tls_magic:)
/* gas won't accept "SEG_TLS:" in the memref so we have to fool it by
* using it as a prefix:
*/
SEG_TLS
mov eax, DWORD [TLS_MAGIC_OFFSET_ASM]
ADDRTAKEN_LABEL(safe_read_tls_magic_recover:)
/* our signal handler sets xax to 0 for us on a fault */
ret
END_FUNC(safe_read_tls_magic)
DECLARE_FUNC(safe_read_tls_self)
GLOBAL_LABEL(safe_read_tls_self:)
/* see comment in safe_read_tls_magic */
SEG_TLS
mov REG_XAX, PTRSZ [TLS_SELF_OFFSET_ASM]
ADDRTAKEN_LABEL(safe_read_tls_self_recover:)
/* our signal handler sets xax to 0 for us on a fault */
ret
END_FUNC(safe_read_tls_self)
DECLARE_FUNC(safe_read_tls_app_self)
GLOBAL_LABEL(safe_read_tls_app_self:)
/* see comment in safe_read_tls_magic */
LIB_SEG_TLS
mov REG_XAX, PTRSZ [TLS_APP_SELF_OFFSET_ASM]
ADDRTAKEN_LABEL(safe_read_tls_app_self_recover:)
/* our signal handler sets xax to 0 for us on a fault */
ret
END_FUNC(safe_read_tls_app_self)
#endif
#ifdef UNIX
/* Replacement for _dl_runtime_resolve() used for catching module transitions
* out of native modules.
*/
DECLARE_FUNC(_dynamorio_runtime_resolve)
GLOBAL_LABEL(_dynamorio_runtime_resolve:)
# ifdef X64
/* Preserve all 6 argument registers and rax (num fp reg args). */
push rax
push rdi
push rsi
push rdx
push rcx
push r8
push r9
/* Should be 16-byte aligned now: retaddr, 2 args, 7 regs. */
mov rdi, [rsp + 7 * ARG_SZ] /* link map */
mov rsi, [rsp + 8 * ARG_SZ] /* .dynamic index */
CALLC0(GLOBAL_REF(dynamorio_dl_fixup))
mov r11, rax /* preserve */
pop r9
pop r8
pop rcx
pop rdx
pop rsi
pop rdi
pop rax
add rsp, 16 /* clear args */
jmp r11 /* Jump to resolved PC, or into DR. */
# else /* !X64 */
push REG_XAX
push REG_XCX
mov REG_XAX, [REG_XSP + 2 * ARG_SZ] /* link map */
mov REG_XCX, [REG_XSP + 3 * ARG_SZ] /* .dynamic index */
# ifdef MACOS
lea REG_XSP, [-1*ARG_SZ + REG_XSP] /* maintain align-16: ra + push x2 */
# endif
CALLC2(GLOBAL_REF(dynamorio_dl_fixup), REG_XAX, REG_XCX)
# ifdef MACOS
lea REG_XSP, [1*ARG_SZ + REG_XSP] /* maintain align-16: ra + push x2 */
# endif
mov [REG_XSP + 2 * ARG_SZ], REG_XAX /* overwrite arg1 */
pop REG_XCX
pop REG_XAX
ret 4 /* ret to target, pop arg2 */
# endif /* !X64 */
END_FUNC(_dynamorio_runtime_resolve)
#endif /* UNIX */
/***************************************************************************/
#if defined(WINDOWS) && !defined(X64)
/* Routines to switch to 64-bit mode from 32-bit WOW64, make a 64-bit
* call, and then return to 32-bit mode.
*/
/* Some now live in x86_shared.asm */
/*
* DR_API ptr_int_t
* dr_invoke_x64_routine(dr_auxlib64_routine_ptr_t func64, uint num_params, ...)
*/
# undef FUNCNAME
# define FUNCNAME dr_invoke_x64_routine
DECLARE_EXPORTED_FUNC(FUNCNAME)
GLOBAL_LABEL(FUNCNAME:)
/* This is 32-bit so we just need the stack ptr to locate all the args */
mov eax, esp
/* save callee-saved registers */
push ebx
/* far jmp to next instr w/ 64-bit switch: jmp 0033:<inv64_transfer_to_64> */
RAW(ea)
DD offset inv64_transfer_to_64
DB CS64_SELECTOR
RAW(00)
inv64_transfer_to_64:
/* Below here is executed in 64-bit mode, but with guarantees that
* no address is above 4GB, as this is a WOW64 process.
*/
/* Save WOW64 state.
* FIXME: if the x64 code makes any callbacks, not only do we need
* a wrapper to go back to x86 mode but we need to restore these
* values in case the x86 callback invokes any syscalls!
* Really messy and fragile.
*/
RAW(41) push esp /* push r12 */
RAW(41) push ebp /* push r13 */
RAW(41) push esi /* push r14 */
RAW(41) push edi /* push r15 */
/* align the stack pointer */
mov ebx, esp /* save esp in callee-preserved reg */
sub esp, 32 /* call conv */
mov ecx, dword ptr [12 + eax] /* #args (func64 takes two slots) */
sub ecx, 4
jle inv64_arg_copy_done
shl ecx, 3 /* (#args-4)*8 */
sub esp, ecx /* slots for args */
and esp, HEX(fffffff0) /* align to 16-byte boundary */
/* copy the args to their stack slots (simpler to copy the 1st 4 too) */
mov ecx, dword ptr [12 + eax] /* #args */
cmp ecx, 0
je inv64_arg_copy_done
inv64_arg_copy_loop:
mov edx, dword ptr [12 + 4*ecx + eax] /* ecx = 1-based arg ordinal */
/* FIXME: sign-extension is not always what the user wants.
* But the only general way to solve it would be to take in type codes
* for each arg!
*/
RAW(48) RAW(63) RAW(d2) /* movsxd rdx, edx (sign-extend) */
RAW(48) /* qword ptr */
mov dword ptr [-8 + 8*ecx + esp], edx
sub ecx, 1 /* we can't use "dec" as it will be encoded wrong! */
jnz inv64_arg_copy_loop
inv64_arg_copy_done:
/* put the 1st 4 args into their reg slots */
mov ecx, dword ptr [12 + eax] /* #args */
cmp ecx, 4
jl inv64_arg_lt4
mov edx, dword ptr [12 + 4*4 + eax] /* 1-based arg ordinal */
RAW(4c) RAW(63) RAW(ca) /* movsxd r9, edx */
inv64_arg_lt4:
cmp ecx, 3
jl inv64_arg_lt3
mov edx, dword ptr [12 + 4*3 + eax] /* 1-based arg ordinal */
RAW(4c) RAW(63) RAW(c2) /* movsxd r8, edx */
inv64_arg_lt3:
cmp ecx, 2
jl inv64_arg_lt2
mov edx, dword ptr [12 + 4*2 + eax] /* 1-based arg ordinal */
RAW(48) RAW(63) RAW(d2) /* movsxd rdx, edx (sign-extend) */
inv64_arg_lt2:
cmp ecx, 1
jl inv64_arg_lt1
mov ecx, dword ptr [12 + 4*1 + eax] /* 1-based arg ordinal */
RAW(48) RAW(63) RAW(c9) /* movsxd rcx, ecx (sign-extend) */
inv64_arg_lt1:
/* make the call */
RAW(48) /* qword ptr */
mov eax, dword ptr [4 + eax] /* func64 */
RAW(48) call eax
/* get top 32 bits of return value into edx for 64-bit x86 return value */
RAW(48) mov edx, eax
RAW(48) shr edx, 32
mov esp, ebx /* restore esp */
/* restore WOW64 state */
RAW(41) pop edi /* pop r15 */
RAW(41) pop esi /* pop r14 */
RAW(41) pop ebp /* pop r13 */
RAW(41) pop esp /* pop r12 */
/* far jmp to next instr w/ 32-bit switch: jmp 0023:<inv64_return_to_32> */
push offset inv64_return_to_32 /* 8-byte push */
mov dword ptr [esp + 4], CS32_SELECTOR /* top 4 bytes of prev push */
jmp fword ptr [esp]
inv64_return_to_32:
add esp, 8 /* clean up far jmp target */
pop ebx /* restore callee-saved reg */
ret /* return value in edx:eax */
END_FUNC(FUNCNAME)
#endif /* defined(WINDOWS) && !defined(X64) */
/***************************************************************************/
#ifdef WINDOWS
/* void dynamorio_earliest_init_takeover(void)
*
* Called from hook code for earliest injection.
* Since we want to resume at the hooked app code as though nothing
* happened w/o going first to hooking code to restore regs, caller
* passed us args pointed at by xax. We then preserve regs and call
* C code. C code takes over when it returns to use. We restore
* regs and return to app code.
* Executes on app stack but we assume app stack is fine at this point.
*
* We've pushed a retaddr on the stack, but we expect all our takeover
* points to be at function entry where the app's retaddr was just pushed
* and thus stack alignment was at +ptrsz and is +2*ptrsz on entry here.
*/
DECLARE_EXPORTED_FUNC(dynamorio_earliest_init_takeover)
GLOBAL_LABEL(dynamorio_earliest_init_takeover:)
push REG_XAX /* Save xax (PUSH_PRIV_MCXT clobbers it). */
lea REG_XSP, [REG_XSP - ARG_SZ] /* Align stack whether 32 or 64-bit. */
PUSH_PRIV_MCXT(PTRSZ [REG_XSP + 2*ARG_SZ -\
PUSH_PRIV_MCXT_PRE_PC_SHIFT]) /* Return address as pc. */
# ifdef EARLIEST_INIT_DEBUGBREAK
/* giant loop so can attach debugger, then change ebx to 1
* to step through rest of code */
mov ebx, HEX(7fffffff)
dynamorio_earliest_init_repeat_outer:
mov esi, HEX(7fffffff)
dynamorio_earliest_init_repeatme:
dec esi
cmp esi, 0
jg dynamorio_earliest_init_repeatme
dec ebx
cmp ebx, 0
jg dynamorio_earliest_init_repeat_outer
# endif
lea REG_XDX, [REG_XSP] /* Pointer to priv_mcontext_t. */
/* Fix up app's xsp from the retaddr + push + align we did. */
mov REG_XAX, PTRSZ [REG_XSP + MCONTEXT_XSP_OFFS]
lea REG_XAX, [REG_XAX + 3*ARG_SZ]
mov PTRSZ [REG_XSP + MCONTEXT_XSP_OFFS], REG_XAX
/* Load passed-in xax which points to the arg struct. */
mov REG_XAX, PTRSZ [REG_XSP + PRIV_MCXT_SIZE + ARG_SZ]
/* Load earliest_args_t.app_xax, written by our gencode. */
mov REG_XCX, PTRSZ [REG_XAX]
/* Store into xax slot on stack. */
mov PTRSZ [REG_XSP + MCONTEXT_XAX_OFFS], REG_XCX
CALLC2(GLOBAL_REF(dynamorio_earliest_init_takeover_C), REG_XAX, REG_XDX)
/* We will either be under DR control or running natively at this point. */
/* Restore. */
POP_PRIV_MCXT_GPRS()
lea REG_XSP, [REG_XSP + 2*ARG_SZ] /* Undo align + push. */
ret
END_FUNC(dynamorio_earliest_init_takeover)
#endif /* WINDOWS */
END_FILE