* Copyright (c) 2010-2022 Google, Inc. All rights reserved.
* Copyright (c) 2000-2010 VMware, Inc. All rights reserved.
* **********************************************************/
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of VMware, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
* The Pentium processors maintain cache consistency in hardware, so we don't
* worry about getting stale cache entries.
*/
#include "../globals.h"
#include "../fragment.h"
#include "../fcache.h"
#include "arch.h"
#include "instr.h"
#include "instr_create_shared.h"
#include "instrlist.h"
#include "instrument.h"
#include "decode_private.h"
#define PRE instrlist_meta_preinsert
#define APP instrlist_meta_append
direct branch exit_stub:
5x8 mov %xax, xax_offs(&dcontext) or tls
<we used to support PROFILE_LINKCOUNT with a counter inc here but no more>
5x10 mov &linkstub, %xax
5 jmp target addr
indirect branch exit_stub (only used if -indirect_stubs):
6x9 mov %xbx, xbx_offs(&dcontext) or tls
5x11 mov &linkstub, %xbx
5 jmp indirect_branch_lookup
indirect branches use xbx so that the flags can be saved into xax using
the lahf instruction!
xref PR 249775 on lahf support on x64.
also see emit_inline_ibl_stub() below
*/
* a different memory space with self-protection
*/
#define UNPROT_OFFS(dcontext, offs) \
(TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask) \
? (((ptr_uint_t)((dcontext)->upcontext.separate_upcontext)) + (offs)) \
: (((ptr_uint_t)(dcontext)) + (offs)))
* function is safe with respect to a thread executing the code containing
* this target, presuming that the code in both the before and after states
* is valid, and that [pc, pc+4) does not cross a cache line.
* For x64 this routine only works for 32-bit reachability. If further
* reach is needed the caller must use indirection. Xref PR 215395.
*/
byte *
insert_relative_target(byte *pc, cache_pc target, bool hot_patch)
{
*/
int value = (int)(ptr_int_t)(target - pc - 4);
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(target - pc - 4)));
ATOMIC_4BYTE_WRITE(vmcode_get_writable_addr(pc), value, hot_patch);
pc += 4;
return pc;
}
byte *
insert_relative_jump(byte *pc, cache_pc target, bool hot_patch)
{
int value;
ASSERT(pc != NULL);
*(vmcode_get_writable_addr(pc)) = JMP_OPCODE;
pc++;
CHECK_JMP_TARGET_ALIGNMENT(pc, 4, hot_patch);
value = (int)(ptr_int_t)(target - pc - 4);
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(target - pc - 4)));
*(int *)(vmcode_get_writable_addr(pc)) = value;
pc += 4;
return pc;
}
bool
exit_cti_reaches_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc target_pc)
{
return true;
}
void
patch_stub(fragment_t *f, cache_pc stub_pc, cache_pc target_pc, cache_pc target_prefix_pc,
bool hot_patch)
{
ASSERT_NOT_REACHED();
}
bool
stub_is_patched(dcontext_t *dcontext, fragment_t *f, cache_pc stub_pc)
{
return false;
}
void
unpatch_stub(dcontext_t *dcontext, fragment_t *f, cache_pc stub_pc, bool hot_patch)
{
}
* The write that actually patches the branch is done atomically so this
* function is safe with respect to a thread executing this branch presuming
* that both the before and after targets are valid and that [pc, pc+4) does
* not cross a cache line.
*/
void
patch_branch(dr_isa_mode_t isa_mode, cache_pc branch_pc, cache_pc target_pc,
bool hot_patch)
{
cache_pc byte_ptr = exit_cti_disp_pc(branch_pc);
insert_relative_target(byte_ptr, target_pc, hot_patch);
}
* properly aligned returns 0, else returns the number of bytes it would
* need to be forward shifted to be properly aligned */
uint
patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc)
{
* also should check for addr16 flag (we shouldn't have any prefixes) */
ASSERT((instr_is_cti(inst) && !instr_is_cti_short(inst) &&
!TESTANY(~(PREFIX_JCC_TAKEN | PREFIX_JCC_NOT_TAKEN | PREFIX_PRED_MASK),
instr_get_prefixes(inst))) ||
instr_is_cti_short_rewrite(inst, NULL));
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(
ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE,
CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT))));
return (uint)ALIGN_SHIFT_SIZE(pc + instr_length(dcontext, inst) - CTI_PATCH_SIZE,
CTI_PATCH_SIZE, PAD_JMPS_ALIGNMENT);
}
static cache_pc
insert_spill_or_restore(dcontext_t *dcontext, cache_pc pc, uint flags, bool spill,
bool shared, reg_id_t reg, ushort tls_offs, uint dc_offs,
bool require_addr16)
{
DEBUG_DECLARE(cache_pc start_pc;)
pc = vmcode_get_writable_addr(pc);
IF_DEBUG(start_pc = pc);
byte opcode = ((reg == REG_XAX) ? (spill ? MOV_XAX2MEM_OPCODE : MOV_MEM2XAX_OPCODE)
: (spill ? MOV_REG2MEM_OPCODE : MOV_MEM2REG_OPCODE));
#ifdef X64
shared = true;
if (!FRAG_IS_32(flags)) {
if (reg == REG_XAX) {
*pc = ADDR_PREFIX_OPCODE;
pc++;
}
*pc = TLS_SEG_OPCODE;
pc++;
*pc = REX_PREFIX_BASE_OPCODE | REX_PREFIX_W_OPFLAG;
pc++;
*pc = opcode;
pc++;
if (reg != REG_XAX) {
*pc = MODRM_BYTE(0 , reg_get_bits(reg), 4 );
pc++;
*pc = SIB_DISP32;
pc++;
}
*((uint *)pc) = (uint)os_tls_offset(tls_offs);
pc += 4;
} else
#endif
if (shared) {
* when going through this to the IBL routine speed asks for
* not adding the prefix.
*/
bool addr16 = (require_addr16 || use_addr_prefix_on_short_disp());
if (addr16) {
*pc = ADDR_PREFIX_OPCODE;
pc++;
}
*pc = TLS_SEG_OPCODE;
pc++;
*pc = opcode;
pc++;
if (reg != REG_XAX) {
* w/o addr16 those are 0x1d, 0x0d, 0x05
*/
*pc = MODRM_BYTE(0 , reg_get_bits(reg), addr16 ? 6 : 5 );
pc++;
}
if (addr16) {
*((ushort *)pc) = os_tls_offset(tls_offs);
pc += 2;
} else {
*((uint *)pc) = os_tls_offset(tls_offs);
pc += 4;
}
} else {
*pc = opcode;
pc++;
if (reg != REG_XAX) {
*pc = MODRM_BYTE(0 , reg_get_bits(reg), 5 );
pc++;
}
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
*((uint *)pc) = (uint)(ptr_uint_t)UNPROT_OFFS(dcontext, dc_offs);
pc += 4;
}
ASSERT(IF_X64_ELSE(false, !shared) ||
(pc - start_pc) ==
(reg == REG_XAX ? SIZE_MOV_XAX_TO_TLS(flags, require_addr16)
: SIZE_MOV_XBX_TO_TLS(flags, require_addr16)));
ASSERT(IF_X64_ELSE(false, !shared) || !spill || reg == REG_XAX ||
instr_raw_is_tls_spill(start_pc, reg, tls_offs));
return vmcode_get_executable_addr(pc);
}
static byte *
insert_jmp_to_ibl(byte *pc, fragment_t *f, linkstub_t *l, cache_pc exit_target,
dcontext_t *dcontext)
{
#ifdef WINDOWS
bool spill_xbx_to_fs = FRAG_DB_SHARED(f->flags) ||
(is_shared_syscall_routine(dcontext, exit_target) &&
DYNAMO_OPTION(shared_fragment_shared_syscalls));
#else
bool spill_xbx_to_fs = FRAG_DB_SHARED(f->flags);
#endif
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
note that direct stubs use XAX for linkstub pointer */
#ifdef WINDOWS
if (INTERNAL_OPTION(shared_syscalls_fastpath) &&
is_shared_syscall_routine(dcontext, exit_target)) {
pc = insert_relative_jump(pc, exit_target, NOT_HOT_PATCHABLE);
return pc;
} else
#endif
pc = insert_spill_or_restore(dcontext, pc, f->flags, true ,
spill_xbx_to_fs, REG_XBX, INDIRECT_STUB_SPILL_SLOT,
XBX_OFFSET, true);
pc = vmcode_get_writable_addr(pc);
#ifdef X64
if (!FRAG_IS_32(f->flags)) {
*pc = REX_PREFIX_BASE_OPCODE | REX_PREFIX_W_OPFLAG;
pc++;
}
#endif
*pc = MOV_IMM2XBX_OPCODE;
pc++;
#ifdef WINDOWS
if (DYNAMO_OPTION(shared_syscalls) &&
is_shared_syscall_routine(dcontext, exit_target)) {
* this exit since it's never referenced.
*/
LOG(THREAD, LOG_LINKS, 4, "\tF%d using %s shared syscalls link stub\n", f->id,
TEST(FRAG_IS_TRACE, f->flags) ? "trace" : "bb");
l = (linkstub_t *)(TEST(FRAG_IS_TRACE, f->flags)
? get_shared_syscalls_trace_linkstub()
: get_shared_syscalls_bb_linkstub());
}
#endif
if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
* store-to-mem instead of in a spilled xbx, to use same
* slots as coarse direct stubs
*/
*((ptr_uint_t *)pc) = (ptr_uint_t)f->tag;
pc += sizeof(f->tag);
* a 15-byte stub . For that we should simply store the
* source cti directly into a tls slot. For now though we inline
* the stubs and spill xbx.
*/
} else {
*((ptr_uint_t *)pc) = (ptr_uint_t)l;
pc += sizeof(l);
}
pc = vmcode_get_executable_addr(pc);
pc = insert_relative_jump(pc, exit_target, NOT_HOT_PATCHABLE);
return pc;
}
* cross cache line boundaries. If emitting sets the offset field of all
* instructions, else sets the translation for the added nops (for
* recreating). If emitting and -pad_jmps_shift_{bb,trace} returns the number
* of bytes to shift the start_pc by (this avoids putting a nop before the
* first exit cti) else returns 0.
*/
uint
nop_pad_ilist(dcontext_t *dcontext, fragment_t *f, instrlist_t *ilist, bool emitting)
{
instr_t *inst;
uint offset = 0;
int first_patch_offset = -1;
uint start_shift = 0;
cache_pc starting_pc = f->start_pc + fragment_prefix_size(f->flags);
ASSERT(emitting || f->prefix_size == fragment_prefix_size(f->flags));
ASSERT(PAD_FRAGMENT_JMPS(f->flags));
for (inst = instrlist_first(ilist); inst != NULL; inst = instr_get_next(inst)) {
ASSERT_NOT_IMPLEMENTED(!TEST(INSTR_HOT_PATCHABLE, inst->flags));
if (instr_is_exit_cti(inst)) {
if (is_exit_cti_patchable(dcontext, inst, f->flags)) {
* the current instr.
*/
uint nop_length =
patchable_exit_cti_align_offs(dcontext, inst, starting_pc + offset);
LOG(THREAD, LOG_INTERP, 4, "%s: F%d @" PFX " cti shift needed: %d\n",
__FUNCTION__, f->id, starting_pc + offset, nop_length);
if (first_patch_offset < 0)
first_patch_offset = offset;
if (nop_length > 0) {
* we are within 1 cache line of the first patchable offset
* (this covers the case of a conditional branch which
* mangles to two patchable exits and is still safe since
* they are less then a cache line apart) */
if (PAD_JMPS_SHIFT_START(f->flags) &&
offset + instr_length(dcontext, inst) - first_patch_offset <
PAD_JMPS_ALIGNMENT) {
ASSERT(start_shift == 0);
start_shift = nop_length;
* instructions should be fine since we are still
* within the same cache line as the first patchable
* offset */
starting_pc += nop_length;
} else {
instr_t *nop_inst = INSTR_CREATE_nopNbyte(dcontext, nop_length);
#ifdef X64
if (FRAG_IS_32(f->flags)) {
instr_set_x86_mode(nop_inst, true );
instr_shrink_to_32_bits(nop_inst);
}
#endif
LOG(THREAD, LOG_INTERP, 4,
"Marking exit branch as having nop padding\n");
instr_branch_set_padded(inst, true);
instrlist_preinsert(ilist, inst, nop_inst);
ASSERT((int)nop_length == instr_length(dcontext, nop_inst));
if (emitting) {
nop_inst->offset = offset;
STATS_PAD_JMPS_ADD(f->flags, num_nops, 1);
STATS_PAD_JMPS_ADD(f->flags, nop_bytes, nop_length);
}
instr_set_translation(nop_inst, instr_get_translation(inst));
instr_set_our_mangling(nop_inst, true);
offset += nop_length;
}
ASSERT(patchable_exit_cti_align_offs(dcontext, inst,
starting_pc + offset) == 0);
} else {
DOSTATS({
if (emitting)
STATS_PAD_JMPS_ADD(f->flags, num_no_pad_exits, 1);
});
}
}
}
if (emitting)
inst->offset = offset;
offset += instr_length(dcontext, inst);
}
return start_shift;
}
static cache_pc
insert_save_xax(dcontext_t *dcontext, cache_pc pc, uint flags, bool shared,
ushort tls_offs, bool require_addr16)
{
return insert_spill_or_restore(dcontext, pc, flags, true , shared, REG_XAX,
tls_offs, XAX_OFFSET, require_addr16);
}
static cache_pc
insert_restore_xax(dcontext_t *dcontext, cache_pc pc, uint flags, bool shared,
ushort tls_offs, bool require_addr16)
{
return insert_spill_or_restore(dcontext, pc, flags, false , shared,
REG_XAX, tls_offs, XAX_OFFSET, require_addr16);
}
* lookup routine is encoded earlier into a template,
* (in the routine emit_inline_ibl_stub(), below)
* which we copy here and fix up the linkstub ptr for.
* when the hashtable changes, the mask and table are
* updated in update_indirect_exit_stub(), below.
*/
static byte *
insert_inlined_ibl(dcontext_t *dcontext, fragment_t *f, linkstub_t *l, byte *pc,
cache_pc unlinked_exit_target, uint flags)
{
ibl_code_t *ibl_code =
get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags);
byte *start_pc = pc;
cache_pc linked_exit_target = get_linked_entry(dcontext, unlinked_exit_target);
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(ibl_code->ibl_head_is_inlined);
ASSERT(EXIT_HAS_STUB(l->flags, f->flags));
memcpy(start_pc, ibl_code->inline_ibl_stub_template, ibl_code->inline_stub_length);
patch_branch(FRAG_ISA_MODE(f->flags), EXIT_CTI_PC(f, l),
start_pc + ibl_code->inline_unlink_offs, NOT_HOT_PATCHABLE);
if (DYNAMO_OPTION(indirect_stubs)) {
if (DYNAMO_OPTION(atomic_inlined_linking)) {
insert_relative_target(start_pc + ibl_code->inline_linkedjmp_offs,
linked_exit_target, NOT_HOT_PATCHABLE);
insert_relative_target(start_pc + ibl_code->inline_unlinkedjmp_offs,
unlinked_exit_target, NOT_HOT_PATCHABLE);
} else {
insert_relative_target(start_pc + ibl_code->inline_linkedjmp_offs,
unlinked_exit_target, NOT_HOT_PATCHABLE);
}
pc = start_pc + ibl_code->inline_linkstub_first_offs;
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
*((uint *)vmcode_get_writable_addr(pc)) = (uint)(ptr_uint_t)l;
if (DYNAMO_OPTION(atomic_inlined_linking)) {
pc = start_pc + ibl_code->inline_linkstub_second_offs;
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
*((uint *)vmcode_get_writable_addr(pc)) = (uint)(ptr_uint_t)l;
}
} else {
insert_relative_target(start_pc + ibl_code->inline_linkedjmp_offs,
linked_exit_target, NOT_HOT_PATCHABLE);
insert_relative_target(
start_pc + ibl_code->inline_unlink_offs +
1 ,
unlinked_exit_target, NOT_HOT_PATCHABLE);
}
return start_pc + ibl_code->inline_stub_length;
}
* emitted code in bytes. This routine assumes that the caller will
* take care of any cache synchronization necessary (though none is
* necessary on the Pentium).
* The stub is unlinked initially, except coarse grain indirect exits,
* which are always linked.
*/
int
insert_exit_stub_other_flags(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc stub_pc, ushort l_flags)
{
byte *pc = (byte *)stub_pc;
cache_pc exit_target;
bool indirect = false;
bool can_inline = true;
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
if (LINKSTUB_DIRECT(l_flags)) {
if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
exit_target = fcache_return_coarse_prefix(stub_pc, NULL);
ASSERT(exit_target != NULL);
} else
exit_target = get_direct_exit_target(dcontext, f->flags);
} else {
ASSERT(LINKSTUB_INDIRECT(l_flags));
ASSERT(EXIT_HAS_STUB(l_flags, f->flags));
if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
exit_target =
get_coarse_ibl_prefix(dcontext, stub_pc, extract_branchtype(l_flags));
ASSERT(exit_target != NULL);
} else {
exit_target = get_unlinked_entry(dcontext, EXIT_TARGET_TAG(dcontext, f, l));
}
indirect = true;
#ifdef WINDOWS
can_inline = (exit_target != unlinked_shared_syscall_routine(dcontext));
#endif
if (can_inline) {
ibl_code_t *ibl_code =
get_ibl_routine_code(dcontext, extract_branchtype(l_flags), f->flags);
if (!ibl_code->ibl_head_is_inlined)
can_inline = false;
}
}
if (indirect && can_inline) {
pc = insert_inlined_ibl(dcontext, f, l, pc, exit_target, f->flags);
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(pc - stub_pc)));
return (int)(pc - stub_pc);
}
if (indirect) {
pc = insert_jmp_to_ibl(pc, f, l, exit_target, dcontext);
} else if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
* so we store target info to memory instead of a register.
* The exact bytes used here are assumed by entrance_stub_target_tag().
*/
#ifdef X64
if (!FRAG_IS_32(f->flags)) {
app_pc tgt = EXIT_TARGET_TAG(dcontext, f, l);
* assume that the addr prefix is present for 32-bit but not 64-bit.
*/
pc = vmcode_get_writable_addr(pc);
*pc = TLS_SEG_OPCODE;
pc++;
*pc = MOV_IMM2MEM_OPCODE;
pc++;
*pc = MODRM_BYTE(0 , 0 , 4 );
pc++;
*pc = SIB_DISP32;
pc++;
*((uint *)pc) = (uint)os_tls_offset(DIRECT_STUB_SPILL_SLOT);
pc += 4;
*((uint *)pc) = (uint)(ptr_uint_t)tgt;
pc += 4;
*pc = TLS_SEG_OPCODE;
pc++;
*pc = MOV_IMM2MEM_OPCODE;
pc++;
*pc = MODRM_BYTE(0 , 0 , 4 );
pc++;
*pc = SIB_DISP32;
pc++;
*((uint *)pc) = 4 + (uint)os_tls_offset(DIRECT_STUB_SPILL_SLOT);
pc += 4;
*((uint *)pc) = (uint)(((ptr_uint_t)tgt) >> 32);
pc += 4;
pc = vmcode_get_executable_addr(pc);
} else {
#endif
* Both entrance_stub_target_tag() and coarse_indirect_stub_jmp_target()
* assume that the addr prefix is present for 32-bit but not 64-bit.
*/
pc = vmcode_get_writable_addr(pc);
*pc = ADDR_PREFIX_OPCODE;
pc++;
*pc = TLS_SEG_OPCODE;
pc++;
*pc = MOV_IMM2MEM_OPCODE;
pc++;
*pc = MODRM16_DISP16;
pc++;
*((ushort *)pc) = os_tls_offset(DIRECT_STUB_SPILL_SLOT);
pc += 2;
*((uint *)pc) = (uint)(ptr_uint_t)EXIT_TARGET_TAG(dcontext, f, l);
pc += 4;
pc = vmcode_get_executable_addr(pc);
#ifdef X64
}
#endif
pc = insert_relative_jump(pc, exit_target, NOT_HOT_PATCHABLE);
} else {
note that indirect stubs use XBX for linkstub pointer */
pc = insert_save_xax(dcontext, pc, f->flags, FRAG_DB_SHARED(f->flags),
DIRECT_STUB_SPILL_SLOT, true);
#ifdef X64
if (FRAG_IS_32(f->flags)) {
* WOW64 mixed-mode but long-term for 64-bit flexibility (i#774) we
* may need to store the other half of the pointer somewhere
*/
uint l_uint;
ASSERT_TRUNCATE(l_uint, uint, (ptr_uint_t)l);
l_uint = (uint)(ptr_uint_t)l;
*(vmcode_get_writable_addr(pc)) = MOV_IMM2XAX_OPCODE;
pc++;
*((uint *)vmcode_get_writable_addr(pc)) = l_uint;
pc += sizeof(l_uint);
} else {
*(vmcode_get_writable_addr(pc)) =
REX_PREFIX_BASE_OPCODE | REX_PREFIX_W_OPFLAG;
pc++;
#endif
*(vmcode_get_writable_addr(pc)) = MOV_IMM2XAX_OPCODE;
pc++;
*((ptr_uint_t *)vmcode_get_writable_addr(pc)) = (ptr_uint_t)l;
pc += sizeof(l);
#ifdef X64
}
#endif
pc = insert_relative_jump(pc, exit_target, NOT_HOT_PATCHABLE);
}
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(pc - stub_pc)));
return (int)(pc - stub_pc);
}
cache_pc
exit_cti_disp_pc(cache_pc branch_pc)
{
cache_pc byte_ptr = branch_pc;
byte opcode = *byte_ptr;
uint length = 0;
if (opcode == RAW_PREFIX_jcc_taken || opcode == RAW_PREFIX_jcc_not_taken) {
length++;
byte_ptr++;
opcode = *byte_ptr;
* other ctis we strip them out during mangling (i#435)
*/
ASSERT(opcode == RAW_OPCODE_jcc_byte1);
}
if (opcode == ADDR_PREFIX_OPCODE) {
length++;
byte_ptr++;
opcode = *byte_ptr;
}
if (opcode >= RAW_OPCODE_loop_start && opcode <= RAW_OPCODE_loop_end) {
* target pc is in last 4 bytes of "9-byte instruction"
*/
length += CTI_SHORT_REWRITE_LENGTH;
} else if (opcode == RAW_OPCODE_jcc_byte1) {
ASSERT(*(byte_ptr + 1) >= RAW_OPCODE_jcc_byte2_start &&
*(byte_ptr + 1) <= RAW_OPCODE_jcc_byte2_end);
length += CBR_LONG_LENGTH;
} else {
#ifdef HOT_PATCHING_INTERFACE
ASSERT(opcode == RAW_OPCODE_jmp || opcode == RAW_OPCODE_call);
#else
ASSERT(opcode == RAW_OPCODE_jmp);
#endif
length += JMP_LONG_LENGTH;
}
return branch_pc + length - 4;
}
* to a thread executing in the cache unless using the atomic_inlined_linking
* option (unlike unlinking)
*/
void
link_indirect_exit_arch(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
bool hot_patch, app_pc target_tag)
{
uint stub_size;
app_pc exit_target, cur_target, pc;
* on the cti targets, we must calculate them at a consistent
* state (we do have multi-stage modifications for inlined stubs)
*/
byte *stub_pc = (byte *)EXIT_STUB_PC(dcontext, f, l);
if (DYNAMO_OPTION(indirect_stubs)) {
stub_size = exit_stub_size(dcontext, target_tag, f->flags);
pc = stub_pc + stub_size - 5;
} else {
* except for -unsafe_ignore_eflags_trace stay-on-trace cmp,jne
*/
pc = EXIT_CTI_PC(f, l);
if (*pc == JNE_OPCODE_1) {
ASSERT(TEST(FRAG_IS_TRACE, f->flags));
#ifndef X64
ASSERT(INTERNAL_OPTION(unsafe_ignore_eflags_trace));
#endif
cur_target = (app_pc)PC_RELATIVE_TARGET(pc + 2);
exit_target = get_linked_entry(dcontext, cur_target);
pc += 2;
pc = insert_relative_target(pc, exit_target, hot_patch);
return;
} else {
ASSERT(*pc == JMP_OPCODE);
}
}
cur_target = (app_pc)PC_RELATIVE_TARGET(pc + 1);
exit_target = get_linked_entry(dcontext, cur_target);
pc++;
pc = insert_relative_target(pc, exit_target, hot_patch);
}
cache_pc
indirect_linkstub_stub_pc(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
cache_pc cti = EXIT_CTI_PC(f, l);
cache_pc stub;
if (!EXIT_HAS_STUB(l->flags, f->flags))
return NULL;
if (*cti == JNE_OPCODE_1) {
ASSERT(TEST(FRAG_IS_TRACE, f->flags));
#ifndef X64
ASSERT(INTERNAL_OPTION(unsafe_ignore_eflags_trace));
#endif
stub = (cache_pc)PC_RELATIVE_TARGET(cti + 2 );
} else {
if (*cti == JMP_OPCODE) {
stub = (cache_pc)PC_RELATIVE_TARGET(cti + 1 );
} else {
ASSERT(TEST(FRAG_COARSE_GRAIN, f->flags));
ASSERT(coarse_is_indirect_stub(cti));
stub = cti;
}
}
ASSERT(stub >= cti && (stub - cti) <= MAX_FRAGMENT_SIZE);
if (!TEST(LINK_LINKED, l->flags)) {
stub -= linkstub_unlink_entry_offset(dcontext, f, l);
* at unlink ibl routine if we could find the stub target for
* linking here...should consider storing stub pc for ind exits
* for that case to save 5 bytes in the inlined stub
*/
}
return stub;
}
* decoding to find their length
*/
cache_pc
cbr_fallthrough_exit_cti(cache_pc prev_cti_pc)
{
if (*prev_cti_pc == RAW_PREFIX_jcc_taken || *prev_cti_pc == RAW_PREFIX_jcc_not_taken)
prev_cti_pc++;
return (prev_cti_pc + CBR_LONG_LENGTH);
}
* cache (barring ifdef NATIVE_RETURN, which is now removed), for
* inlined indirect exits the
* unlinked path of the ibl routine detects the race condition between the
* two patching writes and handles it appropriately unless using the
* atomic_inlined_linking option in which case there is only one patching
* write (since tail is duplicated) */
void
unlink_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
uint stub_size;
cache_pc exit_target, cur_target;
app_pc target_tag = EXIT_TARGET_TAG(dcontext, f, l);
byte *pc;
ibl_code_t *ibl_code = NULL;
* on the cti targets, we must calculate them at a consistent
* state (we do have multi-stage modifications for inlined stubs)
*/
byte *stub_pc = (byte *)EXIT_STUB_PC(dcontext, f, l);
ASSERT(!TEST(FRAG_COARSE_GRAIN, f->flags));
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_INDIRECT(l->flags));
if (!TEST(LINK_LINKED, l->flags))
return;
#ifdef WINDOWS
if (!is_shared_syscall_routine(dcontext, target_tag)) {
#endif
ibl_code = get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags);
#ifdef WINDOWS
}
#endif
if ((!DYNAMO_OPTION(atomic_inlined_linking) && DYNAMO_OPTION(indirect_stubs)) ||
#ifdef WINDOWS
target_tag ==
shared_syscall_routine_ex(
dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags))) ||
#endif
* target the unlinked ibl entry but we don't yet -- see FIXME in
* emit_inline_ibl_stub()
*/
!ibl_code->ibl_head_is_inlined) {
if (DYNAMO_OPTION(indirect_stubs)) {
stub_size = exit_stub_size(dcontext, target_tag, f->flags);
pc = stub_pc + stub_size - 5;
} else {
pc = EXIT_CTI_PC(f, l);
if (*pc == JNE_OPCODE_1) {
ASSERT(TEST(FRAG_IS_TRACE, f->flags));
#ifndef X64
ASSERT(INTERNAL_OPTION(unsafe_ignore_eflags_trace));
#endif
pc++;
} else
ASSERT(*pc == JMP_OPCODE);
}
cur_target = (cache_pc)PC_RELATIVE_TARGET(pc + 1);
exit_target = get_unlinked_entry(dcontext, cur_target);
pc++;
pc = insert_relative_target(pc, exit_target, HOT_PATCHABLE);
}
* the ending jmp (above) first so that the unlinked path can detect the
* race condition case */
#ifdef WINDOWS
* yet inconsistent
*/
if (target_tag !=
shared_syscall_routine_ex(dcontext _IF_X64(FRAGMENT_GENCODE_MODE(f->flags)))) {
#endif
if (ibl_code->ibl_head_is_inlined) {
cache_pc target = stub_pc;
target += ibl_code->inline_unlink_offs;
patch_branch(FRAG_ISA_MODE(f->flags), EXIT_CTI_PC(f, l), target,
HOT_PATCHABLE);
}
#ifdef WINDOWS
}
#endif
}
* COARSE-GRAIN FRAGMENT SUPPORT
*/
cache_pc
entrance_stub_jmp(cache_pc stub)
{
#ifdef X64
if (*stub == 0x65)
return (stub + STUB_COARSE_DIRECT_SIZE64 - JMP_LONG_LENGTH);
#endif
return (stub + STUB_COARSE_DIRECT_SIZE32 - JMP_LONG_LENGTH);
}
* or a coarse indirect stub. FIXME: if we separate coarse indirect
* stubs from bodies we'll need to put them somewhere else, or fix up
* decode_fragment() to be able to distinguish them in some other way
* like first instruction tls slot.
*/
bool
coarse_is_entrance_stub(cache_pc stub)
{
bool res = false;
coarse_info_t *info = get_stub_coarse_info(stub);
if (info != NULL) {
res = ALIGNED(stub, coarse_stub_alignment(info)) &&
*entrance_stub_jmp(stub) == JMP_OPCODE;
DOCHECK(1, {
if (res) {
cache_pc tgt = entrance_stub_jmp_target(stub);
ASSERT(!in_fcache(stub));
ASSERT(tgt == trace_head_return_coarse_prefix(stub, info) ||
tgt == fcache_return_coarse_prefix(stub, info) ||
in_fcache(tgt));
}
});
}
return res;
}
*
* fragment_t Prefixes
*
* Two types: indirect branch target, which restores eflags and xcx, and
* normal prefix, which just restores xcx
*/
#define IBL_EFLAGS_IN_TLS() (IF_X64_ELSE(true, SHARED_IB_TARGETS()))
* We have 3 different prefixes: one if we don't need to restore eflags, one
* if we need to restore just using sahf, and one if we also need to restore
* the overflow flag OF.
*
* FIXME: currently we cache-align the prefix, not the normal
* entry point...if prefix gets much longer, might want to add
* nops to get normal entry cache-aligned?
*/
#ifdef X86
# define RESTORE_XAX_PREFIX(flags) \
((FRAG_IS_X86_TO_X64(flags) && \
IF_X64_ELSE(DYNAMO_OPTION(x86_to_x64_ibl_opt), false)) \
? SIZE64_MOV_R8_TO_XAX \
: (IBL_EFLAGS_IN_TLS() ? SIZE_MOV_XAX_TO_TLS(flags, false) \
: SIZE32_MOV_XAX_TO_ABS))
# define PREFIX_BASE(flags) \
(RESTORE_XAX_PREFIX(flags) + FRAGMENT_BASE_PREFIX_SIZE(flags))
#else
# define RESTORE_XAX_PREFIX(flags) (ASSERT_NOT_IMPLEMENTED(false), 0)
# define PREFIX_BASE(flags) (ASSERT_NOT_IMPLEMENTED(false), 0)
#endif
int
fragment_ibt_prefix_size(uint flags)
{
bool use_eflags_restore = TEST(FRAG_IS_TRACE, flags)
? !DYNAMO_OPTION(trace_single_restore_prefix)
: !DYNAMO_OPTION(bb_single_restore_prefix);
* PREFIX_BASE(flags) is defined accordingly, and we subtract from it to
* get the correct value when the option is on.
*/
if (INTERNAL_OPTION(unsafe_ignore_eflags_prefix)) {
if (INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
ASSERT(PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags) >= 0);
return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags);
} else {
return PREFIX_BASE(flags);
}
}
if (!use_eflags_restore)
return PREFIX_BASE(flags) - RESTORE_XAX_PREFIX(flags);
if (TEST(FRAG_WRITES_EFLAGS_6, flags))
return PREFIX_BASE(flags);
else if (TEST(FRAG_WRITES_EFLAGS_OF, flags))
return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS);
else {
if (INTERNAL_OPTION(unsafe_ignore_overflow)) {
return (PREFIX_BASE(flags) + PREFIX_SIZE_FIVE_EFLAGS);
} else {
return (PREFIX_BASE(flags) + PREFIX_SIZE_RESTORE_OF +
PREFIX_SIZE_FIVE_EFLAGS);
}
}
}
static cache_pc
insert_restore_xcx(dcontext_t *dcontext, cache_pc pc, uint flags, bool require_addr16)
{
* this works b/c the shared ibl copies the app xcx to both places!
* private_ib_in_tls option makes all prefixes use tls
*/
return insert_spill_or_restore(dcontext, pc, flags, false ,
XCX_IN_TLS(flags), REG_XCX, MANGLE_XCX_SPILL_SLOT,
XCX_OFFSET, require_addr16);
}
static cache_pc
insert_restore_register(dcontext_t *dcontext, fragment_t *f, cache_pc pc, reg_id_t reg)
{
ASSERT(reg == REG_XAX || reg == REG_XCX);
#ifdef X64
if (FRAG_IS_X86_TO_X64(f->flags) && DYNAMO_OPTION(x86_to_x64_ibl_opt)) {
* to restore rax: 49 8b c0 mov %r8 -> %rax
* to restore rcx: 49 8b c9 mov %r9 -> %rcx
*/
pc = vmcode_get_writable_addr(pc);
*pc = REX_PREFIX_BASE_OPCODE | REX_PREFIX_W_OPFLAG | REX_PREFIX_B_OPFLAG;
pc++;
*pc = MOV_MEM2REG_OPCODE;
pc++;
*pc = MODRM_BYTE(3 , reg_get_bits(reg),
reg_get_bits((reg == REG_XAX) ? REG_R8 : REG_R9));
pc++;
pc = vmcode_get_executable_addr(pc);
} else {
#endif
pc = (reg == REG_XAX)
? insert_restore_xax(dcontext, pc, f->flags, IBL_EFLAGS_IN_TLS(),
PREFIX_XAX_SPILL_SLOT, false)
: insert_restore_xcx(dcontext, pc, f->flags, false);
#ifdef X64
}
#endif
return pc;
}
void
insert_fragment_prefix(dcontext_t *dcontext, fragment_t *f)
{
byte *pc = (byte *)f->start_pc;
bool insert_eflags_xax_restore = TEST(FRAG_IS_TRACE, f->flags)
? !DYNAMO_OPTION(trace_single_restore_prefix)
: !DYNAMO_OPTION(bb_single_restore_prefix);
ASSERT(f->prefix_size == 0);
if (use_ibt_prefix(f->flags)) {
if ((!INTERNAL_OPTION(unsafe_ignore_eflags_prefix) ||
!INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) &&
insert_eflags_xax_restore) {
if (!INTERNAL_OPTION(unsafe_ignore_eflags_prefix) &&
!TEST(FRAG_WRITES_EFLAGS_6, f->flags)) {
if (!TEST(FRAG_WRITES_EFLAGS_OF, f->flags) &&
!INTERNAL_OPTION(unsafe_ignore_overflow)) {
DEBUG_DECLARE(byte *restore_of_prefix_pc = pc;)
* we did a seto on %al, so we restore OF by adding 0x7f to
* %al (7f not ff b/c add only sets OF for signed operands,
* sets CF for uint)
*/
STATS_INC(num_oflag_prefix_restore);
pc = vmcode_get_writable_addr(pc);
*pc = ADD_AL_OPCODE;
pc++;
*pc = 0x7f;
pc++;
pc = vmcode_get_executable_addr(pc);
ASSERT(pc - restore_of_prefix_pc == PREFIX_SIZE_RESTORE_OF);
}
*vmcode_get_writable_addr(pc) = SAHF_OPCODE;
pc++;
ASSERT(PREFIX_SIZE_FIVE_EFLAGS == 1);
}
pc = insert_restore_register(dcontext, f, pc, REG_XAX);
}
pc = insert_restore_register(dcontext, f, pc, REG_XCX);
ASSERT_TRUNCATE(f->prefix_size, byte, ((cache_pc)pc) - f->start_pc);
f->prefix_size = (byte)(((cache_pc)pc) - f->start_pc);
} else {
if (dynamo_options.bb_prefixes) {
pc = insert_restore_register(dcontext, f, pc, REG_XCX);
ASSERT_TRUNCATE(f->prefix_size, byte, ((cache_pc)pc) - f->start_pc);
f->prefix_size = (byte)(((cache_pc)pc) - f->start_pc);
}
}
ASSERT(f->prefix_size == fragment_prefix_size(f->flags));
}
#ifdef X64
# ifdef WINDOWS
# define OPND_ARG1 opnd_create_reg(REG_RCX)
# else
# define OPND_ARG1 opnd_create_reg(REG_RDI)
# endif
#else
# define OPND_ARG1 OPND_CREATE_MEM32(REG_ESP, 4)
#endif
void
append_fcache_enter_prologue(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
#ifdef UNIX
instr_t *no_signals = INSTR_CREATE_label(dcontext);
#endif
if (!absolute) {
#ifdef UNIX
APP(ilist,
XINST_CREATE_move(dcontext, opnd_create_reg(REG_XAX),
opnd_create_reg(REG_DCXT)));
#endif
APP(ilist, XINST_CREATE_load(dcontext, opnd_create_reg(REG_DCXT), OPND_ARG1));
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
APP(ilist, RESTORE_FROM_DC(dcontext, REG_DCXT_PROT, PROT_OFFS));
}
#ifdef UNIX
APP(ilist,
XINST_CREATE_cmp(dcontext,
OPND_DC_FIELD(absolute, dcontext, OPSZ_1, SIGPENDING_OFFSET),
OPND_CREATE_INT8(0)));
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jle, opnd_create_instr(no_signals)));
if (!absolute) {
APP(ilist,
XINST_CREATE_move(dcontext, opnd_create_reg(REG_DCXT),
opnd_create_reg(REG_XAX)));
}
APP(ilist, XINST_CREATE_return(dcontext));
APP(ilist, no_signals);
#endif
}
* if (EXIT_DR_HOOK != NULL && !dcontext->ignore_enterexit)
* if (!absolute)
* push %xdi
* push %xsi
* else
* # support for skipping the hook
* RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi
* cmpl %edi,0
* jnz post_hook
* endif
* call EXIT_DR_HOOK # for x64 windows, reserve 32 bytes stack space for call
* if (!absolute)
* pop %xsi
* pop %xdi
* endif
* endif
* post_hook:
*/
void
append_call_exit_dr_hook(dcontext_t *dcontext, instrlist_t *ilist, bool absolute,
bool shared)
{
instr_t *post_hook = INSTR_CREATE_label(dcontext);
if (EXIT_DR_HOOK != NULL) {
if (!absolute) {
* for x64, they're supposed to be callee-saved on windows,
* but not linux (though we could move to r12-r15 on linux
* instead of pushing them).
*/
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XDI)));
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSI)));
}
#ifdef WINDOWS
else {
* dcontext->ignore_enterexit. This is a perf hit to check: could
* instead have a space hit via a separate routine. This is only
* needed right now for NtSuspendThread handling (see case 4942).
*/
APP(ilist, RESTORE_FROM_DC(dcontext, REG_EDI, IGNORE_ENTEREXIT_OFFSET));
APP(ilist,
INSTR_CREATE_test(dcontext, opnd_create_reg(REG_EDI),
opnd_create_reg(REG_EDI)));
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jnz, opnd_create_instr(post_hook)));
}
#endif
* since x64 windows requires 32 bytes of stack space even w/ no args,
* and we don't want anyone clobbering our pushed registers!
*/
dr_insert_call((void *)dcontext, ilist, NULL , (void *)EXIT_DR_HOOK, 0);
if (!absolute) {
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XSI)));
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XDI)));
}
}
APP(ilist, post_hook );
}
* # restore the original register state
* RESTORE_FROM_UPCONTEXT xflags_OFFSET,%xax
* push %xax
* popf # restore eflags temporarily using dstack
*/
void
append_restore_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG0, XFLAGS_OFFSET));
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(SCRATCH_REG0)));
APP(ilist, INSTR_CREATE_RAW_popf(dcontext));
}
* if preserve_xmm_caller_saved
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+0*16,%xmm0
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+1*16,%xmm1
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+2*16,%xmm2
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+3*16,%xmm3
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+4*16,%xmm4
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+5*16,%xmm5
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+6*16,%xmm6 # 32-bit Linux
* RESTORE_FROM_UPCONTEXT xmm_OFFSET+7*16,%xmm7 # 32-bit Linux
* endif
*/
void
append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
ASSERT(preserve_xmm_caller_saved() || !ZMM_ENABLED());
if (!preserve_xmm_caller_saved())
return;
* Rather than try and optimize we save/restore on every cxt
* sw. The xmm field is aligned, so we can use movdqa/movaps,
* though movdqu is stated to be as fast as movdqa when aligned:
* but if so, why have two versions? Is it only loads and not stores
* for which that is true? => PR 266305.
* It's not clear that movdqa is any faster (and its opcode is longer):
* movdqa and movaps are listed as the same latency and throughput in
* the AMD optimization guide. Yet examples of fast memcpy online seem
* to use movdqa when sse2 is available.
* Note that mov[au]p[sd] and movdq[au] are functionally equivalent.
*/
* cost of vmovdqu and whether worth arranging 32-byte alignment
*/
int i;
uint opcode = move_mm_reg_opcode(true , true );
ASSERT(proc_has_feature(FEATURE_SSE));
instr_t *post_restore = NULL;
instr_t *pre_avx512_restore = NULL;
if (ZMM_ENABLED()) {
post_restore = INSTR_CREATE_label(dcontext);
pre_avx512_restore = INSTR_CREATE_label(dcontext);
APP(ilist,
INSTR_CREATE_cmp(
dcontext,
OPND_CREATE_ABSMEM(
vmcode_get_executable_addr((byte *)d_r_avx512_code_in_use), OPSZ_1),
OPND_CREATE_INT8(0)));
APP(ilist,
INSTR_CREATE_jcc(dcontext, OP_jnz, opnd_create_instr(pre_avx512_restore)));
}
for (i = 0; i < proc_num_simd_sse_avx_saved(); i++) {
APP(ilist,
instr_create_1dst_1src(dcontext, opcode,
opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i),
OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM,
SIMD_OFFSET + i * MCXT_SIMD_SLOT_SIZE)));
}
if (ZMM_ENABLED()) {
APP(ilist, INSTR_CREATE_jmp(dcontext, opnd_create_instr(post_restore)));
APP(ilist, pre_avx512_restore );
uint opcode_avx512 = move_mm_avx512_reg_opcode(true );
for (i = 0; i < proc_num_simd_registers(); i++) {
APP(ilist,
instr_create_1dst_2src(
dcontext, opcode_avx512,
opnd_create_reg(DR_REG_START_ZMM + (reg_id_t)i),
opnd_create_reg(DR_REG_K0),
OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_ZMM,
SIMD_OFFSET + i * MCXT_SIMD_SLOT_SIZE)));
}
for (i = 0; i < proc_num_opmask_registers(); i++) {
APP(ilist,
instr_create_1dst_1src(
dcontext, proc_has_feature(FEATURE_AVX512BW) ? OP_kmovq : OP_kmovw,
opnd_create_reg(DR_REG_START_OPMASK + (reg_id_t)i),
OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_OPMASK,
OPMASK_OFFSET + i * OPMASK_AVX512BW_REG_SIZE)));
}
APP(ilist, post_restore );
}
}
* ifdef X64
* RESTORE_FROM_UPCONTEXT r8_OFFSET,%r8
* RESTORE_FROM_UPCONTEXT r9_OFFSET,%r9
* RESTORE_FROM_UPCONTEXT r10_OFFSET,%r10
* RESTORE_FROM_UPCONTEXT r11_OFFSET,%r11
* RESTORE_FROM_UPCONTEXT r12_OFFSET,%r12
* RESTORE_FROM_UPCONTEXT r13_OFFSET,%r13
* RESTORE_FROM_UPCONTEXT r14_OFFSET,%r14
* RESTORE_FROM_UPCONTEXT r15_OFFSET,%r15
* endif
* RESTORE_FROM_UPCONTEXT xax_OFFSET,%xax
* RESTORE_FROM_UPCONTEXT xbx_OFFSET,%xbx
* RESTORE_FROM_UPCONTEXT xcx_OFFSET,%xcx
* RESTORE_FROM_UPCONTEXT xdx_OFFSET,%xdx
* if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xdx_OFFSET,%xdx
* if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi
* endif
* if (absolute || TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi
* endif
* RESTORE_FROM_UPCONTEXT xbp_OFFSET,%xbp
* RESTORE_FROM_UPCONTEXT xsp_OFFSET,%xsp
* if (!absolute)
* if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* RESTORE_FROM_UPCONTEXT xsi_OFFSET,%xsi
* else
* RESTORE_FROM_UPCONTEXT xdi_OFFSET,%xdi
* endif
* endif
*/
void
append_restore_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
#ifdef X64
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R8, R8_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R9, R9_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R10, R10_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R11, R11_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R12, R12_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R13, R13_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R14, R14_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_R15, R15_OFFSET));
#endif
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XAX, XAX_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XBX, XBX_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XCX, XCX_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XDX, XDX_OFFSET));
if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XSI, XSI_OFFSET));
if (absolute || TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XDI, XDI_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XBP, XBP_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XSP, XSP_OFFSET));
if (!absolute) {
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XSI, XSI_OFFSET));
else
APP(ilist, RESTORE_FROM_DC(dcontext, REG_XDI, XDI_OFFSET));
}
}
*
* if (!absolute)
* # get xax and xdi into their real slots, via xbx
* SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
* mov fs:xax_OFFSET,%xbx
* SAVE_TO_UPCONTEXT %xbx,xax_OFFSET
* mov fs:xdx_OFFSET,%xbx
* SAVE_TO_UPCONTEXT %xbx,xdi_OFFSET
* endif
*
* # save the current register state to dcontext's mcontext
* # xax already in context
*
* if (absolute)
* SAVE_TO_UPCONTEXT %xbx,xbx_OFFSET
* endif
* SAVE_TO_UPCONTEXT %xcx,xcx_OFFSET
* SAVE_TO_UPCONTEXT %xdx,xdx_OFFSET
* if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
* SAVE_TO_UPCONTEXT %xsi,xsi_OFFSET
* endif
*
* # on X86
* if (absolute)
* SAVE_TO_UPCONTEXT %xdi,xdi_OFFSET
* endif
* SAVE_TO_UPCONTEXT %xbp,xbp_OFFSET
* SAVE_TO_UPCONTEXT %xsp,xsp_OFFSET
* ifdef X64
* SAVE_TO_UPCONTEXT %r8,r8_OFFSET
* SAVE_TO_UPCONTEXT %r9,r9_OFFSET
* SAVE_TO_UPCONTEXT %r10,r10_OFFSET
* SAVE_TO_UPCONTEXT %r11,r11_OFFSET
* SAVE_TO_UPCONTEXT %r12,r12_OFFSET
* SAVE_TO_UPCONTEXT %r13,r13_OFFSET
* SAVE_TO_UPCONTEXT %r14,r14_OFFSET
* SAVE_TO_UPCONTEXT %r15,r15_OFFSET
* endif
*/
void
append_save_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end, bool absolute,
generated_code_t *code, linkstub_t *linkstub, bool coarse_info)
{
if (!absolute) {
APP(ilist, SAVE_TO_DC(dcontext, REG_XBX, XBX_OFFSET));
APP(ilist, RESTORE_FROM_TLS(dcontext, REG_XBX, DIRECT_STUB_SPILL_SLOT));
if (linkstub != NULL) {
* is now in %xbx
*/
APP(ilist, SAVE_TO_DC(dcontext, REG_XAX, XAX_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_XBX, NEXT_TAG_OFFSET));
APP(ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(REG_XAX),
OPND_CREATE_INTPTR((ptr_int_t)linkstub)));
if (coarse_info) {
APP(ilist, SAVE_TO_DC(dcontext, REG_XCX, COARSE_DIR_EXIT_OFFSET));
#ifdef X64
* cleaner: maybe a restore_indirect_branch_spill() or sthg,
* and IBL_REG to indirect xcx.
*/
if (GENCODE_IS_X86_TO_X64(code->gencode_mode) &&
DYNAMO_OPTION(x86_to_x64_ibl_opt))
APP(ilist, RESTORE_FROM_REG(dcontext, REG_XCX, REG_R9));
else {
#endif
APP(ilist,
RESTORE_FROM_TLS(dcontext, REG_XCX, MANGLE_XCX_SPILL_SLOT));
#ifdef X64
}
#endif
}
} else {
APP(ilist, SAVE_TO_DC(dcontext, REG_XBX, XAX_OFFSET));
}
APP(ilist, RESTORE_FROM_TLS(dcontext, REG_XBX, DCONTEXT_BASE_SPILL_SLOT));
APP(ilist, SAVE_TO_DC(dcontext, REG_XBX, XDI_OFFSET));
}
* xax already in context
*/
if (!ibl_end) {
if (absolute)
APP(ilist, SAVE_TO_DC(dcontext, REG_XBX, XBX_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_XCX, XCX_OFFSET));
}
APP(ilist, SAVE_TO_DC(dcontext, REG_XDX, XDX_OFFSET));
if (absolute || !TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask))
APP(ilist, SAVE_TO_DC(dcontext, REG_XSI, XSI_OFFSET));
if (absolute)
APP(ilist, SAVE_TO_DC(dcontext, REG_XDI, XDI_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_XBP, XBP_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_XSP, XSP_OFFSET));
#ifdef X64
APP(ilist, SAVE_TO_DC(dcontext, REG_R8, R8_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R9, R9_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R10, R10_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R11, R11_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R12, R12_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R13, R13_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R14, R14_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, REG_R15, R15_OFFSET));
#endif
}
* if preserve_xmm_caller_saved
* SAVE_TO_UPCONTEXT %xmm0,xmm_OFFSET+0*16
* SAVE_TO_UPCONTEXT %xmm1,xmm_OFFSET+1*16
* SAVE_TO_UPCONTEXT %xmm2,xmm_OFFSET+2*16
* SAVE_TO_UPCONTEXT %xmm3,xmm_OFFSET+3*16
* SAVE_TO_UPCONTEXT %xmm4,xmm_OFFSET+4*16
* SAVE_TO_UPCONTEXT %xmm5,xmm_OFFSET+5*16
* SAVE_TO_UPCONTEXT %xmm6,xmm_OFFSET+6*16 # 32-bit Linux
* SAVE_TO_UPCONTEXT %xmm7,xmm_OFFSET+7*16 # 32-bit Linux
* endif
*/
void
append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
ASSERT(preserve_xmm_caller_saved() || !ZMM_ENABLED());
if (!preserve_xmm_caller_saved())
return;
* Rather than try and optimize we save/restore on every cxt
* sw. The xmm field is aligned, so we can use movdqa/movaps,
* though movdqu is stated to be as fast as movdqa when aligned:
* but if so, why have two versions? Is it only loads and not stores
* for which that is true? => PR 266305.
* It's not clear that movdqa is any faster (and its opcode is longer):
* movdqa and movaps are listed as the same latency and throughput in
* the AMD optimization guide. Yet examples of fast memcpy online seem
* to use movdqa when sse2 is available.
* Note that mov[au]p[sd] and movdq[au] are functionally equivalent.
*/
* cost of vmovdqu and whether worth arranging 32-byte alignment
*/
int i;
uint opcode = move_mm_reg_opcode(true , true );
ASSERT(proc_has_feature(FEATURE_SSE));
instr_t *post_save = NULL;
instr_t *pre_avx512_save = NULL;
if (ZMM_ENABLED()) {
post_save = INSTR_CREATE_label(dcontext);
pre_avx512_save = INSTR_CREATE_label(dcontext);
APP(ilist,
INSTR_CREATE_cmp(
dcontext,
OPND_CREATE_ABSMEM(
vmcode_get_executable_addr((byte *)d_r_avx512_code_in_use), OPSZ_1),
OPND_CREATE_INT8(0)));
APP(ilist,
INSTR_CREATE_jcc(dcontext, OP_jnz, opnd_create_instr(pre_avx512_save)));
}
for (i = 0; i < proc_num_simd_sse_avx_saved(); i++) {
APP(ilist,
instr_create_1dst_1src(dcontext, opcode,
OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_XMM,
SIMD_OFFSET + i * MCXT_SIMD_SLOT_SIZE),
opnd_create_reg(REG_SAVED_XMM0 + (reg_id_t)i)));
}
if (ZMM_ENABLED()) {
APP(ilist, INSTR_CREATE_jmp(dcontext, opnd_create_instr(post_save)));
APP(ilist, pre_avx512_save );
uint opcode_avx512 = move_mm_avx512_reg_opcode(true );
for (i = 0; i < proc_num_simd_registers(); i++) {
APP(ilist,
instr_create_1dst_2src(
dcontext, opcode_avx512,
OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_ZMM,
SIMD_OFFSET + i * MCXT_SIMD_SLOT_SIZE),
opnd_create_reg(DR_REG_K0),
opnd_create_reg(DR_REG_START_ZMM + (reg_id_t)i)));
}
for (i = 0; i < proc_num_opmask_registers(); i++) {
APP(ilist,
instr_create_1dst_1src(
dcontext, proc_has_feature(FEATURE_AVX512BW) ? OP_kmovq : OP_kmovw,
OPND_DC_FIELD(absolute, dcontext, OPSZ_SAVED_OPMASK,
OPMASK_OFFSET + i * OPMASK_AVX512BW_REG_SIZE),
opnd_create_reg(DR_REG_START_OPMASK + (reg_id_t)i)));
}
APP(ilist, post_save );
}
}
* # now save eflags -- too hard to do without a stack on X86!
* pushf # push eflags on stack
* pop %xbx # grab eflags value
* SAVE_TO_UPCONTEXT %xbx,xflags_OFFSET # save eflags value
*
* # clear eflags now to avoid app's eflags messing up our ENTER_DR_HOOK
* # FIXME: this won't work at CPL0 if we ever run there!
* push 0
* popf
*/
void
append_save_clear_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
reg_id_t reg = IF_X86_ELSE(REG_XBX, DR_REG_R1);
#ifdef X86
APP(ilist, INSTR_CREATE_RAW_pushf(dcontext));
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(reg)));
#elif defined(ARM)
ASSERT_NOT_IMPLEMENTED(false);
#endif
APP(ilist, SAVE_TO_DC(dcontext, reg, XFLAGS_OFFSET));
* messing up our ENTER_DR_HOOK
*/
#ifdef X86
APP(ilist, INSTR_CREATE_push_imm(dcontext, OPND_CREATE_INT8(0)));
APP(ilist, INSTR_CREATE_RAW_popf(dcontext));
#elif defined(ARM)
#endif
}
* # X86 only
* if (ENTER_DR_HOOK != NULL && !dcontext->ignore_enterexit)
* # don't bother to save any registers around call except for xax
* # and xcx, which holds next_tag
* push %xcx
* if (!absolute)
* push %xdi
* push %xsi
* endif
* push %xax
* if (absolute)
* # support for skipping the hook (note: 32-bits even on x64)
* RESTORE_FROM_UPCONTEXT ignore_enterexit_OFFSET,%edi
* cmp %edi,0
* jnz post_hook
* endif
* # for x64 windows, reserve 32 bytes stack space for call prior to call
* call ENTER_DR_HOOK
* post_hook:
* pop %xax
* if (!absolute)
* pop %xsi
* pop %xdi
* endif
* pop %xcx
* endif
*/
bool
append_call_enter_dr_hook(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end,
bool absolute)
{
bool instr_target = false;
if (ENTER_DR_HOOK != NULL) {
* we could move to a callee-saved register instead of pushing.
*/
instr_t *post_hook = INSTR_CREATE_label(dcontext);
if (ibl_end) {
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XCX)));
}
if (!absolute) {
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XDI)));
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XSI)));
}
APP(ilist, INSTR_CREATE_push(dcontext, opnd_create_reg(REG_XAX)));
#ifdef WINDOWS
if (absolute) {
* dcontext->ignore_enterexit. This is a perf hit to check: could
* instead have a space hit via a separate routine. This is only
* needed right now for NtSuspendThread handling (see case 4942).
*/
APP(ilist, RESTORE_FROM_DC(dcontext, REG_EDI, IGNORE_ENTEREXIT_OFFSET));
APP(ilist,
INSTR_CREATE_test(dcontext, opnd_create_reg(REG_EDI),
opnd_create_reg(REG_EDI)));
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jnz, opnd_create_instr(post_hook)));
instr_target = true;
}
#endif
* since x64 windows requires 32 bytes of stack space even w/ no args,
* and we don't want anyone clobbering our pushed registers!
*/
dr_insert_call((void *)dcontext, ilist, NULL , (void *)ENTER_DR_HOOK,
0);
APP(ilist, post_hook );
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XAX)));
if (!absolute) {
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XSI)));
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XDI)));
}
if (ibl_end) {
APP(ilist, INSTR_CREATE_pop(dcontext, opnd_create_reg(REG_XCX)));
APP(ilist, SAVE_TO_DC(dcontext, REG_XCX, NEXT_TAG_OFFSET));
}
}
return instr_target;
}
* uses the xax slot, either in TLS
* memory if tls is true; else using mcontext accessed using absolute address if
* absolute is true, else off xdi.
* MUST NOT clobber xax between this call and the restore call!
*/
void
insert_save_eflags(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, uint flags,
bool tls, bool absolute _IF_X64(bool x86_to_x64_ibl_opt))
{
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));
if (TEST(FRAG_WRITES_EFLAGS_6, flags))
return;
* either absolute address or indirected via xdi as specified by absolute param
*/
if (IF_X64_ELSE(x86_to_x64_ibl_opt, false)) {
PRE(ilist, where, SAVE_TO_REG(dcontext, REG_XAX, REG_R8));
} else if (tls) {
* to restore from. FIXME: This can be much more streamlined
* if TLS_SLOT_SCRATCH1 was the XAX spill slot for everyone
*/
* based on shared/privateness of the fragment, we also need
* to know what would the target do if shared.
*/
PRE(ilist, where, SAVE_TO_TLS(dcontext, REG_XAX, PREFIX_XAX_SPILL_SLOT));
} else {
PRE(ilist, where, SAVE_TO_DC(dcontext, REG_XAX, XAX_OFFSET));
}
PRE(ilist, where, INSTR_CREATE_lahf(dcontext));
if (!TEST(FRAG_WRITES_EFLAGS_OF, flags) &&
!INTERNAL_OPTION(unsafe_ignore_overflow)) {
PRE(ilist, where, INSTR_CREATE_setcc(dcontext, OP_seto, opnd_create_reg(REG_AL)));
}
}
* TLS memory if tls is true; else using mcontext accessed using absolute
* address if absolute is true, else off xdi.
* also restores xax
*/
void
insert_restore_eflags(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
uint flags, bool tls,
bool absolute _IF_X64(bool x86_to_x64_ibl_opt))
{
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));
if (TEST(FRAG_WRITES_EFLAGS_6, flags))
return;
if (!TEST(FRAG_WRITES_EFLAGS_OF, flags) &&
!INTERNAL_OPTION(unsafe_ignore_overflow)) {
* if OF was on when we did seto
*/
PRE(ilist, where,
INSTR_CREATE_add(dcontext, opnd_create_reg(REG_AL), OPND_CREATE_INT8(0x7f)));
}
PRE(ilist, where, INSTR_CREATE_sahf(dcontext));
if (IF_X64_ELSE(x86_to_x64_ibl_opt, false)) {
PRE(ilist, where, RESTORE_FROM_REG(dcontext, REG_XAX, REG_R8));
} else if (tls) {
PRE(ilist, where, RESTORE_FROM_TLS(dcontext, REG_XAX, PREFIX_XAX_SPILL_SLOT));
} else {
PRE(ilist, where, RESTORE_FROM_DC(dcontext, REG_XAX, XAX_OFFSET));
}
}
#define GET_IB_FTABLE(ibl_code, target_trace_table, field) \
(GET_IBL_TARGET_TABLE((ibl_code)->branch_type, (target_trace_table)) + \
offsetof(ibl_table_t, field))
#define HASHLOOKUP_TAG_OFFS (offsetof(fragment_entry_t, tag_fragment))
#define HASHLOOKUP_START_PC_OFFS (offsetof(fragment_entry_t, start_pc_fragment))
* Only assumption is that xcx = effective address of indirect branch
* Else, this emits the top of the shared lookup routine, which assumes:
* 1) xbx = &linkstub
* 2) xcx = effective address of indirect branch
* Assumes that a jne_short is sufficient to reach miss_tgt.
* Returns pointers to three instructions, for use in calculating offsets
* and in pointing jmps inside the ibl head.
* It's fine to pass NULL if you're not interested in them.
*/
void
append_ibl_head(dcontext_t *dcontext, instrlist_t *ilist, ibl_code_t *ibl_code,
patch_list_t *patch, instr_t **fragment_found, instr_t **compare_tag_inst,
instr_t **post_eflags_save, opnd_t miss_tgt, bool miss_8bit,
bool target_trace_table, bool inline_ibl_head)
{
instr_t *mask, *table = NULL, *compare_tag = NULL, *after_linkcount;
opnd_t mask_opnd;
bool absolute = !ibl_code->thread_shared_routine;
bool table_in_tls = SHARED_IB_TARGETS() &&
(target_trace_table || SHARED_BB_ONLY_IB_TARGETS()) &&
DYNAMO_OPTION(ibl_table_in_tls);
uint hash_to_address_factor;
bool only_spill_state_in_tls = !absolute && !table_in_tls;
IF_X64(bool x86_to_x64_ibl_opt =
ibl_code->x86_to_x64_mode && DYNAMO_OPTION(x86_to_x64_ibl_opt);)
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));
#ifndef X64
* needed before for thread-private eflags save.
*/
if (only_spill_state_in_tls) {
insert_shared_get_dcontext(dcontext, ilist, NULL, true );
}
#endif
if (!INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
* case 7169. We're not using any of those techniques, so we save the
* flags.
*/
insert_save_eflags(dcontext, ilist, NULL, 0, IBL_EFLAGS_IN_TLS(),
absolute _IF_X64(x86_to_x64_ibl_opt));
}
if (post_eflags_save != NULL) {
*post_eflags_save = INSTR_CREATE_label(dcontext);
APP(ilist, *post_eflags_save);
}
#ifdef X64
if (only_spill_state_in_tls) {
insert_shared_get_dcontext(dcontext, ilist, NULL, true );
}
#endif
if (IF_X64_ELSE(x86_to_x64_ibl_opt, false)) {
after_linkcount = SAVE_TO_REG(dcontext, SCRATCH_REG1, REG_R10);
} else if (inline_ibl_head || !DYNAMO_OPTION(indirect_stubs)) {
if (absolute)
after_linkcount = SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS);
else
after_linkcount = SAVE_TO_TLS(dcontext, SCRATCH_REG1, TLS_REG1_SLOT);
} else {
* don't need to restore it on hit! save to **xdi** slot so as
* to not overwrite linkstub ptr */
if (absolute)
after_linkcount = SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS);
else if (table_in_tls)
after_linkcount = SAVE_TO_TLS(dcontext, SCRATCH_REG1, TLS_REG3_SLOT);
else
after_linkcount = SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS);
}
APP(ilist, after_linkcount);
if (ibl_code->thread_shared_routine && !DYNAMO_OPTION(private_ib_in_tls)) {
* can work with both tls and mcontext prefixes
* do not need this if using all-tls (private_ib_in_tls option)
*/
#ifdef X64
if (x86_to_x64_ibl_opt)
APP(ilist, RESTORE_FROM_REG(dcontext, SCRATCH_REG1, REG_R9));
else
#endif
APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, MANGLE_XCX_SPILL_SLOT));
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG2_OFFS));
}
* keep original in xbx, hash will be in xcx
*>>> mov %xcx,%xbx */
APP(ilist,
XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG1),
opnd_create_reg(SCRATCH_REG2)));
if (only_spill_state_in_tls) {
8b bf 94 00 00 00 mov 0x94(%xdi) -> %xdi
*/
APP(ilist,
XINST_CREATE_load(
dcontext, opnd_create_reg(SCRATCH_REG5),
OPND_DC_FIELD(absolute, dcontext, OPSZ_PTR, FRAGMENT_FIELD_OFFSET)));
}
if (!absolute && table_in_tls) {
mask_opnd = OPND_TLS_FIELD(TLS_MASK_SLOT(ibl_code->branch_type));
} else if (!absolute) {
ASSERT(only_spill_state_in_tls);
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(
GET_IB_FTABLE(ibl_code, target_trace_table, hash_mask))));
mask_opnd = opnd_create_base_disp(
SCRATCH_REG5, REG_NULL, 0,
(int)GET_IB_FTABLE(ibl_code, target_trace_table, hash_mask), OPSZ_PTR);
} else {
* if we did need to support an immediate for x64, we could
* just use the lower 32 bits and let them be sign-extended.
*>>> andl $0x3fff,%xcx */
mask_opnd = opnd_create_immed_int(0x3fff, OPSZ_4);
}
mask = INSTR_CREATE_and(dcontext, opnd_create_reg(SCRATCH_REG2), mask_opnd);
APP(ilist, mask);
if (absolute) {
add_patch_entry(
patch, mask, PATCH_PER_THREAD,
(ptr_uint_t)GET_IB_FTABLE(ibl_code, target_trace_table, hash_mask));
}
not created yet, use 0 */
if (only_spill_state_in_tls) {
*/
instr_t *table_in_xdi;
IF_X64(ASSERT(
CHECK_TRUNCATE_TYPE_int(GET_IB_FTABLE(ibl_code, target_trace_table, table))));
table_in_xdi = XINST_CREATE_load(
dcontext, opnd_create_reg(SCRATCH_REG5),
opnd_create_base_disp(SCRATCH_REG5, REG_NULL, 0,
(int)GET_IB_FTABLE(ibl_code, target_trace_table, table),
OPSZ_PTR));
APP(ilist, table_in_xdi);
}
if (absolute) {
ASSERT(sizeof(fragment_entry_t) == 8);
if (HASHTABLE_IBL_OFFSET(ibl_code->branch_type) <= IBL_HASH_FUNC_OFFSET_MAX) {
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_uint(
sizeof(fragment_entry_t) /
(size_t)(1 << HASHTABLE_IBL_OFFSET(ibl_code->branch_type)))));
hash_to_address_factor = (uint)sizeof(fragment_entry_t) /
(1 << HASHTABLE_IBL_OFFSET(ibl_code->branch_type));
} else {
ASSERT_NOT_IMPLEMENTED(false);
hash_to_address_factor = 1;
}
* is using the correct hash_mask_offset
*/
* head is not inlined we may want to try that advice
*/
* since we don't need an index register we can switch to a non-SIB encoding
* so that instead of 7 bytes we have 6 byte encoding going through the fast
* decoder
* 8d 0c 0d 5039721c lea xcx,[1c723950+xcx] ; currently
* 8d 89 __ 5039721c lea xcx,[xcx+0x1c723950] ; shorter
*/
table =
INSTR_CREATE_lea(dcontext, opnd_create_reg(SCRATCH_REG2),
opnd_create_base_disp(REG_NULL, SCRATCH_REG2,
hash_to_address_factor, 0, OPSZ_lea));
add_patch_entry(patch, table, PATCH_PER_THREAD,
GET_IB_FTABLE(ibl_code, target_trace_table, table));
APP(ilist, table);
} else {
* use faster and smaller add sequences for our shift
*/
uint i;
opnd_t table_opnd;
ASSERT_NOT_IMPLEMENTED(HASHTABLE_IBL_OFFSET(ibl_code->branch_type) <=
IBL_HASH_FUNC_OFFSET_MAX);
for (i = IBL_HASH_FUNC_OFFSET_MAX;
i > HASHTABLE_IBL_OFFSET(ibl_code->branch_type); i--) {
APP(ilist,
INSTR_CREATE_add(dcontext, opnd_create_reg(SCRATCH_REG2),
opnd_create_reg(SCRATCH_REG2)));
}
* to do it in combination with the shift:
* add fs:lookuptable,%xcx -> %xcx
* or if the table addr is in %xdi:
* add %xdi,%xcx -> %xcx
*/
table_opnd = table_in_tls
? OPND_TLS_FIELD(TLS_TABLE_SLOT(ibl_code->branch_type))
: opnd_create_reg(SCRATCH_REG5) ;
APP(ilist, INSTR_CREATE_add(dcontext, opnd_create_reg(SCRATCH_REG2), table_opnd));
}
*>>> cmp HASHLOOKUP_TAG_OFFS(%xcx),%xbx */
compare_tag =
INSTR_CREATE_cmp(dcontext, OPND_CREATE_MEMPTR(SCRATCH_REG2, HASHLOOKUP_TAG_OFFS),
opnd_create_reg(SCRATCH_REG1));
APP(ilist, compare_tag);
if (miss_8bit)
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jne_short, miss_tgt));
else
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jne, miss_tgt));
#ifdef X64
if (ibl_code->x86_mode) {
* bits are 0 before we declare it a match (xref PR 283895).
*/
APP(ilist,
INSTR_CREATE_cmp(dcontext,
OPND_CREATE_MEM32(SCRATCH_REG2, HASHLOOKUP_TAG_OFFS + 4),
OPND_CREATE_INT32(0)));
if (miss_8bit)
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jne_short, miss_tgt));
else
APP(ilist, INSTR_CREATE_jcc(dcontext, OP_jne, miss_tgt));
}
#endif
#define HEAD_START_PC_OFFS HASHLOOKUP_START_PC_OFFS
append_ibl_found(dcontext, ilist, ibl_code, patch, HEAD_START_PC_OFFS, false,
only_spill_state_in_tls,
target_trace_table ? DYNAMO_OPTION(trace_single_restore_prefix)
: DYNAMO_OPTION(bb_single_restore_prefix),
fragment_found);
#undef HEAD_START_PC_OFFS
if (compare_tag_inst != NULL)
*compare_tag_inst = compare_tag;
}
*
hit path (shared_syscall remains as before):
if (!INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
| 5 movl %eax,eax_OFFSET
| 1 lahf
| 3 seto %al
}
6 movl %ebx, ebx_offs(&dcontext)
2 movl %ecx,%ebx # tag in ecx, hash will be in ebx
6 andl $0x3fff,%ecx # hash the tag
7 movl ftable(,%ecx,4),%ecx # ecx = ftable[hash]
# empty slot is not 0, instead is a constant frag w/ tag 0
2 cmpl FRAGMENT_TAG_OFFS(%ecx),%ebx
2 jne miss # if !DYNAMO_OPTION(indirect_stubs), jne ibl
6 movl ebx_offs(&dcontext),%ebx
3 jmp *FRAGMENT_START_PC_OFFS(%ecx)
unlinked entry point into stub:
if (!DYNAMO_OPTION(indirect_stubs)) {
5 jmp unlinked_ib_lookup # we can eliminate this if we store stub pc
} else {
if (DYNAMO_OPTION(atomic_inlined_linking)) {
# duplicate miss path so linking can be atomic
10 movl &linkstub, edi_offs(&dcontext)
5 jmp unlinked_ib_lookup
} else {
# set flag in ecx (bottom byte = 0x1) so that unlinked path can
# detect race condition during unlinking
6 movl %ecx, ebx_offs(&dcontext)
2 movb $0x1, %ecx
}
miss:
10 movl &linkstub, edi_offs(&dcontext)
5 jmp indirect_branch_lookup/(if !atomic_inlined_linking)unlinked_ib_lookup
}
*/
byte *
emit_inline_ibl_stub(dcontext_t *dcontext, byte *pc, ibl_code_t *ibl_code,
bool target_trace_table)
{
* reading any without making sure they're initialized first
*/
instrlist_t ilist;
instr_t *miss = NULL, *unlink, *after_unlink = NULL;
patch_list_t *patch = &ibl_code->ibl_stub_patch;
byte *unlinked_ibl_pc = ibl_code->unlinked_ibl_entry;
byte *linked_ibl_pc = ibl_code->indirect_branch_lookup_routine;
bool absolute = !ibl_code->thread_shared_routine;
* Keep in mind PR 257963: trace inline cmp needs separate entry. */
IF_X64(ASSERT_NOT_IMPLEMENTED(false));
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));
ibl_code->inline_ibl_stub_template = pc;
ibl_code->ibl_head_is_inlined = true;
instrlist_init(&ilist);
init_patch_list(patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_XDI);
<head>
unlinked entry point into stub:
if (DYNAMO_OPTION(atomic_inlined_linking)) {
# duplicate miss path so linking can be atomic
10 movl &linkstub, edi_offs(&dcontext)
5 jmp unlinked_ib_lookup
} else {
# set flag in ecx (bottom byte = 0x1) so that unlinked path can
# detect race condition during unlinking
6 movl %ecx, ebx_offs(&dcontext)
2 movb $0x1, %ecx
}
miss:
10 movl &linkstub, edi_offs(&dcontext)
5 jmp indirect_branch_lookup/(if !atomic_inlined_linking)unlinked_ib_lookup
*/
if (DYNAMO_OPTION(indirect_stubs)) {
if (absolute) {
miss = XINST_CREATE_store(
dcontext, opnd_create_dcontext_field(dcontext, SCRATCH_REG5_OFFS),
OPND_CREATE_INT32(0));
} else {
miss = XINST_CREATE_store(dcontext, OPND_TLS_FIELD(TLS_REG3_SLOT),
OPND_CREATE_INT32(0));
}
append_ibl_head(dcontext, &ilist, ibl_code, patch, NULL, NULL, NULL,
opnd_create_instr(miss), true ,
target_trace_table, true );
if (DYNAMO_OPTION(atomic_inlined_linking)) {
if (absolute) {
unlink = SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS);
after_unlink = XINST_CREATE_store(
dcontext, opnd_create_dcontext_field(dcontext, SCRATCH_REG5_OFFS),
OPND_CREATE_INT32(0));
} else {
unlink = SAVE_TO_TLS(dcontext, SCRATCH_REG1, TLS_REG1_SLOT);
after_unlink = XINST_CREATE_store(dcontext, OPND_TLS_FIELD(TLS_REG3_SLOT),
OPND_CREATE_INT32(0));
}
APP(&ilist, unlink);
APP(&ilist, after_unlink);
APP(&ilist, INSTR_CREATE_jmp(dcontext, opnd_create_pc(unlinked_ibl_pc)));
} else {
if (absolute)
unlink = SAVE_TO_DC(dcontext, SCRATCH_REG2, SCRATCH_REG1_OFFS);
else
unlink = SAVE_TO_TLS(dcontext, SCRATCH_REG2, TLS_REG1_SLOT);
APP(&ilist, unlink);
APP(&ilist,
XINST_CREATE_load_int(dcontext, opnd_create_reg(REG_CL),
OPND_CREATE_INT8(1)));
}
APP(&ilist, miss);
APP(&ilist,
INSTR_CREATE_jmp(dcontext,
opnd_create_pc(DYNAMO_OPTION(atomic_inlined_linking)
? linked_ibl_pc
: unlinked_ibl_pc)));
add_patch_marker(patch, unlink, PATCH_UINT_SIZED ,
0 ,
(ptr_uint_t *)&ibl_code->inline_unlink_offs);
if (DYNAMO_OPTION(atomic_inlined_linking)) {
add_patch_marker(patch, after_unlink, PATCH_UINT_SIZED ,
-4 ,
(ptr_uint_t *)&ibl_code->inline_linkstub_second_offs);
add_patch_marker(patch, instr_get_prev(miss),
PATCH_UINT_SIZED , 1 ,
(ptr_uint_t *)&ibl_code->inline_unlinkedjmp_offs);
}
add_patch_marker(patch, miss, PATCH_UINT_SIZED ,
-4 ,
(ptr_uint_t *)&ibl_code->inline_linkstub_first_offs);
add_patch_marker(patch, instrlist_last(&ilist),
PATCH_UINT_SIZED , 1 ,
(ptr_uint_t *)&ibl_code->inline_linkedjmp_offs);
} else {
instr_t *cmp;
append_ibl_head(dcontext, &ilist, ibl_code, patch, NULL, &cmp, NULL,
opnd_create_pc(linked_ibl_pc), false ,
target_trace_table, true );
* go to the inlined stub when linked and straight to the unlinked ibl
* entry when unlinked but we haven't put in the support in the link
* routines (they all assume they can find the unlinked from the current
* target in a certain manner).
*/
unlink = INSTR_CREATE_jmp(dcontext, opnd_create_pc(unlinked_ibl_pc));
APP(&ilist, unlink);
* inserted inside app_ibl_head (in append_ibl_found) at a later instr
* than the miss instr. To fix, we must either put the miss patch point
* in the middle of the array and shift it over to keep it sorted, or
* enable patch-encode to handle out-of-order entries (we could mark
* this with a flag).
*/
#ifdef HASHTABLE_STATISTICS
ASSERT_NOT_IMPLEMENTED(!absolute || !INTERNAL_OPTION(hashtable_ibl_stats));
#endif
add_patch_marker(patch, instr_get_next(cmp), PATCH_UINT_SIZED ,
2 ,
(ptr_uint_t *)&ibl_code->inline_linkedjmp_offs);
* but encode_with_patch_list asserts, wanting 1 patch per instr, in order
*/
add_patch_marker(patch, unlink, PATCH_UINT_SIZED ,
0 ,
(ptr_uint_t *)&ibl_code->inline_unlink_offs);
}
ibl_code->inline_stub_length = encode_with_patch_list(dcontext, patch, &ilist, pc);
instrlist_clear(dcontext, &ilist);
return pc + ibl_code->inline_stub_length;
}
* for now always using jmp rel32 with statistics
*
* Use with caution where jmp_short would really work in release - no
* ASSERTs to help you.
*/
#ifdef HASHTABLE_STATISTICS
# define INSTR_CREATE_jmp_smart INSTR_CREATE_jmp
#else
# define INSTR_CREATE_jmp_smart INSTR_CREATE_jmp_short
#endif
# indirect_branch_lookup
#.If the lookup succeeds, control jumps to the fcache target; otherwise
# it sets up for and jumps to fcache_return.
# when we unlink an indirect branch we go through the cleanup part of
# this lookup routine that takes us straight to fcache_return.
# We assume dynamo is NOT in trace creation mode (which would require
# going back to dynamo here). We assume that when a fragment is
# unlinked its indirect branch exit stubs are redirected to the
# unlinked_* labels below. Note that even if you did come in here in
# trace creation mode, and we didn't go back to dynamo here, the
# current trace would have ended now (b/c next fragment is a trace),
# so we'd end up possibly adding erroneous fragments to the end of
# the trace but the indirect branch check would ensure they were never
# executed.
# N.B.: a number of optimizations of the miss path are possible by making
# it separate from the unlink path
*/
byte *
emit_indirect_branch_lookup(dcontext_t *dcontext, generated_code_t *code, byte *pc,
byte *fcache_return_pc, bool target_trace_table,
bool inline_ibl_head, ibl_code_t *ibl_code )
{
instrlist_t ilist;
instr_t *fragment_not_found, *unlinked = INSTR_CREATE_label(dcontext);
instr_t *target_delete_entry;
patch_list_t *patch = &ibl_code->ibl_patch;
bool absolute = !ibl_code->thread_shared_routine;
bool table_in_tls = SHARED_IB_TARGETS() &&
(target_trace_table || SHARED_BB_ONLY_IB_TARGETS()) &&
DYNAMO_OPTION(ibl_table_in_tls);
bool only_spill_state_in_tls = !absolute && !table_in_tls;
#ifdef HASHTABLE_STATISTICS
bool save_xdi = !absolute && table_in_tls;
#endif
instr_t *fragment_found;
instr_t *compare_tag = NULL;
instr_t *sentinel_check;
const linkstub_t *linkstub = NULL;
IF_X64(bool x86_to_x64_ibl_opt =
ibl_code->x86_to_x64_mode && DYNAMO_OPTION(x86_to_x64_ibl_opt);)
instr_t *next_fragment_nochasing =
INSTR_CREATE_cmp(dcontext, OPND_CREATE_MEMPTR(SCRATCH_REG2, HASHLOOKUP_TAG_OFFS),
OPND_CREATE_INT8(0));
IF_X64(ASSERT_NOT_IMPLEMENTED(!absolute));
if (ibl_code->source_fragment_type == IBL_COARSE_SHARED ||
!DYNAMO_OPTION(indirect_stubs)) {
linkstub = get_ibl_sourceless_linkstub(ibltype_to_linktype(ibl_code->branch_type),
IBL_FRAG_FLAGS(ibl_code));
}
* app state, except for those restored in a prefix. We need to massage
* the state so that it looks like the fragment_not_found -- IBL miss -
* path, so we need to restore %xbx. See more on the target_delete_entry
* below, where the instr is added to the ilist.
*/
#ifdef X64
if (x86_to_x64_ibl_opt) {
target_delete_entry = SAVE_TO_REG(dcontext, SCRATCH_REG1, REG_R10);
} else {
#endif
target_delete_entry = absolute
? instr_create_save_to_dcontext(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS)
: SAVE_TO_TLS(dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT);
#ifdef X64
}
#endif
fragment_not_found = XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG2),
opnd_create_reg(SCRATCH_REG1));
instrlist_init(&ilist);
init_patch_list(patch, absolute ? PATCH_TYPE_ABSOLUTE : PATCH_TYPE_INDIRECT_XDI);
LOG(THREAD, LOG_EMIT, 3,
"emit_indirect_branch_lookup: pc=" PFX " fcache_return_pc=" PFX "\n"
"target_trace_table=%d inline_ibl_head=%d absolute=%d\n",
pc, fcache_return_pc, target_trace_table, inline_ibl_head, absolute);
if (inline_ibl_head) {
* 1) xbx = effective address of indirect branch
* 2) xcx = &fragment
* 3) xdx_slot = &linkstub
*/
} else {
* 1) xbx = &linkstub if DYNAMO_OPTION(indirect_stubs),
* or src tag if DYNAMO_OPTION(coarse_units)
* 2) xcx = effective address of indirect branch
*/
#ifdef X64
instr_t *trace_cmp_entry;
#endif
append_ibl_head(dcontext, &ilist, ibl_code, patch, &fragment_found, &compare_tag,
IF_X64_ELSE(&trace_cmp_entry, NULL),
opnd_create_instr(next_fragment_nochasing),
true , target_trace_table,
inline_ibl_head);
#ifdef X64
if (IS_IBL_TRACE(ibl_code->source_fragment_type) &&
!GENCODE_IS_X86(code->gencode_mode)) {
add_patch_marker(patch, trace_cmp_entry, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->trace_cmp_entry);
}
#endif
}
APP(&ilist, next_fragment_nochasing);
if (INTERNAL_OPTION(ibl_sentinel_check)) {
* &lookuptable[ftable->capacity-1] (sentinel) while it would
* work great for thread private IBL routines where we can
* hardcode the address.
* >>>a) cmp %xcx, HASHLOOKUP_SENTINEL_ADDR
;; &lookuptable[ftable->capacity-1] (sentinel)
* For shared routines currently we'd need to walk a few
* pointers - we could just put that one TLS to avoid pointer
* chasing. Yet if we are to have even one extra memory load
* anyways, it is easier to just store a special start_pc to
* compare instead
* >>>b) cmp 4x8(%xcx), HASHLOOKUP_SENTINEL_PC
* Where the expectation is that null_fragment=(0,0) while
* sentinel_fragment=(0,1) For simplicity we just use b) even
* in private IBL routines.
*/
ASSERT((int)(ptr_int_t)HASHLOOKUP_SENTINEL_START_PC <= INT8_MAX &&
(int)(ptr_int_t)HASHLOOKUP_SENTINEL_START_PC >= INT8_MIN);
sentinel_check = INSTR_CREATE_cmp(
dcontext, OPND_CREATE_MEMPTR(SCRATCH_REG2, HASHLOOKUP_START_PC_OFFS),
OPND_CREATE_INT8((int)(ptr_int_t)HASHLOOKUP_SENTINEL_START_PC));
} else {
* just exit back to d_r_dispatch
*/
sentinel_check = fragment_not_found;
}
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_je_short, opnd_create_instr(sentinel_check)));
* add xcx, 8x16 # no wrap around check, instead rely on a nulltag sentinel entry
* alternative method of rehashing xbx+4x8 or without checks is also not efficient
*/
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
if (save_xdi)
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
append_increment_counter(dcontext, &ilist, ibl_code, patch, REG_NULL,
HASHLOOKUP_STAT_OFFS(collision),
REG_NULL);
if (save_xdi) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
}
}
#endif
APP(&ilist,
INSTR_CREATE_lea(dcontext, opnd_create_reg(SCRATCH_REG2),
opnd_create_base_disp(SCRATCH_REG2, REG_NULL, 0,
sizeof(fragment_entry_t), OPSZ_lea)));
if (inline_ibl_head) {
compare_tag = INSTR_CREATE_cmp(
dcontext, OPND_CREATE_MEMPTR(SCRATCH_REG2, HASHLOOKUP_TAG_OFFS),
opnd_create_reg(SCRATCH_REG1));
APP(&ilist, compare_tag);
* (DS == PREFIX_DATA)
*/
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_jne_short,
opnd_create_instr(next_fragment_nochasing)));
append_ibl_found(dcontext, &ilist, ibl_code, patch, HASHLOOKUP_START_PC_OFFS,
true, only_spill_state_in_tls,
target_trace_table ? DYNAMO_OPTION(trace_single_restore_prefix)
: DYNAMO_OPTION(bb_single_restore_prefix),
NULL);
} else {
* since release builds can use a short jump
*/
APP(&ilist, INSTR_CREATE_jmp_smart(dcontext, opnd_create_instr(compare_tag)));
}
if (INTERNAL_OPTION(ibl_sentinel_check)) {
APP(&ilist, sentinel_check);
APP(&ilist,
INSTR_CREATE_jcc(dcontext, OP_jne_short,
opnd_create_instr(fragment_not_found)));
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
if (save_xdi)
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
append_increment_counter(dcontext, &ilist, ibl_code, patch, REG_NULL,
HASHLOOKUP_STAT_OFFS(overwrap),
REG_NULL);
if (save_xdi) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
}
}
#endif
* mov %xdi -> %xcx ; xdi == &lookuptable[0]
*/
if (absolute) {
enum { BOGUS_HASH_TABLE = 0xabcdabcd };
instr_t *table = INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(SCRATCH_REG2),
OPND_CREATE_INT32(BOGUS_HASH_TABLE));
add_patch_entry(patch, table, PATCH_PER_THREAD,
GET_IB_FTABLE(ibl_code, target_trace_table, table));
APP(&ilist, table);
} else if (table_in_tls) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG2,
TLS_TABLE_SLOT(ibl_code->branch_type)));
} else {
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
* it and then reload per_thread_t* and then the table*. */
insert_shared_get_dcontext(dcontext, &ilist, NULL, false);
APP(&ilist,
XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG5),
OPND_DC_FIELD(absolute, dcontext, OPSZ_PTR,
FRAGMENT_FIELD_OFFSET)));
* we assume that this isn't a performance-sensitive run and
* opt for code simplicity by rematerializing XDI.
*/
IF_X64(ASSERT(CHECK_TRUNCATE_TYPE_int(
GET_IB_FTABLE(ibl_code, target_trace_table, table))));
APP(&ilist,
XINST_CREATE_load(
dcontext, opnd_create_reg(SCRATCH_REG5),
opnd_create_base_disp(
SCRATCH_REG5, REG_NULL, 0,
(int)GET_IB_FTABLE(ibl_code, target_trace_table, table),
OPSZ_PTR)));
}
#endif
APP(&ilist,
XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG2),
opnd_create_reg(SCRATCH_REG5)));
}
APP(&ilist, INSTR_CREATE_jmp(dcontext, opnd_create_instr(compare_tag)));
}
#ifdef X64
if (IS_IBL_TRACE(ibl_code->source_fragment_type) &&
!GENCODE_IS_X86(code->gencode_mode)) {
if (INTERNAL_OPTION(unsafe_ignore_eflags_trace)) {
add_patch_marker(patch, unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->trace_cmp_unlinked);
} else if (inline_ibl_head) {
* we insert our own
*/
instr_t *trace_cmp_unlinked = INSTR_CREATE_label(dcontext);
APP(&ilist, trace_cmp_unlinked);
add_patch_marker(patch, trace_cmp_unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->trace_cmp_unlinked);
insert_restore_eflags(dcontext, &ilist, NULL, 0, true , false ,
x86_to_x64_ibl_opt);
APP(&ilist, INSTR_CREATE_jmp(dcontext, opnd_create_instr(unlinked)));
}
}
#endif
* target_delete_entry
*/
* miss path restore assumes -- so we put it there now. If coming from
* a private ibl or a no-prefix-target ibl, this is simply a redundant
* store. Xref case 4649.
*/
APP(&ilist, target_delete_entry);
if (linkstub == NULL) {
* we use a special linkstub_t in the last_exit "slot" (xdi / tls xdx) for any
* source (xref case 4635).
* Rare enough that should be ok, and everyone, including trace building,
* can handle it. Although w/ an unknown last_exit the trace builder has to
* assume the final exit was taken, that's only bad when ending in a cbr,
* and when that's the case won't end up here (have to have -inline_bb_ibl
* to get here, since we only add bbs to traces).
*/
#ifdef X64
APP(&ilist,
INSTR_CREATE_mov_imm(
dcontext, opnd_create_reg(SCRATCH_REG1),
OPND_CREATE_INTPTR((ptr_int_t)get_ibl_deleted_linkstub())));
#endif
if (absolute) {
APP(&ilist,
instr_create_save_immed32_to_dcontext(
dcontext, (int)(ptr_int_t)get_ibl_deleted_linkstub(),
SCRATCH_REG5_OFFS));
} else if (table_in_tls) {
APP(&ilist,
XINST_CREATE_store(
dcontext, OPND_TLS_FIELD(TLS_REG3_SLOT),
IF_X64_ELSE(
opnd_create_reg(SCRATCH_REG1),
OPND_CREATE_INTPTR((ptr_int_t)get_ibl_deleted_linkstub()))));
} else {
insert_shared_get_dcontext(dcontext, &ilist, NULL, true);
APP(&ilist,
XINST_CREATE_store(
dcontext,
OPND_DC_FIELD(absolute, dcontext, OPSZ_PTR, SCRATCH_REG5_OFFS),
IF_X64_ELSE(
opnd_create_reg(SCRATCH_REG1),
OPND_CREATE_INTPTR((ptr_int_t)get_ibl_deleted_linkstub()))));
insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
}
}
* FIXME: for -no_indirect_stubs, is this source of add_ibl curiosities on IIS?
* but one at least was a post-syscall!
*/
if (!ibl_use_target_prefix(ibl_code)) {
* in the hit path. we also restored eflags already.
*/
if (absolute) {
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG1, SCRATCH_REG0_OFFS));
} else {
APP(&ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, DIRECT_STUB_SPILL_SLOT));
}
if (!INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
insert_save_eflags(dcontext, &ilist, NULL, 0, IBL_EFLAGS_IN_TLS(),
absolute _IF_X64(x86_to_x64_ibl_opt));
}
} else {
APP(&ilist,
XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG1),
OPND_CREATE_MEMPTR(SCRATCH_REG2, FRAGMENT_TAG_OFFS)));
}
* further later, so if we don't add now the patch ordering becomes
* confused.
*/
add_patch_marker(patch, target_delete_entry, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->target_delete_entry);
* fragment_not_found
*/
* the unlinked inlined indirect branch race condition case
* also comes here (if !atomic_inlined_linking) */
*>>> mov %xbx, %xcx */
APP(&ilist, fragment_not_found);
* condition cases (if !atomic_inlined_linking), but that should almost
* never happen so don't worry about it screwing up the count */
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
if (save_xdi)
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
append_increment_counter(dcontext, &ilist, ibl_code, patch, REG_NULL,
HASHLOOKUP_STAT_OFFS(miss), SCRATCH_REG1);
if (save_xdi) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
}
}
#endif
if (only_spill_state_in_tls) {
insert_shared_get_dcontext(dcontext, &ilist, NULL, false );
}
* inlining we reverse them so that trace_cmp entry can come in at the restore */
if (inline_ibl_head) {
if (!INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
insert_restore_eflags(dcontext, &ilist, NULL, 0, IBL_EFLAGS_IN_TLS(),
absolute _IF_X64(x86_to_x64_ibl_opt));
}
APP(&ilist, unlinked);
}
if (DYNAMO_OPTION(indirect_stubs)) {
if (absolute) {
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS));
} else if (table_in_tls) {
APP(&ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, TLS_REG3_SLOT));
} else {
ASSERT(only_spill_state_in_tls);
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG1, SCRATCH_REG5_OFFS));
}
} else {
if (IF_X64_ELSE(x86_to_x64_ibl_opt, false))
APP(&ilist, RESTORE_FROM_REG(dcontext, SCRATCH_REG1, REG_R10));
else if (absolute)
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS));
else {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT));
}
}
if (only_spill_state_in_tls) {
insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
}
if (!inline_ibl_head) {
#ifdef X64
if (IS_IBL_TRACE(ibl_code->source_fragment_type) &&
!GENCODE_IS_X86(code->gencode_mode) &&
!INTERNAL_OPTION(unsafe_ignore_eflags_trace)) {
instr_t *trace_cmp_unlinked = INSTR_CREATE_label(dcontext);
APP(&ilist, trace_cmp_unlinked);
add_patch_marker(patch, trace_cmp_unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->trace_cmp_unlinked);
}
#endif
if (!INTERNAL_OPTION(unsafe_ignore_eflags_ibl)) {
insert_restore_eflags(dcontext, &ilist, NULL, 0, IBL_EFLAGS_IN_TLS(),
absolute _IF_X64(x86_to_x64_ibl_opt));
}
APP(&ilist, unlinked);
}
if (!absolute) {
insert_shared_get_dcontext(dcontext, &ilist, NULL, true );
}
* was saved before for saving and restoring eflags. FIXME: in
* some incarnations of this routine it is redundant, yet this is
* the slow path anyways
*/
APP(&ilist, SAVE_TO_DC(dcontext, SCRATCH_REG0, SCRATCH_REG0_OFFS));
direct exit stubs to use XBX */
if (ibl_code->source_fragment_type == IBL_COARSE_SHARED) {
* fake linkstubs. Here we put the src from xbx into its special slot.
*/
ASSERT(DYNAMO_OPTION(coarse_units));
APP(&ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, COARSE_IB_SRC_OFFSET));
ASSERT(linkstub != NULL);
}
if (TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask)) {
* have safe stack yet! so we duplicate fcache_return code here,
* but we keep xcx w/ next tag around until we can store it as next_tag.
*/
if (linkstub == NULL) {
APP(&ilist,
XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG0),
opnd_create_reg(SCRATCH_REG1)));
} else {
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(SCRATCH_REG0),
OPND_CREATE_INTPTR((ptr_int_t)linkstub)));
}
append_fcache_return_common(dcontext, code, &ilist, true , absolute,
false , NULL, false );
} else {
APP(&ilist, SAVE_TO_DC(dcontext, SCRATCH_REG2, NEXT_TAG_OFFSET));
if (linkstub == NULL) {
APP(&ilist,
XINST_CREATE_load(dcontext, opnd_create_reg(SCRATCH_REG0),
opnd_create_reg(SCRATCH_REG1)));
} else {
APP(&ilist,
INSTR_CREATE_mov_imm(dcontext, opnd_create_reg(SCRATCH_REG0),
OPND_CREATE_INTPTR((ptr_int_t)linkstub)));
}
if (absolute) {
if (DYNAMO_OPTION(indirect_stubs))
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS));
} else {
if (DYNAMO_OPTION(indirect_stubs)) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, INDIRECT_STUB_SPILL_SLOT));
}
* using XCX
*/
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG2, SCRATCH_REG0_OFFS));
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG2, DIRECT_STUB_SPILL_SLOT));
}
* mcontext for private IBL routines (unless private_ib_in_tls is set).
* For x86_to_x64, we restore XCX from R9.
*/
if (IF_X64_ELSE(x86_to_x64_ibl_opt, false)) {
APP(&ilist, RESTORE_FROM_REG(dcontext, SCRATCH_REG2, REG_R9));
} else if (ibl_code->thread_shared_routine || DYNAMO_OPTION(private_ib_in_tls)) {
APP(&ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG2, MANGLE_XCX_SPILL_SLOT));
} else {
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG2, SCRATCH_REG2_OFFS));
}
if (!absolute) {
* save it again */
insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
}
* other app registers restored */
APP(&ilist, INSTR_CREATE_jmp(dcontext, opnd_create_pc(fcache_return_pc)));
}
if (inline_ibl_head && !DYNAMO_OPTION(atomic_inlined_linking)) {
* >>> race_condition_inc:
* >>> #note that eflags are already saved in this path
* >>> <inc_stat>
* >>> jmp fragment_not_found
* #endif
* >>> #detect unlink path flag to check for unlinking race condition
* >>> #must be eflags safe, they are prob. not saved yet
* >>> unlinked:
* >>> movzx %cl, %xcx
* >>> # xcx now holds 1 in the unlink case, and the zero extended
* >>> # lower byte of a pointer into the hashtable in the race
* >>> # condition case (since our pointers into the hashtable are
* >>> # aligned this can't be 1), the loop will jmp if xcx != 1
* #ifdef HASHTABLE_STATISTICS
* >>> loop race_condition_inc #race condition handling path
* #else
* >>> loop fragment_not_found #race condition handling path
* #endif
* >>> #normal unlink path
* >>> RESTORE_FROM_UPCONTEXT xbx_offset, %xcx
* >>> SAVE_TO_UPCONTEXT %xbx, xbx_offset
* >>> jmp old_unlinked
*/
instr_t *old_unlinked_target = instr_get_next(unlinked);
instr_t *race_target = fragment_not_found;
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
race_target = instrlist_last(&ilist);
if (save_xdi)
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
append_increment_counter(dcontext, &ilist, ibl_code, patch, REG_NULL,
HASHLOOKUP_STAT_OFFS(race_condition), SCRATCH_REG2);
if (save_xdi) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
}
APP(&ilist,
INSTR_CREATE_jmp_short(dcontext, opnd_create_instr(fragment_not_found)));
}
race_target = instr_get_next(race_target);
#endif
instrlist_remove(&ilist, unlinked);
APP(&ilist, unlinked);
APP(&ilist,
INSTR_CREATE_movzx(dcontext, opnd_create_reg(SCRATCH_REG2),
opnd_create_reg(REG_CL)));
add_patch_marker(patch, unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->unlinked_ibl_entry);
APP(&ilist, INSTR_CREATE_loop(dcontext, opnd_create_instr(race_target)));
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
if (save_xdi)
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
append_increment_counter(dcontext, &ilist, ibl_code, patch, REG_NULL,
HASHLOOKUP_STAT_OFFS(unlinked_count), SCRATCH_REG2);
if (save_xdi) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
} else if (only_spill_state_in_tls)
insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
}
#endif
if (absolute) {
APP(&ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG2, SCRATCH_REG1_OFFS));
APP(&ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS));
} else {
APP(&ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG2, SCRATCH_REG1_OFFS));
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG1, SCRATCH_REG1_OFFS));
}
APP(&ilist,
INSTR_CREATE_jmp_short(dcontext, opnd_create_instr(old_unlinked_target)));
} else {
#ifdef HASHTABLE_STATISTICS
if (INTERNAL_OPTION(hashtable_ibl_stats)) {
instr_t *old_unlinked = instr_get_next(unlinked);
instrlist_remove(&ilist, unlinked);
APP(&ilist, unlinked);
add_patch_marker(patch, unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->unlinked_ibl_entry);
* and not from top of ibl, so we must save xdi. Is this true
* for all cases of only_spill_state_in_tls with !save_xdi?
* Maybe should be saved in append_increment_counter's call to
* insert_shared_get_dcontext() instead.
*/
if (IF_X64_ELSE(true, save_xdi)) {
APP(&ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
}
* xax slot, but that should be dead here */
insert_save_eflags(dcontext, &ilist, NULL, 0, !absolute,
absolute _IF_X64(x86_to_x64_ibl_opt));
append_increment_counter(dcontext, &ilist, ibl_code, patch, REG_NULL,
HASHLOOKUP_STAT_OFFS(unlinked_count), SCRATCH_REG1);
insert_restore_eflags(dcontext, &ilist, NULL, 0, !absolute,
absolute _IF_X64(x86_to_x64_ibl_opt));
if (IF_X64_ELSE(true, save_xdi)) {
APP(&ilist,
RESTORE_FROM_TLS(dcontext, SCRATCH_REG5, HTABLE_STATS_SPILL_SLOT));
} else if (only_spill_state_in_tls) {
* at the entry point into the IBL routine but we do need to
* restore app state now.
*/
insert_shared_restore_dcontext_reg(dcontext, &ilist, NULL);
}
APP(&ilist, INSTR_CREATE_jmp(dcontext, opnd_create_instr(old_unlinked)));
} else
#endif
{
add_patch_marker(patch, unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->unlinked_ibl_entry);
}
}
#ifdef X64
if (GENCODE_IS_X86(code->gencode_mode)) {
* using an x86 table, for both full correctness and performance: for now
* we have no way to detect a source in one mode jumping to a target built in
* another mode w/o a mode switch, but that would be an app error anyway).
* Rather than complicating the REG_X* defines used above we have a post-pass
* that shrinks all the registers and all the INTPTR immeds.
* The other two changes we need are performed up above:
* 1) cmp top bits to 0 for match
* 2) no trace_cmp entry points
* Note that we're punting on PR 283152: we go ahead and clobber the top bits
* of all our scratch registers.
*/
instrlist_convert_to_x86(&ilist);
}
#endif
ibl_code->ibl_routine_length = encode_with_patch_list(dcontext, patch, &ilist, pc);
instrlist_clear(dcontext, &ilist);
return pc + ibl_code->ibl_routine_length;
}
void
relink_special_ibl_xfer(dcontext_t *dcontext, int index,
ibl_entry_point_type_t entry_type, ibl_branch_type_t ibl_type)
{
generated_code_t *code;
byte *pc, *ibl_tgt;
if (dcontext == GLOBAL_DCONTEXT) {
ASSERT(!special_ibl_xfer_is_thread_private());
code = SHARED_GENCODE_MATCH_THREAD(get_thread_private_dcontext());
} else {
#ifdef X64
code = SHARED_GENCODE_MATCH_THREAD(dcontext);
#else
ASSERT(special_ibl_xfer_is_thread_private());
code = THREAD_GENCODE(dcontext);
#endif
}
if (code == NULL)
return;
ibl_tgt = special_ibl_xfer_tgt(dcontext, code, entry_type, ibl_type);
ASSERT(code->special_ibl_xfer[index] != NULL);
pc = (code->special_ibl_xfer[index] + code->special_ibl_unlink_offs[index] +
1 );
protect_generated_code(code, WRITABLE);
insert_relative_target(pc, ibl_tgt, code->thread_shared );
protect_generated_code(code, READONLY);
}
bool
fill_with_nops(dr_isa_mode_t isa_mode, byte *addr, size_t size)
{
* #47414, section 5.8 "Code Padding with Operand-Size Override and Multibyte
* NOP".
* For compatibility with Intel case 10 and 11 are left out.
* Xref Intel, see Vol. 2B 4-167 "Table 4-12. Recommended Multi-Byte Sequence of NOP
* Instruction".
*/
switch (size) {
case 1: memcpy(addr, "\x90", 1); break;
case 2: memcpy(addr, "\x66\x90", 2); break;
case 3: memcpy(addr, "\x0f\x1f\x00", 3); break;
case 4: memcpy(addr, "\x0f\x1f\x40\x00", 4); break;
case 5: memcpy(addr, "\x0f\x1f\x44\x00\x00", 5); break;
case 6: memcpy(addr, "\x66\x0f\x1f\x44\x00\x00", 6); break;
case 7: memcpy(addr, "\x0f\x1f\x80\x00\x00\x00\x00", 7); break;
case 8: memcpy(addr, "\x0f\x1f\x84\x00\x00\x00\x00\x00", 8); break;
case 9: memcpy(addr, "\x66\x0f\x1f\x84\x00\x00\x00\x00\x00", 9); break;
default: memset(addr, 0x90, size);
}
return true;
}
* the jmp in jmp_target as if was located at app_loc. */
bool
is_jmp_rel32(byte *code_buf, app_pc app_loc, app_pc *jmp_target )
{
if (*code_buf == JMP_OPCODE) {
if (jmp_target != NULL) {
*jmp_target = app_loc + JMP_LONG_LENGTH + *(int *)(code_buf + 1);
}
return true;
}
return false;
}
* the jmp in jmp_target as if was located at app_loc. */
bool
is_jmp_rel8(byte *code_buf, app_pc app_loc, app_pc *jmp_target )
{
if (*code_buf == JMP_SHORT_OPCODE) {
if (jmp_target != NULL) {
*jmp_target = app_loc + JMP_SHORT_LENGTH + *(char *)(code_buf + 1);
}
return true;
}
return false;
}