* Copyright (c) 2014-2021 Google, Inc. All rights reserved.
* Copyright (c) 2016 ARM Limited. All rights reserved.
* **********************************************************/
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of ARM Limited nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL ARM LIMITED OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
#include "../globals.h"
#include "arch.h"
#include "instr.h"
#include "instr_create_shared.h"
#include "instrlist.h"
#include "instrument.h"
#define APP instrlist_meta_append
#define PRE instrlist_meta_preinsert
#define OPREG opnd_create_reg
#define BR_X1_INST (0xd61f0000 | 1 << 5)
* fragment is:
*
* Unlinked:
* exit_cti stub
* ...
* stub:
* stp x0, x1, [x28]
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <fcache-return>
*
* Linked, exit_cti_reaches_target (near fragment):
* exit_cti target_fragment
* ...
* stub:
* stp x0, x1, [x28]
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <fcache-return>
*
* Linked, unconditional branch reaches target (intermediate fragment):
* exit_cti stub
* ...
* stub:
* b target_fragment
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <fcache-return>
*
* Linked, !unconditional branch reaches target (far fragment):
* exit_cti stub
* ...
* stub:
* stp x0, x1, [x28]
* movz x0, #&linkstub[0, 16), lsl #0x00
* movk x0, #&linkstub[16, 32), lsl #0x10
* movk x0, #&linkstub[32, 48), lsl #0x20
* movk x0, #&linkstub[48, 64), lsl #0x30
* ldr x1, [#8/#12]
* br x1
* <target_fragment_prefix>
*
* To ensure atomicity of <target> patching, the data slot must be 8-byte
* aligned. We do this by reserving 12 bytes for the data slot and using the
* appropriate offset in ldr for the 8-byte aligned 8 byte region within it.
*
* For complete design details, see the following wiki
* https://dynamorio.org/page_aarch64_far.html
*/
byte *
insert_relative_target(byte *pc, cache_pc target, bool hot_patch)
{
ASSERT_NOT_IMPLEMENTED(false);
return NULL;
}
byte *
insert_relative_jump(byte *pc, cache_pc target, bool hot_patch)
{
ASSERT_NOT_IMPLEMENTED(false);
return NULL;
}
uint
nop_pad_ilist(dcontext_t *dcontext, fragment_t *f, instrlist_t *ilist, bool emitting)
{
ASSERT_NOT_IMPLEMENTED(false);
return 0;
}
* pc must be a writable (vmcode) pc.
*/
uint *
insert_mov_imm(uint *pc, reg_id_t dst, ptr_int_t val)
{
uint rt = dst - DR_REG_X0;
ASSERT(rt < 31);
*pc++ = 0xd2800000 | rt | (val & 0xffff) << 5;
if ((val >> 16 & 0xffff) != 0)
*pc++ = 0xf2a00000 | rt | (val >> 16 & 0xffff) << 5;
if ((val >> 32 & 0xffff) != 0)
*pc++ = 0xf2c00000 | rt | (val >> 32 & 0xffff) << 5;
if ((val >> 48 & 0xffff) != 0)
*pc++ = 0xf2e00000 | rt | (val >> 48 & 0xffff) << 5;
return pc;
}
* 8-byte aligned region in the 12-byte slot reserved in the stub.
*/
static ptr_uint_t *
get_target_pc_slot(fragment_t *f, cache_pc stub_pc)
{
return (ptr_uint_t *)ALIGN_FORWARD(
vmcode_get_writable_addr(stub_pc + DIRECT_EXIT_STUB_SIZE(f->flags) -
DIRECT_EXIT_STUB_DATA_SZ),
8);
}
* emitted code in bytes. This routine assumes that the caller will
* take care of any cache synchronization necessary.
* The stub is unlinked initially, except coarse grain indirect exits,
* which are always linked.
*/
int
insert_exit_stub_other_flags(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc stub_pc, ushort l_flags)
{
uint *write_stub_pc = (uint *)vmcode_get_writable_addr(stub_pc);
uint *pc = write_stub_pc;
uint num_nops_needed = 0;
ASSERT_NOT_IMPLEMENTED(!TEST(FRAG_COARSE_GRAIN, f->flags));
if (LINKSTUB_DIRECT(l_flags)) {
*pc++ = (0xa9000000 | 0 | 1 << 10 | (dr_reg_stolen - DR_REG_X0) << 5 |
TLS_REG0_SLOT >> 3 << 15);
pc = insert_mov_imm(pc, DR_REG_X0, (ptr_int_t)l);
num_nops_needed = 4 - (pc - write_stub_pc - 1);
ptr_uint_t *target_pc_slot = get_target_pc_slot(f, stub_pc);
ASSERT(pc < (uint *)target_pc_slot);
uint target_pc_slot_offs = (uint *)target_pc_slot - pc;
*pc++ = (0x58000000 | (DR_REG_X1 - DR_REG_X0) | target_pc_slot_offs << 5);
*pc++ = BR_X1_INST;
* the immediate into a register. Ideally we would skip adding NOPs, but
* lots of places expect the stub size to be fixed.
*/
for (uint j = 0; j < num_nops_needed; j++)
*pc++ = RAW_NOP_INST;
* the fcache-return routine or the linked fragment. We reserve 12 bytes
* and use the 8-byte aligned region of 8 bytes within it.
*/
ASSERT(pc == (uint *)target_pc_slot || pc + 1 == (uint *)target_pc_slot);
ASSERT(sizeof(app_pc) == 8);
pc += DIRECT_EXIT_STUB_DATA_SZ / sizeof(uint);
* same, no matter which thread creates/unpatches the stub.
*/
ASSERT(fcache_return_routine(dcontext) == fcache_return_routine(GLOBAL_DCONTEXT));
*target_pc_slot = (ptr_uint_t)fcache_return_routine(dcontext);
ASSERT((ptr_int_t)((byte *)pc - (byte *)write_stub_pc) ==
DIRECT_EXIT_STUB_SIZE(l_flags));
} else {
cache_pc exit_target =
get_unlinked_entry(dcontext, EXIT_TARGET_TAG(dcontext, f, l));
*pc++ = (0xa9000000 | 0 | 1 << 10 | (dr_reg_stolen - DR_REG_X0) << 5 |
TLS_REG0_SLOT >> 3 << 15);
pc = insert_mov_imm(pc, DR_REG_X0, (ptr_int_t)l);
num_nops_needed = 4 - (pc - write_stub_pc - 1);
*pc++ = (0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, exit_target) >> 3 << 10);
*pc++ = BR_X1_INST;
* the immediate into a register. Ideally we would skip adding NOPs, but
* lots of places expect the stub size to be fixed.
*/
for (uint j = 0; j < num_nops_needed; j++)
*pc++ = RAW_NOP_INST;
}
return (int)((byte *)pc - (byte *)write_stub_pc);
}
bool
exit_cti_reaches_target(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
cache_pc target_pc)
{
cache_pc branch_pc = EXIT_CTI_PC(f, l);
ptr_uint_t off = (ptr_uint_t)target_pc - (ptr_uint_t)branch_pc;
uint enc = *(uint *)branch_pc;
ASSERT(ALIGNED(branch_pc, 4) && ALIGNED(target_pc, 4));
if ((enc & 0xfc000000) == 0x14000000)
return (off + 0x8000000 < 0x10000000);
else if ((enc & 0xff000010) == 0x54000000 ||
(enc & 0x7e000000) == 0x34000000)
return (off + 0x40000 < 0x80000);
else if ((enc & 0x7e000000) == 0x36000000)
return (off + 0x2000 < 0x4000);
ASSERT(false);
return false;
}
void
patch_stub(fragment_t *f, cache_pc stub_pc, cache_pc target_pc, cache_pc target_prefix_pc,
bool hot_patch)
{
ptr_uint_t off = (ptr_uint_t)target_pc - (ptr_uint_t)stub_pc;
if (off + 0x8000000 < 0x10000000) {
* (OP_b, 26-bit signed immediate offset).
* i#1911: Patching arbitrary instructions to an unconditional branch
* is theoretically not sound. Architectural specifications do not
* guarantee safe behaviour or any bound on when the change will be
* visible to other processor elements.
*/
*(uint *)vmcode_get_writable_addr(stub_pc) =
(0x14000000 | (0x03ffffff & off >> 2));
if (hot_patch)
machine_cache_sync(stub_pc, stub_pc + 4, true);
return;
}
* branch needs to be to the fragment prefix, as we need to restore the clobbered
* regs.
*/
ATOMIC_8BYTE_ALIGNED_WRITE(get_target_pc_slot(f, stub_pc),
(ptr_uint_t)target_prefix_pc,
false);
return;
}
static bool
stub_is_patched_for_intermediate_fragment_link(dcontext_t *dcontext, cache_pc stub_pc)
{
uint enc;
ATOMIC_4BYTE_ALIGNED_READ(stub_pc, &enc);
return (enc & 0xfc000000) == 0x14000000;
}
static bool
stub_is_patched_for_far_fragment_link(dcontext_t *dcontext, fragment_t *f,
cache_pc stub_pc)
{
ptr_uint_t target_pc;
ATOMIC_8BYTE_ALIGNED_READ(get_target_pc_slot(f, stub_pc), &target_pc);
return target_pc != (ptr_uint_t)fcache_return_routine(dcontext);
}
bool
stub_is_patched(dcontext_t *dcontext, fragment_t *f, cache_pc stub_pc)
{
return stub_is_patched_for_intermediate_fragment_link(dcontext, stub_pc) ||
stub_is_patched_for_far_fragment_link(dcontext, f, stub_pc);
}
void
unpatch_stub(dcontext_t *dcontext, fragment_t *f, cache_pc stub_pc, bool hot_patch)
{
* intermediate fragments or the one for far fragments.
*/
if (stub_is_patched_for_intermediate_fragment_link(dcontext, stub_pc)) {
* i#1911: Patching unconditional branch to some arbitrary instruction
* is theoretically not sound. Architectural specifications do not
* guarantee safe behaviour or any bound on when the change will be
* visible to other processor elements.
*/
*(uint *)vmcode_get_writable_addr(stub_pc) =
(0xa9000000 | 0 | 1 << 10 | (dr_reg_stolen - DR_REG_X0) << 5 |
TLS_REG0_SLOT >> 3 << 15);
if (hot_patch)
machine_cache_sync(stub_pc, stub_pc + AARCH64_INSTR_SIZE, true);
} else if (stub_is_patched_for_far_fragment_link(dcontext, f, stub_pc)) {
* same, no matter which thread creates/unpatches the stub.
*/
ASSERT(fcache_return_routine(dcontext) == fcache_return_routine(GLOBAL_DCONTEXT));
ATOMIC_8BYTE_ALIGNED_WRITE(get_target_pc_slot(f, stub_pc),
(ptr_uint_t)fcache_return_routine(dcontext),
false);
}
}
void
patch_branch(dr_isa_mode_t isa_mode, cache_pc branch_pc, cache_pc target_pc,
bool hot_patch)
{
ptr_uint_t off = (ptr_uint_t)target_pc - (ptr_uint_t)branch_pc;
uint *pc_writable = (uint *)vmcode_get_writable_addr(branch_pc);
uint enc = *pc_writable;
ASSERT(ALIGNED(branch_pc, 4) && ALIGNED(target_pc, 4));
if ((enc & 0xfc000000) == 0x14000000) {
ASSERT(off + 0x8000000 < 0x10000000);
*pc_writable = (0x14000000 | (0x03ffffff & off >> 2));
} else if ((enc & 0xff000010) == 0x54000000 ||
(enc & 0x7e000000) == 0x34000000) {
ASSERT(off + 0x100000 < 0x200000);
*pc_writable = (enc & 0xff00001f) | (0x00ffffe0 & off >> 2 << 5);
} else if ((enc & 0x7e000000) == 0x36000000) {
ASSERT(off + 0x8000 < 0x10000);
*pc_writable = (enc & 0xfff8001f) | (0x0007ffe0 & off >> 2 << 5);
} else
ASSERT(false);
if (hot_patch)
machine_cache_sync(branch_pc, branch_pc + 4, true);
return;
}
uint
patchable_exit_cti_align_offs(dcontext_t *dcontext, instr_t *inst, cache_pc pc)
{
return 0;
}
cache_pc
exit_cti_disp_pc(cache_pc branch_pc)
{
ASSERT_NOT_IMPLEMENTED(false);
return NULL;
}
static uint *
get_stub_branch(uint *pc)
{
while (*pc == RAW_NOP_INST)
pc--;
ASSERT(*pc == BR_X1_INST);
return pc;
}
void
link_indirect_exit_arch(dcontext_t *dcontext, fragment_t *f, linkstub_t *l,
bool hot_patch, app_pc target_tag)
{
byte *stub_pc = (byte *)EXIT_STUB_PC(dcontext, f, l);
uint *pc;
cache_pc exit_target;
ibl_type_t ibl_type = { 0 };
DEBUG_DECLARE(bool is_ibl =)
get_ibl_routine_type_ex(dcontext, target_tag, &ibl_type);
ASSERT(is_ibl);
if (IS_IBL_LINKED(ibl_type.link_state))
exit_target = target_tag;
else
exit_target = get_linked_entry(dcontext, target_tag);
pc = (uint *)(stub_pc + exit_stub_size(dcontext, target_tag, f->flags) -
AARCH64_INSTR_SIZE);
pc = get_stub_branch(pc) - 1;
*(uint *)vmcode_get_writable_addr((byte *)pc) =
(0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, exit_target) >> 3 << 10);
if (hot_patch)
machine_cache_sync(pc, pc + 1, true);
}
cache_pc
indirect_linkstub_stub_pc(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
cache_pc cti = EXIT_CTI_PC(f, l);
if (!EXIT_HAS_STUB(l->flags, f->flags))
return NULL;
if (decode_raw_is_jmp(dcontext, cti))
return decode_raw_jmp_target(dcontext, cti);
if (decode_raw_is_cond_branch_zero(dcontext, cti))
return decode_raw_cond_branch_zero_target(dcontext, cti);
ASSERT_NOT_REACHED();
return NULL;
}
cache_pc
cbr_fallthrough_exit_cti(cache_pc prev_cti_pc)
{
ASSERT_NOT_IMPLEMENTED(false);
return NULL;
}
void
unlink_indirect_exit(dcontext_t *dcontext, fragment_t *f, linkstub_t *l)
{
byte *stub_pc = (byte *)EXIT_STUB_PC(dcontext, f, l);
uint *pc;
cache_pc exit_target;
ibl_code_t *ibl_code = NULL;
ASSERT(linkstub_owned_by_fragment(dcontext, f, l));
ASSERT(LINKSTUB_INDIRECT(l->flags));
if (!TEST(LINK_LINKED, l->flags))
return;
ibl_code = get_ibl_routine_code(dcontext, extract_branchtype(l->flags), f->flags);
exit_target = ibl_code->unlinked_ibl_entry;
pc = (uint *)(stub_pc +
exit_stub_size(dcontext, ibl_code->indirect_branch_lookup_routine,
f->flags) -
AARCH64_INSTR_SIZE);
pc = get_stub_branch(pc) - 1;
*(uint *)vmcode_get_writable_addr((byte *)pc) =
(0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, exit_target) >> 3 << 10);
machine_cache_sync(pc, pc + 1, true);
}
* COARSE-GRAIN FRAGMENT SUPPORT
*/
cache_pc
entrance_stub_jmp(cache_pc stub)
{
ASSERT_NOT_IMPLEMENTED(false);
return NULL;
}
bool
coarse_is_entrance_stub(cache_pc stub)
{
return false;
}
*
* fragment_t Prefixes
*/
int
fragment_ibt_prefix_size(uint flags)
{
return FRAGMENT_BASE_PREFIX_SIZE(flags);
}
void
insert_fragment_prefix(dcontext_t *dcontext, fragment_t *f)
{
byte *write_start = vmcode_get_writable_addr(f->start_pc);
byte *pc = write_start;
ASSERT(f->prefix_size == 0);
*(uint *)pc = (0xa9400000 | (DR_REG_X0 - DR_REG_X0) | (DR_REG_X1 - DR_REG_X0) << 10 |
(dr_reg_stolen - DR_REG_X0) << 5 | TLS_REG0_SLOT >> 3 << 10);
pc += AARCH64_INSTR_SIZE;
f->prefix_size = (byte)(((cache_pc)pc) - write_start);
ASSERT(f->prefix_size == fragment_prefix_size(f->flags));
}
void
append_call_exit_dr_hook(dcontext_t *dcontext, instrlist_t *ilist, bool absolute,
bool shared)
{
ASSERT_NOT_IMPLEMENTED(EXIT_DR_HOOK == NULL);
}
void
append_restore_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_W0, XFLAGS_OFFSET));
APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_W1, XFLAGS_OFFSET + 4));
APP(ilist, RESTORE_FROM_DC(dcontext, DR_REG_W2, XFLAGS_OFFSET + 8));
APP(ilist,
INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_NZCV),
opnd_create_reg(DR_REG_X0)));
APP(ilist,
INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_FPCR),
opnd_create_reg(DR_REG_X1)));
APP(ilist,
INSTR_CREATE_msr(dcontext, opnd_create_reg(DR_REG_FPSR),
opnd_create_reg(DR_REG_X2)));
}
*/
void
append_restore_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
int i;
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, simd))));
for (i = 0; i < 32; i += 2) {
APP(ilist,
INSTR_CREATE_ldp(
dcontext, opnd_create_reg(DR_REG_Q0 + i),
opnd_create_reg(DR_REG_Q0 + i + 1),
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, i * 16, OPSZ_32)));
}
if (proc_has_feature(FEATURE_SVE)) {
for (i = 0; i < 32; i++) {
* From the SVE manual:
* "Load a vector register from a memory address generated by a
* 64-bit scalar base, plus an immediate offset in the range -256
* to 255 which is multiplied by the current vector register size
* in bytes."
*/
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_Z0 + i),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, i * proc_get_vector_length_bytes(),
opnd_size_from_bytes(proc_get_vector_length_bytes()))));
}
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep))));
* register for FFR load below, then restored from svep afterwards.
*/
for (i = 0; i < 15; i++) {
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P0 + i),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0,
i * (proc_get_vector_length_bytes() / 8),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
}
* a temporary predicate register to load:
* add x2, x(dcxt), #(offset ffr)
* ldr p15, [x2, #(ffr)]
* wrffr p15.b
* ldr p15, [x1, #(15 mul vl)]
*/
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(
DR_REG_X2, DR_REG_NULL, 0, 0,
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
APP(ilist,
INSTR_CREATE_wrffr_sve(dcontext,
opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, 15 * (proc_get_vector_length_bytes() / 8),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
}
}
* right before jump to fcache target.
* - dcontext is in REG_DCXT
* - DR's tls base is in dr_reg_stolen
* - all other registers can be used as scratch, and we are using X0.
*/
void
append_restore_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
int i;
ASSERT_NOT_IMPLEMENTED(!TEST(SELFPROT_DCONTEXT, dynamo_options.protect_mask));
ASSERT(dr_reg_stolen != SCRATCH_REG0);
APP(ilist, RESTORE_FROM_DC(dcontext, SCRATCH_REG0, REG_OFFSET(dr_reg_stolen)));
APP(ilist, SAVE_TO_TLS(dcontext, SCRATCH_REG0, TLS_REG_STOLEN_SLOT));
* loop of OP_ldp instructions below.
* This means that the mcontext stolen reg slot holds DR's base instead of
* the app's value while we're in the cache, which can be confusing: but we have
* to get the official value from TLS on signal and other transitions anyway,
* and DR's base makes it easier to spot bugs than a prior app value.
*/
APP(ilist, SAVE_TO_DC(dcontext, dr_reg_stolen, REG_OFFSET(dr_reg_stolen)));
i = (REG_DCXT == DR_REG_X0);
APP(ilist,
INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_X30),
opnd_create_reg(DR_REG_X0 + i),
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X30), OPSZ_16)));
APP(ilist,
XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_SP),
opnd_create_reg(DR_REG_X0 + i)));
for (i = 0; i < 30; i += 2) {
if ((REG_DCXT - DR_REG_X0) >> 1 != i >> 1) {
APP(ilist,
INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_X0 + i),
opnd_create_reg(DR_REG_X0 + i + 1),
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X0 + i),
OPSZ_16)));
}
}
i = (REG_DCXT - DR_REG_X0) & ~1;
APP(ilist,
INSTR_CREATE_ldp(dcontext, opnd_create_reg(DR_REG_X0 + i),
opnd_create_reg(DR_REG_X0 + i + 1),
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X0 + i), OPSZ_16)));
}
* append_fcache_return_prologue.
* Assuming the execution comes from an exit stub via br DR_REG_X1,
* dcontext base is held in REG_DCXT, and exit stub in X0.
* App's x0 and x1 is stored in TLS_REG0_SLOT and TLS_REG1_SLOT
* - store all registers into dcontext's mcontext
* - restore REG_DCXT app value from TLS slot to mcontext
* - restore dr_reg_stolen app value from TLS slot to mcontext
*/
void
append_save_gpr(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end, bool absolute,
generated_code_t *code, linkstub_t *linkstub, bool coarse_info)
{
int i;
* the code generated here. See, for example:
* emit_do_syscall_common, emit_indirect_branch_lookup, handle_sigreturn,
* insert_exit_stub_other_flags, execute_handler_from_{cache,dispatch},
* transfer_from_sig_handler_to_fcache_return
*/
for (i = 2; i < 30; i += 2) {
APP(ilist,
INSTR_CREATE_stp(dcontext,
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X0 + i), OPSZ_16),
opnd_create_reg(DR_REG_X0 + i),
opnd_create_reg(DR_REG_X0 + i + 1)));
}
APP(ilist,
XINST_CREATE_move(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(DR_REG_SP)));
APP(ilist,
INSTR_CREATE_stp(dcontext,
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0,
REG_OFFSET(DR_REG_X30), OPSZ_16),
opnd_create_reg(DR_REG_X30), opnd_create_reg(DR_REG_X1)));
* stp x1, x2, [x(dcxt)]
*/
APP(ilist,
INSTR_CREATE_ldp(
dcontext, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X2),
opnd_create_base_disp(dr_reg_stolen, DR_REG_NULL, 0, 0, OPSZ_16)));
APP(ilist,
INSTR_CREATE_stp(dcontext,
opnd_create_base_disp(REG_DCXT, DR_REG_NULL, 0, 0, OPSZ_16),
opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X2)));
if (linkstub != NULL) {
ASSERT_NOT_IMPLEMENTED(false);
}
* append_prepare_fcache_return, so copy it to mcontext.
*/
APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, DCONTEXT_BASE_SPILL_SLOT));
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, REG_DCXT_OFFS));
* and we restore its value back to mcontext on fcache return.
*/
APP(ilist, RESTORE_FROM_TLS(dcontext, SCRATCH_REG1, TLS_REG_STOLEN_SLOT));
APP(ilist, SAVE_TO_DC(dcontext, SCRATCH_REG1, REG_OFFSET(dr_reg_stolen)));
}
* GPR's are already saved.
*/
void
append_save_simd_reg(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
int i;
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, simd))));
for (i = 0; i < 32; i += 2) {
* From the AArch64 manual:
* "The signed immediate byte offset is a multiple of 16 in the range
* -1024 to 1008, defaulting to 0 and encoded in the imm7 field as
* <imm>/16."
*/
APP(ilist,
INSTR_CREATE_stp(
dcontext,
opnd_create_base_disp(DR_REG_X1, DR_REG_NULL, 0, i * 16, OPSZ_32),
opnd_create_reg(DR_REG_Q0 + i), opnd_create_reg(DR_REG_Q0 + i + 1)));
}
if (proc_has_feature(FEATURE_SVE)) {
for (i = 0; i < 32; i++) {
* "Store a vector register to a memory address generated by a
* 64-bit scalar base, plus an immediate offset in the range -256
* to 255 which is multiplied by the current vector register size
* in bytes."
*/
APP(ilist,
INSTR_CREATE_str(
dcontext,
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, i * proc_get_vector_length_bytes(),
opnd_size_from_bytes(proc_get_vector_length_bytes())),
opnd_create_reg(DR_REG_Z0 + i)));
}
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, svep))));
for (i = 0; i < 16; i++) {
APP(ilist,
INSTR_CREATE_str(
dcontext,
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0,
i * (proc_get_vector_length_bytes() / 8),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
opnd_create_reg(DR_REG_P0 + i)));
}
* a temporary predicate register to store:
* rdffr p15.b
* add x2, x(dcxt), #(offset ffr)
* str p15, [x2, #(ffr)]
* ldr p15, [x1, #(15 mul vl)]
*/
APP(ilist,
INSTR_CREATE_rdffr_sve(dcontext,
opnd_create_reg_element_vector(DR_REG_P15, OPSZ_1)));
APP(ilist,
XINST_CREATE_add_2src(dcontext, opnd_create_reg(DR_REG_X2),
opnd_create_reg(REG_DCXT),
OPND_CREATE_INTPTR(offsetof(priv_mcontext_t, ffr))));
APP(ilist,
INSTR_CREATE_str(
dcontext,
opnd_create_base_disp(
DR_REG_X2, DR_REG_NULL, 0, 0,
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8)),
opnd_create_reg(DR_REG_P15)));
APP(ilist,
INSTR_CREATE_ldr(
dcontext, opnd_create_reg(DR_REG_P15),
opnd_create_base_disp(
DR_REG_X1, DR_REG_NULL, 0, 15 * (proc_get_vector_length_bytes() / 8),
opnd_size_from_bytes(proc_get_vector_length_bytes() / 8))));
}
}
void
append_save_clear_xflags(dcontext_t *dcontext, instrlist_t *ilist, bool absolute)
{
APP(ilist,
INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X1),
opnd_create_reg(DR_REG_NZCV)));
APP(ilist,
INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X2),
opnd_create_reg(DR_REG_FPCR)));
APP(ilist,
INSTR_CREATE_mrs(dcontext, opnd_create_reg(DR_REG_X3),
opnd_create_reg(DR_REG_FPSR)));
APP(ilist, SAVE_TO_DC(dcontext, DR_REG_W1, XFLAGS_OFFSET));
APP(ilist, SAVE_TO_DC(dcontext, DR_REG_W2, XFLAGS_OFFSET + 4));
APP(ilist, SAVE_TO_DC(dcontext, DR_REG_W3, XFLAGS_OFFSET + 8));
}
bool
append_call_enter_dr_hook(dcontext_t *dcontext, instrlist_t *ilist, bool ibl_end,
bool absolute)
{
ASSERT_NOT_IMPLEMENTED(EXIT_DR_HOOK == NULL);
return false;
}
void
insert_save_eflags(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where, uint flags,
bool tls, bool absolute _IF_X86_64(bool x86_to_x64_ibl_opt))
{
ASSERT_NOT_IMPLEMENTED(false);
}
void
insert_restore_eflags(dcontext_t *dcontext, instrlist_t *ilist, instr_t *where,
uint flags, bool tls,
bool absolute _IF_X86_64(bool x86_to_x64_ibl_opt))
{
ASSERT_NOT_IMPLEMENTED(false);
}
byte *
emit_inline_ibl_stub(dcontext_t *dcontext, byte *pc, ibl_code_t *ibl_code,
bool target_trace_table)
{
ASSERT_NOT_IMPLEMENTED(false);
return pc;
}
bool
instr_is_ibl_hit_jump(instr_t *instr)
{
return instr_get_opcode(instr) == OP_br &&
opnd_get_reg(instr_get_target(instr)) == DR_REG_X0;
}
byte *
emit_indirect_branch_lookup(dcontext_t *dc, generated_code_t *code, byte *pc,
byte *fcache_return_pc, bool target_trace_table,
bool inline_ibl_head, ibl_code_t *ibl_code )
{
bool absolute = false;
instrlist_t ilist;
instrlist_init(&ilist);
patch_list_t *patch = &ibl_code->ibl_patch;
init_patch_list(patch, PATCH_TYPE_INDIRECT_TLS);
instr_t *load_tag = INSTR_CREATE_label(dc);
instr_t *compare_tag = INSTR_CREATE_label(dc);
instr_t *try_next = INSTR_CREATE_label(dc);
instr_t *miss = INSTR_CREATE_label(dc);
instr_t *not_hit = INSTR_CREATE_label(dc);
instr_t *target_delete_entry = INSTR_CREATE_label(dc);
instr_t *unlinked = INSTR_CREATE_label(dc);
* x0: link_stub entry
* x1: scratch reg, arrived from br x1
* x2: indirect branch target
* TLS_REG0_SLOT: app's x0
* TLS_REG1_SLOT: app's x1
* TLS_REG2_SLOT: app's x2
* TLS_REG3_SLOT: scratch space
* There are following entries with the same context:
* indirect_branch_lookup
* unlink_stub_entry
* target_delete_entry:
* x0: scratch
* x1: table entry pointer from ibl lookup hit path
* x2: app's x2
* TLS_REG0_SLOT: app's x0
* TLS_REG1_SLOT: app's x1
* TLS_REG2_SLOT: app's x2
* On miss exit we output:
* x0: the dcontext->last_exit
* x1: br x1
* x2: app's x2
* TLS_REG0_SLOT: app's x0 (recovered by fcache_return)
* TLS_REG1_SLOT: app's x1 (recovered by fcache_return)
* On hit exit we output:
* x0: fragment_start_pc (points to the fragment prefix)
* x1: scratch reg
* x2: app's x2
* TLS_REG0_SLOT: app's x0 (recovered by fragment_prefix)
* TLS_REG1_SLOT: app's x1 (recovered by fragment_prefix)
*/
APP(&ilist, instr_create_save_to_tls(dc, DR_REG_R0, TLS_REG3_SLOT));
* properly; the corresponding store-release is in update_lookuptable_tls().
*/
APP(&ilist,
INSTR_CREATE_add(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(dr_reg_stolen),
OPND_CREATE_INT32(TLS_MASK_SLOT(ibl_code->branch_type))));
APP(&ilist,
INSTR_CREATE_ldar(dc, opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(DR_REG_X1, 0)));
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X0),
opnd_create_base_disp(dr_reg_stolen, DR_REG_NULL, 0,
TLS_TABLE_SLOT(ibl_code->branch_type),
OPSZ_8)));
APP(&ilist,
INSTR_CREATE_and(dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X1),
opnd_create_reg(DR_REG_X2)));
APP(&ilist,
INSTR_CREATE_add_shift(
dc, opnd_create_reg(DR_REG_X1), opnd_create_reg(DR_REG_X0),
opnd_create_reg(DR_REG_X1), OPND_CREATE_INT8(DR_SHIFT_LSL),
OPND_CREATE_INT8(4 - HASHTABLE_IBL_OFFSET(ibl_code->branch_type))));
APP(&ilist, load_tag);
APP(&ilist,
INSTR_CREATE_ldr(
dc, opnd_create_reg(DR_REG_X0),
OPND_CREATE_MEMPTR(DR_REG_X1, offsetof(fragment_entry_t, tag_fragment))));
APP(&ilist, compare_tag);
APP(&ilist,
INSTR_CREATE_cbz(dc, opnd_create_instr(not_hit), opnd_create_reg(DR_REG_X0)));
APP(&ilist,
XINST_CREATE_sub(dc, opnd_create_reg(DR_REG_X0), opnd_create_reg(DR_REG_X2)));
APP(&ilist,
INSTR_CREATE_cbnz(dc, opnd_create_instr(try_next), opnd_create_reg(DR_REG_X0)));
* will be restored by the fragment prefix.
*/
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R2, TLS_REG2_SLOT));
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X0),
OPND_CREATE_MEMPTR(
DR_REG_X1, offsetof(fragment_entry_t, start_pc_fragment))));
* (keep in sync with instr_is_ibl_hit_jump())
*/
APP(&ilist, INSTR_CREATE_br(dc, opnd_create_reg(DR_REG_X0)));
APP(&ilist, try_next);
* because of the sentinel at the end.
* ldr x0, [x1, #tag_fragment_offset]! */
APP(&ilist,
instr_create_2dst_3src(
dc, OP_ldr, opnd_create_reg(DR_REG_X0), opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(DR_REG_X1, sizeof(fragment_entry_t)),
opnd_create_reg(DR_REG_X1), OPND_CREATE_INTPTR(sizeof(fragment_entry_t))));
APP(&ilist, INSTR_CREATE_b(dc, opnd_create_instr(compare_tag)));
APP(&ilist, not_hit);
if (INTERNAL_OPTION(ibl_sentinel_check)) {
APP(&ilist,
XINST_CREATE_load(
dc, opnd_create_reg(DR_REG_X0),
OPND_CREATE_MEMPTR(DR_REG_X1,
offsetof(fragment_entry_t, start_pc_fragment))));
* Instead we rely on the sentinel start PC being 1.
*/
ASSERT(HASHLOOKUP_SENTINEL_START_PC == (cache_pc)PTR_UINT_1);
APP(&ilist,
XINST_CREATE_sub(dc, opnd_create_reg(DR_REG_X0), OPND_CREATE_INT8(1)));
APP(&ilist,
INSTR_CREATE_cbnz(dc, opnd_create_instr(miss), opnd_create_reg(DR_REG_R0)));
APP(&ilist,
XINST_CREATE_load(dc, opnd_create_reg(DR_REG_X1),
OPND_CREATE_MEMPTR(dr_reg_stolen,
TLS_TABLE_SLOT(ibl_code->branch_type))));
APP(&ilist, INSTR_CREATE_b(dc, opnd_create_instr(load_tag)));
}
APP(&ilist, target_delete_entry);
add_patch_marker(patch, target_delete_entry, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->target_delete_entry);
APP(&ilist,
INSTR_CREATE_ldr(
dc, opnd_create_reg(DR_REG_R2),
OPND_CREATE_MEMPTR(DR_REG_R1, offsetof(fragment_entry_t, tag_fragment))));
* below.
*/
instrlist_insert_mov_immed_ptrsz(dc, (ptr_uint_t)get_ibl_deleted_linkstub(),
opnd_create_reg(DR_REG_R0), &ilist, NULL, NULL,
NULL);
APP(&ilist, INSTR_CREATE_b(dc, opnd_create_instr(unlinked)));
APP(&ilist, miss);
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R0, TLS_REG3_SLOT));
APP(&ilist, unlinked);
add_patch_marker(patch, unlinked, PATCH_ASSEMBLE_ABSOLUTE,
0 ,
(ptr_uint_t *)&ibl_code->unlinked_ibl_entry);
insert_shared_get_dcontext(dc, &ilist, NULL, true);
APP(&ilist, SAVE_TO_DC(dc, DR_REG_R2, NEXT_TAG_OFFSET));
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R5, DCONTEXT_BASE_SPILL_SLOT));
APP(&ilist, instr_create_restore_from_tls(dc, DR_REG_R2, TLS_REG2_SLOT));
APP(&ilist,
INSTR_CREATE_ldr(dc, opnd_create_reg(DR_REG_X1),
OPND_TLS_FIELD(TLS_FCACHE_RETURN_SLOT)));
APP(&ilist, INSTR_CREATE_br(dc, opnd_create_reg(DR_REG_X1)));
ibl_code->ibl_routine_length = encode_with_patch_list(dc, patch, &ilist, pc);
instrlist_clear(dc, &ilist);
return pc + ibl_code->ibl_routine_length;
}
void
relink_special_ibl_xfer(dcontext_t *dcontext, int index,
ibl_entry_point_type_t entry_type, ibl_branch_type_t ibl_type)
{
generated_code_t *code;
byte *ibl_tgt;
uint *pc;
if (dcontext == GLOBAL_DCONTEXT) {
ASSERT(!special_ibl_xfer_is_thread_private());
code = SHARED_GENCODE_MATCH_THREAD(get_thread_private_dcontext());
} else {
ASSERT(special_ibl_xfer_is_thread_private());
code = THREAD_GENCODE(dcontext);
}
if (code == NULL)
return;
ibl_tgt = special_ibl_xfer_tgt(dcontext, code, entry_type, ibl_type);
ASSERT(code->special_ibl_xfer[index] != NULL);
pc = (uint *)(code->special_ibl_xfer[index] + code->special_ibl_unlink_offs[index]);
uint *write_pc = (uint *)vmcode_get_writable_addr((byte *)pc);
protect_generated_code(code, WRITABLE);
* Relinking does not require the branch instruction to change, just the
* target load, e.g.
* ldr +0x78(%x28)[8byte] -> %x1
* br %x1
* See INSTR_CREATE_ldr() followed by XINST_CREATE_jump_reg() calls in
* emit_special_ibl_xfer(), where special_ibl_unlink_offs has been adjusted
* to point to the ldr.
* TODO i#1911: When modified like this, the ldr instruction is not
* guaranteed to be updated for all cores without synchronization. A
* possible fix is to use TLS to store the target so only data needs to
* change rather than code.
*/
*write_pc = (uint)(0xf9400000 | 1 | (dr_reg_stolen - DR_REG_X0) << 5 |
get_ibl_entry_tls_offs(dcontext, ibl_tgt) >> 3 << 10);
machine_cache_sync(pc, pc + 1, true);
protect_generated_code(code, READONLY);
}
bool
fill_with_nops(dr_isa_mode_t isa_mode, byte *addr, size_t size)
{
byte *pc;
if (!ALIGNED(addr, 4) || !ALIGNED(addr + size, 4)) {
ASSERT_NOT_REACHED();
return false;
}
for (pc = addr; pc < addr + size; pc += 4)
*(uint *)pc = RAW_NOP_INST;
return true;
}