* Copyright (c) 2019-2023 Google, Inc. All rights reserved.
* *******************************************************************************/
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* * Neither the name of Google, Inc. nor the names of its contributors may be
* used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
* DAMAGE.
*/
* Restartable sequence ("rseq") support (i#2350).
* This is a kernel feature which provides cpu-atomic regions: if a thread
* is pre-empted within an rseq region, an abort handler is invoked.
* The feature is difficult to handle under binary instrumentation.
* We rely on the app following certain conventions, including containing a
* section holding a table of all rseq sequences.
*/
#include "../globals.h"
#include "../module_shared.h"
#include "module_private.h"
#include "os_private.h"
#include "rseq_linux.h"
#include "../fragment.h"
#include "decode.h"
#include "instr_create_shared.h"
#include "instrument.h"
#include <stddef.h>
#include "include/syscall.h"
#include <errno.h>
#define ABS(x) ((x) < 0 ? -(x) : (x))
* which broke our build. To avoid future issues we use our own definitions.
* Binary breakage is unlikely without long periods of deprecation so this is
* not adding undue risk.
* If these structures were used in other files we would put a header in
* core/unix/include alongside similar headers.
*/
struct rseq_cs {
uint version;
uint flags;
uint64 start_ip;
uint64 post_commit_offset;
uint64 abort_ip;
} __attribute__((aligned(4 * sizeof(uint64))));
struct rseq {
uint cpu_id_start;
uint cpu_id;
uint64 rseq_cs;
uint flags;
} __attribute__((aligned(4 * sizeof(uint64))));
#define RSEQ_FLAG_UNREGISTER 1
vm_area_vector_t *d_r_rseq_areas;
DECLARE_CXTSWPROT_VAR(static mutex_t rseq_trigger_lock,
INIT_LOCK_FREE(rseq_trigger_lock));
static volatile bool rseq_enabled;
* As of glibc 2.35, it is present at the following offset from the app
* lib seg base. We check these offsets first and then fall back to a
* wider search. The linux.rseq test helps detect changes in these
* offsets in future glibc versions.
*/
#ifdef X86
# ifdef X64
# define GLIBC_RSEQ_OFFSET 2464
# else
# define GLIBC_RSEQ_OFFSET 1312
# endif
#else
* XXX: To improve struct rseq offset detection on AArch32, find the offset
* on an AArch32 machine running glibc 2.35+ and add here.
*/
# define GLIBC_RSEQ_OFFSET -32
#endif
static int rseq_tls_offset;
* to be the same.
*/
static int rseq_signature;
typedef struct _rseq_region_t {
app_pc start;
app_pc end;
app_pc handler;
app_pc final_instr_pc;
* for our 2nd invocation, if they're written in the rseq region. We only support
* GPR inputs. We document that we do not support any other inputs (no flags, no
* SIMD registers).
*/
bool reg_written[DR_NUM_GPR_REGS];
} rseq_region_t;
* make multiple copies of the app code (e.g., drbbdup).
*/
typedef struct _rseq_cs_record_t {
struct rseq_cs rcs;
void *alloc_ptr;
struct _rseq_cs_record_t *next;
} rseq_cs_record_t;
* pointer field to every fragment_t, and the complexity of another subclass like
* trace_t, we store them externally in a hashtable. The FRAG_HAS_RSEQ_ENDPOINT flag
* avoids the hashtable lookup on every fragment.
*/
static generic_table_t *rseq_cs_table;
#define INIT_RSEQ_CS_TABLE_SIZE 5
static void
rseq_area_free(void *data)
{
HEAP_TYPE_FREE(GLOBAL_DCONTEXT, data, rseq_region_t, ACCT_VMAREAS, PROTECTED);
}
static void *
rseq_area_dup(void *data)
{
rseq_region_t *src = (rseq_region_t *)data;
rseq_region_t *dst =
HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, rseq_region_t, ACCT_VMAREAS, PROTECTED);
ASSERT(src != NULL);
*dst = *src;
return dst;
}
static inline size_t
rseq_cs_alloc_size(void)
{
return sizeof(rseq_cs_record_t) + __alignof(struct rseq_cs);
}
static void
rseq_cs_free(dcontext_t *dcontext, void *data)
{
rseq_cs_record_t *record = (rseq_cs_record_t *)data;
do {
void *tofree = record->alloc_ptr;
record = record->next;
global_heap_free(tofree, rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER));
} while (record != NULL);
}
void
d_r_rseq_init(void)
{
VMVECTOR_ALLOC_VECTOR(d_r_rseq_areas, GLOBAL_DCONTEXT,
VECTOR_SHARED | VECTOR_NEVER_MERGE, rseq_areas);
vmvector_set_callbacks(d_r_rseq_areas, rseq_area_free, rseq_area_dup, NULL, NULL);
rseq_cs_table = generic_hash_create(GLOBAL_DCONTEXT, INIT_RSEQ_CS_TABLE_SIZE, 80,
HASHTABLE_SHARED | HASHTABLE_PERSISTENT,
rseq_cs_free _IF_DEBUG("rseq_cs table"));
if (rseq_is_registered_for_current_thread())
rseq_locate_rseq_regions(false);
}
void
d_r_rseq_exit(void)
{
generic_hash_destroy(GLOBAL_DCONTEXT, rseq_cs_table);
vmvector_delete_vector(GLOBAL_DCONTEXT, d_r_rseq_areas);
DELETE_LOCK(rseq_trigger_lock);
}
void
rseq_thread_attach(dcontext_t *dcontext)
{
rseq_region_t *info;
if (!vmvector_lookup_data(d_r_rseq_areas, dcontext->next_tag, NULL, NULL,
(void **)&info))
return;
* so the restore on rseq exit won't read incorrect values, but it's simpler and
* less error-prone to send it to the abort handler, like we do on detach or other
* translation points.
*/
dcontext->next_tag = info->handler;
}
bool
rseq_get_region_info(app_pc pc, app_pc *start OUT, app_pc *end OUT, app_pc *handler OUT,
bool **reg_written OUT, int *reg_written_size OUT)
{
rseq_region_t *info;
if (!vmvector_lookup_data(d_r_rseq_areas, pc, start, end, (void **)&info))
return false;
if (handler != NULL)
*handler = info->handler;
if (reg_written != NULL)
*reg_written = info->reg_written;
if (reg_written_size != NULL)
*reg_written_size = sizeof(info->reg_written) / sizeof(info->reg_written[0]);
return true;
}
bool
rseq_set_final_instr_pc(app_pc start, app_pc final_instr_pc)
{
rseq_region_t *info;
if (!vmvector_lookup_data(d_r_rseq_areas, start, NULL, NULL, (void **)&info))
return false;
if (final_instr_pc < start || final_instr_pc >= info->end)
return false;
info->final_instr_pc = final_instr_pc;
return true;
}
int
rseq_get_tls_ptr_offset(void)
{
ASSERT(rseq_tls_offset != 0);
return rseq_tls_offset + offsetof(struct rseq, rseq_cs);
}
static void
rseq_clear_tls_ptr(dcontext_t *dcontext)
{
ASSERT(rseq_tls_offset != 0);
byte *base = get_app_segment_base(LIB_SEG_TLS);
struct rseq *app_rseq = (struct rseq *)(base + rseq_tls_offset);
* or safe_write here either. We already cannot handle rseq adversarial cases.
*/
if (is_dynamo_address((byte *)(ptr_uint_t)app_rseq->rseq_cs))
app_rseq->rseq_cs = 0;
}
int
rseq_get_signature(void)
{
ASSERT(rseq_enabled);
return rseq_signature;
}
byte *
rseq_get_rseq_cs_alloc(byte **rseq_cs_aligned OUT)
{
byte *rseq_cs_alloc = global_heap_alloc(rseq_cs_alloc_size() HEAPACCT(ACCT_OTHER));
*rseq_cs_aligned = (byte *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs));
return rseq_cs_alloc;
}
void
rseq_record_rseq_cs(byte *rseq_cs_alloc, fragment_t *f, cache_pc start, cache_pc end,
cache_pc abort)
{
rseq_cs_record_t *record =
(rseq_cs_record_t *)ALIGN_FORWARD(rseq_cs_alloc, __alignof(struct rseq_cs));
record->alloc_ptr = rseq_cs_alloc;
record->next = NULL;
struct rseq_cs *target = &record->rcs;
target->version = 0;
target->flags = 0;
target->start_ip = (ptr_uint_t)start;
target->post_commit_offset = (ptr_uint_t)(end - start);
target->abort_ip = (ptr_uint_t)abort;
TABLE_RWLOCK(rseq_cs_table, write, lock);
rseq_cs_record_t *existing =
generic_hash_lookup(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f);
if (existing != NULL) {
while (existing->next != NULL)
existing = existing->next;
existing->next = record;
} else {
generic_hash_add(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f, record);
}
TABLE_RWLOCK(rseq_cs_table, write, unlock);
}
void
rseq_remove_fragment(dcontext_t *dcontext, fragment_t *f)
{
if (!rseq_enabled)
return;
rseq_clear_tls_ptr(dcontext);
TABLE_RWLOCK(rseq_cs_table, write, lock);
generic_hash_remove(GLOBAL_DCONTEXT, rseq_cs_table, (ptr_uint_t)f);
TABLE_RWLOCK(rseq_cs_table, write, unlock);
}
void
rseq_shared_fragment_flushtime_update(dcontext_t *dcontext)
{
if (!rseq_enabled)
return;
* We clear the pointer on completion of the native rseq execution, but it's
* not easy to clear it on midpoint exits. We instead clear prior to
* rseq_cs being freed: for thread-private in rseq_remove_fragment() and for
* thread-shared each thread should come here prior to deletion.
*/
rseq_clear_tls_ptr(dcontext);
}
bool
rseq_is_registered_for_current_thread(void)
{
* For 64-bit we can pass a kernel address and look for EFAULT
* vs EINVAL, but there is no kernel address for 32-bit.
* So we try to perform a legitimate registration.
*/
struct rseq test_rseq = {};
int res = dynamorio_syscall(SYS_rseq, 4, &test_rseq, sizeof(test_rseq), 0, 0);
if (res == -EINVAL)
return true;
if (res == -ENOSYS)
return false;
* if &test_rseq == the app's struct but the signature is different, but that
* seems so unlikely that we just assume -EPERM implies seccomp.
*/
if (res == -EPERM)
return false;
ASSERT(res == 0);
if (dynamorio_syscall(SYS_rseq, 4, &test_rseq, sizeof(test_rseq),
RSEQ_FLAG_UNREGISTER, 0) != 0) {
ASSERT_NOT_REACHED();
}
return false;
}
static void
rseq_analyze_instructions(rseq_region_t *info)
{
* need to preserve for our restart. We do not want to blindly spill and restore
* 16+ registers for every sequence (too much overhead).
*/
instr_t instr;
instr_init(GLOBAL_DCONTEXT, &instr);
app_pc pc = info->start;
int i;
memset(info->reg_written, 0, sizeof(info->reg_written));
while (pc < info->end) {
instr_reset(GLOBAL_DCONTEXT, &instr);
app_pc next_pc = decode(GLOBAL_DCONTEXT, pc, &instr);
if (next_pc == NULL) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq sequence contains invalid instructions");
ASSERT_NOT_REACHED();
}
if (instr_is_syscall(&instr)
IF_DEBUG(
&&!check_filter("api.rseq;linux.rseq;linux.rseq_table;linux.rseq_noarray",
get_short_name(get_application_name())))) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq sequence contains a system call");
ASSERT_NOT_REACHED();
}
if (instr_is_call(&instr)) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq sequence contains a call");
ASSERT_NOT_REACHED();
}
* the sequence. We can't limit ourselves to registers clearly live on
* input, since code *after* the sequence could read them. We do disallow
* callouts to helper functions to simplify our lives.
*
* We only preserve GPR's, for simplicity, and because they are far more likely
* as inputs than flags or SIMD registers. We'd like to verify that only GPR's
* are used, but A) we can't easily check values read *after* the sequence (the
* handler could set up state read afterward and sometimes clobbered inside), B)
* we do want to support SIMD and flags writes in the sequence, and C) even
* checking for values read in the sequence would want new interfaces like
* DR_REG_START_SIMD or register iterators for reasonable code.
*/
for (i = 0; i < DR_NUM_GPR_REGS; i++) {
if (info->reg_written[i])
continue;
reg_id_t reg = DR_REG_START_GPR + (reg_id_t)i;
if (instr_writes_to_reg(&instr, reg, DR_QUERY_DEFAULT)) {
LOG(GLOBAL, LOG_LOADER, 3,
"Rseq region @" PFX " writes register %s at " PFX "\n", info->start,
reg_names[reg], pc);
info->reg_written[i] = true;
}
}
pc = next_pc;
}
instr_free(GLOBAL_DCONTEXT, &instr);
}
static void
rseq_process_entry(struct rseq_cs *entry, ssize_t load_offs)
{
LOG(GLOBAL, LOG_LOADER, 2,
"Found rseq region: ver=%u; flags=%u; start=" PFX "; end=" PFX "; abort=" PFX
"\n",
entry->version, entry->flags, entry->start_ip + load_offs,
entry->start_ip + entry->post_commit_offset + load_offs,
entry->abort_ip + load_offs);
rseq_region_t *info =
HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, rseq_region_t, ACCT_VMAREAS, PROTECTED);
info->start = (app_pc)(ptr_uint_t)entry->start_ip + load_offs;
info->end = info->start + entry->post_commit_offset;
info->handler = (app_pc)(ptr_uint_t)entry->abort_ip + load_offs;
info->final_instr_pc = NULL;
int signature;
if (!d_r_safe_read(info->handler - sizeof(signature), sizeof(signature),
&signature)) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
"Rseq signature is unreadable");
ASSERT_NOT_REACHED();
}
if (signature != rseq_signature) {
if (rseq_signature == 0) {
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
ATOMIC_4BYTE_WRITE(&rseq_signature, signature, false);
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
LOG(GLOBAL, LOG_LOADER, 2, "Rseq signature is 0x%08x\n", rseq_signature);
} else {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3,
get_application_name(), get_application_pid(),
"Rseq signatures are not all identical");
ASSERT_NOT_REACHED();
}
}
rseq_analyze_instructions(info);
vmvector_add(d_r_rseq_areas, info->start, info->end, (void *)info);
RSTATS_INC(num_rseq_regions);
* interior pc's.
*/
if (fragment_lookup(GLOBAL_DCONTEXT, info->start) != NULL) {
* can't easily tell the difference; plus we avoid a flush for lazy rseq
* activation).
*/
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(), get_application_pid(),
"Rseq sequences must not be used for non-rseq purposes");
ASSERT_NOT_REACHED();
}
}
static void
rseq_process_elf_sections(module_area_t *ma, bool at_map,
ELF_SECTION_HEADER_TYPE *sec_hdr_start, const char *strtab,
ssize_t load_offs)
{
bool found_array = false;
uint i;
ELF_HEADER_TYPE *elf_hdr = (ELF_HEADER_TYPE *)ma->start;
ELF_SECTION_HEADER_TYPE *sec_hdr = sec_hdr_start;
* relocated and only need the offset if relocations have not yet been applied.
*/
ssize_t entry_offs = 0;
if (at_map || (DYNAMO_OPTION(early_inject) && !dr_api_entry && !dynamo_started))
entry_offs = load_offs;
for (i = 0; i < elf_hdr->e_shnum; i++) {
#define RSEQ_PTR_ARRAY_SEC_NAME "__rseq_cs_ptr_array"
if (strcmp(strtab + sec_hdr->sh_name, RSEQ_PTR_ARRAY_SEC_NAME) == 0) {
found_array = true;
byte **ptrs = (byte **)(sec_hdr->sh_addr + load_offs);
int j;
for (j = 0; j < sec_hdr->sh_size / sizeof(ptrs); ++j) {
* failing to find section headers, make this a fatal error: better
* to notify the user than try to run the rseq w/o proper handling.
*/
if (ptrs < (byte **)ma->start || ptrs > (byte **)ma->end) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
RSEQ_PTR_ARRAY_SEC_NAME " is not in a loaded segment");
ASSERT_NOT_REACHED();
}
* (a partial map shouldn't make it to module list processing).
* We do perform a sanity check to handle unusual non-relocated
* cases (it's possible this array is not in a loaded segment?).
*/
byte *entry = *ptrs + entry_offs;
if (entry < ma->start || entry > ma->end) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
RSEQ_PTR_ARRAY_SEC_NAME "'s entries are not in a loaded segment");
ASSERT_NOT_REACHED();
}
rseq_process_entry((struct rseq_cs *)entry, entry_offs);
++ptrs;
}
break;
}
++sec_hdr;
}
if (!found_array) {
sec_hdr = sec_hdr_start;
for (i = 0; i < elf_hdr->e_shnum; i++) {
#define RSEQ_SEC_NAME "__rseq_cs"
#define RSEQ_OLD_SEC_NAME "__rseq_table"
if (strcmp(strtab + sec_hdr->sh_name, RSEQ_SEC_NAME) == 0 ||
strcmp(strtab + sec_hdr->sh_name, RSEQ_OLD_SEC_NAME) == 0) {
* over it. We're reading the loaded data, not the file, so it will
* always be aligned.
*/
#define RSEQ_CS_ALIGNMENT (4 * sizeof(__u64))
struct rseq_cs *array = (struct rseq_cs *)ALIGN_FORWARD(
sec_hdr->sh_addr + load_offs, RSEQ_CS_ALIGNMENT);
int j;
for (j = 0; j < sec_hdr->sh_size / sizeof(*array); ++j) {
if (array < (struct rseq_cs *)ma->start ||
array > (struct rseq_cs *)ma->end) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
RSEQ_SEC_NAME " is not in a loaded segment");
ASSERT_NOT_REACHED();
}
rseq_process_entry(array, entry_offs);
++array;
}
break;
}
++sec_hdr;
}
}
}
static bool
rseq_process_module(module_area_t *ma, bool at_map, bool saw_glibc_rseq_reg)
{
bool res = false;
ASSERT(is_elf_so_header(ma->start, ma->end - ma->start));
ELF_HEADER_TYPE *elf_hdr = (ELF_HEADER_TYPE *)ma->start;
ASSERT(elf_hdr->e_shentsize == sizeof(ELF_SECTION_HEADER_TYPE));
int fd = INVALID_FILE;
byte *sec_map = NULL, *str_map = NULL;
size_t sec_size = 0, str_size = 0;
ELF_SECTION_HEADER_TYPE *sec_hdr = NULL;
char *strtab;
ssize_t load_offs = ma->start - ma->os_data.base_address;
if (at_map && elf_hdr->e_shoff + ma->start < ma->end) {
sec_map = elf_hdr->e_shoff + ma->start;
sec_hdr = (ELF_SECTION_HEADER_TYPE *)sec_map;
strtab = (char *)(ma->start + sec_hdr[elf_hdr->e_shstrndx].sh_offset);
if (strtab > (char *)ma->end)
goto rseq_process_module_cleanup;
} else {
* case: they are typically at the end of the file. For this reason, we delay
* calling this function until we see the app use rseq.
*/
if (ma->full_path == NULL)
goto rseq_process_module_cleanup;
fd = os_open(ma->full_path, OS_OPEN_READ);
if (fd == INVALID_FILE)
goto rseq_process_module_cleanup;
off_t offs = ALIGN_BACKWARD(elf_hdr->e_shoff, PAGE_SIZE);
sec_size =
ALIGN_FORWARD(elf_hdr->e_shoff + elf_hdr->e_shnum * elf_hdr->e_shentsize,
PAGE_SIZE) -
offs;
sec_map =
os_map_file(fd, &sec_size, offs, NULL, MEMPROT_READ, MAP_FILE_COPY_ON_WRITE);
if (sec_map == NULL)
goto rseq_process_module_cleanup;
sec_hdr = (ELF_SECTION_HEADER_TYPE *)(sec_map + elf_hdr->e_shoff - offs);
offs = ALIGN_BACKWARD(sec_hdr[elf_hdr->e_shstrndx].sh_offset, PAGE_SIZE);
str_size = ALIGN_FORWARD(sec_hdr[elf_hdr->e_shstrndx].sh_offset +
sec_hdr[elf_hdr->e_shstrndx].sh_size,
PAGE_SIZE) -
offs;
str_map =
os_map_file(fd, &str_size, offs, NULL, MEMPROT_READ, MAP_FILE_COPY_ON_WRITE);
if (str_map == NULL)
goto rseq_process_module_cleanup;
strtab = (char *)(str_map + sec_hdr[elf_hdr->e_shstrndx].sh_offset - offs);
}
* relocated the executable yet.
*/
rseq_process_elf_sections(ma, at_map || saw_glibc_rseq_reg, sec_hdr, strtab,
load_offs);
res = true;
rseq_process_module_cleanup:
if (str_size != 0)
os_unmap_file(str_map, str_size);
if (sec_size != 0)
os_unmap_file(sec_map, sec_size);
if (fd != INVALID_FILE)
os_close(fd);
DODEBUG({
if (!res) {
const char *name = GET_MODULE_NAME(&ma->names);
if (name == NULL)
name = "(null)";
LOG(GLOBAL, LOG_INTERP | LOG_VMAREAS, 2,
"%s: error looking for rseq table in %s\n", __FUNCTION__, name);
if (strstr(name, "linux-vdso.so") == NULL) {
SYSLOG_INTERNAL_WARNING_ONCE(
"Failed to identify whether a module has an rseq table");
}
}
});
return res;
}
static bool
try_struct_rseq(void *try_addr)
{
static const int RSEQ_RARE_SIGNATURE = 42;
int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq),
RSEQ_FLAG_UNREGISTER, RSEQ_RARE_SIGNATURE);
LOG(GLOBAL, LOG_LOADER, 3, "Tried rseq @ " PFX " => %d\n", try_addr, res);
if (res == -EINVAL)
return false;
* actually used 42 for its signature we'll have to re-register it.
*/
if (res == 0) {
int res = dynamorio_syscall(SYS_rseq, 4, try_addr, sizeof(struct rseq), 0,
RSEQ_RARE_SIGNATURE);
ASSERT(res == 0);
res = -EPERM;
}
if (res == -EPERM) {
return true;
}
return false;
}
* we must search for its TLS location.
*/
static int
rseq_locate_tls_offset(void)
{
* has a consistent %fs:-offs address. Unfortunately, using a local copy of the
* rseq code for our non-instrumented execution requires us to locate the app's
* struct using heuristics, because the system call was poorly designed and will not
* let us replace the app's. Alternatives of no local copy have worse problems.
*/
* are <64 possible slots.
*/
int offset = 0;
byte *addr = get_app_segment_base(LIB_SEG_TLS);
if (addr > 0) {
byte *try_glibc_addr = addr + GLIBC_RSEQ_OFFSET;
if (try_struct_rseq(try_glibc_addr)) {
LOG(GLOBAL, LOG_LOADER, 2,
"Found glibc struct rseq @ " PFX " for thread => %s:%s0x%x\n",
try_glibc_addr, get_register_name(LIB_SEG_TLS),
(GLIBC_RSEQ_OFFSET < 0 ? "-" : ""), ABS(GLIBC_RSEQ_OFFSET));
return GLIBC_RSEQ_OFFSET;
}
}
* or the offset of glibc's struct rseq has changed. We do a wider search now.
*/
byte *seg_bottom;
size_t seg_size;
if (addr > 0 && get_memory_info(addr, &seg_bottom, &seg_size, NULL)) {
LOG(GLOBAL, LOG_LOADER, 3, "rseq within static TLS " PFX " - " PFX "\n",
seg_bottom, addr);
int alignment = __alignof(struct rseq_cs);
int i;
* is present in the struct pthread, which is at a positive offset from the
* app library segment base on x86, and negative on aarchxx. However, in the
* absence of rseq support from glibc, the app manually registers its own
* struct rseq which is present in static TLS, which is at a negative offset
* from the app library segment base on x86, and positive on aarchxx.
*/
ASSERT(seg_bottom <= addr && addr < seg_bottom + seg_size);
for (i = (seg_bottom - addr) / alignment;
addr + i * alignment < seg_bottom + seg_size; ++i) {
byte *try_addr = addr + i * alignment;
ASSERT(seg_bottom <= try_addr &&
try_addr < seg_bottom + seg_size);
* registered one. Our caller is not supposed to call here until the app
* has registered the current thread (either manually or using glibc).
*/
if (try_struct_rseq(try_addr)) {
LOG(GLOBAL, LOG_LOADER, 2,
"Found struct rseq @ " PFX " for thread => %s:%s0x%x\n", try_addr,
get_register_name(LIB_SEG_TLS), (i < 0 ? "-" : ""),
ABS(i) * alignment);
offset = i * alignment;
break;
}
}
}
return offset;
}
void
rseq_process_syscall(dcontext_t *dcontext)
{
byte *seg_base = get_app_segment_base(LIB_SEG_TLS);
byte *app_addr = (byte *)dcontext->sys_param0;
bool constant_offset = false;
bool first_rseq_registration = false;
if (rseq_tls_offset == 0) {
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
int offset = app_addr - seg_base;
int prior = atomic_exchange_int(&rseq_tls_offset, offset);
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
if (prior == 0)
first_rseq_registration = true;
constant_offset = (prior == 0 || prior == offset);
LOG(GLOBAL, LOG_LOADER, 2,
"Observed struct rseq at syscall @ " PFX " for thread => %s:%s0x%x\n",
app_addr, get_register_name(LIB_SEG_TLS), (rseq_tls_offset < 0 ? "-" : ""),
ABS(rseq_tls_offset));
} else
constant_offset = (seg_base + rseq_tls_offset == app_addr);
if (!constant_offset) {
REPORT_FATAL_ERROR_AND_EXIT(RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
"struct rseq is not always at the same offset");
ASSERT_NOT_REACHED();
}
* at a positive offset from thread pointer on x86 and negative offset on AArch64,
* both unlike the static TLS used by manual app registration.
*/
rseq_locate_rseq_regions(first_rseq_registration &&
IF_X86_ELSE(rseq_tls_offset > 0, rseq_tls_offset < 0));
}
*
* To avoid extra overhead going to disk to read section headers, we delay looking
* for rseq data until the app invokes an rseq syscall (or on attach we see a thread
* that has rseq set up). We document that we do not handle the app using rseq
* regions for non-rseq purposes, so we do not need to flush the cache here.
* Since we also identify the rseq_cs address here, this should be called *after*
* the app has registered the current thread for rseq.
*/
void
rseq_locate_rseq_regions(bool saw_glibc_rseq_reg)
{
if (rseq_enabled)
return;
* thus requiring synchronization.
*/
d_r_mutex_lock(&rseq_trigger_lock);
if (rseq_enabled) {
d_r_mutex_unlock(&rseq_trigger_lock);
return;
}
int offset = 0;
if (rseq_tls_offset == 0) {
offset = rseq_locate_tls_offset();
if (offset == 0) {
REPORT_FATAL_ERROR_AND_EXIT(
RSEQ_BEHAVIOR_UNSUPPORTED, 3, get_application_name(),
get_application_pid(),
"struct rseq is not in static thread-local storage");
ASSERT_NOT_REACHED();
}
}
SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
bool new_value = true;
ATOMIC_1BYTE_WRITE(&rseq_enabled, new_value, false);
if (rseq_tls_offset == 0)
ATOMIC_4BYTE_WRITE(&rseq_tls_offset, offset, false);
SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
module_iterator_t *iter = module_iterator_start();
while (module_iterator_hasnext(iter)) {
module_area_t *ma = module_iterator_next(iter);
rseq_process_module(ma, false , saw_glibc_rseq_reg);
}
module_iterator_stop(iter);
d_r_mutex_unlock(&rseq_trigger_lock);
}
void
rseq_module_init(module_area_t *ma, bool at_map)
{
if (rseq_enabled) {
rseq_process_module(ma, at_map, false);
}
}
void
rseq_process_native_abort(dcontext_t *dcontext)
{
LOG(THREAD, LOG_INTERP | LOG_VMAREAS, 2, "Abort triggered in rseq native code\n");
* reasonable.
*/
rseq_region_t *info = NULL;
priv_mcontext_t *source_mc = NULL;
if (dcontext->last_fragment != NULL &&
vmvector_lookup_data(d_r_rseq_areas, dcontext->last_fragment->tag, NULL, NULL,
(void **)&info)) {
* the whole sequence when any abort anywhere in the native execution occurs.
* We leave it up to the client to roll back at least the final instr.
* Since we don't know the interrupted PC (the kernel doesn't tell us), we
* do what the kernel does and present the abort handler as the PC.
* We similarly use the target context for the rest of the context.
*/
source_mc = HEAP_TYPE_ALLOC(dcontext, priv_mcontext_t, ACCT_CLIENT, PROTECTED);
*source_mc = *get_mcontext(dcontext);
source_mc->pc = info->handler;
}
get_mcontext(dcontext)->pc = dcontext->next_tag;
if (instrument_kernel_xfer(dcontext, DR_XFER_RSEQ_ABORT, osc_empty, NULL, source_mc,
dcontext->next_tag, get_mcontext(dcontext)->xsp, osc_empty,
get_mcontext(dcontext), 0)) {
dcontext->next_tag = canonicalize_pc_target(dcontext, get_mcontext(dcontext)->pc);
}
if (source_mc != NULL)
HEAP_TYPE_FREE(dcontext, source_mc, priv_mcontext_t, ACCT_CLIENT, PROTECTED);
* caused the abort. (It might be better to instead suppress this abort-exit
* event and present the signal as causing the abort but that is more complex
* to implement so we pretend the signal came in after the abort.)
* XXX: We saw a double abort and assume it is from some signal+abort
* combination but we failed to reproduce it in our linux.rseq tests cases
* so we do not have proof that this is solving anything here.
*/
translate_clear_last_direct_translation(dcontext);
}
void
rseq_insert_start_label(dcontext_t *dcontext, app_pc tag, instrlist_t *ilist)
{
app_pc start, end, handler;
if (!rseq_get_region_info(tag, &start, &end, &handler, NULL, NULL) || tag != start) {
ASSERT_NOT_REACHED();
return;
}
instr_t *label = INSTR_CREATE_label(dcontext);
instr_set_note(label, (void *)DR_NOTE_RSEQ_ENTRY);
dr_instr_label_data_t *data = instr_get_label_data_area(label);
data->data[0] = (ptr_uint_t)end;
data->data[1] = (ptr_uint_t)handler;
instrlist_meta_preinsert(ilist, instrlist_first(ilist), label);
}