DynamoRIO/core/unix/signal.c-代码预览-DynamoRIO:运行时代码操控系统，支持多平台程序分析与优化 - AtomGit

GGitHubi #5365 : Add AArch64 SVE support to the core (part 1) (#5835 )
f646a632创建于 2023年8月14日历史提交
/* **********************************************************
 * Copyright (c) 2011-2023 Google, Inc.  All rights reserved.
 * Copyright (c) 2000-2010 VMware, Inc.  All rights reserved.
 * **********************************************************/

/*
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of VMware, Inc. nor the names of its contributors may be
 *   used to endorse or promote products derived from this software without
 *   specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2000-2001 Hewlett-Packard Company */

/*
 * signal.c - dynamorio signal handler
 */

#include <errno.h>
#undef errno

#include "signal_private.h" /* pulls in globals.h for us, in right order */

/* We want to build on older toolchains so we have our own copy of signal
 * data structures
 */
#include "include/siginfo.h"
#ifdef LINUX
#    include "include/sigcontext.h"
#    include "include/signalfd.h"
#    include "include/clone3.h"
#    include "../globals.h" /* after our sigcontext.h, to preclude bits/sigcontext.h */
#elif defined(MACOS)
#    include "../globals.h" /* this defines _XOPEN_SOURCE for Mac */
#    include <signal.h>     /* after globals.h, for _XOPEN_SOURCE from os_exports.h */
#    include "os_public.h"
#endif

#ifdef LINUX
#    include <linux/sched.h>
#endif

#include <sys/time.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <ucontext.h>
#include "os_private.h"
#include "../fragment.h"
#include "../fcache.h"
#include "../perfctr.h"
#include "arch.h"
#include "../monitor.h"  /* for trace_abort */
#include "../link.h"     /* for linking interrupted fragment_t */
#include "instr.h"       /* to find target of SIGSEGV */
#include "decode.h"      /* to find target of SIGSEGV */
#include "decode_fast.h" /* to handle self-mod code */
#include "../synch.h"
#include "../nudge.h"
#include "disassemble.h"
#include "ksynch.h"
#include "tls.h" /* tls_reinstate_selector */
#include "../translate.h"

#ifdef LINUX
#    include "include/syscall.h"
#else
#    include <sys/syscall.h>
#    ifdef MACOS
#        include "include/syscall_mach.h"
#    endif
#endif

#include "instrument.h"

#ifdef VMX86_SERVER
#    include <errno.h>
#endif

/* Define the Linux names, which the code is already using */
#ifndef SA_NOMASK
#    define SA_NOMASK SA_NODEFER
#endif
#ifndef SA_ONESHOT
#    define SA_ONESHOT SA_RESETHAND
#endif
#ifndef SS_AUTODISARM
#    define SS_AUTODISARM (1U << 31)
#endif
#ifndef SS_FLAG_BITS
#    define SS_FLAG_BITS SS_AUTODISARM
#endif
#ifdef X86
/* Kernel-only flags. */
#    define SA_IA32_ABI 0x02000000U
#    define SA_X32_ABI 0x01000000U
#endif

/**** data structures ***************************************************/

/* The signal numbers are slightly different between operating systems.
 * To support differing default actions, we have separate arrays, rather
 * than indirecting to a single all-signals array.
 */
extern int default_action[];

/* We know that many signals are always asynchronous.
 * Others, however, may be synchronous or may not -- e.g., another process
 * could send us a SIGSEGV, and there is no way we can tell whether it
 * was generated by a real memory fault or not.  Thus we have to assume
 * that we must not delay any SIGSEGV deliveries.
 */
extern bool can_always_delay[];

static inline bool
sig_is_alarm_signal(int sig)
{
    return (sig == SIGALRM || sig == SIGVTALRM || sig == SIGPROF);
}

/* We do not use SIGSTKSZ b/c for things like code modification
 * we end up calling many core routines and so want more space.
 * Also, SIGSTKSZ is now defined as sysconf(_SC_SIGSTKSZ) and we
 * cannot invoke libc.
 */
#define SIGSTACK_SIZE (DYNAMO_OPTION(signal_stack_size))

/* this flag not defined in our headers */
#define SA_RESTORER 0x04000000

/* if no app sigaction, it's RT, since that's our handler */
#ifdef LINUX
#    define IS_RT_FOR_APP(info, sig)                        \
        IF_X64_ELSE(true,                                   \
                    ((info)->sighand->action[(sig)] == NULL \
                         ? true                             \
                         : (TEST(SA_SIGINFO, (info)->sighand->action[(sig)]->flags))))
#elif defined(MACOS)
#    define IS_RT_FOR_APP(info, sig) (true)
#endif

/* kernel sets size and sp to 0 for SS_DISABLE
 * when asked, will hand back SS_ONSTACK only if current xsp is inside the
 * alt stack; otherwise, if an alt stack is registered, it will give flags of 0
 * We do not support the "legacy stack switching" that uses the restorer field
 * as seen in kernel sources.
 */
#define APP_HAS_SIGSTACK(info) \
    ((info)->app_sigstack.ss_sp != NULL && (info)->app_sigstack.ss_flags != SS_DISABLE)

/* Under normal circumstances the app's action[] entry is lazily initialized when the
 * app registers a signal handler, but during detach there are points where we
 * are still intercepting signals after action[sig] has been set to
 * zeros. To be extra defensive, we do a NULL check.
 */
#define USE_APP_SIGSTACK(info, sig)                                    \
    (APP_HAS_SIGSTACK(info) && (info)->sighand->action[sig] != NULL && \
     TEST(SA_ONSTACK, (info)->sighand->action[sig]->flags))

/* If we only intercept a few signals, we leave whether un-intercepted signals
 * are blocked unchanged and stored in the kernel.  If we intercept all (not
 * quite yet: PR 297033, hence the need for this macro) we emulate the mask for
 * all.
 */
#define EMULATE_SIGMASK(info, sig) \
    (DYNAMO_OPTION(intercept_all_signals) || (info)->sighand->we_intercept[(sig)])

/* i#27: custom data to pass to the child of a clone */
/* PR i#149/403015: clone record now passed via a new dstack */
typedef struct _clone_record_t {
    byte *dstack; /* dstack for new thread - allocated by parent thread */
#ifdef MACOS
    /* XXX i#1403: once we have lower-level, earlier thread interception we can
     * likely switch to something closer to what we do on Linux.
     * This is used for bsdthread_create, where app_thread_xsp is NULL;
     * for vfork, app_thread_xsp is non-NULL and this is unused.
     */
    void *thread_arg;
#endif
    reg_t app_thread_xsp; /* app xsp preserved for new thread to use */
    app_pc continuation_pc;
    thread_id_t caller_id;
    int clone_sysnum;
#ifdef LINUX
    /* Flags in the args to SYS_clone3 are 64-bit. */
    uint64 clone_flags;
    /* Pointer to the app's copy of clone_args.
     * We restore this to the corresponding reg
     * post-syscall in the child thread.
     */
    clone3_syscall_args_t *app_clone_args;
#else
    uint clone_flags;
#endif
    thread_sig_info_t info;
    thread_sig_info_t *parent_info;
    void *pcprofile_info;
#ifdef AARCHXX
    /* To ensure we have the right value as of the point of the clone, we
     * store it here (we'll have races if we try to get it during new thread
     * init).
     */
    reg_t app_stolen_value;
#    ifndef AARCH64
    dr_isa_mode_t isa_mode;
#    endif
    /* To ensure we have the right app lib tls base in child thread,
     * we store it here if necessary (clone w/o CLONE_SETTLS or vfork).
     */
    void *app_lib_tls_base;
#endif
    /* we leave some padding at base of stack for dynamorio_clone
     * to store values
     */
    reg_t for_dynamorio_clone[4];
} __attribute__((__aligned__(ABI_STACK_ALIGNMENT))) clone_record_t;

/* i#350: set up signal handler for safe_read/faults during init */
static thread_sig_info_t init_info;
static kernel_sigset_t init_sigmask;

#ifdef DEBUG
static bool removed_sig_handler;
#endif

os_cxt_ptr_t osc_empty;

/* For delivering signals that arrive in now-native threads while we're still
 * detaching other threads.  The thread that receives the signal has no dcontext
 * anymore, so we need a global record of its handler (DR's handler is still registered
 * with the kernel).
 * XXX i#1921: To properly handle multiple separate signal handler thread groups
 * within this single DR process domain, we would need a list of these, along with
 * lists of threads within each.  Since that is very rare, and requires either
 * hardcoded maximums or complexities with exit-time heap, we currently do not
 * support that and die if asked to deliver a native signal in such circumstances.
 */
DECLARE_CXTSWPROT_VAR(static read_write_lock_t detached_sigact_lock,
                      INIT_READWRITE_LOCK(detached_sigact_lock));
static kernel_sigaction_t detached_sigact[SIGARRAY_SIZE];
static bool multiple_handlers_present; /* Accessed using atomics, not the lock. */

/**** function prototypes ***********************************************/

/* in x86.asm */
void
main_signal_handler(int sig, kernel_siginfo_t *siginfo, kernel_ucontext_t *ucxt);

static void
set_handler_and_record_app(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                           kernel_sigaction_t *act);

static void
intercept_signal(dcontext_t *dcontext, thread_sig_info_t *info, int sig);

static void
signal_info_init_sigaction(dcontext_t *dcontext, thread_sig_info_t *info);

static void
signal_info_exit_sigaction(dcontext_t *dcontext, thread_sig_info_t *info,
                           bool other_thread);

static bool
execute_handler_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame,
                           sigcontext_t *sc_orig, fragment_t *f, byte *access_address);

static bool
execute_handler_from_dispatch(dcontext_t *dcontext, int sig);

static void
execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame);

/* Execute default action from code cache and may terminate the process.
 * If returns, the return value decides if caller should restore
 * the untranslated context.
 */
static bool
execute_default_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                           sigcontext_t *sc_orig, bool forged);

static void
execute_default_from_dispatch(dcontext_t *dcontext, int sig, sigframe_rt_t *frame);

static bool
reroute_to_unmasked_thread(dcontext_t *dcontext, sigframe_rt_t *frame, int sig);

static bool
handle_alarm(dcontext_t *dcontext, int sig, kernel_ucontext_t *ucxt);

static bool
handle_suspend_signal(dcontext_t *dcontext, kernel_siginfo_t *siginfo,
                      kernel_ucontext_t *ucxt, sigframe_rt_t *frame);

static bool
handle_nudge_signal(dcontext_t *dcontext, kernel_siginfo_t *siginfo,
                    kernel_ucontext_t *ucxt);

static void
init_itimer(dcontext_t *dcontext, bool first);

static bool
set_actual_itimer(dcontext_t *dcontext, int which, thread_sig_info_t *info, bool enable);

static bool
alarm_signal_has_DR_only_itimer(dcontext_t *dcontext, int signal);

#ifdef DEBUG
static void
dump_sigset(dcontext_t *dcontext, kernel_sigset_t *set);
static void
dump_unmasked(dcontext_t *dcontext, const char *where)
{
    if (dcontext == GLOBAL_DCONTEXT || dcontext == NULL)
        return;
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    if (info->sighand == NULL)
        return;
    LOG(THREAD, LOG_ASYNCH, 3, "%s: threads_unmasked: ", where);
    for (int i = 1; i <= MAX_SIGNUM; i++) {
        LOG(THREAD, LOG_ASYNCH, 3, "[%d]=%d ", i, info->sighand->threads_unmasked[i]);
        if (i % 16 == 0 || i == MAX_SIGNUM)
            LOG(THREAD, LOG_ASYNCH, 3, "\n");
    }
}
#endif

static bool
is_sys_kill(dcontext_t *dcontext, byte *pc, byte *xsp, kernel_siginfo_t *info);

int
sigaction_syscall(int sig, kernel_sigaction_t *act, kernel_sigaction_t *oact)
{
#if !defined(VMX86_SERVER) && defined(LINUX)
    /* PR 305020: must have SA_RESTORER for x64 */
    /* i#2812: must have SA_RESTORER to handle vsyscall32 being disabled */
    if (act != NULL && !TEST(SA_RESTORER, act->flags)) {
        act->flags |= SA_RESTORER;
        act->restorer = (void (*)(void))dynamorio_sigreturn;
    }
#endif
    return dynamorio_syscall(IF_MACOS_ELSE(SYS_sigaction, SYS_rt_sigaction), 4, sig, act,
                             oact, sizeof(kernel_sigset_t));
}

static inline bool
signal_is_interceptable(int sig)
{
    return (sig != SIGKILL && sig != SIGSTOP);
}

/* Checks for a for-certain-synchronous-fault signal. */
static bool
signal_is_fault(dcontext_t *dcontext, int sig, byte *pc, byte *xsp,
                kernel_siginfo_t *info)
{
    switch (sig) {
    case SIGSEGV:
    case SIGBUS:
    case SIGILL:
    case SIGFPE:
    case SIGTRAP: return !is_sys_kill(dcontext, pc, xsp, info);
    default: return false;
    }
}

static bool
signal_is_process_wide(dcontext_t *dcontext, kernel_siginfo_t *info, byte *pc, byte *xsp)
{
#ifdef MACOS
    if (sig_is_alarm_signal(info->si_signo))
        return true;
    if (is_at_do_syscall(dcontext, pc, xsp)) {
        if (dcontext->sys_num == SYS_kill)
            return true;
        if (dcontext->sys_num == SYS___pthread_kill)
            return false;
    }
    return true;
#else
    switch (info->si_code) {
    case SI_KERNEL:
        /* Legacy interval timer sets SI_KERNEL instead of SI_TIMER. */
        if (sig_is_alarm_signal(info->si_signo))
            return true;
        /* XXX: For other signals, SI_KERNEL will always be the active thread, right? */
        return false;
    case SI_USER:
        if (is_at_do_syscall(dcontext, pc, xsp)) {
            if (dcontext->sys_num == SYS_kill || dcontext->sys_num == SYS_rt_sigqueueinfo)
                return true;
            /* tkill and tgkill should set SI_TKILL, but just in case: */
            if (dcontext->sys_num == SYS_tkill || dcontext->sys_num == SYS_tgkill ||
                dcontext->sys_num == SYS_rt_tgsigqueueinfo) {
                /* Passing -1 as tid does not turn into process-wide:
                 * these can only send to a specific thread.
                 */
                return false;
            }
        }
    case SI_QUEUE:
        if (is_at_do_syscall(dcontext, pc, xsp) &&
            dcontext->sys_num == SYS_rt_tgsigqueueinfo)
            return false;
        return true;
    case SI_TIMER: return true;
    case SI_MESGQ: return true;
    case SI_ASYNCIO: return true;
    case SI_SIGIO: return true;
    case SI_TKILL: return false;
#    ifdef SI_DETHREAD
    case SI_DETHREAD: return true;
#    endif
    default: return true;
    }
#endif
}

static inline int
sigaltstack_syscall(const stack_t *newstack, stack_t *oldstack)
{
    return dynamorio_syscall(SYS_sigaltstack, 2, newstack, oldstack);
}

static inline int
getitimer_syscall(int which, struct itimerval *val)
{
    return dynamorio_syscall(SYS_getitimer, 2, which, val);
}

static inline int
setitimer_syscall(int which, struct itimerval *val, struct itimerval *old)
{
    return dynamorio_syscall(SYS_setitimer, 3, which, val, old);
}

static inline int
sigprocmask_syscall(int how, kernel_sigset_t *set, kernel_sigset_t *oset,
                    size_t sigsetsize)
{
    return dynamorio_syscall(IF_MACOS_ELSE(SYS_sigprocmask, SYS_rt_sigprocmask), 4, how,
                             set, oset, sigsetsize);
}

void
block_all_noncrash_signals_except(kernel_sigset_t *oset, int num_signals,
                                  ... /* list of signals */)
{
    kernel_sigset_t set;
    kernel_sigfillset(&set);
    va_list ap;
    va_start(ap, num_signals);
    for (int i = 0; i < num_signals; ++i) {
        kernel_sigdelset(&set, va_arg(ap, int));
    }
    va_end(ap);
    /* We never block SIGSEGV or SIGBUS: we need them for various safe reads and to
     * properly report crashes.
     */
    kernel_sigdelset(&set, SIGSEGV);
    kernel_sigdelset(&set, SIGBUS);
    sigprocmask_syscall(SIG_SETMASK, &set, oset, sizeof(set));
}

static void
unblock_all_signals(kernel_sigset_t *oset)
{
    kernel_sigset_t set;
    kernel_sigemptyset(&set);
    sigprocmask_syscall(SIG_SETMASK, &set, oset, sizeof(set));
}

/* exported for stackdump.c */
bool
set_default_signal_action(int sig)
{
    kernel_sigset_t set;
    kernel_sigaction_t act;
    int rc;
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t)SIG_DFL;
    /* arm the signal */
    rc = sigaction_syscall(sig, &act, NULL);
    DODEBUG({ removed_sig_handler = true; });

    /* If we're in our handler now, we have to unblock */
    kernel_sigemptyset(&set);
    kernel_sigaddset(&set, sig);
    sigprocmask_syscall(SIG_UNBLOCK, &set, NULL, sizeof(set));

    return (rc == 0);
}

static bool
set_ignore_signal_action(int sig)
{
    kernel_sigaction_t act;
    int rc;
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t)SIG_IGN;
    /* arm the signal */
    rc = sigaction_syscall(sig, &act, NULL);
    return (rc == 0);
}

/* We assume that signal handlers will be shared most of the time
 * (pthreads shares them)
 * Rather than start out with the handler table in local memory and then
 * having to transfer to global, we just always use global
 */
static void
handler_free(dcontext_t *dcontext, void *p, size_t size)
{
    global_heap_free(p, size HEAPACCT(ACCT_OTHER));
}

static void *
handler_alloc(dcontext_t *dcontext, size_t size)
{
    return global_heap_alloc(size HEAPACCT(ACCT_OTHER));
}

/* Does not return. */
static void
report_unhandleable_signal_and_exit(int sig, const char *sub_message)
{
    /* TODO i#1921: Add more info such as the PC and disasm of surrounding instrs. */
    char signum_str[8];
    snprintf(signum_str, BUFFER_SIZE_ELEMENTS(signum_str), "%d", sig);
    NULL_TERMINATE_BUFFER(signum_str);
    char tid_str[16];
    snprintf(tid_str, BUFFER_SIZE_ELEMENTS(tid_str), TIDFMT, get_sys_thread_id());
    NULL_TERMINATE_BUFFER(tid_str);
    REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_HANDLE_SIGNAL, 5, get_application_name(),
                                get_application_pid(), signum_str, tid_str, sub_message);
}

/**** top-level routines ***********************************************/

static bool
os_itimers_thread_shared(void)
{
    static bool itimers_shared;
    static bool cached = false;
    if (!cached) {
        file_t f = os_open("/proc/version", OS_OPEN_READ);
        if (f != INVALID_FILE) {
            char buf[128];
            int major, minor, rel;
            os_read(f, buf, BUFFER_SIZE_ELEMENTS(buf));
            NULL_TERMINATE_BUFFER(buf);
            if (sscanf(buf, "%*s %*s %d.%d.%d", &major, &minor, &rel) == 3) {
                /* Linux NPTL in kernel 2.6.12+ has POSIX-style itimers shared
                 * among threads.
                 */
                LOG(GLOBAL, LOG_ASYNCH, 1, "kernel version = %d.%d.%d\n", major, minor,
                    rel);
                itimers_shared = ((major == 2 && minor >= 6 && rel >= 12) ||
                                  (major >= 3 /* linux-3.0 or above */));
                cached = true;
            }
            os_close(f);
        }
        if (!cached) {
            /* assume not shared */
            itimers_shared = false;
            cached = true;
        }
        LOG(GLOBAL, LOG_ASYNCH, 1, "itimers are %s\n",
            itimers_shared ? "thread-shared" : "thread-private");
    }
    return itimers_shared;
}

static void
unset_initial_crash_handlers(dcontext_t *dcontext)
{
    ASSERT(init_info.sighand != NULL);
    DODEBUG({
        /* Satisfy ASSERT in signal_info_exit_sigaction(). */
        for (int i = 1; i <= MAX_SIGNUM; i++)
            init_info.sighand->threads_unmasked[i] = 0;
    });
    signal_info_exit_sigaction(GLOBAL_DCONTEXT, &init_info, false /*!other_thread*/);
    /* Undo the unblock-all */
    sigprocmask_syscall(SIG_SETMASK, &init_sigmask, NULL, sizeof(init_sigmask));
    /* app_sigblocked is filled in when signal_swap_mask() is called. */
    DOLOG(2, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 2, "initial app signal mask:\n");
        dump_sigset(dcontext, &init_sigmask);
    });
}

void
d_r_signal_init(void)
{
    kernel_sigset_t set;
    IF_LINUX(IF_X86_64(ASSERT(ALIGNED(offsetof(sigpending_t, xstate), AVX_ALIGNMENT))));
    IF_MACOS(ASSERT(sizeof(kernel_sigset_t) == sizeof(__darwin_sigset_t)));
    os_itimers_thread_shared();

    IF_LINUX(signalfd_init());
    signal_arch_init();

    /* Do not usurp the app's signal handling when in standalone mode.
     * XXX i#1409: Refactoring and better separating general utilities from
     * managed mode operations would make things cleaner.
     */
    if (standalone_library)
        return;

    /* Set up a handler for safe_read (or other fault detection) during
     * DR init before thread is initialized.  We must do this *after* signal_arch_init()
     * and other key init in case a native signal arrives right after we install
     * our handler (i#1921).
     *
     * XXX: could set up a clone_record_t and pass to the initial
     * signal_thread_inherit() but that would require further code changes.
     * Could also call signal_thread_inherit to init this, but we don't want
     * to intercept timer signals, etc. before we're ready to handle them,
     * so we do a partial init.
     */
    signal_info_init_sigaction(GLOBAL_DCONTEXT, &init_info);
    intercept_signal(GLOBAL_DCONTEXT, &init_info, SIGSEGV);
    intercept_signal(GLOBAL_DCONTEXT, &init_info, SIGBUS);
    kernel_sigemptyset(&set);
    kernel_sigaddset(&set, SIGSEGV);
    kernel_sigaddset(&set, SIGBUS);
    sigprocmask_syscall(SIG_UNBLOCK, &set, &init_sigmask, sizeof(set));
}

void
d_r_signal_exit()
{
    DELETE_READWRITE_LOCK(detached_sigact_lock);
    IF_LINUX(signalfd_exit());
    if (init_info.sighand != NULL) {
        /* We never took over the app (e.g., standalone mode).  Restore its state. */
        unset_initial_crash_handlers(GLOBAL_DCONTEXT);
    }
#ifdef DEBUG
    if (d_r_stats->loglevel > 0 && (d_r_stats->logmask & (LOG_ASYNCH | LOG_STATS)) != 0) {
        LOG(GLOBAL, LOG_ASYNCH | LOG_STATS, 1, "Total signals delivered: %d\n",
            GLOBAL_STAT(num_signals));
    }
#endif
}

#ifdef HAVE_SIGALTSTACK
/* Separated out to run from the dstack (i#2016: see below). */
static void
set_our_alt_stack(void *arg)
{
    thread_sig_info_t *info = (thread_sig_info_t *)arg;
    DEBUG_DECLARE(int rc =)
    sigaltstack_syscall(&info->sigstack, &info->app_sigstack);
    ASSERT(rc == 0);
}
#endif

void
signal_thread_init(dcontext_t *dcontext, void *os_data)
{
    thread_sig_info_t *info =
        HEAP_TYPE_ALLOC(dcontext, thread_sig_info_t, ACCT_OTHER, PROTECTED);
    size_t pend_unit_size = sizeof(sigpending_t) +
        /* include alignment for xsave on xstate */
        signal_frame_extra_size(true)
        /* sigpending_t has xstate inside it already */
        IF_LINUX(IF_X86(-sizeof(kernel_xstate_t)));
    IF_X86(ASSERT(YMM_ENABLED() || !ZMM_ENABLED()));
    /* pend_unit_size may not be aligned, even for AVX. We request alignment from the
     * allocator for all pending units (xref i#3749, i#3380).
     */

    /* all fields want to be initialized to 0 */
    memset(info, 0, sizeof(thread_sig_info_t));
    dcontext->signal_field = (void *)info;

    /* our special heap to avoid reentrancy problems
     * composed entirely of sigpending_t units
     * Note that it's fine to have the special heap do page-at-a-time
     * committing, which does not use locks (unless triggers reset!),
     * but if we need a new unit that will grab a lock: we try to
     * avoid that by limiting the # of pending alarm signals (PR 596768).
     */
    info->sigheap = special_heap_init_aligned(
        pend_unit_size, IF_X86_ELSE(AVX_ALIGNMENT, 0),
        false /* cannot have any locking */, false /* -x */, true /* persistent */,
        pend_unit_size * DYNAMO_OPTION(max_pending_signals));

#ifdef HAVE_SIGALTSTACK
    /* set up alternate stack
     * i#552 we may terminate the process without freeing the stack, so we
     * stack_alloc it to exempt from the memory leak check.
     */
    info->sigstack.ss_sp = (char *)stack_alloc(SIGSTACK_SIZE, NULL) - SIGSTACK_SIZE;
    info->sigstack.ss_size = SIGSTACK_SIZE;
    /* kernel will set xsp to sp+size to grow down from there, we don't have to */
    info->sigstack.ss_flags = 0;

    /* i#2016: for late takeover, this app thread may already be on its own alt
     * stack.  Not setting SA_ONSTACK for SUSPEND_SIGNAL is not sufficient to avoid
     * this, as our SUSPEND_SIGNAL can interrupt the app inside its own signal
     * handler.  Thus, we simply swap to another stack temporarily to avoid the
     * kernel complaining.  The dstack is set up but it has the clone record and
     * initial mcxt, so we use the new alt stack.
     */
    call_switch_stack((void *)info, (byte *)info->sigstack.ss_sp + info->sigstack.ss_size,
                      set_our_alt_stack, NULL, true /*return*/);
    LOG(THREAD, LOG_ASYNCH, 1, "signal stack is " PFX " - " PFX "\n",
        info->sigstack.ss_sp, info->sigstack.ss_sp + info->sigstack.ss_size);
    /* app_sigstack dealt with below, based on parentage */
#endif

    ASSIGN_INIT_LOCK_FREE(info->sigblocked_lock, sigmask_lock);
    /* No sigblocked_lock since this is init time. */
    kernel_sigemptyset(&info->app_sigblocked);

    ASSIGN_INIT_LOCK_FREE(info->child_lock, child_lock);

    /* signal_thread_inherit() finishes per-thread init and is invoked
     * by os_thread_init_finalize(): we need it after synch_thread_init() and
     * other post-os_thread_init() setup b/c we can't yet record pending signals,
     * but we need it before we give up thread_initexit_lock so we can handle
     * our own suspend signals (i#2779).
     */
}

bool
is_thread_signal_info_initialized(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    return info->fully_initialized;
}

/* i#27: create custom data to pass to the child of a clone
 * since we can't rely on being able to find the caller, or that
 * its syscall data is still valid, once in the child.
 *
 * i#149/ PR 403015: The clone record is passed to the new thread via the dstack
 * created for it.  Unlike before, where the child thread would create its own
 * dstack, now the parent thread creates the dstack.  Also, switches app stack
 * to dstack.
 *
 * XXX i#1403: for Mac we want to eventually do lower-level earlier interception
 * of threads, but for now we're later and higher-level, intercepting the user
 * thread function on the new thread's stack.  We ignore app_thread_xsp.
 *
 * On Linux, dr_clone_args and app_clone_args are set only for SYS_clone3; also
 * app_thread_xsp is NULL for SYS_clone3.
 */
void *
#ifdef MACOS
create_clone_record(dcontext_t *dcontext, reg_t *app_thread_xsp, app_pc thread_func,
                    void *thread_arg)
#elif defined(LINUX)
create_clone_record(dcontext_t *dcontext, reg_t *app_thread_xsp,
                    /* Do not store dr_clone_args; it is freed in the parent thread
                     * in the post-syscall handling of clone3.
                     */
                    clone3_syscall_args_t *dr_clone_args,
                    clone3_syscall_args_t *app_clone_args)
#else
create_clone_record(dcontext_t *dcontext, reg_t *app_thread_xsp)
#endif
{
    clone_record_t *record;
    byte *dstack = stack_alloc(DYNAMORIO_STACK_SIZE, NULL);
    LOG(THREAD, LOG_ASYNCH, 1, "create_clone_record: dstack for new thread is " PFX "\n",
        dstack);

#ifdef MACOS
    if (app_thread_xsp == NULL) {
        record = HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, clone_record_t, ACCT_THREAD_MGT,
                                 true /*prot*/);
        record->app_thread_xsp = 0;
        record->continuation_pc = thread_func;
        record->thread_arg = thread_arg;
        record->clone_flags = CLONE_THREAD | CLONE_VM | CLONE_SIGHAND | SIGCHLD;
    } else {
#endif
        /* Note, the stack grows to low memory addr, so dstack points to the high
         * end of the allocated stack region.  So, we must subtract to get space for
         * the clone record.
         */
        record = (clone_record_t *)(dstack - sizeof(clone_record_t));
        ASSERT(ALIGNED(record, get_ABI_stack_alignment()));
#ifdef LINUX
        ASSERT((dcontext->sys_num != SYS_clone3 && app_thread_xsp != NULL &&
                dr_clone_args == NULL && app_clone_args == NULL) ||
               (dcontext->sys_num == SYS_clone3 && dr_clone_args != NULL &&
                app_clone_args != NULL && app_thread_xsp == NULL));
        if (dr_clone_args != NULL) {
            /* SYS_clone3 has the lowest address of the stack in
             * cl_args->stack. But we expect the highest (non-inclusive)
             * in the clone record's app_thread_xsp.
             */
            record->app_thread_xsp = dr_clone_args->stack + dr_clone_args->stack_size;
            record->clone_flags = dr_clone_args->flags;
            record->app_clone_args = app_clone_args;
        } else {
#endif
            record->app_thread_xsp = *app_thread_xsp;
            record->clone_flags = dcontext->sys_param0;
            IF_LINUX(record->app_clone_args = NULL);
#ifdef LINUX
        }
#endif
        /* asynch_target is set in d_r_dispatch() prior to calling pre_system_call(). */
        record->continuation_pc = dcontext->asynch_target;
#ifdef MACOS
    }
#endif
    LOG(THREAD, LOG_ASYNCH, 1, "allocated clone record: " PFX "\n", record);

    record->dstack = dstack;
    record->caller_id = dcontext->owning_thread;
    record->clone_sysnum = dcontext->sys_num;
    record->info = *((thread_sig_info_t *)dcontext->signal_field);
    /* Sigstack is not inherited so clear it now to avoid having to figure out
     * where it got its value in signal_thread_inherit (i#3116).
     */
    memset(&record->info.app_sigstack, 0, sizeof(record->info.app_sigstack));
    record->info.app_sigstack.ss_flags = SS_DISABLE;
    record->parent_info = (thread_sig_info_t *)dcontext->signal_field;
    record->pcprofile_info = dcontext->pcprofile_field;
#ifdef AARCHXX
    record->app_stolen_value = get_stolen_reg_val(get_mcontext(dcontext));
#    ifndef AARCH64
    record->isa_mode = dr_get_isa_mode(dcontext);
#    endif
    /* If the child thread shares the same TLS with parent by not setting
     * CLONE_SETTLS or vfork, we put the TLS base here and clear the
     * thread register in new_thread_setup, so that DR can distinguish
     * this case from normal pthread thread creation.
     */
    record->app_lib_tls_base = (!TEST(CLONE_SETTLS, record->clone_flags))
        ? os_get_app_tls_base(dcontext, TLS_REG_LIB)
        : NULL;
#endif
    LOG(THREAD, LOG_ASYNCH, 1, "create_clone_record: thread " TIDFMT ", pc " PFX "\n",
        record->caller_id, record->continuation_pc);

#ifdef LINUX
    if (dr_clone_args != NULL) {
        ASSERT(ALIGNED(XSTATE_ALIGNMENT, REGPARM_END_ALIGN));
        /* SYS_clone3 expects the lowest address of the stack in dr_clone_args->stack. */
        dr_clone_args->stack = (ptr_uint_t)(dstack - DYNAMORIO_STACK_SIZE);
        dr_clone_args->stack_size =
            (uint64)ALIGN_BACKWARD(record, XSTATE_ALIGNMENT) - dr_clone_args->stack;
    } else {
#endif
        if (IF_MACOS_ELSE(app_thread_xsp != NULL, true)) {
            /* Set the thread stack to point to the dstack, below the clone record.
             * Note: it's glibc who sets up the arg to the thread start function;
             * the kernel just does a fork + stack swap, so we can get away w/ our
             * own stack swap if we restore before the glibc asm code takes over.
             * We restore this parameter to the app value in
             * restore_clone_param_from_clone_record().
             */
            /* i#754: Set stack to be XSTATE aligned for saving YMM registers */
            ASSERT(ALIGNED(XSTATE_ALIGNMENT, REGPARM_END_ALIGN));
            *app_thread_xsp = ALIGN_BACKWARD(record, XSTATE_ALIGNMENT);
        }
#ifdef LINUX
    }
#endif
    return (void *)record;
}

/* This is to support dr_create_client_thread() */
void
set_clone_record_fields(void *record, reg_t app_thread_xsp, app_pc continuation_pc,
                        uint clone_sysnum, uint clone_flags)
{
    clone_record_t *rec = (clone_record_t *)record;
    ASSERT(rec != NULL);
    rec->app_thread_xsp = app_thread_xsp;
    rec->continuation_pc = continuation_pc;
    rec->clone_sysnum = clone_sysnum;
    rec->clone_flags = clone_flags;
}

/* i#149/PR 403015: The clone record is passed to the new thread by placing it
 * at the bottom of the dstack, i.e., the high memory.  So the new thread gets
 * it from the base of the dstack.  The dstack is then set as the app stack.
 *
 * CAUTION: don't use a lot of stack in this routine as it gets invoked on the
 *          dstack from new_thread_setup - this is because this routine assumes
 *          no more than a page of dstack for X86 and 2 pages of dstack for
 *          AArch64 have been used so far since the clone system call was done.
 */
void *
get_clone_record(reg_t xsp)
{
    clone_record_t *record;
    byte *dstack_base;

    /* xsp should be in a dstack, i.e., dynamorio heap.  */
    ASSERT(is_dynamo_address((app_pc)xsp));

    /* The (size of the clone record +
     *      stack used by new_thread_start (only for setting up priv_mcontext_t) +
     *      stack used by new_thread_setup before calling get_clone_record())
     * is less than a page for X86 and 2 pages for AArch64. This is verified by
     * the assert below. If it does exceed 1 page for X86 and 2 for AArch64, it
     * won't happen at random during runtime, but in a predictable way during
     * development, which will be caught by the assert.
     *
     * The current usage is about 800 bytes (X86) or 1920 bytes (AArch64) for
     * clone_record + sizeof(priv_mcontext_t) + few words in new_thread_setup
     * before get_clone_record() is called.
     */
#ifdef AARCH64
    dstack_base = (byte *)ALIGN_FORWARD(xsp, PAGE_SIZE) + PAGE_SIZE;
#else
    dstack_base = (byte *)ALIGN_FORWARD(xsp, PAGE_SIZE);
#endif
    record = (clone_record_t *)(dstack_base - sizeof(clone_record_t));

    /* dstack_base and the dstack in the clone record should be the same. */
    ASSERT(dstack_base == record->dstack);
#ifdef MACOS
    ASSERT(record->app_thread_xsp != 0); /* else it's not in dstack */
#endif
    return (void *)record;
}

/* i#149/PR 403015: App xsp is passed to the new thread via the clone record. */
reg_t
get_clone_record_app_xsp(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *)record)->app_thread_xsp;
}

#ifdef MACOS
void *
get_clone_record_thread_arg(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *)record)->thread_arg;
}
#endif

byte *
get_clone_record_dstack(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *)record)->dstack;
}

#ifdef AARCHXX
reg_t
get_clone_record_stolen_value(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *)record)->app_stolen_value;
}

#    ifndef AARCH64
uint /* dr_isa_mode_t but we have a header ordering problem */
get_clone_record_isa_mode(void *record)
{
    ASSERT(record != NULL);
    return ((clone_record_t *)record)->isa_mode;
}
#    endif

void
set_thread_register_from_clone_record(void *record)
{
    /* If record->app_lib_tls_base is not NULL, it means the parent
     * thread did not setup TLS for the child, and we need clear the
     * thread register.
     */
    if (((clone_record_t *)record)->app_lib_tls_base != NULL)
        write_thread_register(NULL);
}

void
set_app_lib_tls_base_from_clone_record(dcontext_t *dcontext, void *record)
{
    if (((clone_record_t *)record)->app_lib_tls_base != NULL) {
        /* child and parent share the same TLS */
        os_set_app_tls_base(dcontext, TLS_REG_LIB,
                            ((clone_record_t *)record)->app_lib_tls_base);
    }
}
#endif

void
restore_clone_param_from_clone_record(dcontext_t *dcontext, void *record)
{
#ifdef LINUX
    ASSERT(record != NULL);
    clone_record_t *crec = (clone_record_t *)record;
    if (TEST(CLONE_VM, crec->clone_flags)) {
        /* Restore the original stack parameter to the syscall, which we clobbered
         * in create_clone_record().  Some apps examine it post-syscall (i#3171).
         */
        if (crec->clone_sysnum == SYS_clone) {
            set_syscall_param(dcontext, SYSCALL_PARAM_CLONE_STACK,
                              get_mcontext(dcontext)->xsp);
        } else if (crec->clone_sysnum == SYS_clone3) {
#    ifdef X86
            set_syscall_param(dcontext, SYSCALL_PARAM_CLONE3_CLONE_ARGS,
                              (reg_t)crec->app_clone_args);
#    else
            /* On AArchXX r0 is used to pass the first arg to the syscall as well as
             * to hold its return value. As the clone_args pointer isn't available
             * post-syscall natively anyway, there's no need to restore here.
             */
#    endif
        }
    }
#endif
}

/* Initializes info's sighand structure. */
static void
signal_info_init_sigaction(dcontext_t *dcontext, thread_sig_info_t *info)
{
    info->sighand =
        (sighand_info_t *)handler_alloc(dcontext, SIGARRAY_SIZE * sizeof(*info->sighand));
    memset(info->sighand, 0, SIGARRAY_SIZE * sizeof(*info->sighand));
    memset(&info->restorer_valid, -1, SIGARRAY_SIZE * sizeof(info->restorer_valid[0]));
    ASSIGN_INIT_LOCK_FREE(info->sighand->lock, sighand_lock);
    for (int i = 1; i <= MAX_SIGNUM; i++) {
        if (!EMULATE_SIGMASK(info, i))
            continue;
        info->sighand->threads_unmasked[i] = 1;
    }
}

/* Cleans up info's sighand structure. */
static void
signal_info_exit_sigaction(dcontext_t *dcontext, thread_sig_info_t *info,
                           bool other_thread)
{
    int i;
    kernel_sigaction_t act;
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t)SIG_DFL;
    kernel_sigemptyset(&act.mask); /* does mask matter for SIG_DFL? */
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
    for (i = 1; i <= MAX_SIGNUM; i++) {
        /* No synch here b/c we're exiting. */
        ASSERT(info->sighand->threads_unmasked[i] == 0);
        if (sig_is_alarm_signal(i) && doing_detach && !standalone_library &&
            alarm_signal_has_DR_only_itimer(dcontext, i)) {
            /* We ignore alarms *during* detach in signal_remove_alarm_handlers(),
             * but to avoid crashing on an alarm arriving post-detach we set to
             * SIG_IGN if we have an itimer and the app does not (a slight
             * transparency violation to gain robustness: i#2270).
             */
            set_ignore_signal_action(i);
        } else if (!other_thread) {
            if (info->sighand->action[i] != NULL) {
                /* Restore to old handler, but not if exiting whole process:
                 * else may get itimer during cleanup, so we set to SIG_IGN.  We
                 * do this during detach in signal_remove_alarm_handlers() (and
                 * post-detach above).
                 */
                if (dynamo_exited && !doing_detach) {
                    info->sighand->action[i]->handler = (handler_t)SIG_IGN;
                }
                LOG(THREAD, LOG_ASYNCH, 2, "\trestoring " PFX " as handler for %d\n",
                    info->sighand->action[i]->handler, i);
                sigaction_syscall(i, info->sighand->action[i], NULL);
            } else if (info->sighand->we_intercept[i]) {
                /* restore to default */
                LOG(THREAD, LOG_ASYNCH, 2, "\trestoring SIG_DFL as handler for %d\n", i);
                sigaction_syscall(i, &act, NULL);
            }
        }
        if (info->sighand->action[i] != NULL) {
            handler_free(dcontext, info->sighand->action[i], sizeof(kernel_sigaction_t));
        }
    }
    DELETE_LOCK(info->sighand->lock);
    handler_free(dcontext, info->sighand, SIGARRAY_SIZE * sizeof(*info->sighand));
    info->sighand = NULL;
}

/* Called to finalize per-thread initialization.
 * Inherited and shared fields are set up here.
 * The clone_record contains the continuation pc, which is stored in dcontext->next_tag.
 */
void
signal_thread_inherit(dcontext_t *dcontext, void *clone_record)
{
    clone_record_t *record = (clone_record_t *)clone_record;
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    if (record != NULL) {
        LOG(THREAD, LOG_ASYNCH, 1, "continuation pc is " PFX "\n",
            record->continuation_pc);
        dcontext->next_tag = record->continuation_pc;
        LOG(THREAD, LOG_ASYNCH, 1,
            "parent tid is " TIDFMT ", parent sysnum is %d(%s), clone flags=" PIFX "\n",
            record->caller_id, record->clone_sysnum,
#ifdef SYS_vfork
            (record->clone_sysnum == SYS_vfork)
                ? "vfork"
                :
#endif
                (IF_LINUX(record->clone_sysnum == SYS_clone        ? "clone"
                              : record->clone_sysnum == SYS_clone3 ? "clone3"
                                                                   :)
                     IF_MACOS(record->clone_sysnum == SYS_bsdthread_create
                                  ? "bsdthread_create"
                                  :) "unexpected"),
            record->clone_flags);
#ifdef SYS_vfork
        if (record->clone_sysnum == SYS_vfork) {
            /* The above clone_flags argument is bogus.
               SYS_vfork doesn't have a free register to keep the hardcoded value
               see /usr/src/linux/arch/i386/kernel/process.c */
            /* CHECK: is this the only place real clone flags are needed? */
            record->clone_flags = CLONE_VFORK | CLONE_VM | SIGCHLD;
        }
#endif

        /* Handlers are either inherited or shared. */
        if (TEST(CLONE_SIGHAND, record->clone_flags)) {
            /* Need to share table of handlers! */
            LOG(THREAD, LOG_ASYNCH, 2, "sharing signal handlers with parent\n");
            info->sighand = record->info.sighand;
            info->sighand->is_shared = true;
            d_r_mutex_lock(&info->sighand->lock);
            info->sighand->refcount++;
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (!EMULATE_SIGMASK(info, i))
                    continue;
                if (!kernel_sigismember(&info->app_sigblocked, i))
                    ATOMIC_INC(int, info->sighand->threads_unmasked[i]);
            }
#ifdef DEBUG
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (info->sighand->action[i] != NULL) {
                    LOG(THREAD, LOG_ASYNCH, 2, "\thandler for signal %d is " PFX "\n", i,
                        info->sighand->action[i]->handler);
                }
            }
#endif
            d_r_mutex_unlock(&info->sighand->lock);
        } else {
            /* Copy handlers. */
            LOG(THREAD, LOG_ASYNCH, 2, "inheriting signal handlers from parent\n");
            if (!atomic_read_bool(&multiple_handlers_present))
                ATOMIC_1BYTE_WRITE(&multiple_handlers_present, true, false);
            signal_info_init_sigaction(dcontext, info);
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (record->info.sighand->action[i] != NULL) {
                    info->sighand->action[i] = (kernel_sigaction_t *)handler_alloc(
                        dcontext, sizeof(kernel_sigaction_t));
                    memcpy(info->sighand->action[i], record->info.sighand->action[i],
                           sizeof(kernel_sigaction_t));
                    LOG(THREAD, LOG_ASYNCH, 2, "\thandler for signal %d is " PFX "\n", i,
                        info->sighand->action[i]->handler);
                }
                /* No sighand synch needed since only this thread owns these. */
                if (EMULATE_SIGMASK(info, i)) {
                    if (!kernel_sigismember(&info->app_sigblocked, i))
                        info->sighand->threads_unmasked[i] = 1;
                    else
                        info->sighand->threads_unmasked[i] = 0;
                }
            }
            memcpy(info->sighand->we_intercept, record->info.sighand->we_intercept,
                   SIGARRAY_SIZE * sizeof(bool));
            d_r_mutex_lock(&record->info.child_lock);
            record->info.num_unstarted_children--;
            d_r_mutex_unlock(&record->info.child_lock);
            /* this should be safe since parent should wait for us */
            d_r_mutex_lock(&record->parent_info->child_lock);
            record->parent_info->num_unstarted_children--;
            d_r_mutex_unlock(&record->parent_info->child_lock);
        }

        /* itimers are either private or shared */
        if (TEST(CLONE_THREAD, record->clone_flags) && os_itimers_thread_shared()) {
            ASSERT(record->info.shared_itimer);
            LOG(THREAD, LOG_ASYNCH, 2, "sharing itimers with parent\n");
            info->shared_itimer = true;
            info->shared_itimer_refcount = record->info.shared_itimer_refcount;
            info->shared_itimer_underDR = record->info.shared_itimer_underDR;
            info->itimer = record->info.itimer;
            atomic_add_exchange_int((volatile int *)info->shared_itimer_refcount, 1);
            /* shared_itimer_underDR will be incremented in start_itimer() */
        } else {
            info->shared_itimer = false;
            init_itimer(dcontext, false /*!first thread*/);
        }

        /* rest of state is never shared.
         * app_sigstack should already be in place, when we set up our sigstack
         * we asked for old sigstack.
         * FIXME: are current pending or blocked inherited?
         */
#ifdef MACOS
        if (record->app_thread_xsp != 0) {
            HEAP_TYPE_FREE(GLOBAL_DCONTEXT, record, clone_record_t, ACCT_THREAD_MGT,
                           true /*prot*/);
        }
#endif
    } else {
        /* Initialize in isolation */

        if (APP_HAS_SIGSTACK(info)) {
            /* parent was NOT under our control, so the real sigstack we see is
             * a real sigstack that was present before we took control
             */
            LOG(THREAD, LOG_ASYNCH, 1, "app already has signal stack " PFX " - " PFX "\n",
                info->app_sigstack.ss_sp,
                info->app_sigstack.ss_sp + info->app_sigstack.ss_size);
        }

        signal_info_init_sigaction(dcontext, info);

        info->shared_itimer = false; /* we'll set to true if a child is created */
        init_itimer(dcontext, true /*first*/);

        /* We split init vs start for the signal handlers and mask.  We do not
         * install ours until we start running the app, to avoid races like
         * i#2335.  We'll set them up when os_process_under_dynamorio_*() invokes
         * signal_reinstate_handlers().  All we do now is mark which signals we
         * want to intercept.
         */
        if (DYNAMO_OPTION(intercept_all_signals)) {
            /* PR 304708: to support client signal handlers without
             * the complexity of per-thread and per-signal callbacks
             * we always intercept all signals.  We also check here
             * for handlers the app registered before our init.
             */
            for (i = 1; i <= MAX_SIGNUM; i++) {
                /* cannot intercept KILL or STOP */
                if (signal_is_interceptable(i) &&
                    /* FIXME PR 297033: we don't support intercepting DEFAULT_STOP /
                     * DEFAULT_CONTINUE signals.  Once add support, update
                     * dr_register_signal_event() comments.
                     */
                    default_action[i] != DEFAULT_STOP &&
                    default_action[i] != DEFAULT_CONTINUE)
                    info->sighand->we_intercept[i] = true;
            }
        } else {
            /* We intercept the following signals ourselves.  If this is the first
             * thread, currently doing DR init, the app handlers for these are in
             * init_info.  We do not copy those handlers into info at this time as we're
             * not equipped yet to deliver signals.  We wait for DR to start running the
             * app (init can be separated from start for dr_app_* interfaces), when
             * signal_reinstate_handlers() will record the app handlers.  Signals
             * received in the meantime will go through execute_native_handler().
             */
            info->sighand->we_intercept[SIGSEGV] = true;
            /* PR 313665: look for DR crashes on unaligned memory or mmap bounds */
            info->sighand->we_intercept[SIGBUS] = true;
            /* PR 212090: the signal we use to suspend threads */
            info->sighand->we_intercept[SUSPEND_SIGNAL] = true;
#ifdef PAPI
            /* use SIGPROF for updating gui so it can be distinguished from SIGVTALRM */
            info->sighand->we_intercept[SIGPROF] = true;
#endif
            /* vtalarm only used with pc profiling.  it interferes w/ PAPI
             * so arm this signal only if necessary
             */
            if (INTERNAL_OPTION(profile_pcs)) {
                info->sighand->we_intercept[SIGVTALRM] = true;
            }
            info->sighand->we_intercept[SIGALRM] = true;
#ifdef SIDELINE
            info->sighand->we_intercept[SIGCHLD] = true;
#endif
            /* i#61/PR 211530: the signal we use for nudges */
            info->sighand->we_intercept[NUDGESIG_SIGNUM] = true;
        }

        /* should be 1st thread */
        if (d_r_get_num_threads() > 1)
            ASSERT_NOT_REACHED();
    }

    /* only when SIGVTALRM handler is in place should we start itimer (PR 537743) */
    if (INTERNAL_OPTION(profile_pcs)) {
        /* even if the parent thread exits, we can use a pointer to its
         * pcprofile_info b/c when shared it's process-shared and is not freed
         * until the entire process exits
         */
        pcprofile_thread_init(dcontext, info->shared_itimer,
                              (record == NULL) ? NULL : record->pcprofile_info);
    }

    info->pre_syscall_app_sigprocmask_valid = false;

    /* Assumed to be async safe. */
    info->fully_initialized = true;
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
}

/* When taking over existing app threads, we assume they're using pthreads and
 * expect to share signal handlers, memory, thread group id, etc.
 * Invokes dynamo_thread_init() with the appropriate os_data.
 */
dcontext_t *
init_thread_with_shared_siginfo(priv_mcontext_t *mc, dcontext_t *takeover_dc)
{
    clone_record_t crec = {
        0,
    };
    thread_sig_info_t *parent_siginfo = (thread_sig_info_t *)takeover_dc->signal_field;
    /* Create a fake clone record with the given siginfo.  All threads in the
     * same thread group must share signal handlers since Linux 2.5.35, but we
     * have to guess at the other flags.
     * FIXME i#764: If we take over non-pthreads threads, we'll need some way to
     * tell if they're sharing signal handlers or not.
     */
    crec.caller_id = takeover_dc->owning_thread;
#ifdef LINUX
    crec.clone_sysnum = SYS_clone;
#else
    ASSERT_NOT_IMPLEMENTED(false); /* FIXME i#58: NYI on Mac */
#endif
    crec.clone_flags = PTHREAD_CLONE_FLAGS;
    crec.parent_info = parent_siginfo;
    crec.info = *parent_siginfo;
    crec.pcprofile_info = takeover_dc->pcprofile_field;
    IF_DEBUG(int r =)
    dynamo_thread_init(NULL, mc, &crec, false);
    ASSERT(r == SUCCESS);
    return get_thread_private_dcontext();
}

static void
free_pending_signal(thread_sig_info_t *info, int sig)
{
    sigpending_t *temp = info->sigpending[sig];
    info->sigpending[sig] = temp->next;
    special_heap_free(info->sigheap, temp);
    info->num_pending--;
}

/* This is split from os_fork_init() so the new logfiles are available
 * (xref i#189/PR 452168).  It had to be after dynamo_other_thread_exit()
 * called in dynamorio_fork_init() after os_fork_init() else we clean
 * up data structs used in signal_thread_exit().
 */
void
signal_fork_init(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    /* Child of fork is a single thread in a new process so should
     * start over w/ no sharing (xref i#190/PR 452178)
     */
    if (info->sighand->is_shared) {
        info->sighand->is_shared = false;
        info->sighand->refcount = 0;
        for (i = 1; i <= MAX_SIGNUM; i++) {
            if (!EMULATE_SIGMASK(info, i))
                continue;
            /* No sighand synch needed since only this thread owns these. */
            if (!kernel_sigismember(&info->app_sigblocked, i))
                info->sighand->threads_unmasked[i] = 1;
            else
                info->sighand->threads_unmasked[i] = 0;
        }
    }
    if (info->shared_itimer) {
        /* itimers are not inherited across fork */
        info->shared_itimer = false;
        for (i = 0; i < NUM_ITIMERS; i++)
            DELETE_RECURSIVE_LOCK((*info->itimer)[i].lock);
        if (os_itimers_thread_shared())
            global_heap_free(info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        else
            heap_free(dcontext, info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        info->itimer = NULL; /* reset by init_itimer */
        ASSERT(info->shared_itimer_refcount != NULL);
        global_heap_free(info->shared_itimer_refcount, sizeof(int) HEAPACCT(ACCT_OTHER));
        info->shared_itimer_refcount = NULL;
        ASSERT(info->shared_itimer_underDR != NULL);
        global_heap_free(info->shared_itimer_underDR, sizeof(int) HEAPACCT(ACCT_OTHER));
        info->shared_itimer_underDR = NULL;
        init_itimer(dcontext, true /*first*/);
    }
    info->num_unstarted_children = 0;
    for (i = 1; i <= MAX_SIGNUM; i++) {
        /* "A child created via fork(2) initially has an empty pending signal set" */
        dcontext->signals_pending = 0;
        while (info->sigpending[i] != NULL) {
            free_pending_signal(info, i);
        }
        info->num_pending = 0;
    }
    if (INTERNAL_OPTION(profile_pcs)) {
        pcprofile_fork_init(dcontext);
    }

    info->pre_syscall_app_sigprocmask_valid = false;

    /* Assumed to be async safe. */
    info->fully_initialized = true;
}

#ifdef DEBUG
static bool
sigsegv_handler_is_ours(void)
{
    int rc;
    kernel_sigaction_t oldact;
    rc = sigaction_syscall(SIGSEGV, NULL, &oldact);
    return (rc == 0 && oldact.handler == (handler_t)main_signal_handler);
}
#endif /* DEBUG */

#if defined(X86) && defined(LINUX)
static byte *
get_and_initialize_xstate_buffer(dcontext_t *dcontext)
{
    /* See thread_sig_info_t.xstate_buf comments for why this is in TLS. */
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    if (info->xstate_buf == NULL) {
        info->xstate_alloc =
            heap_alloc(dcontext, signal_frame_extra_size(true) HEAPACCT(ACCT_OTHER));
        info->xstate_buf = (byte *)ALIGN_FORWARD(info->xstate_alloc, XSTATE_ALIGNMENT);
        ASSERT(info->xstate_alloc + signal_frame_extra_size(true) >=
               info->xstate_buf + signal_frame_extra_size(false));
    }
    kernel_fpstate_t *fpstate = (kernel_fpstate_t *)info->xstate_buf;
    /* If we pass uninitialized values for kernel_xsave_hdr_t.reserved1 through
     * sigreturn, we'll get a SIGSEGV.  Best to zero it all out.
     */
    memset(fpstate, 0, signal_frame_extra_size(false));
    fpstate->sw_reserved.extended_size = signal_frame_extra_size(false);
    if (YMM_ENABLED()) { /* ZMM_ENABLED() always implies YMM_ENABLED() too. */
        fpstate->sw_reserved.magic1 = FP_XSTATE_MAGIC1;
        fpstate->sw_reserved.xstate_size = signal_frame_extra_size(false) -
            FP_XSTATE_MAGIC2_SIZE IF_X86_32(-FSAVE_FPSTATE_PREFIX_SIZE);
        uint bv_high, bv_low;
        dr_xgetbv(&bv_high, &bv_low);
        fpstate->sw_reserved.xstate_bv = (((uint64)bv_high) << 32) | bv_low;
        *(int *)((byte *)fpstate + fpstate->sw_reserved.extended_size -
                 FP_XSTATE_MAGIC2_SIZE) = FP_XSTATE_MAGIC2;
    } else {
        fpstate->sw_reserved.magic1 = 0;
        fpstate->sw_reserved.xstate_size = sizeof(kernel_fpstate_t);
        fpstate->sw_reserved.xstate_bv = 0;
    }
    return info->xstate_buf;
}
#endif

void
signal_thread_exit(dcontext_t *dcontext, bool other_thread)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;

    /* i#1012: DR's signal handler should always be installed before this point.
     */
    ASSERT(sigsegv_handler_is_ours() || removed_sig_handler);

    while (info->num_unstarted_children > 0) {
        /* must wait for children to start and copy our state
         * before we destroy it!
         */
        os_thread_yield();
    }

    /* stop_itimer() was already called by os_thread_not_under_dynamo() called
     * from dynamo_thread_exit_common().  We need to leave the app itimers in place
     * in case we're detaching.
     */

#if defined(X86) && defined(LINUX)
    if (info->xstate_alloc != NULL) {
        heap_free(dcontext, info->xstate_alloc,
                  signal_frame_extra_size(true) HEAPACCT(ACCT_OTHER));
    }
#endif

    /* FIXME: w/ shared handlers, if parent (the owner here) dies,
     * can children keep living w/ a copy of the handlers?
     */
    if (info->sighand->is_shared) {
        d_r_mutex_lock(&info->sighand->lock);
        info->sighand->refcount--;
        d_r_mutex_unlock(&info->sighand->lock);
    }
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (!EMULATE_SIGMASK(info, i))
            continue;
        if (!kernel_sigismember(&info->app_sigblocked, i))
            ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
    }
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
    if (!info->sighand->is_shared || info->sighand->refcount == 0) {
        LOG(THREAD, LOG_ASYNCH, 2, "signal handler cleanup:\n");
        signal_info_exit_sigaction(dcontext, info, other_thread);
    }

    if (info->shared_itimer) {
        atomic_add_exchange_int((volatile int *)info->shared_itimer_refcount, -1);
    }
    if (!info->shared_itimer || *info->shared_itimer_refcount == 0) {
        if (INTERNAL_OPTION(profile_pcs)) {
            /* no cleanup needed for non-final thread in group */
            pcprofile_thread_exit(dcontext);
        }
        for (i = 0; i < NUM_ITIMERS; i++)
            DELETE_RECURSIVE_LOCK((*info->itimer)[i].lock);
        if (os_itimers_thread_shared())
            global_heap_free(info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        else
            heap_free(dcontext, info->itimer, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
        if (info->shared_itimer_refcount != NULL) {
            global_heap_free(info->shared_itimer_refcount,
                             sizeof(int) HEAPACCT(ACCT_OTHER));
            ASSERT(info->shared_itimer_underDR != NULL);
            global_heap_free(info->shared_itimer_underDR,
                             sizeof(int) HEAPACCT(ACCT_OTHER));
        }
    }
    for (i = 1; i <= MAX_SIGNUM; i++) {
        /* pending queue is per-thread and not shared */
        while (info->sigpending[i] != NULL) {
            sigpending_t *temp = info->sigpending[i];
            info->sigpending[i] = temp->next;
            special_heap_free(info->sigheap, temp);
        }
        info->num_pending = 0;
    }
    /* If no detach flag is set, we assume that this thread is on its way to exit.
     * In order to prevent receiving signals while a thread is on its way to exit
     * without a valid dcontext, signals at this stage are blocked. The exceptions
     * are the suspend signal and any signal that a terminating SYS_kill may need.
     * (i#2921). In this case, we do not want to restore the signal mask. For detach,
     * we do need to restore the app's mask.
     */
    if (!other_thread && doing_detach)
        signal_swap_mask(dcontext, true /*to_app*/);
#ifdef HAVE_SIGALTSTACK
    /* Remove our sigstack and restore the app sigstack if it had one.  */
    if (!other_thread) {
        LOG(THREAD, LOG_ASYNCH, 2, "removing our signal stack " PFX " - " PFX "\n",
            info->sigstack.ss_sp, info->sigstack.ss_sp + info->sigstack.ss_size);
        if (APP_HAS_SIGSTACK(info)) {
            LOG(THREAD, LOG_ASYNCH, 2, "restoring app signal stack " PFX " - " PFX "\n",
                info->app_sigstack.ss_sp,
                info->app_sigstack.ss_sp + info->app_sigstack.ss_size);
        } else {
            ASSERT(TEST(SS_DISABLE, info->app_sigstack.ss_flags));
        }
        if (info->sigstack.ss_sp != NULL) {
            /* i#552: to raise client exit event, we may call dynamo_process_exit
             * on sigstack in signal handler.
             * In that case we set sigstack (ss_sp) NULL to avoid stack swap.
             */
#    ifdef MACOS
            if (info->app_sigstack.ss_sp == NULL) {
                /* Kernel fails w/ ENOMEM (even for SS_DISABLE) if ss_size is too small */
                info->sigstack.ss_flags = SS_DISABLE;
                i = sigaltstack_syscall(&info->sigstack, NULL);
                /* i#1814: kernel gives EINVAL if last handler didn't call sigreturn! */
                ASSERT(i == 0 || i == -EINVAL);
            } else {
                i = sigaltstack_syscall(&info->app_sigstack, NULL);
                /* i#1814: kernel gives EINVAL if last handler didn't call sigreturn! */
                ASSERT(i == 0 || i == -EINVAL);
            }
#    else
            i = sigaltstack_syscall(&info->app_sigstack, NULL);
            ASSERT(i == 0);
#    endif
        }
    }
#endif
    IF_LINUX(signalfd_thread_exit(dcontext, info));
    special_heap_exit(info->sigheap);
    DELETE_LOCK(info->sigblocked_lock);
    DELETE_LOCK(info->child_lock);
#ifdef HAVE_SIGALTSTACK
    if (info->sigstack.ss_sp != NULL) {
        /* i#552: to raise client exit event, we may call dynamo_process_exit
         * on sigstack in signal handler.
         * In that case we set sigstack (ss_sp) NULL to avoid stack free.
         */
        stack_free(info->sigstack.ss_sp + info->sigstack.ss_size, info->sigstack.ss_size);
    }
#endif
#ifdef DEBUG
    /* for non-debug we do fast exit path and don't free local heap */
    HEAP_TYPE_FREE(dcontext, info, thread_sig_info_t, ACCT_OTHER, PROTECTED);
#endif
#ifdef PAPI
    /* use SIGPROF for updating gui so it can be distinguished from SIGVTALRM */
    set_itimer_callback(
        dcontext, ITIMER_PROF, 500,
        (void (*func)(dcontext_t *, priv_mcontext_t *))perfctr_update_gui());
#endif
}

void
set_handler_sigact(kernel_sigaction_t *act, int sig, handler_t handler)
{
    act->handler = handler;
#ifdef MACOS
    /* This is the real target */
    act->tramp = (tramp_t)handler;
#endif

    act->flags = SA_SIGINFO; /* send 3 args to handler */
#ifdef HAVE_SIGALTSTACK
    act->flags |= SA_ONSTACK; /* use our sigstack */
#endif
    /* We want the kernel to help us auto-restart syscalls, esp. when our signals
     * interrupt native code such as during attach or in client or DR code (i#2659).
     */
    act->flags |= SA_RESTART;

#if !defined(VMX86_SERVER) && defined(LINUX)
    /* PR 305020: must have SA_RESTORER for x64 */
    /* i#2812: must have SA_RESTORER to handle vsyscall32 being disabled */
    act->flags |= SA_RESTORER;
    act->restorer = (void (*)(void))dynamorio_sigreturn;
#endif

    /* We block most signals within our handler */
    kernel_sigfillset(&act->mask);
    /* i#184/PR 450670: we let our suspend signal interrupt our own handler
     * We never send more than one before resuming, so no danger to stack usage
     * from our own: but app could pile them up.
     */
    kernel_sigdelset(&act->mask, SUSPEND_SIGNAL);
    /* i#193/PR 287309: we need to NOT suppress further SIGSEGV, for decode faults,
     * for try/except, and for !HAVE_MEMINFO probes.
     * Just like SUSPEND_SIGNAL, if app sends repeated SEGV, could run out of
     * alt stack: seems too corner-case to be worth increasing stack size.
     */
    kernel_sigdelset(&act->mask, SIGSEGV);
    if (sig == SUSPEND_SIGNAL || sig == SIGSEGV)
        act->flags |= SA_NODEFER;
    /* Sigset is a 1 or 2 elt array of longs on X64/X86.  Treat as 2 elt of
     * uint32. */
    IF_DEBUG(uint32 *mask_sig = (uint32 *)&act->mask.sig[0]);
    LOG(THREAD_GET, LOG_ASYNCH, 3, "mask for our handler is " PFX " " PFX "\n",
        mask_sig[0], mask_sig[1]);
}

static void
set_our_handler_sigact(kernel_sigaction_t *act, int sig)
{
    set_handler_sigact(act, sig, (handler_t)main_signal_handler);
}

static void
set_handler_and_record_app(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                           kernel_sigaction_t *act)
{
    int rc;
    kernel_sigaction_t oldact;
    ASSERT(sig <= MAX_SIGNUM);

    /* Get the app's handler first, to handle a race where a signal arrives in
     * between.  That trumps a race where the app changes its handler which is much
     * less likely.
     */
    rc = sigaction_syscall(sig, NULL, &oldact);
    ASSERT(rc ==
           0
           /* Workaround for PR 223720, which was fixed in ESX4.0 but
            * is present in ESX3.5 and earlier: vmkernel treats
            * 63 and 64 as invalid signal numbers.
            */
           IF_VMX86(|| (sig >= 63 && rc == -EINVAL)));
    if (rc != 0) /* be defensive: app will probably still work */
        return;

    if (oldact.handler != (handler_t)SIG_DFL &&
        oldact.handler != (handler_t)main_signal_handler) {
        /* save the app's action for sig */
        if (info->sighand->is_shared) {
            /* sighand structure is shared */
            d_r_mutex_lock(&info->sighand->lock);
        }
        if (info->sighand->action[sig] != NULL) {
            /* go ahead and toss the old one, it's up to the app to store
             * and then restore later if it wants to
             */
            handler_free(dcontext, info->sighand->action[sig],
                         sizeof(kernel_sigaction_t));
        }
        info->sighand->action[sig] =
            (kernel_sigaction_t *)handler_alloc(dcontext, sizeof(kernel_sigaction_t));
        memcpy(info->sighand->action[sig], &oldact, sizeof(kernel_sigaction_t));
        /* clear cache */
        info->restorer_valid[sig] = -1;
        if (info->sighand->is_shared)
            d_r_mutex_unlock(&info->sighand->lock);
#ifdef DEBUG
        if (oldact.handler == (handler_t)SIG_IGN) {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app already installed SIG_IGN as sigaction for signal %d\n", sig);
        } else {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app already installed " PFX " as sigaction flags=0x%x for signal %d\n",
                oldact.handler, oldact.flags, sig);
        }
#endif
        /* Record the app handler for delivering native signals during initialization.
         * XXX: Should we rename this s/detached_/native_/ or something?
         */
        d_r_write_lock(&detached_sigact_lock);
        memcpy(&detached_sigact[sig], info->sighand->action[sig],
               sizeof(detached_sigact[sig]));
        d_r_write_unlock(&detached_sigact_lock);
    } else {
        LOG(THREAD, LOG_ASYNCH, 2,
            "prior handler is " PFX " vs main " PFX " with flags=0x%x for signal %d\n",
            oldact.handler, main_signal_handler, oldact.flags, sig);
        if (info->sighand->action[sig] != NULL) {
            if (info->sighand->is_shared)
                d_r_mutex_lock(&info->sighand->lock);
            handler_free(dcontext, info->sighand->action[sig],
                         sizeof(kernel_sigaction_t));
            info->sighand->action[sig] = NULL;
            if (info->sighand->is_shared)
                d_r_mutex_unlock(&info->sighand->lock);
        }
    }

    /* Arm the signal. */
    rc = sigaction_syscall(sig, act, NULL);
    DODEBUG({
        /* i#4627: QEMU before 5.0 has a signal mapping bug where 33 and 64
         * fail.  Better to continue and hope 33 doesn't mess up pthreads.
         */
        if (rc == -EINVAL && !IS_STRING_OPTION_EMPTY(xarch_root) &&
            (sig == 33 || sig == 64)) {
            SYSLOG_INTERNAL_WARNING("Failed to register signal handler for %d: assuming "
                                    "under QEMU<5.0 and continuing",
                                    sig);
        } else {
            ASSERT(rc ==
                   0
                   /* Workaround for PR 223720, which was fixed in ESX4.0 but
                    * is present in ESX3.5 and earlier: vmkernel treats
                    * 63 and 64 as invalid signal numbers.
                    */
                   IF_VMX86(|| (sig >= 63 && rc == -EINVAL)));
        }
    });
    if (rc != 0) /* be defensive: app will probably still work */
        return;
    LOG(THREAD, LOG_ASYNCH, 3, "\twe intercept signal %d\n", sig);
}

/* Set up main_signal_handler as the handler for signal "sig",
 * for the current thread.  Since we deal with kernel data structures
 * in our interception of system calls, we use them here as well,
 * to avoid having to translate to/from libc data structures.
 */
static void
intercept_signal(dcontext_t *dcontext, thread_sig_info_t *info, int sig)
{
    kernel_sigaction_t act;
    ASSERT(sig <= MAX_SIGNUM);
    set_our_handler_sigact(&act, sig);
    set_handler_and_record_app(dcontext, info, sig, &act);
}

static void
intercept_signal_ignore_initially(dcontext_t *dcontext, thread_sig_info_t *info, int sig)
{
    kernel_sigaction_t act;
    ASSERT(sig <= MAX_SIGNUM);
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t)SIG_IGN;
    set_handler_and_record_app(dcontext, info, sig, &act);
}

static void
intercept_signal_no_longer_ignore(dcontext_t *dcontext, thread_sig_info_t *info, int sig)
{
    kernel_sigaction_t act;
    ASSERT(sig <= MAX_SIGNUM);
    set_our_handler_sigact(&act, sig);
    DEBUG_DECLARE(int rc =) sigaction_syscall(sig, &act, NULL);
    ASSERT(rc == 0);
}

/* i#1921: For proper single-threaded native execution with re-takeover we need
 * to propagate signals.  For now we only support going completely native in
 * this thread but without a full detach, so we abandon our signal handlers w/o
 * freeing memory up front.
 * We also use this for the start/stop interface where we are going fully native
 * for all threads.
 */
void
signal_remove_handlers(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    kernel_sigaction_t act;
    memset(&act, 0, sizeof(act));
    act.handler = (handler_t)SIG_DFL;
    kernel_sigemptyset(&act.mask);
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (info->sighand->action[i] != NULL) {
            LOG(THREAD, LOG_ASYNCH, 2, "\trestoring " PFX " as handler for %d\n",
                info->sighand->action[i]->handler, i);
            sigaction_syscall(i, info->sighand->action[i], NULL);
        } else if (info->sighand->we_intercept[i]) {
            /* restore to default */
            LOG(THREAD, LOG_ASYNCH, 2, "\trestoring SIG_DFL as handler for %d\n", i);
            sigaction_syscall(i, &act, NULL);
        }
    }
    DODEBUG({ removed_sig_handler = true; });
}

void
signal_remove_alarm_handlers(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (!info->sighand->we_intercept[i])
            continue;
        if (sig_is_alarm_signal(i)) {
            set_ignore_signal_action(i);
        }
    }
}

/* For attaching mid-run, we assume regular POSIX with handlers global to just one
 * thread group in the process.
 * We also use this routine for the initial setup of our handlers, which we
 * split from signal_thread_inherit() to support start/stop.
 */
void
signal_reinstate_handlers(dcontext_t *dcontext, bool ignore_alarm)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    for (i = 1; i <= MAX_SIGNUM; i++) {
        bool skip = false;
        if (!info->sighand->we_intercept[i]) {
            skip = true;
            if (signal_is_interceptable(i)) {
                /* We do have to intercept everything the app does.
                 * If the app removes its handler, we'll never remove ours, which we
                 * can live with.
                 */
                kernel_sigaction_t oldact;
                int rc = sigaction_syscall(i, NULL, &oldact);
                ASSERT(rc == 0);
                if (rc == 0 && oldact.handler != (handler_t)SIG_DFL &&
                    oldact.handler != (handler_t)main_signal_handler) {
                    skip = false;
                }
            }
        }
        if (skip)
            continue;
        if (sig_is_alarm_signal(i) && ignore_alarm) {
            LOG(THREAD, LOG_ASYNCH, 2, "\tignoring %d initially\n", i);
            intercept_signal_ignore_initially(dcontext, info, i);
        } else {
            LOG(THREAD, LOG_ASYNCH, 2, "\trestoring DR handler for %d\n", i);
            intercept_signal(dcontext, info, i);
        }
    }
    DODEBUG({ removed_sig_handler = false; });
}

void
signal_reinstate_alarm_handlers(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (!info->sighand->we_intercept[i] || !sig_is_alarm_signal(i))
            continue;
        LOG(THREAD, LOG_ASYNCH, 2, "\trestoring DR handler for %d\n", i);
        intercept_signal_no_longer_ignore(dcontext, info, i);
    }
}

/**** system call handlers ***********************************************/

/* FIXME: invalid pointer passed to kernel will currently show up
 * probably as a segfault in our handlers below...need to make them
 * look like kernel, and pass error code back to os.c
 */

void
handle_clone(dcontext_t *dcontext, uint64 flags)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    if ((flags & CLONE_VM) == 0) {
        /* separate process not sharing memory */
        if ((flags & CLONE_SIGHAND) != 0) {
            /* FIXME: how deal with this?
             * "man clone" says: "Since Linux 2.6.0-test6, flags must also
             * include CLONE_VM if CLONE_SIGHAND is specified"
             */
            LOG(THREAD, LOG_ASYNCH, 1, "WARNING: !CLONE_VM but CLONE_SIGHAND!\n");
            ASSERT_NOT_IMPLEMENTED(false);
        }
        return;
    }

    pre_second_thread();

    if (TEST(CLONE_SIGHAND, flags)) {
        /* need to share table of handlers! */
        LOG(THREAD, LOG_ASYNCH, 2, "handle_clone: CLONE_SIGHAND set!\n");
        if (!info->sighand->is_shared) {
            /* This is the start of a chain of sharing.
             * No synch needed here since child not created yet.
             */
            info->sighand->is_shared = true;
            info->sighand->refcount = 1;
        } /* else, some ancestor is already owner */
    } else {
        /* child will inherit copy of current table -> cannot modify it
         * until child is scheduled!  FIXME: any other way?
         */
        d_r_mutex_lock(&info->child_lock);
        info->num_unstarted_children++;
        d_r_mutex_unlock(&info->child_lock);
    }

    if (TEST(CLONE_THREAD, flags) && os_itimers_thread_shared()) {
        if (!info->shared_itimer) {
            /* this is the start of a chain of sharing
             * no synch needed here, child not created yet
             */
            info->shared_itimer = true;
            info->shared_itimer_refcount =
                (int *)global_heap_alloc(sizeof(int) HEAPACCT(ACCT_OTHER));
            *info->shared_itimer_refcount = 1;
            info->shared_itimer_underDR =
                (int *)global_heap_alloc(sizeof(int) HEAPACCT(ACCT_OTHER));
            *info->shared_itimer_underDR = 1;
        } /* else, some ancestor already created */
    }
}

/* Returns false if should NOT issue syscall.
 * In such a case, the result is in "result".
 * If *result is non-zero, the syscall should fail.
 * We could instead issue the syscall and expect it to fail, which would have a more
 * accurate error code, but that risks missing a failure (e.g., RT on Android
 * which in some cases returns success on bugus params).
 * It seems better to err on the side of the wrong error code or failing when
 * we shouldn't, than to think it failed when it didn't, which is more complex
 * to deal with.
 */
bool
handle_sigaction(dcontext_t *dcontext, int sig, const kernel_sigaction_t *act,
                 prev_sigaction_t *oact, size_t sigsetsize, OUT uint *result)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    kernel_sigaction_t *save;
    kernel_sigaction_t local_act;
    if (sigsetsize != sizeof(kernel_sigset_t)) {
        *result = EINVAL;
        return false;
    }
    if (act != NULL) {
        /* Linux checks readability before checking the signal number. */
        if (!d_r_safe_read(act, sizeof(local_act), &local_act)) {
            *result = EFAULT;
            return false;
        }
    }
    /* i#1135: app may pass invalid signum to find MAX_SIGNUM */
    if (sig <= 0 || sig > MAX_SIGNUM || (act != NULL && !signal_is_interceptable(sig))) {
        *result = EINVAL;
        return false;
    }
    if (act != NULL) {
        /* app is installing a new action */
        while (info->num_unstarted_children > 0) {
            /* must wait for children to start and copy our state
             * before we modify it!
             */
            os_thread_yield();
        }
        info->sigaction_param = act;
    }
    if (info->sighand->is_shared) {
        /* sighand structure is shared */
        d_r_mutex_lock(&info->sighand->lock);
    }
    if (oact != NULL) {
        /* Keep a copy of the prior one for post-syscall to hand to the app. */
        info->use_kernel_prior_sigaction = false;
        if (info->sighand->action[sig] == NULL) {
            if (info->sighand->we_intercept[sig]) {
                /* need to pretend there is no handler */
                memset(&info->prior_app_sigaction, 0, sizeof(info->prior_app_sigaction));
                info->prior_app_sigaction.handler = (handler_t)SIG_DFL;
            } else {
                info->use_kernel_prior_sigaction = true;
            }
        } else {
            memcpy(&info->prior_app_sigaction, info->sighand->action[sig],
                   sizeof(info->prior_app_sigaction));
        }
    }
    if (act != NULL) {
        if (local_act.handler == (handler_t)SIG_IGN ||
            local_act.handler == (handler_t)SIG_DFL) {
            LOG(THREAD, LOG_ASYNCH, 2, "app installed %s as sigaction for signal %d\n",
                (local_act.handler == (handler_t)SIG_IGN) ? "SIG_IGN" : "SIG_DFL", sig);
            if (!info->sighand->we_intercept[sig]) {
                /* let the SIG_IGN/SIG_DFL go through, we want to remove our
                 * handler.  we delete the stored action in post_
                 */
                if (info->sighand->is_shared)
                    d_r_mutex_unlock(&info->sighand->lock);
                return true;
            }
        } else {
            LOG(THREAD, LOG_ASYNCH, 2,
                "app installed " PFX " as sigaction for signal %d\n", local_act.handler,
                sig);
            DOLOG(2, LOG_ASYNCH, {
                LOG(THREAD, LOG_ASYNCH, 2, "signal mask for handler:\n");
                dump_sigset(dcontext, (kernel_sigset_t *)&local_act.mask);
            });
        }

        /* save app's entire sigaction struct */
        save = (kernel_sigaction_t *)handler_alloc(dcontext, sizeof(kernel_sigaction_t));
        memcpy(save, &local_act, sizeof(kernel_sigaction_t));
        /* Remove the unblockable sigs */
        kernel_sigdelset(&save->mask, SIGKILL);
        kernel_sigdelset(&save->mask, SIGSTOP);
#ifdef X86
        /* Remove flags not allowed to be passed to the kernel (this also zeroes
         * the top 32 bits, like the kernel does: i#3681).
         */
        save->flags &= ~(SA_IA32_ABI | SA_X32_ABI);
#endif
        if (info->sighand->action[sig] != NULL) {
            /* go ahead and toss the old one, it's up to the app to store
             * and then restore later if it wants to
             */
            handler_free(dcontext, info->sighand->action[sig],
                         sizeof(kernel_sigaction_t));
        }
        info->sighand->action[sig] = save;
        LOG(THREAD, LOG_ASYNCH, 3, "\tflags = " PFX ", %s = " PFX "\n", local_act.flags,
            IF_MACOS_ELSE("tramp", "restorer"),
            IF_MACOS_ELSE(local_act.tramp, local_act.restorer));
        /* clear cache */
        info->restorer_valid[sig] = -1;
    }
    if (info->sighand->is_shared)
        d_r_mutex_unlock(&info->sighand->lock);
    if (info->sighand->we_intercept[sig]) {
        /* cancel the syscall */
        *result = handle_post_sigaction(dcontext, true, sig, act, oact, sigsetsize);
        return false;
    }
    if (act != NULL) {
        /* Now hand kernel our main handler instead of app's. */
        set_our_handler_sigact(&info->our_sigaction, sig);
        set_syscall_param(dcontext, 1, (reg_t)&info->our_sigaction);

        /* FIXME PR 297033: we don't support intercepting DEFAULT_STOP /
         * DEFAULT_CONTINUE signals b/c we can't generate the default
         * action: if the app registers a handler, though, we should work
         * properly if we never see SIG_DFL.
         */
    }
    return true;
}

/* os.c thinks it's passing us struct_sigaction, really it's kernel_sigaction_t,
 * which has fields in different order.
 * Only called on success.
 * Returns the desired app return value (caller will negate if nec).
 */
uint
handle_post_sigaction(dcontext_t *dcontext, bool success, int sig,
                      const kernel_sigaction_t *act, prev_sigaction_t *oact,
                      size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    if (act != NULL) {
        /* Restore app register value, in case we changed it. */
        set_syscall_param(dcontext, 1, (reg_t)info->sigaction_param);
    }
    if (!success)
        return 0; /* don't change return value */
    ASSERT(sig <= MAX_SIGNUM && sig > 0);
    if (oact != NULL) {
        if (info->use_kernel_prior_sigaction) {
            /* Real syscall succeeded with oact so it must be readable, barring races. */
            ASSERT(oact->handler == (handler_t)SIG_IGN ||
                   oact->handler == (handler_t)SIG_DFL);
        } else {
            /* We may have skipped the syscall so we have to check writability */
#ifdef MACOS
            /* On MacOS prev_sigaction_t is a different type (i#2105) */
            bool fault = true;
            TRY_EXCEPT(dcontext,
                       {
                           oact->handler = info->prior_app_sigaction.handler;
                           oact->mask = info->prior_app_sigaction.mask;
                           oact->flags = info->prior_app_sigaction.flags;
                           fault = false;
                       },
                       {
                           /* EXCEPT */
                           /* nothing: fault is already true */
                       });
            if (fault)
                return EFAULT;
#else
            if (!safe_write_ex(oact, sizeof(*oact), &info->prior_app_sigaction, NULL)) {
                /* We actually don't have to undo installing any passed action
                 * b/c the Linux kernel does that *before* checking oact perms.
                 */
                return EFAULT;
            }
#endif
        }
    }
    /* If installing IGN or DFL, delete ours.
     * XXX: This is racy.  We can't hold the lock across the syscall, though.
     * What we should do is just drop support for -no_intercept_all_signals,
     * which is off by default anyway and never turned off.
     */
    if (act != NULL &&
        /* De-ref here should work barring races: already racy and non-default so not
         * bothering with safe_read.
         */
        ((act->handler == (handler_t)SIG_IGN || act->handler == (handler_t)SIG_DFL) &&
         !info->sighand->we_intercept[sig]) &&
        info->sighand->action[sig] != NULL) {
        if (info->sighand->is_shared)
            d_r_mutex_lock(&info->sighand->lock);
        /* remove old stored app action */
        handler_free(dcontext, info->sighand->action[sig], sizeof(kernel_sigaction_t));
        info->sighand->action[sig] = NULL;
        if (info->sighand->is_shared)
            d_r_mutex_unlock(&info->sighand->lock);
    }
    return 0;
}

#ifdef LINUX
static bool
convert_old_sigaction_to_kernel(dcontext_t *dcontext, kernel_sigaction_t *ks,
                                const old_sigaction_t *os)
{
    bool res = false;
    TRY_EXCEPT(dcontext,
               {
                   ks->handler = os->handler;
                   ks->flags = os->flags;
                   ks->restorer = os->restorer;
                   kernel_sigemptyset(&ks->mask);
                   ks->mask.sig[0] = os->mask;
                   res = true;
               },
               {
                   /* EXCEPT */
                   /* nothing: res is already false */
               });
    return res;
}

static bool
convert_kernel_sigaction_to_old(dcontext_t *dcontext, old_sigaction_t *os,
                                const kernel_sigaction_t *ks)
{
    bool res = false;
    TRY_EXCEPT(dcontext,
               {
                   os->handler = ks->handler;
                   os->flags = ks->flags;
                   os->restorer = ks->restorer;
                   os->mask = ks->mask.sig[0];
                   res = true;
               },
               {
                   /* EXCEPT */
                   /* nothing: res is already false */
               });
    return res;
}

/* Returns false (and "result") if should NOT issue syscall. */
bool
handle_old_sigaction(dcontext_t *dcontext, int sig, const old_sigaction_t *act,
                     old_sigaction_t *oact, OUT uint *result)
{
    kernel_sigaction_t kact;
    kernel_sigaction_t okact;
    bool res;
    if (act != NULL) {
        if (!convert_old_sigaction_to_kernel(dcontext, &kact, act)) {
            *result = EFAULT;
            return false;
        }
    }
    res = handle_sigaction(dcontext, sig, act == NULL ? NULL : &kact,
                           oact == NULL ? NULL : &okact, sizeof(kernel_sigset_t), result);
    if (!res)
        *result = handle_post_old_sigaction(dcontext, true, sig, act, oact);
    return res;
}

/* Returns the desired app return value (caller will negate if nec). */
uint
handle_post_old_sigaction(dcontext_t *dcontext, bool success, int sig,
                          const old_sigaction_t *act, old_sigaction_t *oact)
{
    kernel_sigaction_t kact = {};
    kernel_sigaction_t okact = {};
    ptr_uint_t res;
    if (act != NULL && success) {
        if (!convert_old_sigaction_to_kernel(dcontext, &kact, act)) {
            ASSERT(!success);
            return EFAULT;
        }
    }
    if (oact != NULL && success) {
        if (!convert_old_sigaction_to_kernel(dcontext, &okact, oact)) {
            ASSERT(!success);
            return EFAULT;
        }
    }
    res = handle_post_sigaction(dcontext, success, sig, act == NULL ? NULL : &kact,
                                oact == NULL ? NULL : &okact, sizeof(kernel_sigset_t));
    if (res == 0 && oact != NULL) {
        if (!convert_kernel_sigaction_to_old(dcontext, oact, &okact)) {
            return EFAULT;
        }
    }
    return res;
}
#endif /* LINUX */

/* Returns false and sets *result if should NOT issue syscall.
 * If *result is non-zero, the syscall should fail.
 */
bool
handle_sigaltstack(dcontext_t *dcontext, const stack_t *stack, stack_t *old_stack,
                   reg_t cur_xsp, OUT uint *result)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    stack_t local_stack;
    if (old_stack != NULL) {
        if (!safe_write_ex(old_stack, sizeof(*old_stack), &info->app_sigstack, NULL)) {
            *result = EFAULT;
            return false;
        }
    }
    if (stack != NULL) {
        /* Fail in the same way the kernel does. */
        if (!d_r_safe_read(stack, sizeof(local_stack), &local_stack)) {
            *result = EFAULT;
            return false;
        }
        if (APP_HAS_SIGSTACK(info)) {
            /* The app is not allowed to set a new altstack while on the current one. */
            reg_t cur_sigstk = (reg_t)info->app_sigstack.ss_sp;
            if (cur_xsp >= cur_sigstk &&
                cur_xsp < cur_sigstk + info->app_sigstack.ss_size) {
                *result = EPERM;
                return false;
            }
        }
        uint key_flag = local_stack.ss_flags & ~SS_FLAG_BITS;
        if (key_flag != SS_DISABLE && key_flag != SS_ONSTACK && key_flag != 0) {
            *result = EINVAL;
            return false;
        }
        if (key_flag == SS_DISABLE) {
            /* Zero the other params and don't even check them. */
            local_stack.ss_sp = NULL;
            local_stack.ss_size = 0;
        } else {
            if (local_stack.ss_size < os_minsigstksz()) {
                *result = ENOMEM;
                return false;
            }
        }
        info->app_sigstack = local_stack;
        LOG(THREAD, LOG_ASYNCH, 2, "Setting app signal stack to " PFX "-" PFX " %d=%s\n",
            local_stack.ss_sp, local_stack.ss_sp + local_stack.ss_size - 1,
            local_stack.ss_flags, (APP_HAS_SIGSTACK(info)) ? "enabled" : "disabled");
    }
    *result = 0;
    return false; /* always cancel syscall */
}

/* Blocked signals:
 * In general, we don't need to keep track of blocked signals.
 * We only need to do so for those signals we intercept ourselves.
 * Thus, info->app_sigblocked ONLY contains entries for signals
 * we intercept ourselves.
 * PR 304708: we now intercept all signals.
 */

/* Caller must hold info->sigblocked_lock. */
static void
clear_blocked(dcontext_t *dcontext, thread_sig_info_t *info)
{
    ASSERT(OWN_MUTEX(&info->sigblocked_lock));
    for (int i = 1; i <= MAX_SIGNUM; i++) {
        if (!EMULATE_SIGMASK(info, i))
            continue;
        if (kernel_sigismember(&info->app_sigblocked, i))
            ATOMIC_INC(int, info->sighand->threads_unmasked[i]);
    }
    kernel_sigemptyset(&info->app_sigblocked);
}

static void
set_blocked(dcontext_t *dcontext, kernel_sigset_t *set, bool absolute)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    d_r_mutex_lock(&info->sigblocked_lock);
    if (absolute) {
        /* discard current blocked signals, re-set from new mask */
        clear_blocked(dcontext, info);
    } /* else, OR in the new set */
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (!EMULATE_SIGMASK(info, i))
            continue;
        if (kernel_sigismember(set, i) && !kernel_sigismember(&info->app_sigblocked, i)) {
            ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
            kernel_sigaddset(&info->app_sigblocked, i);
        }
    }
#ifdef DEBUG
    if (d_r_stats->loglevel >= 3 && (d_r_stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "blocked signals are now:\n");
        dump_sigset(dcontext, &info->app_sigblocked);
    }
#endif
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
    d_r_mutex_unlock(&info->sigblocked_lock);
}

void
signal_set_mask(dcontext_t *dcontext, kernel_sigset_t *sigset)
{
    set_blocked(dcontext, sigset, true /*absolute*/);
}

void
signal_swap_mask(dcontext_t *dcontext, bool to_app)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    if (to_app) {
        if (init_info.sighand != NULL) {
            /* This is the first execution of the app.
             * We need to remove our own init-time handler and mask.
             */
            unset_initial_crash_handlers(dcontext);
            DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
            return;
        }
        d_r_mutex_lock(&info->sigblocked_lock);
        sigprocmask_syscall(SIG_SETMASK, &info->app_sigblocked, NULL,
                            sizeof(info->app_sigblocked));
        d_r_mutex_unlock(&info->sigblocked_lock);
    } else {
        /* Normally we get here just once at takeover, but if we have threads
         * going native we don't consider them unmasked as we can't rely on their
         * mask not changing: so we undo their counts when they come back here and
         * redo with the latest blocked signals (unblock_all_signals() sets
         * app_sigblocked to a new absolute set).
         */
        for (int i = 1; i <= MAX_SIGNUM; i++) {
            if (!EMULATE_SIGMASK(info, i))
                continue;
            if (kernel_sigismember(&info->app_sigblocked, i))
                ATOMIC_INC(int, info->sighand->threads_unmasked[i]);
        }
        d_r_mutex_lock(&info->sigblocked_lock);
        unblock_all_signals(&info->app_sigblocked);
        d_r_mutex_unlock(&info->sigblocked_lock);
        for (int i = 1; i <= MAX_SIGNUM; i++) {
            if (!EMULATE_SIGMASK(info, i))
                continue;
            if (kernel_sigismember(&info->app_sigblocked, i))
                ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
        }
        DOLOG(2, LOG_ASYNCH, {
            LOG(THREAD, LOG_ASYNCH, 2, "thread %d's initial app signal mask:\n",
                d_r_get_thread_id());
            dump_sigset(dcontext, &info->app_sigblocked);
        });
    }
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
}

/* Scans over info->sigpending to see if there are any unblocked, pending
 * signals, and sets dcontext->signals_pending if there are.  Do this after
 * modifying the set of signals blocked by the application.
 */
void
check_signals_pending(dcontext_t *dcontext, thread_sig_info_t *info)
{
    int i;

    if (dcontext->signals_pending != 0)
        return;

    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (info->sigpending[i] != NULL &&
            !kernel_sigismember(&info->app_sigblocked, i) && !dcontext->signals_pending) {
            /* We only update the application's set of blocked signals from
             * syscall handlers, so we know we'll go back to d_r_dispatch and see
             * this flag right away.
             */
            LOG(THREAD, LOG_ASYNCH, 3, "\tsetting signals_pending flag\n");
            dcontext->signals_pending = 1;
            break;
        }
    }
}

/* Returns whether to execute the syscall */
bool
handle_sigprocmask(dcontext_t *dcontext, int how, kernel_sigset_t *app_set,
                   kernel_sigset_t *oset, size_t sigsetsize, uint *error_code)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    kernel_sigset_t safe_set;

    /* We assume any change to the mask ends a handler: e.g., sigprocmask from
     * siglongjmp.  It seems unusual to call sigprocmask in the middle; this
     * in_app_handler is best-effort in any case.
     */
    info->in_app_handler = false;

    /* Some code uses this syscall to check whether the given address is
     * readable. E.g.
     * github.com/abseil/abseil-cpp/blob/master/absl/debugging/internal/
     * address_is_readable.cc#L85
     */
    if (sigsetsize != sizeof(kernel_sigset_t)) {
        if (error_code != NULL)
            *error_code = EINVAL;
        return false;
    }
    if (app_set != NULL && !d_r_safe_read(app_set, sizeof(safe_set), &safe_set)) {
        if (error_code != NULL)
            *error_code = EFAULT;
        return false;
    }
    if (app_set != NULL && !(how >= SIG_BLOCK && how <= SIG_SETMASK)) {
        if (error_code != NULL)
            *error_code = EINVAL;
        return false;
    }
    /* We intentionally do not check validity of oset here. This is because,
     * for an unwriteable oset, the native sigprocmask syscall shows a weird
     * non-atomic behavior: it returns EFAULT, but also updates the app signal
     * mask. To replicate the same behavior, we allow the following code to
     * update the app signal masks, which happens either by making the actual
     * syscall (for -no_intercept_all_signals) or by updating app_sigblocked
     * (for -intercept_all_signals). oset is checked in handle_post_sigprocmask
     * and any failures are returned to the app.
     */
    /* If we're intercepting all, we emulate the whole thing */
    bool execute_syscall = !DYNAMO_OPTION(intercept_all_signals);
    LOG(THREAD, LOG_ASYNCH, 2, "handle_sigprocmask how=%d\n", how);
    d_r_mutex_lock(&info->sigblocked_lock);
    if (oset != NULL)
        info->pre_syscall_app_sigblocked = info->app_sigblocked;
    if (app_set != NULL) {
        /* app_set was already read into safe_set above. */
        if (execute_syscall) {
            /* The syscall will execute, so remove from the set passed
             * to it.   We restore post-syscall.
             * XXX i#1187: we could crash here touching app memory -- could
             * use TRY, but the app could pass read-only memory and it
             * would work natively!  Better to swap in our own
             * allocated data struct. This would prevent a second read (by
             * the kernel) too, which might fail if it does not _remain_
             * readable anymore.  There's a transparency issue w/
             * races too if another thread looks at this memory.  This
             * won't happen by default b/c -intercept_all_signals is
             * on by default so we don't try to solve all these
             * issues.
             */
            info->pre_syscall_app_sigprocmask = safe_set;
        }
        if (how == SIG_BLOCK) {
            /* The set of blocked signals is the union of the current
             * set and the set argument.
             */
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (EMULATE_SIGMASK(info, i) && kernel_sigismember(&safe_set, i)) {
                    if (!kernel_sigismember(&info->app_sigblocked, i))
                        ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
                    kernel_sigaddset(&info->app_sigblocked, i);
                    if (execute_syscall)
                        kernel_sigdelset(app_set, i);
                }
            }
        } else if (how == SIG_UNBLOCK) {
            /* The signals in set are removed from the current set of
             *  blocked signals.
             */
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (EMULATE_SIGMASK(info, i) && kernel_sigismember(&safe_set, i)) {
                    if (kernel_sigismember(&info->app_sigblocked, i))
                        ATOMIC_INC(int, info->sighand->threads_unmasked[i]);
                    kernel_sigdelset(&info->app_sigblocked, i);
                    if (execute_syscall)
                        kernel_sigdelset(app_set, i);
                }
            }
        } else if (how == SIG_SETMASK) {
            /* The set of blocked signals is set to the argument set. */
            clear_blocked(dcontext, info);
            for (i = 1; i <= MAX_SIGNUM; i++) {
                if (!EMULATE_SIGMASK(info, i))
                    continue;
                if (kernel_sigismember(&safe_set, i)) {
                    ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
                    kernel_sigaddset(&info->app_sigblocked, i);
                    if (execute_syscall)
                        kernel_sigdelset(app_set, i);
                }
            }
        }
#ifdef DEBUG
        if (d_r_stats->loglevel >= 3 && (d_r_stats->logmask & LOG_ASYNCH) != 0) {
            LOG(THREAD, LOG_ASYNCH, 3, "blocked signals are now:\n");
            dump_sigset(dcontext, &info->app_sigblocked);
        }
#endif
        /* make sure we deliver pending signals that are now unblocked
         * FIXME: consider signal #S, which we intercept ourselves.
         * If S arrives, then app blocks it prior to our delivering it,
         * we then won't deliver it until app unblocks it...is this a
         * problem?  Could have arrived a little later and then we would
         * do same thing, but this way kernel may send one more than would
         * get w/o dynamo?  This goes away if we deliver signals
         * prior to letting app do a syscall.
         */
        check_signals_pending(dcontext, info);
    }
    d_r_mutex_unlock(&info->sigblocked_lock);
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
    if (!execute_syscall) {
        int status = handle_post_sigprocmask(dcontext, how, app_set, oset, sigsetsize);
        if (status != 0 && error_code != NULL) {
            *error_code = status;
        }
        return false; /* skip syscall */
    } else
        return true;
}

/* need to add in our signals that the app thinks are blocked */
int
handle_post_sigprocmask(dcontext_t *dcontext, int how, kernel_sigset_t *app_set,
                        kernel_sigset_t *oset, size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    if (!DYNAMO_OPTION(intercept_all_signals)) {
        /* Restore app memory.
         * XXX i#1187: See comment in handle_sigprocmask about read-only
         * app_set. Ideally, we would replace app_set with our own copy
         * pre-syscall so that we only need to restore a pointer to the original
         * app copy post-syscall.
         * In case of a write failure below, we do not return an EFAULT as it would
         * be incorrect to expect app_set to be writeable. We try continuing and
         * return a cloberred app_set to the app. This is low priority as
         * -intercept_all_signals is true by default.
         */
        safe_write_ex(app_set, sizeof(*app_set), &info->pre_syscall_app_sigprocmask,
                      NULL);
    }
    if (oset != NULL) {
        /* Note that we check validity of oset post-syscall only, after the app's
         * blocked signal mask has been updated. If there's a fault writing oset,
         * the signal mask is not reverted. This is same as the native syscall
         * behavior, which is also non-atomic: for a bad oset, it returns EFAULT
         * but still updates the blocked signal mask.
         *
         * If we are unable to fix oset because it is not writeable, we return
         * EFAULT. However, on Mac systems, a bad oset does not cause EFAULT
         * natively, so we fail the write silently.
         */
        if (DYNAMO_OPTION(intercept_all_signals)) {
            if (!safe_write_ex(oset, sizeof(*oset), &info->pre_syscall_app_sigblocked,
                               NULL))
                return IF_MACOS_ELSE(0, EFAULT);
        } else {
            bool write_fault = true;
            TRY_EXCEPT(
                dcontext,
                {
                    /* the syscall wrote to oset already, so just add any additional */
                    for (i = 1; i <= MAX_SIGNUM; i++) {
                        if (EMULATE_SIGMASK(info, i) &&
                            /* use the pre-syscall value: do not take into account changes
                             * from this syscall itself! (PR 523394)
                             */
                            kernel_sigismember(&info->pre_syscall_app_sigblocked, i)) {
                            kernel_sigaddset(oset, i);
                        }
                    }
                    write_fault = false;
                },
                {
                    /* EXCEPT */
                    /* nothing: write_fault is already true */
                });
            if (write_fault)
                return IF_MACOS_ELSE(0, EFAULT);
        }
    }
    return 0;
}

void
handle_sigsuspend(dcontext_t *dcontext, kernel_sigset_t *set, size_t sigsetsize)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    ASSERT(set != NULL);
    LOG(THREAD, LOG_ASYNCH, 2, "handle_sigsuspend\n");
    info->in_sigsuspend = true;
    info->app_sigblocked_save = info->app_sigblocked;
    d_r_mutex_lock(&info->sigblocked_lock);
    clear_blocked(dcontext, info);
    for (i = 1; i <= MAX_SIGNUM; i++) {
        if (EMULATE_SIGMASK(info, i) && kernel_sigismember(set, i)) {
            ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
            kernel_sigaddset(&info->app_sigblocked, i);
            kernel_sigdelset(set, i);
        }
    }
#ifdef DEBUG
    if (d_r_stats->loglevel >= 3 && (d_r_stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "in sigsuspend, blocked signals are now:\n");
        dump_sigset(dcontext, &info->app_sigblocked);
    }
#endif
    d_r_mutex_unlock(&info->sigblocked_lock);
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
}

static void
terminate_sigsuspend(dcontext_t *dcontext, thread_sig_info_t *info,
                     kernel_ucontext_t *ucxt)
{
    ASSERT(info->in_sigsuspend);
    /* Sigsuspend ends when a signal is received, so restore the
     * old blocked set.
     */
    d_r_mutex_lock(&info->sigblocked_lock);
    clear_blocked(dcontext, info);
    info->app_sigblocked = info->app_sigblocked_save;
    for (int i = 1; i <= MAX_SIGNUM; i++) {
        if (EMULATE_SIGMASK(info, i) && kernel_sigismember(&info->app_sigblocked, i)) {
            ATOMIC_DEC(int, info->sighand->threads_unmasked[i]);
        }
    }
    info->in_sigsuspend = false;
    /* Update the set to restore to post-signal-delivery. */
#ifdef MACOS
    ucxt->uc_sigmask = *(__darwin_sigset_t *)&info->app_sigblocked;
#else
    ucxt->uc_sigmask = info->app_sigblocked;
#endif
    DOLOG(3, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 3, "after sigsuspend, blocked signals are now:\n");
        dump_sigset(dcontext, &info->app_sigblocked);
    });
    d_r_mutex_unlock(&info->sigblocked_lock);
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
}

/**** utility routines ***********************************************/
#ifdef DEBUG
static void
dump_sigset(dcontext_t *dcontext, kernel_sigset_t *set)
{
    int sig;
    for (sig = 1; sig <= MAX_SIGNUM; sig++) {
        if (kernel_sigismember(set, sig))
            LOG(THREAD, LOG_ASYNCH, 1, "\t%d = blocked\n", sig);
    }
}
#endif /* DEBUG */

/* PR 205795: to avoid lock problems w/ in_fcache (it grabs a lock, we
 * could have interrupted someone holding that), we first check
 * whereami --- if whereami is DR_WHERE_FCACHE we still check the pc
 * to distinguish generated routines, but at least we're certain
 * it's not in DR where it could own a lock.
 * We can't use is_on_dstack() here b/c we need to handle clean call
 * arg crashes -- which is too bad since checking client dll and DR dll is
 * not sufficient due to calls to ntdll, libc, or pc being in gencode.
 */
static bool
safe_is_in_fcache(dcontext_t *dcontext, app_pc pc, app_pc xsp)
{
    if (dcontext->whereami != DR_WHERE_FCACHE || is_in_client_lib(pc) ||
        is_in_dynamo_dll(pc) || is_on_initstack(xsp))
        return false;
    /* Reasonably certain not in DR code, so no locks should be held */
    return in_fcache(pc);
}

static bool
safe_is_in_coarse_stubs(dcontext_t *dcontext, app_pc pc, app_pc xsp)
{
    if (dcontext->whereami != DR_WHERE_FCACHE || is_in_client_lib(pc) ||
        is_in_dynamo_dll(pc) || is_on_initstack(xsp))
        return false;
    /* Reasonably certain not in DR code, so no locks should be held */
    return in_coarse_stubs(pc);
}

static bool
is_on_alt_stack(dcontext_t *dcontext, byte *sp)
{
#ifdef HAVE_SIGALTSTACK
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    return (sp >= (byte *)info->sigstack.ss_sp &&
            /* deliberate equality check since stacks often init to top */
            sp <= (byte *)(info->sigstack.ss_sp + info->sigstack.ss_size));
#else
    return false;
#endif
}

/* The caller must initialize ucxt, including its fpstate pointer for x86 Linux. */
static void
sig_full_initialize(sig_full_cxt_t *sc_full, kernel_ucontext_t *ucxt)
{
    sc_full->sc = SIGCXT_FROM_UCXT(ucxt);
#ifdef X86
    sc_full->fp_simd_state = NULL; /* we have a ptr inside sigcontext_t */
#elif defined(ARM)
    sc_full->fp_simd_state = &ucxt->coproc.uc_vfp;
#elif defined(AARCH64)
    sc_full->fp_simd_state =
        &ucxt->IF_MACOS64_ELSE(uc_mcontext64->__ns, uc_mcontext.__reserved);
#else
    ASSERT_NOT_IMPLEMENTED(false);
#endif
}

void
sigcontext_to_mcontext(priv_mcontext_t *mc, sig_full_cxt_t *sc_full,
                       dr_mcontext_flags_t flags)
{
    sigcontext_t *sc = sc_full->sc;
    ASSERT(mc != NULL && sc != NULL);
#ifdef X86
    if (TEST(DR_MC_INTEGER, flags)) {
        mc->xax = sc->SC_XAX;
        mc->xbx = sc->SC_XBX;
        mc->xcx = sc->SC_XCX;
        mc->xdx = sc->SC_XDX;
        mc->xsi = sc->SC_XSI;
        mc->xdi = sc->SC_XDI;
        mc->xbp = sc->SC_XBP;
#    ifdef X64
        mc->r8 = sc->SC_FIELD(r8);
        mc->r9 = sc->SC_FIELD(r9);
        mc->r10 = sc->SC_FIELD(r10);
        mc->r11 = sc->SC_FIELD(r11);
        mc->r12 = sc->SC_FIELD(r12);
        mc->r13 = sc->SC_FIELD(r13);
        mc->r14 = sc->SC_FIELD(r14);
        mc->r15 = sc->SC_FIELD(r15);
#    endif /* X64 */
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        mc->xsp = sc->SC_XSP;
        mc->xflags = sc->SC_XFLAGS;
        mc->pc = (app_pc)sc->SC_XIP;
    }
#elif defined(RISCV64)
    if (TEST(DR_MC_INTEGER, flags)) {
        mc->ra = sc->SC_FIELD(sc_regs.ra);
        mc->gp = sc->SC_FIELD(sc_regs.gp);
        mc->tp = sc->SC_FIELD(sc_regs.tp);
        mc->t0 = sc->SC_FIELD(sc_regs.t0);
        mc->t1 = sc->SC_FIELD(sc_regs.t1);
        mc->t2 = sc->SC_FIELD(sc_regs.t2);
        mc->s0 = sc->SC_FIELD(sc_regs.s0);
        mc->s1 = sc->SC_FIELD(sc_regs.s1);
        mc->a0 = sc->SC_FIELD(sc_regs.a0);
        mc->a1 = sc->SC_FIELD(sc_regs.a1);
        mc->a2 = sc->SC_FIELD(sc_regs.a2);
        mc->a3 = sc->SC_FIELD(sc_regs.a3);
        mc->a4 = sc->SC_FIELD(sc_regs.a4);
        mc->a5 = sc->SC_FIELD(sc_regs.a5);
        mc->a6 = sc->SC_FIELD(sc_regs.a6);
        mc->a7 = sc->SC_FIELD(sc_regs.a7);
        mc->s2 = sc->SC_FIELD(sc_regs.s2);
        mc->s3 = sc->SC_FIELD(sc_regs.s3);
        mc->s4 = sc->SC_FIELD(sc_regs.s4);
        mc->s5 = sc->SC_FIELD(sc_regs.s5);
        mc->s6 = sc->SC_FIELD(sc_regs.s6);
        mc->s7 = sc->SC_FIELD(sc_regs.s7);
        mc->s8 = sc->SC_FIELD(sc_regs.s8);
        mc->s9 = sc->SC_FIELD(sc_regs.s9);
        mc->s10 = sc->SC_FIELD(sc_regs.s10);
        mc->s11 = sc->SC_FIELD(sc_regs.s11);
        mc->t3 = sc->SC_FIELD(sc_regs.t3);
        mc->t4 = sc->SC_FIELD(sc_regs.t4);
        mc->t5 = sc->SC_FIELD(sc_regs.t5);
        mc->t6 = sc->SC_FIELD(sc_regs.t6);
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        mc->pc = (void *)sc->SC_FIELD(sc_regs.pc);
        mc->sp = sc->SC_FIELD(sc_regs.sp);
        mc->fcsr = sc->SC_FIELD(sc_fpregs.f.fcsr);
    }
#elif defined(AARCH64)
    if (TEST(DR_MC_INTEGER, flags)) {
#    ifdef MACOS
        memcpy(&mc->r0, &sc->SC_FIELD(x[0]), sizeof(mc->r0) * 31);
#    else
        memcpy(&mc->r0, &sc->SC_FIELD(regs[0]), sizeof(mc->r0) * 31);
#    endif
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        /* XXX i#2710: the link register should be under DR_MC_CONTROL */
        mc->sp = sc->SC_FIELD(sp);
        mc->pc = (void *)sc->SC_FIELD(pc);
        mc->nzcv = sc->SC_XFLAGS;
    }
#elif defined(ARM)
    if (TEST(DR_MC_INTEGER, flags)) {
        mc->r0 = sc->SC_FIELD(arm_r0);
        mc->r1 = sc->SC_FIELD(arm_r1);
        mc->r2 = sc->SC_FIELD(arm_r2);
        mc->r3 = sc->SC_FIELD(arm_r3);
        mc->r4 = sc->SC_FIELD(arm_r4);
        mc->r5 = sc->SC_FIELD(arm_r5);
        mc->r6 = sc->SC_FIELD(arm_r6);
        mc->r7 = sc->SC_FIELD(arm_r7);
        mc->r8 = sc->SC_FIELD(arm_r8);
        mc->r9 = sc->SC_FIELD(arm_r9);
        mc->r10 = sc->SC_FIELD(arm_r10);
        mc->r11 = sc->SC_FIELD(arm_fp);
        mc->r12 = sc->SC_FIELD(arm_ip);
        /* XXX i#2710: the link register should be under DR_MC_CONTROL */
        mc->r14 = sc->SC_FIELD(arm_lr);
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        mc->r13 = sc->SC_FIELD(arm_sp);
        mc->r15 = sc->SC_FIELD(arm_pc);
        mc->cpsr = sc->SC_FIELD(arm_cpsr);
    }
#    ifdef X64
#        error NYI on AArch64
#    endif /* X64 */
#endif     /* X86/ARM/RISCV64 */
    if (TEST(DR_MC_MULTIMEDIA, flags))
        sigcontext_to_mcontext_simd(mc, sc_full);
}

/* Note that unlike mcontext_to_context(), this routine does not fill in
 * any state that is not present in the mcontext: in particular, it assumes
 * the sigcontext already contains the native fpstate.  If the caller
 * is generating a synthetic sigcontext, the caller should call
 * save_fpstate() before calling this routine.
 */
/* XXX: on ARM, sigreturn needs the T bit set in the sigcontext_t cpsr field in
 * order to return to Thumb mode.  But, our mcontext doesn't have the T bit (b/c
 * usermode can't read it).  Thus callers must either modify an mcontext
 * obtained from sigcontext_to_mcontext() or must call set_pc_mode_in_cpsr() in
 * order to create a proper sigcontext for sigreturn.  All callers here do so.
 * The only external non-Windows caller of thread_set_mcontext() is
 * translate_from_synchall_to_dispatch() who first does a thread_get_mcontext();
 * thread_get_mcontext() sets the dcontext isa_mode from cpsr, and thread_set_mcontext()
 * sets it.
 */
void
mcontext_to_sigcontext(sig_full_cxt_t *sc_full, priv_mcontext_t *mc,
                       dr_mcontext_flags_t flags)
{
    sigcontext_t *sc = sc_full->sc;
    ASSERT(mc != NULL && sc != NULL);
#ifdef X86
    if (TEST(DR_MC_INTEGER, flags)) {
        sc->SC_XAX = mc->xax;
        sc->SC_XBX = mc->xbx;
        sc->SC_XCX = mc->xcx;
        sc->SC_XDX = mc->xdx;
        sc->SC_XSI = mc->xsi;
        sc->SC_XDI = mc->xdi;
        sc->SC_XBP = mc->xbp;
#    ifdef X64
        sc->SC_FIELD(r8) = mc->r8;
        sc->SC_FIELD(r9) = mc->r9;
        sc->SC_FIELD(r10) = mc->r10;
        sc->SC_FIELD(r11) = mc->r11;
        sc->SC_FIELD(r12) = mc->r12;
        sc->SC_FIELD(r13) = mc->r13;
        sc->SC_FIELD(r14) = mc->r14;
        sc->SC_FIELD(r15) = mc->r15;
#    endif /* X64 */
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        sc->SC_XSP = mc->xsp;
        sc->SC_XFLAGS = mc->xflags;
        sc->SC_XIP = (ptr_uint_t)mc->pc;
    }
#elif defined(RISCV64)
    if (TEST(DR_MC_INTEGER, flags)) {
        sc->SC_FIELD(sc_regs.ra) = mc->ra;
        sc->SC_FIELD(sc_regs.gp) = mc->gp;
        sc->SC_FIELD(sc_regs.tp) = mc->tp;
        sc->SC_FIELD(sc_regs.t0) = mc->t0;
        sc->SC_FIELD(sc_regs.t1) = mc->t1;
        sc->SC_FIELD(sc_regs.t2) = mc->t2;
        sc->SC_FIELD(sc_regs.s0) = mc->s0;
        sc->SC_FIELD(sc_regs.s1) = mc->s1;
        sc->SC_FIELD(sc_regs.a0) = mc->a0;
        sc->SC_FIELD(sc_regs.a1) = mc->a1;
        sc->SC_FIELD(sc_regs.a2) = mc->a2;
        sc->SC_FIELD(sc_regs.a3) = mc->a3;
        sc->SC_FIELD(sc_regs.a4) = mc->a4;
        sc->SC_FIELD(sc_regs.a5) = mc->a5;
        sc->SC_FIELD(sc_regs.a6) = mc->a6;
        sc->SC_FIELD(sc_regs.a7) = mc->a7;
        sc->SC_FIELD(sc_regs.s2) = mc->s2;
        sc->SC_FIELD(sc_regs.s3) = mc->s3;
        sc->SC_FIELD(sc_regs.s4) = mc->s4;
        sc->SC_FIELD(sc_regs.s5) = mc->s5;
        sc->SC_FIELD(sc_regs.s6) = mc->s6;
        sc->SC_FIELD(sc_regs.s7) = mc->s7;
        sc->SC_FIELD(sc_regs.s8) = mc->s8;
        sc->SC_FIELD(sc_regs.s9) = mc->s9;
        sc->SC_FIELD(sc_regs.s10) = mc->s10;
        sc->SC_FIELD(sc_regs.s11) = mc->s11;
        sc->SC_FIELD(sc_regs.t3) = mc->t3;
        sc->SC_FIELD(sc_regs.t4) = mc->t4;
        sc->SC_FIELD(sc_regs.t5) = mc->t5;
        sc->SC_FIELD(sc_regs.t6) = mc->t6;
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        sc->SC_FIELD(sc_regs.pc) = (ptr_uint_t)mc->pc;
        sc->SC_FIELD(sc_regs.sp) = mc->sp;
        sc->SC_FIELD(sc_fpregs.f.fcsr) = mc->fcsr;
    }
#elif defined(AARCH64)
    if (TEST(DR_MC_INTEGER, flags)) {
#    ifdef MACOS
        memcpy(&sc->SC_FIELD(x[0]), &mc->r0, sizeof(mc->r0) * 31);
#    else
        memcpy(&sc->SC_FIELD(regs[0]), &mc->r0, sizeof(mc->r0) * 31);
#    endif
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        /* XXX i#2710: the link register should be under DR_MC_CONTROL */
        sc->SC_FIELD(sp) = mc->sp;
        sc->SC_FIELD(pc) = (ptr_uint_t)mc->pc;
        sc->SC_XFLAGS = mc->nzcv;
    }
#elif defined(ARM)
    if (TEST(DR_MC_INTEGER, flags)) {
        sc->SC_FIELD(arm_r0) = mc->r0;
        sc->SC_FIELD(arm_r1) = mc->r1;
        sc->SC_FIELD(arm_r2) = mc->r2;
        sc->SC_FIELD(arm_r3) = mc->r3;
        sc->SC_FIELD(arm_r4) = mc->r4;
        sc->SC_FIELD(arm_r5) = mc->r5;
        sc->SC_FIELD(arm_r6) = mc->r6;
        sc->SC_FIELD(arm_r7) = mc->r7;
        sc->SC_FIELD(arm_r8) = mc->r8;
        sc->SC_FIELD(arm_r9) = mc->r9;
        sc->SC_FIELD(arm_r10) = mc->r10;
        sc->SC_FIELD(arm_fp) = mc->r11;
        sc->SC_FIELD(arm_ip) = mc->r12;
        /* XXX i#2710: the link register should be under DR_MC_CONTROL */
        sc->SC_FIELD(arm_lr) = mc->r14;
    }
    if (TEST(DR_MC_CONTROL, flags)) {
        sc->SC_FIELD(arm_sp) = mc->r13;
        sc->SC_FIELD(arm_pc) = mc->r15;
        sc->SC_FIELD(arm_cpsr) = mc->cpsr;
    }
#    ifdef X64
#        error NYI on AArch64
#    endif /* X64 */
#endif     /* X86/ARM/RISCV64 */
    if (TEST(DR_MC_MULTIMEDIA, flags))
        mcontext_to_sigcontext_simd(sc_full, mc);
}

static void
ucontext_to_mcontext(priv_mcontext_t *mc, kernel_ucontext_t *uc)
{
    sig_full_cxt_t sc_full;
    sig_full_initialize(&sc_full, uc);
    sigcontext_to_mcontext(mc, &sc_full, DR_MC_ALL);
}

static void
mcontext_to_ucontext(kernel_ucontext_t *uc, priv_mcontext_t *mc)
{
    sig_full_cxt_t sc_full;
    sig_full_initialize(&sc_full, uc);
    mcontext_to_sigcontext(&sc_full, mc, DR_MC_ALL);
}

#ifdef AARCHXX
static void
set_sigcxt_stolen_reg(sigcontext_t *sc, reg_t val)
{
    *(&sc->SC_R0 + (dr_reg_stolen - DR_REG_R0)) = val;
}

static reg_t
get_sigcxt_stolen_reg(sigcontext_t *sc)
{
    return *(&sc->SC_R0 + (dr_reg_stolen - DR_REG_R0));
}

#    ifndef AARCH64
static dr_isa_mode_t
get_pc_mode_from_cpsr(sigcontext_t *sc)
{
    return TEST(EFLAGS_T, sc->SC_XFLAGS) ? DR_ISA_ARM_THUMB : DR_ISA_ARM_A32;
}

dr_isa_mode_t
get_sigcontext_isa_mode(sig_full_cxt_t *sc_full)
{
    return get_pc_mode_from_cpsr(sc_full->sc);
}

static void
set_pc_mode_in_cpsr(sigcontext_t *sc, dr_isa_mode_t isa_mode)
{
    if (isa_mode == DR_ISA_ARM_THUMB)
        sc->SC_XFLAGS |= EFLAGS_T;
    else
        sc->SC_XFLAGS &= ~EFLAGS_T;
}

void
set_sigcontext_isa_mode(sig_full_cxt_t *sc_full, dr_isa_mode_t isa_mode)
{
    set_pc_mode_in_cpsr(sc_full->sc, isa_mode);
}
#    endif
#endif

/* Returns whether successful.  If avoid_failure, tries to translate
 * at least pc if not successful.  Pass f if known.
 */
static bool
translate_sigcontext(dcontext_t *dcontext, kernel_ucontext_t *uc, bool avoid_failure,
                     fragment_t *f)
{
    bool success = false;
    priv_mcontext_t mcontext;
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);

    ucontext_to_mcontext(&mcontext, uc);
    /* FIXME: if cannot find exact match, we're in trouble!
     * probably ok to delay, since that indicates not a synchronous
     * signal.
     */
    /* FIXME : in_fcache() (called by recreate_app_state) grabs fcache
     * fcache_unit_areas.lock, we could deadlock! Also on initexit_lock
     * == PR 205795/1317
     */
    /* For safe recreation we need to either be couldbelinking or hold the
     * initexit lock (to keep someone from flushing current fragment), the
     * initexit lock is easier
     */
    d_r_mutex_lock(&thread_initexit_lock);
    /* PR 214962: we assume we're going to relocate to this stored context,
     * so we restore memory now
     */
    if (translate_mcontext(dcontext->thread_record, &mcontext, true /*restore memory*/,
                           f)) {
        mcontext_to_ucontext(uc, &mcontext);
        success = true;
    } else {
        if (avoid_failure) {
            ASSERT_NOT_REACHED(); /* Raise a visible debug error: sthg is wrong. */
            /* FIXME : what to do? reg state might be wrong at least get pc */
            if (safe_is_in_fcache(dcontext, (cache_pc)sc->SC_XIP, (app_pc)sc->SC_XSP)) {
                sc->SC_XIP = (ptr_uint_t)recreate_app_pc(dcontext, mcontext.pc, f);
                ASSERT(sc->SC_XIP != (ptr_uint_t)NULL);
            } else {
                /* FIXME : can't even get pc right, what do we do here? */
                sc->SC_XIP = 0;
            }
        }
    }
    d_r_mutex_unlock(&thread_initexit_lock);

    /* FIXME i#2095: restore the app's segment register value(s). */

    LOG(THREAD, LOG_ASYNCH, 3,
        "\ttranslate_sigcontext: just set frame's eip to " PFX "\n", sc->SC_XIP);
    return success;
}

/* Takes an os-specific context */
void
thread_set_self_context(void *cxt)
{
#ifdef X86
    if (!INTERNAL_OPTION(use_sigreturn_setcontext)) {
        sigcontext_t *sc = (sigcontext_t *)cxt;
        dr_jmp_buf_t buf;
        buf.xbx = sc->SC_XBX;
        buf.xcx = sc->SC_XCX;
        buf.xdi = sc->SC_XDI;
        buf.xsi = sc->SC_XSI;
        buf.xbp = sc->SC_XBP;
        /* XXX: this is not fully transparent: it assumes the target stack
         * is valid and that we can clobber the slot beyond TOS.
         * Using this instead of sigreturn is meant mainly as a diagnostic
         * to help debug future issues with sigreturn (xref i#2080).
         */
        buf.xsp = sc->SC_XSP - XSP_SZ; /* extra slot for retaddr */
        buf.xip = sc->SC_XIP;
#    ifdef X64
        buf.r8 = sc->SC_R8;
        buf.r9 = sc->SC_R9;
        buf.r10 = sc->SC_R10;
        buf.r11 = sc->SC_R11;
        buf.r12 = sc->SC_R12;
        buf.r13 = sc->SC_R13;
        buf.r14 = sc->SC_R14;
        buf.r15 = sc->SC_R15;
#    endif
        dr_longjmp(&buf, sc->SC_XAX);
        return;
    }
#endif

    /* Unlike Windows we can't say "only set this subset of the
     * full machine state", so we need to get the rest of the state,
     */
    sigframe_rt_t frame;
#if defined(LINUX) || defined(DEBUG)
    sigcontext_t *sc = (sigcontext_t *)cxt;
#endif
    app_pc xsp_for_sigreturn;
#ifdef VMX86_SERVER
    ASSERT_NOT_IMPLEMENTED(false); /* PR 405694: can't use regular sigreturn! */
#endif
    memset(&frame, 0, sizeof(frame));
#if defined(X86)
    dcontext_t *dcontext = get_thread_private_dcontext();
#endif
#ifdef LINUX
#    ifdef X86
    byte *xstate = get_and_initialize_xstate_buffer(dcontext);
    frame.uc.uc_mcontext.fpstate = &((kernel_xstate_t *)xstate)->fpstate;
#    endif /* X86 */
    frame.uc.uc_mcontext = *sc;
#endif
    IF_ARM(ASSERT_NOT_TESTED());
#if defined(X86)
    save_fpstate(dcontext, &frame);
#endif
    /* The kernel calls do_sigaltstack on sys_rt_sigreturn primarily to ensure
     * the frame is ok, but the side effect is we can mess up our own altstack
     * settings if we're not careful.  Having invalid ss_size looks good for
     * kernel 2.6.23.9 at least so we leave frame.uc.uc_stack as all zeros.
     */
    /* make sure sigreturn's mask setting doesn't change anything */
    sigprocmask_syscall(SIG_SETMASK, NULL, (kernel_sigset_t *)&frame.uc.uc_sigmask,
                        sizeof(frame.uc.uc_sigmask));
    LOG(THREAD_GET, LOG_ASYNCH, 2, "thread_set_self_context: pc=" PFX "\n", sc->SC_XIP);
    LOG(THREAD_GET, LOG_ASYNCH, 3, "full sigcontext\n");
    DOLOG(LOG_ASYNCH, 3, {
        dcontext_t *dc = get_thread_private_dcontext();
        dump_sigcontext(dc, get_sigcontext_from_rt_frame(&frame));
    });
    /* set up xsp to point at &frame */
    /* For x86 only, we need to skip pretcode while setting xsp. */
    xsp_for_sigreturn = ((app_pc)&frame)IF_X86(+sizeof(char *));
#ifdef DR_HOST_NOT_TARGET
    ASSERT_NOT_REACHED();
#elif defined(X86)
    asm("mov  %0, %%" ASM_XSP : : "m"(xsp_for_sigreturn));
#    ifdef MACOS
    ASSERT_NOT_IMPLEMENTED(false && "need to pass 2 params to SYS_sigreturn");
    asm("jmp _dynamorio_sigreturn");
#    else
    /* i#2632: recent clang for 32-bit annoyingly won't do the right thing for
     * "jmp dynamorio_sigreturn" and leaves relocs so we ensure it's PIC:
     */
    void (*asm_jmp_tgt)() = dynamorio_sigreturn;
    asm("mov  %0, %%" ASM_XCX : : "m"(asm_jmp_tgt));
    asm("jmp  *%" ASM_XCX);
#    endif /* MACOS/LINUX */
#elif defined(AARCH64)
    asm("mov  " ASM_XSP ", %0" : : "r"(xsp_for_sigreturn));
#    ifdef MACOS
    asm("b    _dynamorio_sigreturn");
#    else
    asm("b    dynamorio_sigreturn");
#    endif
#elif defined(ARM)
    asm("ldr  " ASM_XSP ", %0" : : "m"(xsp_for_sigreturn));
    asm("b    dynamorio_sigreturn");
#endif /* X86/AARCH64/ARM */
    ASSERT_NOT_REACHED();
}

static void
thread_set_segment_registers(sigcontext_t *sc)
{
#ifdef X86
    /* Fill in the segment registers */
    __asm__ __volatile__("mov %%cs, %%ax; mov %%ax, %0"
                         : "=m"(sc->SC_FIELD(cs))
                         :
                         : "eax");
#    ifndef X64
    __asm__ __volatile__("mov %%ss, %%ax; mov %%ax, %0"
                         : "=m"(sc->SC_FIELD(ss))
                         :
                         : "eax");
    __asm__ __volatile__("mov %%ds, %%ax; mov %%ax, %0"
                         : "=m"(sc->SC_FIELD(ds))
                         :
                         : "eax");
    __asm__ __volatile__("mov %%es, %%ax; mov %%ax, %0"
                         : "=m"(sc->SC_FIELD(es))
                         :
                         : "eax");
#    endif
    __asm__ __volatile__("mov %%fs, %%ax; mov %%ax, %0"
                         : "=m"(sc->SC_FIELD(fs))
                         :
                         : "eax");
    __asm__ __volatile__("mov %%gs, %%ax; mov %%ax, %0"
                         : "=m"(sc->SC_FIELD(gs))
                         :
                         : "eax");
#endif
}

/* Takes a priv_mcontext_t */
void
thread_set_self_mcontext(priv_mcontext_t *mc)
{
    kernel_ucontext_t ucxt;
    sig_full_cxt_t sc_full;
    sig_full_initialize(&sc_full, &ucxt);
#if defined(LINUX) && defined(X86)
    sc_full.sc->fpstate = NULL; /* for mcontext_to_sigcontext */
#endif
    mcontext_to_sigcontext(&sc_full, mc, DR_MC_ALL);
    thread_set_segment_registers(sc_full.sc);
    /* sigreturn takes the mode from cpsr */
    IF_ARM(
        set_pc_mode_in_cpsr(sc_full.sc, dr_get_isa_mode(get_thread_private_dcontext())));
    /* thread_set_self_context will fill in the real fp/simd state for x86 */
    thread_set_self_context((void *)sc_full.sc);
    ASSERT_NOT_REACHED();
}

#ifdef LINUX
static bool
sig_has_restorer(thread_sig_info_t *info, int sig)
{
#    ifdef VMX86_SERVER
    /* vmkernel ignores SA_RESTORER (PR 405694) */
    return false;
#    endif
    if (info->sighand->action[sig] == NULL)
        return false;
    if (TEST(SA_RESTORER, info->sighand->action[sig]->flags))
        return true;
#    ifdef AARCH64
    /* In AArch64 either the app or the kernel defines a restorer, not glibc, contrary
     * to x86/ARM where glibc defines a restorer if the app did not define one. Thus,
     * reading info->sighand->action[sig]->restorer when SA_RESTORER is not specified by
     * the app was never an issue for x86/ARM, but for AArch64 if the SA_RESTORER is
     * not specified DR will read garbage leading to a seg fault later when
     * safe_reading the restorer. To avoid this issue return false early for AArch64 if
     * SA_RESTORER is not specified.
     */
    return false;
#    elif defined(RISCV64)
    /* FIXME i#3544: Not implemented */
    ASSERT_NOT_IMPLEMENTED(false);
    return false;
#    endif
    if (info->sighand->action[sig]->restorer == NULL)
        return false;
    /* we cache the result due to the safe_read cost */
    if (info->restorer_valid[sig] == -1) {
        /* With older kernels, don't seem to need flag: if sa_restorer !=
         * NULL kernel will use it.  But with newer kernels that's not
         * true, and sometimes libc does pass non-NULL.
         */
#    ifdef X86
        /* Signal restorer code for Ubuntu 7.04:
         *   0xffffe420 <__kernel_sigreturn+0>:      pop    %eax
         *   0xffffe421 <__kernel_sigreturn+1>:      mov    $0x77,%eax
         *   0xffffe426 <__kernel_sigreturn+6>:      int    $0x80
         *
         *   0xffffe440 <__kernel_rt_sigreturn+0>:   mov    $0xad,%eax
         *   0xffffe445 <__kernel_rt_sigreturn+5>:   int    $0x80
         */
        static const byte SIGRET_NONRT[8] = { 0x58, 0xb8, 0x77, 0x00,
                                              0x00, 0x00, 0xcd, 0x80 };
        static const byte SIGRET_RT[8] = { 0xb8, 0xad, 0x00, 0x00, 0x00, 0xcd, 0x80 };
#    elif defined(ARM)
        static const byte SIGRET_NONRT[8] = { 0x77, 0x70, 0xa0, 0xe3,
                                              0x00, 0x00, 0x00, 0xef };
        static const byte SIGRET_RT[8] = {
            0xad, 0x70, 0xa0, 0xe3, 0x00, 0x00, 0x00, 0xef
        };
#    elif defined(AARCH64)
        static const byte SIGRET_NONRT[8] = { 0 }; /* unused */
        static const byte SIGRET_RT[8] =
            /* FIXME i#1569: untested */
            /* mov w8, #139 ; svc #0 */
            { 0x68, 0x11, 0x80, 0x52, 0x01, 0x00, 0x00, 0xd4 };
#    elif defined(RISCV64)
        static const byte SIGRET_NONRT[8] = { 0 }; /* unused */
        static const byte SIGRET_RT[8] = { 0 };    /* unused */
        ;
#    endif
        byte buf[MAX(sizeof(SIGRET_NONRT), sizeof(SIGRET_RT))] = { 0 };
        if (d_r_safe_read(info->sighand->action[sig]->restorer, sizeof(buf), buf) &&
            ((IS_RT_FOR_APP(info, sig) &&
              memcmp(buf, SIGRET_RT, sizeof(SIGRET_RT)) == 0) ||
             (!IS_RT_FOR_APP(info, sig) &&
              memcmp(buf, SIGRET_NONRT, sizeof(SIGRET_NONRT)) == 0))) {
            LOG(THREAD_GET, LOG_ASYNCH, 2,
                "sig_has_restorer %d: " PFX " looks like restorer, using w/o flag\n", sig,
                info->sighand->action[sig]->restorer);
            info->restorer_valid[sig] = 1;
        } else
            info->restorer_valid[sig] = 0;
    }
    return (info->restorer_valid[sig] == 1);
}
#endif

/* Returns the size of the frame for delivering to the app.
 * For x64 this does NOT include kernel_fpstate_t.
 */
static uint
get_app_frame_size(thread_sig_info_t *info, int sig)
{
    if (IS_RT_FOR_APP(info, sig))
        return sizeof(sigframe_rt_t);
#ifdef LINUX
    else
        return sizeof(sigframe_plain_t);
#endif
}

static kernel_ucontext_t *
get_ucontext_from_rt_frame(sigframe_rt_t *frame)
{
#if defined(MACOS) && !defined(X64)
    /* Padding makes it unsafe to access uc on frame from kernel */
    return frame->puc;
#else
    return &frame->uc;
#endif
}

sigcontext_t *
get_sigcontext_from_rt_frame(sigframe_rt_t *frame)
{
    return SIGCXT_FROM_UCXT(get_ucontext_from_rt_frame(frame));
}

static sigcontext_t *
get_sigcontext_from_app_frame(thread_sig_info_t *info, int sig, void *frame)
{
    sigcontext_t *sc = NULL; /* initialize to satisfy Mac clang */
    bool rtframe = IS_RT_FOR_APP(info, sig);
    if (rtframe)
        sc = get_sigcontext_from_rt_frame((sigframe_rt_t *)frame);
#ifdef LINUX
    else {
#    ifdef X86
        sc = (sigcontext_t *)&(((sigframe_plain_t *)frame)->sc);
#    elif defined(ARM)
        sc = SIGCXT_FROM_UCXT(&(((sigframe_plain_t *)frame)->uc));
#    else
        ASSERT_NOT_REACHED();
#    endif
    }
#endif
    return sc;
}

static sigcontext_t *
get_sigcontext_from_pending(thread_sig_info_t *info, int sig)
{
    ASSERT(info->sigpending[sig] != NULL);
    return get_sigcontext_from_rt_frame(&info->sigpending[sig]->rt_frame);
}

/* Returns the address on the appropriate signal stack where we should copy
 * the frame.
 * If frame is NULL, assumes signal happened while in DR and has been delayed,
 * and thus we need to provide fpstate regardless of whether the original
 * had it.  If frame is non-NULL, matches frame's amount of fpstate.
 * dcontext can be NULL if frame is non-NULL.
 */
static byte *
get_sigstack_frame_ptr(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                       sigframe_rt_t *frame)
{
    ASSERT(dcontext != NULL || frame != NULL);
    sigcontext_t *sc = (frame == NULL) ? get_sigcontext_from_pending(info, sig)
                                       : get_sigcontext_from_rt_frame(frame);
    byte *sp;

    if (frame != NULL) {
        /* Signal happened natively or while in the cache: grab interrupted xsp. */
        sp = (byte *)sc->SC_XSP;
        LOG(THREAD, LOG_ASYNCH, 3, "get_sigstack_frame_ptr: using frame's xsp " PFX "\n",
            sp);
    } else {
        /* signal happened while in DR, use stored xsp */
        sp = (byte *)get_mcontext(dcontext)->xsp;
        LOG(THREAD, LOG_ASYNCH, 3, "get_sigstack_frame_ptr: using app xsp " PFX "\n", sp);
    }

    if (USE_APP_SIGSTACK(info, sig)) {
        /* app has own signal stack which is enabled for this handler */
        LOG(THREAD, LOG_ASYNCH, 3, "get_sigstack_frame_ptr: app has own stack " PFX "\n",
            info->app_sigstack.ss_sp);
        LOG(THREAD, LOG_ASYNCH, 3, "\tcur sp=" PFX " vs app stack " PFX "-" PFX "\n", sp,
            info->app_sigstack.ss_sp,
            info->app_sigstack.ss_sp + info->app_sigstack.ss_size);
        if (sp > (byte *)info->app_sigstack.ss_sp &&
            sp - (byte *)info->app_sigstack.ss_sp < info->app_sigstack.ss_size) {
            /* we're currently in the alt stack, so use current xsp */
            LOG(THREAD, LOG_ASYNCH, 3,
                "\tinside alt stack, so using current xsp " PFX "\n", sp);
        } else {
            /* need to go to top, stack grows down */
            sp = info->app_sigstack.ss_sp + info->app_sigstack.ss_size;
            LOG(THREAD, LOG_ASYNCH, 3,
                "\tnot inside alt stack, so using base xsp " PFX "\n", sp);
        }
    }

    if (frame != NULL) {
        /* Handle DR's frame already being on the app stack.  For native delivery we
         * could try to re-use this frame, but that doesn't work with plain vs rt.
         * Instead we move below and live with the downsides of a potential stack
         * overflow.
         */
        size_t frame_sz_max = sizeof(sigframe_rt_t) + REDZONE_SIZE +
            IF_LINUX(IF_X86((sc->fpstate == NULL ? 0
                                                 : signal_frame_extra_size(
                                                       true)) +)) 64 /* align + slop */;
        if (sp > (byte *)frame && sp < (byte *)frame + frame_sz_max) {
            sp = (byte *)frame;
            /* We're probably *on* the stack right where we want to put the frame! */
            byte *cur_sp;
            GET_STACK_PTR(cur_sp);
            /* We have to copy the frame below our own stack usage high watermark
             * in the rest of the code we'll execute from execute_native_handler().
             * Kind of a mess.  For now we estimate two pages which should be plenty
             * and still not be egregious usage for most app stacks.  For the altstack
             * we check the size below.
             */
#define EXECUTE_NATIVE_STACK_USAGE (4096 * 2)
            if (sp > cur_sp && sp - frame_sz_max - EXECUTE_NATIVE_STACK_USAGE < cur_sp) {
                sp = cur_sp - frame_sz_max - EXECUTE_NATIVE_STACK_USAGE;
            }
            if (USE_APP_SIGSTACK(info, sig)) {
                /* We avoid SIGSTKSZ as it now calls sysconf in libc. */
#define APP_ALTSTACK_USAGE (4096)
                if (sp - APP_ALTSTACK_USAGE < (byte *)info->app_sigstack.ss_sp) {
                    /* There's not enough stack space.  The only solution would be to
                     * re-use DR's frame and limit our own stack usage here, which
                     * gets complex.  We bail.
                     */
                    report_unhandleable_signal_and_exit(
                        sig, "sigaltstack too small in native thread");
                    ASSERT_NOT_REACHED();
                }
            }
            LOG(THREAD, LOG_ASYNCH, 3,
                "get_sigstack_frame_ptr: moving beyond same-stack frame to %p\n", sp);
        }
    }

    /* now get frame pointer: need to go down to first field of frame */
    sp -= get_app_frame_size(info, sig);
#if defined(LINUX) && defined(X86)
    if (frame == NULL) {
        /* XXX i#641: we always include space for full xstate,
         * even if we don't use it all, which does not match what the
         * kernel does, but we're not tracking app actions to know whether
         * we can skip lazy fpstate on the delay
         */
        sp -= signal_frame_extra_size(true);
    } else {
        if (sc->fpstate != NULL) {
            /* The kernel doesn't seem to lazily include avx, so we don't either,
             * which simplifies all our frame copying: if YMM_ENABLED() and the
             * fpstate pointer is non-NULL, then we assume there's space for
             * full xstate
             */
            sp -= signal_frame_extra_size(true);
            DOCHECK(1, {
                ASSERT(YMM_ENABLED() || !ZMM_ENABLED());
                if (YMM_ENABLED()) {
                    ASSERT_CURIOSITY(sc->fpstate->sw_reserved.magic1 == FP_XSTATE_MAGIC1);
                    ASSERT(sc->fpstate->sw_reserved.extended_size <=
                           signal_frame_extra_size(true));
                }
            });
        }
    }
#endif /* LINUX && X86 */
    /* PR 369907: don't forget the redzone */
    sp -= REDZONE_SIZE;

    /* Align to 16-bytes.  The kernel does this for both 32 and 64-bit code
     * these days, so we do as well.
     */
    sp = (byte *)ALIGN_BACKWARD(sp, 16);
    IF_X86(sp -= sizeof(reg_t)); /* Model retaddr. */

    LOG(THREAD, LOG_ASYNCH, 3, "\tplacing frame at " PFX "\n", sp);
    return sp;
}

#if defined(LINUX) && !defined(X64)
static void
convert_rt_mask_to_nonrt(sigframe_plain_t *f_plain, kernel_sigset_t *sigmask)
{
#    ifdef X86
    f_plain->sc.oldmask = sigmask->sig[0];
    memcpy(&f_plain->extramask, &sigmask->sig[1], (_NSIG_WORDS - 1) * sizeof(uint));
#    elif defined(ARM)
    f_plain->uc.uc_mcontext.oldmask = sigmask->sig[0];
    memcpy(&f_plain->uc.sigset_ex, &sigmask->sig[1], (_NSIG_WORDS - 1) * sizeof(uint));
#    else
#        error NYI
#    endif
}

static void
convert_frame_to_nonrt(dcontext_t *dcontext, int sig, sigframe_rt_t *f_old,
                       sigframe_plain_t *f_new, size_t f_new_alloc_size)
{
#    ifdef X86
    sigcontext_t *sc_old = get_sigcontext_from_rt_frame(f_old);
    f_new->pretcode = f_old->pretcode;
    f_new->sig = f_old->sig;
    memcpy(&f_new->sc, get_sigcontext_from_rt_frame(f_old), sizeof(sigcontext_t));
    if (sc_old->fpstate != NULL) {
        /* up to caller to include enough space for fpstate at end */
        uint offset_fpstate_xsave = offsetof(kernel_fpstate_t, _fxsr_env);
        /* i#5079: The kernel requires the xsave area to be 64-byte aligned, which
         * starts at _fxsr_env in kernel_fpstate_t.
         */
        byte *new_fpstate =
            (byte *)ALIGN_FORWARD(((byte *)f_new) + sizeof(*f_new) + offset_fpstate_xsave,
                                  XSTATE_ALIGNMENT) -
            offset_fpstate_xsave;
        size_t extra_size = signal_frame_extra_size(false);
        ASSERT(new_fpstate + extra_size <= (byte *)f_new + f_new_alloc_size);
        memcpy(new_fpstate, sc_old->fpstate, extra_size);
        f_new->sc.fpstate = (kernel_fpstate_t *)new_fpstate;
    }
    convert_rt_mask_to_nonrt(f_new, &f_old->uc.uc_sigmask);
    memcpy(&f_new->retcode, &f_old->retcode, RETCODE_SIZE);
    /* now fill in our extra field */
    f_new->sig_noclobber = f_new->sig;
#    elif defined(ARM)
    memcpy(&f_new->uc, &f_old->uc, sizeof(f_new->uc));
    memcpy(f_new->retcode, f_old->retcode, sizeof(f_new->retcode));
    /* now fill in our extra field */
    f_new->sig_noclobber = f_old->info.si_signo;
#    endif /* X86 */
    LOG(THREAD, LOG_ASYNCH, 3, "\tconverted sig=%d rt frame to non-rt frame\n",
        f_new->sig_noclobber);
}
#endif

/* Exported for call from main_signal_handler asm routine.
 * For the rt signal frame f_old that was copied to f_new, updates
 * the intra-frame absolute pointers to point to the new addresses
 * in f_new.
 * Only updates the pretcode to the stored app restorer if for_app.
 * dcontext can be NULL (which is why we take in info).
 */
static void
fixup_rtframe_pointers(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                       sigframe_rt_t *f_old, sigframe_rt_t *f_new, bool for_app,
                       size_t f_new_alloc_size)
{
    ASSERT(info != NULL);
#if defined(X86) && defined(LINUX)
    bool has_restorer = sig_has_restorer(info, sig);
#    ifdef DEBUG
    uint level = 3;
#        if !defined(HAVE_MEMINFO)
    /* avoid logging every single TRY probe fault */
    if (!dynamo_initialized)
        level = 5;
#        endif
#    endif

    if (has_restorer && for_app)
        f_new->pretcode = (char *)info->sighand->action[sig]->restorer;
    else {
#    ifdef VMX86_SERVER
        /* PR 404712: skip kernel's restorer code */
        if (for_app)
            f_new->pretcode = (char *)dynamorio_sigreturn;
#    else
#        ifdef X64
        ASSERT(!for_app || doing_detach); /* detach uses a frame to go native */
#        else
        /* only point at retcode if old one was -- with newer OS, points at
         * vsyscall page and there is no restorer, yet stack restorer code left
         * there for gdb compatibility
         */
        if (f_old->pretcode == f_old->retcode)
            f_new->pretcode = f_new->retcode;
        /* else, pointing at vsyscall, or we set it to dynamorio_sigreturn in
         * main_signal_handler
         */
        LOG(THREAD, LOG_ASYNCH, level, "\tleaving pretcode with old value\n");
#        endif
#    endif
    }
#    ifndef X64
    f_new->pinfo = &(f_new->info);
    f_new->puc = &(f_new->uc);
#    endif
    if (f_old->uc.uc_mcontext.fpstate != NULL) {
        uint frame_size = get_app_frame_size(info, sig);
        byte *frame_end = ((byte *)f_new) + frame_size;
#    ifdef X64
        byte *tgt = (byte *)ALIGN_FORWARD(frame_end, XSTATE_ALIGNMENT);
#    else
        uint offset_fpstate_xsave = offsetof(kernel_fpstate_t, _fxsr_env);
        /* i#5079: The kernel requires the xsave area to be 64-byte aligned, which
         * starts at _fxsr_env in kernel_fpstate_t.
         */
        byte *tgt =
            (byte *)ALIGN_FORWARD(frame_end + offset_fpstate_xsave, XSTATE_ALIGNMENT) -
            offset_fpstate_xsave;
#    endif
        size_t extra_size = signal_frame_extra_size(false);
        ASSERT(extra_size == f_new->uc.uc_mcontext.fpstate->sw_reserved.extended_size);
        /* XXX: It may be better to move this into memcpy_rt_frame (or another
         * routine, since copy_frame_to_pending() wants them separate), since
         * fixup_rtframe_pointers() was originally meant to just update a few
         * pointers and not do a big memcpy.  Compare to MACOS which calls it in
         * copy_frame_to_pending(): if Linux did that we'd have memory clobbering.
         */
        ASSERT(tgt + extra_size <= (byte *)f_new + f_new_alloc_size);
        memcpy(tgt, f_new->uc.uc_mcontext.fpstate, extra_size);
        f_new->uc.uc_mcontext.fpstate = (kernel_fpstate_t *)tgt;
        LOG(THREAD, LOG_ASYNCH, level + 1, "\tfpstate old=" PFX " new=" PFX "\n",
            f_old->uc.uc_mcontext.fpstate, f_new->uc.uc_mcontext.fpstate);
        /* Be sure we're keeping both magic words.  Failure to do so causes the
         * kernel to only restore x87 and SSE state and zero out all AVX+ state
         * (i#3812).
         */
        ASSERT(f_new->uc.uc_mcontext.fpstate->sw_reserved.magic1 ==
               f_old->uc.uc_mcontext.fpstate->sw_reserved.magic1);
        ASSERT((f_new->uc.uc_mcontext.fpstate->sw_reserved.magic1 == 0 &&
                f_new->uc.uc_mcontext.fpstate->sw_reserved.extended_size ==
                    sizeof(kernel_fpstate_t)) ||
               (f_new->uc.uc_mcontext.fpstate->sw_reserved.magic1 == FP_XSTATE_MAGIC1 &&
                *(int *)((byte *)f_new->uc.uc_mcontext.fpstate +
                         f_new->uc.uc_mcontext.fpstate->sw_reserved.extended_size -
                         FP_XSTATE_MAGIC2_SIZE) == FP_XSTATE_MAGIC2));
    } else {
        /* if fpstate is not set up, we're delivering signal immediately,
         * and we shouldn't need an fpstate since DR code won't modify it;
         * only if we delayed will we need it, and when delaying we make
         * room and set up the pointer in copy_frame_to_pending.
         * xref i#641.
         */
        LOG(THREAD, LOG_ASYNCH, level + 1, "\tno fpstate needed\n");
    }
    LOG(THREAD, LOG_ASYNCH, level, "\tretaddr = " PFX "\n", f_new->pretcode);
#    ifdef RETURN_AFTER_CALL
    info->signal_restorer_retaddr = (app_pc)f_new->pretcode;
#    endif
    /* 32-bit kernel copies to aligned buf first */
    IF_X64(ASSERT(ALIGNED(f_new->uc.uc_mcontext.fpstate, 16)));
#elif defined(MACOS)
#    ifdef X64
    /* Nothing to do. */
#    else
    f_new->pinfo = &(f_new->info);
    f_new->puc = &(f_new->uc);
    f_new->puc->uc_mcontext =
        (IF_X64_ELSE(_STRUCT_MCONTEXT64, _STRUCT_MCONTEXT32) *)&f_new->mc;
    LOG(THREAD, LOG_ASYNCH, 3, "\tf_new=" PFX ", &handler=" PFX "\n", f_new,
        &f_new->handler);
    ASSERT(!for_app || ALIGNED(&f_new->handler, 16));
#    endif
#endif /* X86 && LINUX */
}

/* Only operates on rt frames, so call before converting to plain.
 * Must be called *after* translating the sigcontext.
 */
static void
fixup_siginfo(dcontext_t *dcontext, int sig, sigframe_rt_t *frame)
{
    /* For some signals, si_addr is a PC which we must translate. */
    if (sig != SIGILL && sig != SIGTRAP && sig != SIGFPE)
        return; /* nothing to do */
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    kernel_siginfo_t *siginfo = SIGINFO_FROM_RT_FRAME(frame);
    LOG(THREAD, LOG_ASYNCH, 3, "%s: updating si_addr from " PFX " to " PFX "\n",
        __FUNCTION__, siginfo->si_addr, sc->SC_XIP);
    siginfo->si_addr = (void *)sc->SC_XIP;
#ifdef LINUX
    siginfo->si_addr_lsb = sc->SC_XIP & 0x1;
#endif
}

static void
memcpy_rt_frame(sigframe_rt_t *frame, byte *dst, bool from_pending)
{
#if defined(MACOS) && !defined(X64)
    if (!from_pending) {
        /* The kernel puts padding in the middle.  We collapse that padding here
         * and re-align when we copy to the app stack.
         * We should not reference fields from mc onward in what the kernel put
         * on the stack, as our sigframe_rt_t layout does not match the kernel's
         * variable mid-struct padding.
         */
        sigcontext_t *sc = SIGCXT_FROM_UCXT(frame->puc);
        memcpy(dst, frame, offsetof(sigframe_rt_t, puc) + sizeof(frame->puc));
        memcpy(&((sigframe_rt_t *)dst)->mc, sc,
               sizeof(sigframe_rt_t) - offsetof(sigframe_rt_t, mc));
        return;
    }
#endif
    /* Ensure no overlap. */
    ASSERT(dst >= (byte *)(frame + 1) + signal_frame_extra_size(true) ||
           dst + sizeof(sigframe_rt_t) + signal_frame_extra_size(true) <= (byte *)frame);
    memcpy(dst, frame, sizeof(sigframe_rt_t));
}

/* Copies frame to sp.
 * PR 304708: we now leave in rt form right up until we copy to the
 * app stack, so that we can deliver to a client at a safe spot
 * in rt form, so this routine now converts to a plain frame if necessary.
 * If no restorer, touches up pretcode
 * (and if rt_frame, touches up pinfo and puc)
 * Also touches up fpstate pointer
 * dcontext can be NULL (which is why we take in info) in which case no check
 * versus executable memory is performed (we assume this is a native frame).
 */
static void
copy_frame_to_stack(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                    sigframe_rt_t *frame, byte *sp, bool from_pending)
{
    bool rtframe = IS_RT_FOR_APP(info, sig);
    uint frame_size = get_app_frame_size(info, sig);
#if defined(LINUX) && defined(X86_32)
    bool has_restorer = sig_has_restorer(info, sig);
#endif
    byte *flush_pc;
    bool stack_unwritable = false;
    uint size = frame_size;
#if defined(LINUX) && defined(X86)
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    size += (sc->fpstate == NULL ? 0 : signal_frame_extra_size(true));
#endif /* LINUX && X86 */

    LOG(THREAD, LOG_ASYNCH, 3, "copy_frame_to_stack: rt=%d, src=" PFX ", sp=" PFX "\n",
        rtframe, frame, sp);
    fixup_siginfo(dcontext, sig, frame);

    /* We avoid querying memory as it incurs global contended locks. */
    flush_pc =
        dcontext == NULL ? false : is_executable_area_writable_overlap(sp, sp + size);
    if (flush_pc != NULL) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "\tcopy_frame_to_stack: part of stack is unwritable-by-us @" PFX "\n",
            flush_pc);
        flush_fragments_and_remove_region(dcontext, flush_pc, sp + size - flush_pc,
                                          false /* don't own initexit_lock */,
                                          false /* keep futures */);
    }
    ASSERT(!rtframe || frame_size == sizeof(*frame));
    if (dcontext == NULL) {
        /* We have no safe-read mechanism so we do the best we can: blindly copy. */
        if (rtframe)
            memcpy_rt_frame(frame, sp, from_pending);
        IF_NOT_X64(IF_LINUX(else convert_frame_to_nonrt(dcontext, sig, frame,
                                                        (sigframe_plain_t *)sp, size);));
    } else {
        TRY_EXCEPT(
            dcontext, /* try */
            {
                if (rtframe)
                    memcpy_rt_frame(frame, sp, from_pending);
                IF_NOT_X64(
                    IF_LINUX(else convert_frame_to_nonrt(dcontext, sig, frame,
                                                         (sigframe_plain_t *)sp, size);));
            },
            /* except */ { stack_unwritable = true; });
    }
    if (stack_unwritable) {
        /* Override the no-nested check in record_pending_signal(): it's ok b/c
         * receive_pending_signal() calls to here at a consistent point,
         * and we won't return there.
         */
        info->nested_pending_ok = true;
        /* Just throw away this signal and deliver SIGSEGV instead with the
         * same sigcontext, like the kernel does.
         */
        free_pending_signal(info, sig);
        os_forge_exception(0, UNREADABLE_MEMORY_EXECUTION_EXCEPTION);
        ASSERT_NOT_REACHED();
    }

    kernel_sigset_t *mask_to_restore = NULL;
    if (info->pre_syscall_app_sigprocmask_valid) {
        mask_to_restore = &info->pre_syscall_app_sigprocmask;
        info->pre_syscall_app_sigprocmask_valid = false;
    } else {
        mask_to_restore = &info->app_sigblocked;
    }

    /* if !has_restorer we do NOT add the restorer code to the exec list here,
     * to avoid removal problems (if handler never returns) and consistency problems
     * (would have to mark as selfmod right now if on stack).
     * for PROGRAM_SHEPHERDING we recognize as a pattern, and for consistency we
     * allow entire region once try to execute -- not a performance worry since should
     * very rarely be on the stack: should either be libc restorer code or with recent
     * OS in rx vsyscall page.
     */

    /* fix up pretcode, pinfo, puc, fpstate */
    if (rtframe) {
        sigframe_rt_t *f_new = (sigframe_rt_t *)sp;
        fixup_rtframe_pointers(dcontext, info, sig, frame, f_new, true /*for app*/, size);
#ifdef HAVE_SIGALTSTACK
        /* Make sure the frame's sigstack reflects the app stack, both for transparency
         * of the app examining it and for correctness if we detach mid-handler.
         */
        LOG(THREAD, LOG_ASYNCH, 3, "updated uc_stack @" PFX " to " PFX "\n",
            &f_new->uc.uc_stack, info->app_sigstack.ss_sp);
        f_new->uc.uc_stack = info->app_sigstack;
#endif

        /* Store the prior mask, for restoring in sigreturn. */
        memcpy(&f_new->uc.uc_sigmask, mask_to_restore, sizeof(info->app_sigblocked));
    } else {
#ifdef X64
        ASSERT_NOT_REACHED();
#endif
#if defined(LINUX) && !defined(X64)
        sigframe_plain_t *f_new = (sigframe_plain_t *)sp;
#    ifdef X86
#        ifndef VMX86_SERVER
        sigframe_plain_t *f_old = (sigframe_plain_t *)frame;
#        endif
        if (has_restorer)
            f_new->pretcode = (char *)info->sighand->action[sig]->restorer;
        else {
#        ifdef VMX86_SERVER
            /* PR 404712: skip kernel's restorer code */
            f_new->pretcode = (char *)dynamorio_nonrt_sigreturn;
#        else
            /* see comments in rt case above */
            if (f_old->pretcode == f_old->retcode)
                f_new->pretcode = f_new->retcode;
            else {
                /* whether we set to dynamorio_sigreturn in main_signal_handler
                 * or it's still vsyscall page, we have to convert to non-rt
                 */
                f_new->pretcode = (char *)dynamorio_nonrt_sigreturn;
            } /* else, pointing at vsyscall most likely */
            LOG(THREAD, LOG_ASYNCH, 3, "\tleaving pretcode with old value\n");
#        endif
        }
        /* convert_frame_to_nonrt*() should have updated fpstate pointer.
         * The inlined fpstate is no longer used on new kernels, and we do that
         * as well on older kernels.
         */
        ASSERT(f_new->sc.fpstate != &f_new->fpstate);
        /* 32-bit kernel copies to aligned buf so no assert on fpstate alignment */
        LOG(THREAD, LOG_ASYNCH, 3, "\tretaddr = " PFX "\n", f_new->pretcode);
        /* There is no stored alt stack in a plain frame to update. */
#        ifdef RETURN_AFTER_CALL
        info->signal_restorer_retaddr = (app_pc)f_new->pretcode;
#        endif
#    endif /* X86 */
        /* Store the prior mask, for restoring in sigreturn. */
        convert_rt_mask_to_nonrt(f_new, mask_to_restore);
#endif /* LINUX && !X64 */
    }

#if defined(MACOS) && !defined(X64)
    /* Update handler field, which is passed to the libc trampoline, to app */
    ASSERT(info->sighand->action[sig] != NULL);
    ((sigframe_rt_t *)sp)->handler = (app_pc)info->sighand->action[sig]->handler;
#endif
}

/* Copies frame to pending slot.
 * PR 304708: we now leave in rt form right up until we copy to the
 * app stack, so that we can deliver to a client at a safe spot
 * in rt form.
 */
static void
copy_frame_to_pending(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                      byte *access_address)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    sigframe_rt_t *dst = &(info->sigpending[sig]->rt_frame);
    memcpy_rt_frame(frame, (byte *)dst, false /*!already pending*/);

#if defined(LINUX) && defined(X86)
    /* For lazy fpstate, it's possible there was no fpstate when the kernel
     * sent us the frame, but in between then and now the app executed some
     * fp or xmm/ymm instrs.  Today we always add fpstate just in case.
     * XXX i#641 optimization: track whether any fp/xmm/ymm
     * instrs happened and avoid this.
     */
    /* we'll fill in updated fpstate at delivery time, but we go ahead and
     * copy now in case our own retrieval somehow misses some fields
     */
    if (frame->uc.uc_mcontext.fpstate != NULL) {
        size_t extra_size = signal_frame_extra_size(false);
        ASSERT(extra_size == frame->uc.uc_mcontext.fpstate->sw_reserved.extended_size);
        memcpy(&info->sigpending[sig]->xstate, frame->uc.uc_mcontext.fpstate, extra_size);
    }
    /* we must set the pointer now so that later save_fpstate, etc. work */
    dst->uc.uc_mcontext.fpstate = (kernel_fpstate_t *)&info->sigpending[sig]->xstate;
#endif /* LINUX && X86 */

    info->sigpending[sig]->access_address = access_address;
    info->sigpending[sig]->use_sigcontext = false;

#ifdef MACOS
    /* We rely on puc to find sc to we have to fix it up */
    fixup_rtframe_pointers(dcontext, info, sig, frame, dst, false /*!for app*/,
                           sizeof(*dst));
#endif

    LOG(THREAD, LOG_ASYNCH, 3, "copy_frame_to_pending from " PFX "\n", frame);
    DOLOG(3, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 3, "sigcontext:\n");
        dump_sigcontext(dcontext, get_sigcontext_from_rt_frame(dst));
    });
}

/**** real work ***********************************************/

/* transfer control from signal handler to fcache return routine */
static void
transfer_from_sig_handler_to_fcache_return(dcontext_t *dcontext, kernel_ucontext_t *uc,
                                           sigcontext_t *sc_interrupted, int sig,
                                           app_pc next_pc, linkstub_t *last_exit,
                                           bool is_kernel_xfer)
{
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);
    if (is_kernel_xfer) {
        sig_full_cxt_t sc_interrupted_full = { sc_interrupted, NULL /*not provided*/ };
        sig_full_cxt_t sc_full;
        sig_full_initialize(&sc_full, uc);
        sc->SC_XIP = (ptr_uint_t)next_pc;
        /* i#4041: Provide the actually-interrupted mid-rseq PC to the rseq event. */
        ptr_uint_t official_xl8 = sc_interrupted->SC_XIP;
        ptr_uint_t rseq_xl8 =
            (ptr_uint_t)translate_last_direct_translation(dcontext, (app_pc)official_xl8);
        if (rseq_xl8 != official_xl8) {
            sc_interrupted->SC_XIP = rseq_xl8;
            if (instrument_kernel_xfer(dcontext, DR_XFER_RSEQ_ABORT, sc_interrupted_full,
                                       NULL, NULL, next_pc, sc->SC_XSP, sc_full, NULL,
                                       sig))
                next_pc = (app_pc)sc->SC_XIP;
            /* The signal event has the abort handler, like the kernel passes. */
            sc_interrupted->SC_XIP = official_xl8;
        }
        if (instrument_kernel_xfer(dcontext, DR_XFER_SIGNAL_DELIVERY, sc_interrupted_full,
                                   NULL, NULL, next_pc, sc->SC_XSP, sc_full, NULL, sig))
            next_pc = (app_pc)sc->SC_XIP;
    }
    dcontext->next_tag = canonicalize_pc_target(dcontext, next_pc);

    /* Set our sigreturn context to point to fcache_return!
     * Then we'll go back through kernel, appear in fcache_return,
     * and go through d_r_dispatch & interp, without messing up dynamo stack.
     * Note that even if this is a write in the shared cache, we
     * still go to the private fcache_return for simplicity.
     */
    sc->SC_XIP = (ptr_uint_t)fcache_return_routine(dcontext);
#if defined(AARCHXX)
    /* We do not have to set dr_reg_stolen in dcontext's mcontext here
     * because dcontext's mcontext is stale and we used the mcontext
     * created from recreate_app_state_internal with the original sigcontext.
     */
    /* We restore dr_reg_stolen's app value in recreate_app_state_internal,
     * so now we need set dr_reg_stolen to hold DR's TLS before sigreturn
     * from DR's handler.
     */
    ASSERT(get_sigcxt_stolen_reg(sc) != (reg_t)*get_dr_tls_base_addr());
    /* Preserve the translated value. */
    dcontext->local_state->spill_space.reg_stolen = get_sigcxt_stolen_reg(sc);
    /* Now put DR's base in the sigcontext. */
    set_sigcxt_stolen_reg(sc, (reg_t)*get_dr_tls_base_addr());
#    ifndef AARCH64
    /* We're going to our fcache_return gencode which uses DEFAULT_ISA_MODE */
    set_pc_mode_in_cpsr(sc, DEFAULT_ISA_MODE);
#    endif
#endif

#if defined(X64) || defined(ARM)
    /* x64 always uses shared gencode */
    dcontext->local_state->spill_space.SS_RETVAL_REG = sc->SC_RETURN_REG;
#    ifdef AARCH64
    /* X1 needs to be spilled because of br x1 in exit stubs. */
    dcontext->local_state->spill_space.r1 = sc->SC_R1;
#    endif
#else
    get_mcontext(dcontext)->MC_RETVAL_REG = sc->SC_RETURN_REG;
#endif
    LOG(THREAD, LOG_ASYNCH, 2, "\tsaved xax " PFX "\n", sc->SC_RETURN_REG);

    sc->SC_RETURN_REG = (ptr_uint_t)last_exit;
    LOG(THREAD, LOG_ASYNCH, 2, "\tset next_tag to " PFX ", resuming in fcache_return\n",
        next_pc);
    LOG(THREAD, LOG_ASYNCH, 3, "transfer_from_sig_handler_to_fcache_return\n");
    DOLOG(3, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 3, "sigcontext @" PFX ":\n", sc);
        dump_sigcontext(dcontext, sc);
    });
}

static dr_signal_action_t
send_signal_to_client(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                      sigcontext_t *raw_sc, byte *access_address, bool blocked,
                      fragment_t *fragment)
{
    kernel_ucontext_t *uc = get_ucontext_from_rt_frame(frame);
    dr_siginfo_t si;
    dr_signal_action_t action;
    /* XXX #1615: we need a full ucontext to store pre-xl8 simd values.
     * Right now we share the same simd values with post-xl8.
     */
    sig_full_cxt_t raw_sc_full;
    sig_full_initialize(&raw_sc_full, uc);
    raw_sc_full.sc = raw_sc;
    if (!dr_signal_hook_exists())
        return DR_SIGNAL_DELIVER;
    LOG(THREAD, LOG_ASYNCH, 2, "sending signal to client\n");
    si.sig = sig;
    si.drcontext = (void *)dcontext;
    /* It's safe to allocate since we do not send signals that interrupt DR.
     * With priv_mcontext_t x2 that's a little big for stack alloc.
     */
    si.mcontext = heap_alloc(dcontext, sizeof(*si.mcontext) HEAPACCT(ACCT_OTHER));
    si.raw_mcontext = heap_alloc(dcontext, sizeof(*si.raw_mcontext) HEAPACCT(ACCT_OTHER));
    dr_mcontext_init(si.mcontext);
    dr_mcontext_init(si.raw_mcontext);
    /* i#207: fragment tag and fcache start pc on fault. */
    si.fault_fragment_info.tag = NULL;
    si.fault_fragment_info.cache_start_pc = NULL;
    /* i#182/PR 449996: we provide the pre-translation context */
    if (raw_sc != NULL) {
        fragment_t wrapper;
        si.raw_mcontext_valid = true;
        sigcontext_to_mcontext(dr_mcontext_as_priv_mcontext(si.raw_mcontext),
                               &raw_sc_full, si.raw_mcontext->flags);
        /* i#207: fragment tag and fcache start pc on fault. */
        /* FIXME: we should avoid the fragment_pclookup since it is expensive
         * and since we already did the work of a lookup when translating
         */
        if (fragment == NULL)
            fragment = fragment_pclookup(dcontext, si.raw_mcontext->pc, &wrapper);
        if (fragment != NULL && !hide_tag_from_client(fragment->tag)) {
            si.fault_fragment_info.tag = fragment->tag;
            si.fault_fragment_info.cache_start_pc = FCACHE_ENTRY_PC(fragment);
            si.fault_fragment_info.is_trace = TEST(FRAG_IS_TRACE, fragment->flags);
            si.fault_fragment_info.app_code_consistent =
                !TESTANY(FRAG_WAS_DELETED | FRAG_SELFMOD_SANDBOXED, fragment->flags);
        }
    } else
        si.raw_mcontext_valid = false;
    /* The client has no way to calculate this when using
     * instrumentation that deliberately faults (to shift a rare event
     * out of the fastpath) so we provide it.  When raw_mcontext is
     * available the client can calculate it, but we provide it as a
     * convenience anyway.
     */
    si.access_address = access_address;
    si.blocked = blocked;
    ucontext_to_mcontext(dr_mcontext_as_priv_mcontext(si.mcontext), uc);
    /* We disallow the client calling dr_redirect_execution(), so we
     * will not leak si
     */
    action = instrument_signal(dcontext, &si);
    if (action == DR_SIGNAL_DELIVER || action == DR_SIGNAL_REDIRECT) {
        /* propagate client changes */
        CLIENT_ASSERT(si.mcontext->flags == DR_MC_ALL,
                      "signal mcontext flags cannot be changed");
        mcontext_to_ucontext(uc, dr_mcontext_as_priv_mcontext(si.mcontext));
    } else if (action == DR_SIGNAL_SUPPRESS && raw_sc != NULL) {
        /* propagate client changes */
        CLIENT_ASSERT(si.raw_mcontext->flags == DR_MC_ALL,
                      "signal mcontext flags cannot be changed");
        mcontext_to_sigcontext(&raw_sc_full,
                               dr_mcontext_as_priv_mcontext(si.raw_mcontext),
                               si.raw_mcontext->flags);
    }
    heap_free(dcontext, si.mcontext, sizeof(*si.mcontext) HEAPACCT(ACCT_OTHER));
    heap_free(dcontext, si.raw_mcontext, sizeof(*si.raw_mcontext) HEAPACCT(ACCT_OTHER));
    return action;
}

/* Returns false if caller should exit */
static bool
handle_client_action_from_cache(dcontext_t *dcontext, int sig, dr_signal_action_t action,
                                sigframe_rt_t *our_frame, sigcontext_t *sc_orig,
                                sigcontext_t *sc_interrupted, bool blocked)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    kernel_ucontext_t *uc = get_ucontext_from_rt_frame(our_frame);
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);
    /* in order to pass to the client, we come all the way here for signals
     * the app has no handler for
     */
    if (action == DR_SIGNAL_REDIRECT) {
        /* send_signal_to_client copied mcontext into our
         * main_signal_handler frame, so we set up for fcache_return w/
         * our frame's state
         */
        transfer_from_sig_handler_to_fcache_return(
            dcontext, uc, sc_interrupted, sig, (app_pc)sc->SC_XIP,
            (linkstub_t *)get_asynch_linkstub(), true);
        if (is_building_trace(dcontext)) {
            LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
            trace_abort(dcontext);
        }
        return false;
    } else if (action == DR_SIGNAL_SUPPRESS ||
               (!blocked && info->sighand->action[sig] != NULL &&
                info->sighand->action[sig]->handler == (handler_t)SIG_IGN)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: not delivering!\n",
            (action == DR_SIGNAL_SUPPRESS) ? "client suppressing signal"
                                           : "app signal handler is SIG_IGN");
        /* restore original (untranslated) sc */
        *get_sigcontext_from_rt_frame(our_frame) = *sc_orig;
        return false;
    } else if (!blocked && /* no BYPASS for blocked */
               (action == DR_SIGNAL_BYPASS ||
                (info->sighand->action[sig] == NULL ||
                 info->sighand->action[sig]->handler == (handler_t)SIG_DFL))) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: executing default action\n",
            (action == DR_SIGNAL_BYPASS) ? "client forcing default"
                                         : "app signal handler is SIG_DFL");
        if (execute_default_from_cache(dcontext, sig, our_frame, sc_orig, false)) {
            /* if we haven't terminated, restore original (untranslated) sc
             * on request.
             */
            *get_sigcontext_from_rt_frame(our_frame) = *sc_orig;
            LOG(THREAD, LOG_ASYNCH, 2, "%s: restored xsp=" PFX ", xip=" PFX "\n",
                __FUNCTION__, get_sigcontext_from_rt_frame(our_frame)->SC_XSP,
                get_sigcontext_from_rt_frame(our_frame)->SC_XIP);
        }
        return false;
    }
    CLIENT_ASSERT(action == DR_SIGNAL_DELIVER, "invalid signal event return value");
    return true;
}

static bool
send_signal_to_client_and_handle_action(dcontext_t *dcontext, int sig,
                                        sigframe_rt_t *frame, sigcontext_t *sc,
                                        sigcontext_t *sc_orig, byte *access_address,
                                        bool blocked, fragment_t *fragment,
                                        bool no_deliver)
{
    /* Make a copy of sigcontext_t struct before send_signal_to_client() tweaks it. */
    sigcontext_t sc_interrupted = *sc;
    dr_signal_action_t action = send_signal_to_client(dcontext, sig, frame, sc_orig,
                                                      access_address, blocked, fragment);
    if (blocked) {
        /* For blocked signal early event we disallow BYPASS (xref i#182/PR 449996). */
        CLIENT_ASSERT(action != DR_SIGNAL_BYPASS, "cannot bypass a blocked signal event");
    } else {
        if (no_deliver && action == DR_SIGNAL_DELIVER) {
            /* Do not handle signal delivery here. */
            return true;
        }
    }
    return handle_client_action_from_cache(dcontext, sig, action, frame, sc_orig,
                                           &sc_interrupted, blocked);
}

static void
abort_on_fault(dcontext_t *dcontext, uint dumpcore_flag, app_pc pc, byte *target, int sig,
               sigframe_rt_t *frame, const char *prefix, const char *signame,
               const char *where)
{
    kernel_ucontext_t *ucxt = &frame->uc;
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    bool stack_overflow = (sig == SIGSEGV && is_stack_overflow(dcontext, target));
#if defined(STATIC_LIBRARY) && defined(LINUX)
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    uint orig_dumpcore_flag = dumpcore_flag;
    if (init_info.sighand != NULL)
        info = &init_info; /* use init-time handler */
    ASSERT(info->sighand->action != NULL);
#endif
    const char *fmt = "%s %s at PC " PFX "\n"
                      "Received SIG%s at%s pc " PFX " in thread " TIDFMT "\n"
                      "Base: " PFX "\n"
                      "Registers:"
#ifdef X86
                      "eax=" PFX " ebx=" PFX " ecx=" PFX " edx=" PFX "\n"
                      "\tesi=" PFX " edi=" PFX " esp=" PFX " ebp=" PFX "\n"
#    ifdef X64
                      "\tr8 =" PFX " r9 =" PFX " r10=" PFX " r11=" PFX "\n"
                      "\tr12=" PFX " r13=" PFX " r14=" PFX " r15=" PFX "\n"
#    endif /* X64 */
#elif defined(ARM)
#    ifndef X64
                      "  r0 =" PFX " r1 =" PFX " r2 =" PFX " r3 =" PFX "\n"
                      "\tr4 =" PFX " r5 =" PFX " r6 =" PFX " r7 =" PFX "\n"
                      "\tr8 =" PFX " r9 =" PFX " r10=" PFX " r11=" PFX "\n"
                      "\tr12=" PFX " r13=" PFX " r14=" PFX " r15=" PFX "\n"
#    else
#        error NYI on AArch64
#    endif
#endif /* X86/ARM */
                      "\teflags=" PFX;

#if defined(STATIC_LIBRARY) && defined(LINUX)
    /* i#2119: if we're invoking an app handler, disable a fatal coredump. */
    if (INTERNAL_OPTION(invoke_app_on_crash) && info->sighand->action[sig] != NULL &&
        IS_RT_FOR_APP(info, sig) && TEST(dumpcore_flag, DYNAMO_OPTION(dumpcore_mask)) &&
        !DYNAMO_OPTION(live_dump))
        dumpcore_flag = 0;
#endif

    report_dynamorio_problem(
        dcontext, dumpcore_flag | (stack_overflow ? DUMPCORE_STACK_OVERFLOW : 0), pc,
        (app_pc)sc->SC_FP, fmt, prefix, stack_overflow ? STACK_OVERFLOW_NAME : CRASH_NAME,
        pc, signame, where, pc, d_r_get_thread_id(), get_dynamorio_dll_start(),
#ifdef X86
        sc->SC_XAX, sc->SC_XBX, sc->SC_XCX, sc->SC_XDX, sc->SC_XSI, sc->SC_XDI,
        sc->SC_XSP, sc->SC_XBP,
#    ifdef X64
        sc->SC_FIELD(r8), sc->SC_FIELD(r9), sc->SC_FIELD(r10), sc->SC_FIELD(r11),
        sc->SC_FIELD(r12), sc->SC_FIELD(r13), sc->SC_FIELD(r14), sc->SC_FIELD(r15),
#    endif /* X86 */
#elif defined(ARM)
#    ifndef X64
        sc->SC_FIELD(arm_r0), sc->SC_FIELD(arm_r1), sc->SC_FIELD(arm_r2),
        sc->SC_FIELD(arm_r3), sc->SC_FIELD(arm_r4), sc->SC_FIELD(arm_r5),
        sc->SC_FIELD(arm_r6), sc->SC_FIELD(arm_r7), sc->SC_FIELD(arm_r8),
        sc->SC_FIELD(arm_r9), sc->SC_FIELD(arm_r10), sc->SC_FIELD(arm_fp),
        sc->SC_FIELD(arm_ip), sc->SC_FIELD(arm_sp), sc->SC_FIELD(arm_lr),
        sc->SC_FIELD(arm_pc),
#    else
#        error NYI on AArch64
#    endif /* X64 */
#elif defined(RISCV64)
        sc->SC_FIELD(sc_regs.pc), sc->SC_FIELD(sc_regs.ra), sc->SC_FIELD(sc_regs.sp),
        sc->SC_FIELD(sc_regs.gp), sc->SC_FIELD(sc_regs.tp), sc->SC_FIELD(sc_regs.t0),
        sc->SC_FIELD(sc_regs.t1), sc->SC_FIELD(sc_regs.t2), sc->SC_FIELD(sc_regs.s0),
        sc->SC_FIELD(sc_regs.s1), sc->SC_FIELD(sc_regs.a0), sc->SC_FIELD(sc_regs.a1),
        sc->SC_FIELD(sc_regs.a2), sc->SC_FIELD(sc_regs.a3), sc->SC_FIELD(sc_regs.a4),
        sc->SC_FIELD(sc_regs.a5), sc->SC_FIELD(sc_regs.a6), sc->SC_FIELD(sc_regs.a7),
        sc->SC_FIELD(sc_regs.s2), sc->SC_FIELD(sc_regs.s3), sc->SC_FIELD(sc_regs.s4),
        sc->SC_FIELD(sc_regs.s5), sc->SC_FIELD(sc_regs.s6), sc->SC_FIELD(sc_regs.s7),
        sc->SC_FIELD(sc_regs.s8), sc->SC_FIELD(sc_regs.s9), sc->SC_FIELD(sc_regs.s10),
        sc->SC_FIELD(sc_regs.s11), sc->SC_FIELD(sc_regs.t3), sc->SC_FIELD(sc_regs.t4),
        sc->SC_FIELD(sc_regs.t5),
        sc->SC_FIELD(sc_regs.t6)
#endif /* X86/ARM/RISCV64 */
        IF_NOT_RISCV64(sc->SC_XFLAGS));

#if defined(STATIC_LIBRARY) && defined(LINUX)
    /* i#2119: For static DR, the surrounding app's handler may well be
     * safe to invoke even when DR state is messed up: it's worth a try, as it
     * likely has useful reporting features for users of the app.
     * We limit to Linux and RT for simplicity: it can be expanded later if static
     * library use expands.
     */
    if (INTERNAL_OPTION(invoke_app_on_crash) && info->sighand->action[sig] != NULL &&
        IS_RT_FOR_APP(info, sig)) {
        SYSLOG(SYSLOG_WARNING, INVOKING_APP_HANDLER, 2, get_application_name(),
               get_application_pid());
        (*info->sighand->action[sig]->handler)(sig, &frame->info, ucxt);
        /* If the app handler didn't terminate, now get a fatal core. */
        if (TEST(orig_dumpcore_flag, DYNAMO_OPTION(dumpcore_mask)) &&
            !DYNAMO_OPTION(live_dump))
            os_dump_core("post-app-handler attempt at core dump");
    }
#endif

    os_terminate(dcontext, TERMINATE_PROCESS);
    ASSERT_NOT_REACHED();
}

static void
abort_on_DR_fault(dcontext_t *dcontext, app_pc pc, byte *target, int sig,
                  sigframe_rt_t *frame, const char *signame, const char *where)
{
    abort_on_fault(dcontext, DUMPCORE_INTERNAL_EXCEPTION, pc, target, sig, frame,
                   exception_label_core, signame, where);
    ASSERT_NOT_REACHED();
}

/* Returns whether unlinked or mangled syscall.
 * Restored in receive_pending_signal.
 */
static bool
unlink_fragment_for_signal(dcontext_t *dcontext, fragment_t *f,
                           byte *pc /*interruption pc*/)
{
    /* We only come here if we interrupted a fragment in the cache,
     * or interrupted transition gencode (i#2019),
     * which means that this thread's DR state is safe, and so it
     * should be ok to acquire a lock.  xref PR 596069.
     *
     * There is a race where if two threads hit a signal in the same
     * shared fragment, the first could re-link after the second
     * un-links but before the second exits, and the second could then
     * execute the syscall, resulting in arbitrary delay prior to
     * signal delivery.  We don't want to allocate global memory,
     * but we could use a static array of counters (since should
     * be small # of interrupted shared fragments at any one time)
     * used as refcounts so we only unlink when all are done.
     * Not bothering to implement now: going to live w/ chance of
     * long signal delays.  xref PR 596069.
     */
    bool changed = false;
    bool waslinking = is_couldbelinking(dcontext);
    if (!waslinking)
        enter_couldbelinking(dcontext, NULL, false);
    /* may not be linked if trace_relink or something */
    if (TEST(FRAG_COARSE_GRAIN, f->flags)) {
        /* XXX PR 213040: we don't support unlinking coarse, so we try
         * not to come here, but for indirect branch and other spots
         * where we don't yet support translation (since can't fault)
         * we end up w/ no bound on delivery...
         */
    } else if (TEST(FRAG_LINKED_OUTGOING, f->flags)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tunlinking outgoing for interrupted F%d\n", f->id);
        SHARED_FLAGS_RECURSIVE_LOCK(f->flags, acquire, change_linking_lock);
        /* Double-check flags to ensure some other thread didn't unlink
         * while we waited for the change_linking_lock.
         */
        if (TEST(FRAG_LINKED_OUTGOING, f->flags)) {
            unlink_fragment_outgoing(dcontext, f);
            changed = true;
            /* i#4670: Fragments that use the special ibl xfer trampoline are unlinked
             * automatically if pending signals count is greater than zero.
             */
        }
        SHARED_FLAGS_RECURSIVE_LOCK(f->flags, release, change_linking_lock);
    } else {
        LOG(THREAD, LOG_ASYNCH, 3, "\toutgoing already unlinked for interrupted F%d\n",
            f->id);
    }
    if (TEST(FRAG_HAS_SYSCALL, f->flags)) {
        /* Syscalls are signal barriers!
         * Make sure the next syscall (if any) in f is not executed!
         * instead go back to d_r_dispatch right before the syscall
         */
        /* syscall mangling does a bunch of decodes but only one write,
         * changing the target of a short jmp, which is atomic
         * since a one-byte write, so we don't need the change_linking_lock.
         */
        if (mangle_syscall_code(dcontext, f, pc, false /*do not skip exit cti*/))
            changed = true;
    }
    if (!waslinking)
        enter_nolinking(dcontext, NULL, false);
    return changed;
}

static void
relink_interrupted_fragment(dcontext_t *dcontext, thread_sig_info_t *info)
{
    if (info->interrupted == NULL)
        return;
    /* i#2066: if we were building a trace, it may already be re-linked */
    if (!TEST(FRAG_LINKED_OUTGOING, info->interrupted->flags)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tre-linking outgoing for interrupted F%d\n",
            info->interrupted->id);
        SHARED_FLAGS_RECURSIVE_LOCK(info->interrupted->flags, acquire,
                                    change_linking_lock);
        /* Double-check flags to ensure some other thread didn't link
         * while we waited for the change_linking_lock.
         */
        if (!TEST(FRAG_LINKED_OUTGOING, info->interrupted->flags)) {
            link_fragment_outgoing(dcontext, info->interrupted, false);
        }
        SHARED_FLAGS_RECURSIVE_LOCK(info->interrupted->flags, release,
                                    change_linking_lock);
    }
    if (TEST(FRAG_HAS_SYSCALL, info->interrupted->flags)) {
        /* restore syscall (they're a barrier to signals, so signal
         * handler has cur frag exit before it does a syscall)
         */
        if (info->interrupted_pc != NULL) {
            mangle_syscall_code(dcontext, info->interrupted, info->interrupted_pc,
                                true /*skip exit cti*/);
        }
    }
    info->interrupted = NULL;
    info->interrupted_pc = NULL;
}

static bool
interrupted_inlined_syscall(dcontext_t *dcontext, fragment_t *f,
                            byte *pc /*interruption pc*/)
{
    bool pre_or_post_syscall = false;
    if (TEST(FRAG_HAS_SYSCALL, f->flags)) {
        /* PR 596147: if the thread is currently in an inlined
         * syscall when a signal comes in, we can't delay and bound the
         * delivery time: we need to deliver now.  Should decode
         * backward and see if syscall.  We assume our translation of
         * the interruption state is fine to re-start: i.e., the syscall
         * is complete if kernel has pc at post-syscall point, and
         * kernel set EINTR in eax if necessary.
         */
        /* Interrupted fcache, so ok to alloc memory for decode */
        instr_t instr;
        byte *nxt_pc;
        instr_init(dcontext, &instr);
        nxt_pc = decode(dcontext, pc, &instr);
        if (nxt_pc != NULL && instr_valid(&instr) && instr_is_syscall(&instr)) {
            /* pre-syscall but post-jmp so can't skip syscall */
            pre_or_post_syscall = true;
        } else {
            size_t syslen = syscall_instr_length(FRAG_ISA_MODE(f->flags));
            instr_reset(dcontext, &instr);
            nxt_pc = decode(dcontext, pc - syslen, &instr);
            if (nxt_pc != NULL && instr_valid(&instr) && instr_is_syscall(&instr)) {
#if defined(X86) && !defined(MACOS)
                /* decoding backward so check for exit cti jmp prior
                 * to syscall to ensure no mismatch
                 */
                instr_reset(dcontext, &instr);
                nxt_pc = decode(dcontext, pc - syslen - JMP_LONG_LENGTH, &instr);
                if (nxt_pc != NULL && instr_valid(&instr) &&
                    instr_get_opcode(&instr) == OP_jmp) {
                    /* post-inlined-syscall */
                    pre_or_post_syscall = true;
                }
#else
                /* On Mac and ARM we have some TLS spills in between so we just
                 * trust that this is a syscall (esp on ARM w/ aligned instrs).
                 */
                pre_or_post_syscall = true;
#endif
            }
        }
        instr_free(dcontext, &instr);
    }
    return pre_or_post_syscall;
}

/* i#1145: auto-restart syscalls interrupted by signals */
static bool
adjust_syscall_for_restart(dcontext_t *dcontext, thread_sig_info_t *info, int sig,
                           sigcontext_t *sc, fragment_t *f, reg_t orig_retval_reg)
{
    byte *pc = (byte *)sc->SC_XIP;
    int sys_inst_len;

    if (sc->SC_RETURN_REG != -EINTR) {
        /* The syscall succeeded, so no reason to interrupt.
         * Some syscalls succeed on a signal coming in.
         * E.g., SYS_wait4 on SIGCHLD, or reading from a slow device.
         * XXX: Now that we pass SA_RESTART we should never get here?
         */
        return false;
    }
    /* Don't restart if the app's handler says not to */
    if (info->sighand->action[sig] != NULL &&
        !TEST(SA_RESTART, info->sighand->action[sig]->flags)) {
        return false;
    }
    /* XXX i#1145: some syscalls are never restarted when interrupted by a signal.
     * We check those that are simple to distinguish below, but not all are.  We have
     * this under an option so it can be disabled if necessary.
     */
    if (!DYNAMO_OPTION(restart_syscalls))
        return false;

    /* Now that we use SA_RESTART we rely on that and ignore our own
     * inaccurate check sysnum_is_not_restartable(sysnum).
     * SA_RESTART also means we can just be passed in the register value to restore.
     */
    LOG(THREAD, LOG_ASYNCH, 2, "%s: restored xax/r0/a0 to %ld\n", __FUNCTION__,
        orig_retval_reg);
#ifdef X86
    sc->SC_XAX = orig_retval_reg;
#elif defined(AARCHXX)
    sc->SC_R0 = orig_retval_reg;
#elif defined(RISCV64)
    sc->SC_A0 = orig_retval_reg;
#else
#    error NYI
#endif

    /* Now adjust the pc to point at the syscall instruction instead of after it,
     * so when we resume we'll go back to the syscall.
     * Adjusting solves transparency as well: natively the kernel adjusts
     * the pc before setting up the signal frame.
     * We don't pass in the post-syscall pc provided by the kernel because
     * we want the app pc, not the raw pc.
     */
    dr_isa_mode_t isa_mode;
    if (is_after_syscall_address(dcontext, pc) || pc == vsyscall_sysenter_return_pc) {
        isa_mode = dr_get_isa_mode(dcontext);
    } else {
        /* We're going to walk back in the fragment, not gencode */
        ASSERT(f != NULL);
        isa_mode = FRAG_ISA_MODE(f->flags);
    }
    sys_inst_len = syscall_instr_length(isa_mode);

    if (pc == vsyscall_sysenter_return_pc) {
#ifdef X86
        sc->SC_XIP = (ptr_uint_t)(vsyscall_syscall_end_pc - sys_inst_len);
        /* To restart sysenter we must re-copy xsp into xbp, as xbp is
         * clobbered by the kernel.
         * XXX: The kernel points at the int 0x80 in vsyscall on a restart
         * and so doesn't have to do this: should we do that too?  If so we'll
         * have to avoid interpreting our own hook which is right after the
         * int 0x80.
         */
        sc->SC_XBP = sc->SC_XSP;
#else
        ASSERT_NOT_REACHED();
#endif
    } else if (is_after_syscall_address(dcontext, pc)) {
        /* We're at do_syscall: point at app syscall instr.  We want an app
         * address b/c this signal will be delayed and the delivery will use
         * a direct app context: no translation from the cache.
         * The caller sets info->sigpending[sig]->use_sigcontext for us.
         */
        sc->SC_XIP = (ptr_uint_t)(dcontext->asynch_target - sys_inst_len);
        DODEBUG({
            instr_t instr;
            dr_isa_mode_t old_mode;
            dr_set_isa_mode(dcontext, isa_mode, &old_mode);
            instr_init(dcontext, &instr);
            ASSERT(decode(dcontext, (app_pc)sc->SC_XIP, &instr) != NULL &&
                   instr_is_syscall(&instr));
            instr_free(dcontext, &instr);
            dr_set_isa_mode(dcontext, old_mode, NULL);
        });
    } else {
        ASSERT_NOT_REACHED(); /* Inlined syscalls no longer come here. */
    }
#ifdef AARCHXX
    /* dr_reg_stolen is holding DR's TLS on receiving a signal,
     * so we need to put the app's reg value into the ucontext instead.
     * The translation process normally does this for us, but here we're doing
     * a custom translation.
     */
    set_sigcxt_stolen_reg(sc, dcontext->local_state->spill_space.reg_stolen);
#endif
    LOG(THREAD, LOG_ASYNCH, 2, "%s: sigreturn pc is now " PFX "\n", __FUNCTION__,
        sc->SC_XIP);
    return true;
}

/* XXX: Better to get this code inside arch/ but we'd have to convert to an mcontext
 * which seems overkill.
 */
static fragment_t *
find_next_fragment_from_gencode(dcontext_t *dcontext, sigcontext_t *sc)
{
    fragment_t *f = NULL;
    fragment_t wrapper;
    byte *pc = (byte *)sc->SC_XIP;
    if (in_clean_call_save(dcontext, pc) || in_clean_call_restore(dcontext, pc)) {
#ifdef AARCHXX
        f = fragment_pclookup(dcontext, (cache_pc)sc->SC_LR, &wrapper);
#elif defined(X86)
        cache_pc retaddr = NULL;
        /* Get the retaddr.  We assume this is the adjustment used by
         * insert_out_of_line_context_switch().
         */
        byte *ra_slot =
            dcontext->dstack - get_clean_call_switch_stack_size() - sizeof(retaddr);
        /* The extra x86 slot is only there for save. */
        if (in_clean_call_save(dcontext, pc))
            ra_slot -= get_clean_call_temp_stack_size();
        if (d_r_safe_read(ra_slot, sizeof(retaddr), &retaddr))
            f = fragment_pclookup(dcontext, retaddr, &wrapper);
#elif defined(RISCV64)
        /* FIXME i#3544: Not implemented */
        ASSERT_NOT_IMPLEMENTED(false);
        f = fragment_pclookup(dcontext, (cache_pc)sc->SC_RA, &wrapper);
#else
#    error Unsupported arch.
#endif
    } else if (in_indirect_branch_lookup_code(dcontext, pc)) {
        /* Try to find the target if the signal arrived in the IBL.
         * We could try to be a lot more precise by hardcoding the IBL
         * sequence here but that would make the code less maintainable.
         * Instead we try the registers that hold the target app address.
         */
        /* First check for the jmp* on the hit path: that is the only place
         * in the ibl where the target tag is not sitting in a register.
         */
#if defined(X86) && defined(X64)
        /* Optimization for the common case of targeting a prefix on x86_64:
         *    ff 61 08             jmp    0x08(%rcx)[8byte]
         * The tag is in 0x0(%rcx) so we avoid a decode and pclookup.
         */
        if (*pc == 0xff && *(pc + 1) == 0x61 && *(pc + 2) == 0x08) {
            f = fragment_lookup(dcontext, *(app_pc *)sc->SC_XCX);
        }
#endif
        if (f == NULL) {
            instr_t instr;
            instr_init(dcontext, &instr);
            decode_cti(dcontext, pc, &instr);
            if (instr_is_ibl_hit_jump(&instr)) {
                priv_mcontext_t mc;
                sig_full_cxt_t sc_full = { sc, NULL /*not provided*/ };
                sigcontext_to_mcontext(&mc, &sc_full, DR_MC_INTEGER | DR_MC_CONTROL);
                byte *target;
                if (opnd_is_memory_reference(instr_get_target(&instr))) {
                    target = instr_compute_address_priv(&instr, &mc);
                    ASSERT(target != NULL);
                    if (target != NULL)
                        target = *(byte **)target;
                } else {
                    ASSERT(opnd_is_reg(instr_get_target(&instr)));
                    target = (byte *)reg_get_value_priv(
                        opnd_get_reg(instr_get_target(&instr)), &mc);
                }
                ASSERT(target != NULL);
                if (target != NULL)
                    f = fragment_pclookup(dcontext, target, &wrapper);
                /* Hit this case running client.cleancallsig in a loop on x86
                 * and x86_64, and hit it in a proprietary application with
                 * plain DR (debug build) on AArch64. Never tested for ARM. We
                 * can remove this once someone hits it and it works.
                 */
                IF_ARM(ASSERT_NOT_TESTED());
            }
            instr_free(dcontext, &instr);
        }
#ifdef AARCHXX
#    if defined(MACOS)
        /* TODO i#5383: NYI. */
        ASSERT_NOT_IMPLEMENTED(false);
#    endif
        /* The target is in r2 the whole time, w/ or w/o Thumb LSB. */
        if (f == NULL && sc->SC_R2 != 0)
            f = fragment_lookup(dcontext, ENTRY_PC_TO_DECODE_PC(sc->SC_R2));
#elif defined(X86)
        /* The target is initially in xcx but is then copied to xbx. */
        if (f == NULL && sc->SC_XBX != 0)
            f = fragment_lookup(dcontext, (app_pc)sc->SC_XBX);
        if (f == NULL && sc->SC_XCX != 0)
            f = fragment_lookup(dcontext, (app_pc)sc->SC_XCX);
#elif defined(RISCV64)
/* FIXME i#3544: Not implemented */
#else
#    error Unsupported arch.
#endif
    } else {
        /* If in fcache_enter or do_syscall*, we stored the next_tag in asynch_target
         * in d_r_dispatch.  But, we need to avoid using the asynch_target for the
         * fragment we just exited if we're in fcache_return.
         */
        if (dcontext->asynch_target != NULL && !in_fcache_return(dcontext, pc))
            f = fragment_lookup(dcontext, dcontext->asynch_target);
    }
    return f;
}

static void
record_pending_signal(dcontext_t *dcontext, int sig, kernel_ucontext_t *ucxt,
                      sigframe_rt_t *frame, bool forged, byte *access_address)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    os_thread_data_t *ostd = (os_thread_data_t *)dcontext->os_field;
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    /* XXX #1615: we need a full ucontext to store pre-xl8 simd values */
    sigcontext_t sc_orig;
    byte *pc = (byte *)sc->SC_XIP;
    byte *xsp = (byte *)sc->SC_XSP;
    bool receive_now = false;
    bool blocked = false;
    bool handled = false;
    bool reroute = false;
    bool at_auto_restart_syscall = false;
    int syslen = 0;
    reg_t orig_retval_reg = sc->SC_RETURN_REG;
    sigpending_t *pend;
    fragment_t *f = NULL;
    fragment_t wrapper;

    /* We no longer block SUSPEND_SIGNAL (i#184/PR 450670) or SIGSEGV (i#193/PR 287309).
     * But we can have re-entrancy issues in this routine if the app uses the same
     * SUSPEND_SIGNAL, or the nested SIGSEGV needs to be sent to the app.  The
     * latter shouldn't happen unless the app sends SIGSEGV via SYS_kill().
     */
    if (ostd->processing_signal > 0 ||
        /* If we interrupted receive_pending_signal() we can't prepend a new
         * pending or delete an old b/c we might mess up the state so we
         * just drop this one: should only happen for alarm signal
         */
        (((info->accessing_sigpending && !info->nested_pending_ok) ||
          /* It's risky to deliver to an exiting thread: data structs are in unstable
           * states (i#4438).  Just drop if we can.
           */
          dcontext->is_exiting) &&
         /* we do want to report a crash in receive_pending_signal() */
         (can_always_delay[sig] ||
          is_sys_kill(dcontext, pc, (byte *)sc->SC_XSP, &frame->info)))) {
        LOG(THREAD, LOG_ASYNCH, 1, "nested or exit-time signal %d\n", sig);
        ASSERT(ostd->processing_signal == 0 || sig == SUSPEND_SIGNAL || sig == SIGSEGV);
        ASSERT(can_always_delay[sig] ||
               is_sys_kill(dcontext, pc, (byte *)sc->SC_XSP, &frame->info));
        /* To avoid re-entrant execution of special_heap_alloc() and of
         * prepending to the pending list we just drop this signal.
         * FIXME i#194/PR 453996: do better.
         */
        STATS_INC(num_signals_dropped);
        SYSLOG_INTERNAL_WARNING_ONCE("dropping nested/exit-time signal");
        return;
    }
    ostd->processing_signal++; /* no need for atomicity: thread-private */

    /* First, check whether blocked, before we restore for sigsuspend (i#1340). */
    if (kernel_sigismember(&info->app_sigblocked, sig))
        blocked = true;

    if (get_at_syscall(dcontext))
        syslen = syscall_instr_length(dr_get_isa_mode(dcontext));

    if (info->sighand->action[sig] != NULL &&
        info->sighand->action[sig]->handler == (handler_t)SIG_IGN
        /* If a client registered a handler, put this in the queue.
         * Races between registering, queueing, and delivering are fine.
         */
        && !dr_signal_hook_exists()) {
        LOG(THREAD, LOG_ASYNCH, 3,
            "record_pending_signal (%d at pc " PFX "): action is SIG_IGN!\n", sig, pc);
        ostd->processing_signal--;
        return;
    } else if (blocked) {
        /* Signal is blocked by app, so just record it, don't receive now. */
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d at pc " PFX "): signal is currently blocked\n", sig,
            pc);
        IF_LINUX(handled = notify_signalfd(dcontext, info, sig, frame));

        /* TODO i#1188: Similarly to signalfd, deliver to a sigwait-ing thread if we
         * avoid the actual sigwait syscall for handling it.
         */

        /* Another app thread may be unmasked and the kernel delivered to us because
         * DR had the real mask unmasked for this thread (i#2311).
         */
        LOG(THREAD, LOG_ASYNCH, 2,
            "blocked signal: process-wide=%d (code=%d), #-unmasked=%d\n",
            signal_is_process_wide(dcontext, &frame->info, pc, xsp), frame->info.si_code,
            atomic_aligned_read_int(&info->sighand->threads_unmasked[sig]));
        if (signal_is_process_wide(dcontext, &frame->info, pc, xsp) &&
            !signal_is_fault(dcontext, sig, pc, (byte *)sc->SC_XSP, &frame->info) &&
            atomic_aligned_read_int(&info->sighand->threads_unmasked[sig]) > 0 &&
            (!sig_is_alarm_signal(sig) ||
             (!info->in_app_handler && DYNAMO_OPTION(reroute_alarm_signals)))) {
            /* We need to re-route this but cannot acquire the locks to search the
             * threads here.  We thus start delivery and once we come back from
             * dispatch we'll do the search in reroute_to_unmasked_thread().
             * By definition as a process-wide signal this should be asynchronous
             * and delayable.
             */
            LOG(THREAD, LOG_ASYNCH, 2,
                "will reroute process-wide signal to unblocked thread\n");
            blocked = false;
            reroute = true;
        }
    }
    /* The value of blocked may have changed, so we re-start the if-else chain. */
    if (blocked) {
        /* No reroute needed so no action needed. */
    } else if (dcontext->currently_stopped) {
        /* We have no way to delay for a currently-native thread. */
        if (reroute) {
            /* We can't delay, but we can't have interrupted any DR locks, so we
             * can call this now.
             */
            ASSERT_NOT_TESTED();
            handled = reroute_to_unmasked_thread(dcontext, frame, sig);
            if (!handled)
                blocked = true;
        } else {
            LOG(THREAD, LOG_ASYNCH, 2, "Going to receive signal natively now\n");
            execute_native_handler(dcontext, sig, frame);
            handled = true;
        }
    } else if (safe_is_in_fcache(dcontext, pc, xsp)) {
        LOG(THREAD, LOG_ASYNCH, 2, "record_pending_signal(%d) from cache pc " PFX "\n",
            sig, pc);
        /* We assume a reroute can be delayed as a process-directed asynch signal. */
        if (forged || reroute || can_always_delay[sig]) {
            /* to make translation easier, want to delay if can until d_r_dispatch
             * unlink cur frag, wait for d_r_dispatch
             */
            /* check for coarse first to avoid cost of coarse pclookup */
            if (get_fcache_coarse_info(pc) != NULL) {
                /* PR 213040: we can't unlink coarse.  If we fail to translate
                 * we'll switch back to delaying, below.
                 */
                if (sig_is_alarm_signal(sig) && info->sigpending[sig] != NULL &&
                    info->sigpending[sig]->next != NULL && info->skip_alarm_xl8 > 0) {
                    /* Translating coarse fragments is very expensive so we
                     * avoid doing it when we're having trouble keeping up w/
                     * the alarm frequency (PR 213040), but we make sure we try
                     * every once in a while to avoid unbounded signal delay
                     */
                    info->skip_alarm_xl8--;
                    STATS_INC(num_signals_coarse_delayed);
                } else {
                    if (sig_is_alarm_signal(sig))
                        info->skip_alarm_xl8 = SKIP_ALARM_XL8_MAX;
                    receive_now = true;
                    LOG(THREAD, LOG_ASYNCH, 2,
                        "signal interrupted coarse fragment so delivering now\n");
                }
            } else {
                f = fragment_pclookup(dcontext, pc, &wrapper);
                ASSERT(f != NULL);
                ASSERT(!TEST(FRAG_COARSE_GRAIN, f->flags)); /* checked above */
                LOG(THREAD, LOG_ASYNCH, 2, "\tdelaying until exit F%d\n", f->id);
                if (interrupted_inlined_syscall(dcontext, f, pc)) {
                    /* PR 596147: if delayable signal arrives after syscall-skipping
                     * jmp, either at syscall or post-syscall, we deliver
                     * immediately, since we can't bound the delay
                     */
                    receive_now = true;
                    LOG(THREAD, LOG_ASYNCH, 2,
                        "signal interrupted pre/post syscall itself so delivering now\n");
                    /* We don't set at_auto_restart_syscall because we just leave
                     * the SA_RESTART kernel-supplied resumption point: with no
                     * post-syscall handler to worry about we have no need to
                     * change anything.
                     */
                } else {
                    /* could get another signal but should be in same fragment */
                    ASSERT(info->interrupted == NULL || info->interrupted == f);
                    if (info->interrupted != f) {
                        /* Just in case there's a prior, avoid leaving it unlinked. */
                        relink_interrupted_fragment(dcontext, info);
                        if (unlink_fragment_for_signal(dcontext, f, pc)) {
                            info->interrupted = f;
                            info->interrupted_pc = pc;
                        } else {
                            /* either was unlinked for trace creation, or we got another
                             * signal before exiting cache to handle 1st
                             */
                            ASSERT(info->interrupted == NULL || info->interrupted == f);
                        }
                    }
                }
            }
        } else {
            /* the signal interrupted code cache => run handler now! */
            receive_now = true;
            LOG(THREAD, LOG_ASYNCH, 2, "\tnot certain can delay so handling now\n");
        }
    } else if (in_generated_routine(dcontext, pc) ||
               /* XXX: should also check fine stubs */
               safe_is_in_coarse_stubs(dcontext, pc, xsp)) {
        /* Assumption: dynamo errors have been caught already inside
         * the main_signal_handler, thus any error in a generated routine
         * is an asynch signal that can be delayed
         */
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d) from gen routine or stub " PFX "\n", sig, pc);
        if (get_at_syscall(dcontext)) {
            /* i#1206: the syscall was interrupted, so we can go back to d_r_dispatch
             * and don't need to receive it now (which complicates post-syscall handling)
             * w/o any extra delay.
             */
            /* i#2659: we now use SA_RESTART to handle interrupting native
             * auto-restart syscalls.  That means we have to adjust do_syscall
             * interruption to give us control so we can deliver the signal.  Due to
             * needing to run post-syscall handlers (we don't want to get into nested
             * dcontexts like on Windows) it's simplest to go back to d_r_dispatch, which
             * is most easily done by emulating the non-SA_RESTART behavior.
             * XXX: This all seems backward: we should revisit this model and see if
             * we can get rid of this emulation and the auto-restart emulation.
             */
            /* The get_at_syscall() check above distinguishes from just having
             * arrived at the syscall instr, but with SA_RESTART we can't distinguish
             * not-yet-executed-syscall from syscall-was-interrupted-in-the-kernel.
             * This matters for sigreturn (i#2995), whose asynch_target points somewhere
             * other than right after the syscall, so we exclude it (it can't be
             * interrupted so we know we haven't executed it yet).
             */
            if (is_after_syscall_address(dcontext, pc + syslen) &&
                !is_sigreturn_syscall_number(sc->SC_SYSNUM_REG)) {
                LOG(THREAD, LOG_ASYNCH, 2,
                    "Adjusting interrupted auto-restart syscall from " PFX " to " PFX
                    "\n",
                    pc, pc + syslen);
                at_auto_restart_syscall = true;
                sc->SC_XIP += syslen;
                sc->SC_RETURN_REG = -EINTR;
                pc = (byte *)sc->SC_XIP;
            }
        }
        /* This could come from another thread's SYS_kill (via our gen do_syscall) */
        DOLOG(1, LOG_ASYNCH, {
            if (!is_after_syscall_address(dcontext, pc) && !forged &&
                !can_always_delay[sig]) {
                LOG(THREAD, LOG_ASYNCH, 1,
                    "WARNING: signal %d in gen routine: may cause problems!\n", sig);
            }
        });
        /* i#2019: for a signal arriving in gencode before entry to a fragment,
         * we need to unlink the fragment just like for a signal arriving inside
         * the fragment itself.
         * Multiple signals should all have the same asynch_target so we should
         * only need a single info->interrupted.
         */
        if (info->interrupted == NULL && !get_at_syscall(dcontext)) {
            f = find_next_fragment_from_gencode(dcontext, sc);
            if (f != NULL && !TEST(FRAG_COARSE_GRAIN, f->flags)) {
                if (unlink_fragment_for_signal(dcontext, f, FCACHE_ENTRY_PC(f))) {
                    info->interrupted = f;
                    info->interrupted_pc = FCACHE_ENTRY_PC(f);
                }
            }
        }
    } else if (get_at_syscall(dcontext) && pc == vsyscall_sysenter_return_pc - syslen &&
               /* See i#2995 comment above: rule out sigreturn */
               !is_sigreturn_syscall_number(sc->SC_SYSNUM_REG)) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "record_pending_signal(%d) from restart-vsyscall " PFX "\n", sig, pc);
        /* While the kernel points at int 0x80 for a restart, we leverage our
         * existing sysenter restart mechanism.
         */
        at_auto_restart_syscall = true;
        sc->SC_XIP = (reg_t)vsyscall_sysenter_return_pc;
        sc->SC_RETURN_REG = -EINTR;
        pc = (byte *)sc->SC_XIP;
    } else if (pc == vsyscall_sysenter_return_pc) {
        LOG(THREAD, LOG_ASYNCH, 2, "record_pending_signal(%d) from vsyscall " PFX "\n",
            sig, pc);
        /* i#1206: the syscall was interrupted but is not auto-restart, so we can go
         * back to d_r_dispatch and don't need to receive it now (which complicates
         * post-syscall handling)
         */
    } else if (thread_synch_check_state(dcontext, THREAD_SYNCH_NO_LOCKS) &&
               /* Avoid grabbing locks for xl8 while in a suspended state (i#3026). */
               ksynch_get_value(&ostd->suspended) == 0) {
        /* The signal interrupted DR or the client but it's at a safe spot so
         * deliver it now.
         */
        receive_now = true;
    } else {
        /* the signal interrupted DR itself => do not run handler now! */
        LOG(THREAD, LOG_ASYNCH, 2, "record_pending_signal(%d) from DR at pc " PFX "\n",
            sig, pc);
        if (!forged && !reroute && !can_always_delay[sig] &&
            !is_sys_kill(dcontext, pc, (byte *)sc->SC_XSP, &frame->info)) {
            /* i#195/PR 453964: don't re-execute if will just re-fault.
             * Our checks for dstack, etc. in main_signal_handler should
             * have accounted for everything
             */
            ASSERT_NOT_REACHED();
            abort_on_DR_fault(dcontext, pc, NULL, sig, frame,
                              (sig == SIGSEGV) ? "SEGV" : "other", " unknown");
        }
    }

    LOG(THREAD, LOG_ASYNCH, 3, "\taction is not SIG_IGN\n");
#if defined(X86) && defined(LINUX)
    LOG(THREAD, LOG_ASYNCH, 3, "\tretaddr = " PFX "\n",
        frame->pretcode); /* pretcode has same offs for plain */
#endif

    if (receive_now && reroute) {
        /* We can't delay, but we can't have interrupted any DR locks, so we
         * can call this now.
         */
        handled = reroute_to_unmasked_thread(dcontext, frame, sig);
        if (handled)
            receive_now = false;
        else
            blocked = true;
    }

    if (receive_now) {
        /* we need to translate sc before we know whether client wants to
         * suppress, so we need a backup copy
         */
        bool xl8_success;

        ASSERT(!at_auto_restart_syscall); /* only used for delayed delivery */

        sc_orig = *sc;
        ASSERT(!forged);
        /* cache the fragment since pclookup is expensive for coarse (i#658) */
        f = fragment_pclookup(dcontext, (cache_pc)sc->SC_XIP, &wrapper);
        xl8_success = translate_sigcontext(dcontext, ucxt, !can_always_delay[sig], f);

        if (can_always_delay[sig] && !xl8_success) {
            /* delay: we expect this for coarse fragments if alarm arrives
             * in middle of ind branch region or sthg (PR 213040)
             */
            LOG(THREAD, LOG_ASYNCH, 2,
                "signal is in un-translatable spot in coarse fragment: delaying\n");
            receive_now = false;
        }
    }

    if (receive_now) {

        /* N.B.: since we abandon the old context for synchronous signals,
         * we do not need to mark this fragment as FRAG_CANNOT_DELETE
         */
#ifdef DEBUG
        if (d_r_stats->loglevel >= 2 && (d_r_stats->logmask & LOG_ASYNCH) != 0 &&
            safe_is_in_fcache(dcontext, pc, xsp)) {
            ASSERT(f != NULL);
            LOG(THREAD, LOG_ASYNCH, 2, "Got signal at pc " PFX " in this fragment:\n",
                pc);
            disassemble_fragment(dcontext, f, false);
        }
#endif

        LOG(THREAD, LOG_ASYNCH, 2, "Going to receive signal now\n");
        /* If we end up executing the default action, we'll go native
         * since we translated the context.  If there's a handler,
         * we'll copy the context to the app stack and then adjust the
         * original on our stack so we take over.
         */
        execute_handler_from_cache(dcontext, sig, frame, &sc_orig, f, access_address);

    } else if (!handled) {

        /* i#182/PR 449996: must let client act on blocked non-delayable signals to
         * handle instrumentation faults.  Make sure we're at a safe spot: i.e.,
         * only raise for in-cache faults.  Checking forged and no-delay
         * to avoid the in-cache check for delayable signals => safer.
         */
        if (blocked && !forged && !reroute && !can_always_delay[sig] &&
            safe_is_in_fcache(dcontext, pc, xsp)) {
            /* cache the fragment since pclookup is expensive for coarse (i#658) */
            f = fragment_pclookup(dcontext, (cache_pc)sc->SC_XIP, &wrapper);
            sc_orig = *sc;
            translate_sigcontext(dcontext, ucxt, true /*shouldn't fail*/, f);

            if (!send_signal_to_client_and_handle_action(
                    dcontext, sig, frame, sc, &sc_orig, access_address, true /*blocked*/,
                    f, false /*handle deliver*/)) {
                ostd->processing_signal--;
                return;
            }
            /* restore original (untranslated) sc */
            *get_sigcontext_from_rt_frame(frame) = sc_orig;
        }

        /* i#196/PR 453847: avoid infinite loop of signals if try to re-execute */
        if (blocked && !can_always_delay[sig] &&
            !is_sys_kill(dcontext, pc, (byte *)sc->SC_XSP, &frame->info)) {
            ASSERT(default_action[sig] == DEFAULT_TERMINATE ||
                   default_action[sig] == DEFAULT_TERMINATE_CORE);
            LOG(THREAD, LOG_ASYNCH, 1,
                "blocked fatal signal %d cannot be delayed: terminating\n", sig);
            sc_orig = *sc;
            /* If forged we're likely couldbelinking, and we don't need to xl8. */
            if (forged)
                ASSERT(is_couldbelinking(dcontext));
            else
                translate_sigcontext(dcontext, ucxt, true /*shouldn't fail*/, NULL);
            /* the process should be terminated */
            execute_default_from_cache(dcontext, sig, frame, &sc_orig, forged);
            ASSERT_NOT_REACHED();
        }

        /* Happened in DR, do not translate context.  Record for later processing
         * at a safe point with a clean app state.
         */
        if (!blocked || sig >= OFFS_RT || (blocked && info->sigpending[sig] == NULL)) {
            /* only have 1 pending for blocked non-rt signals */

            /* to avoid accumulating signals if we're slow in presence of
             * a high-rate itimer we only keep 2 alarm signals (PR 596768)
             */
            if (sig_is_alarm_signal(sig)) {
                if (info->sigpending[sig] != NULL &&
                    info->sigpending[sig]->next != NULL) {
                    ASSERT(info->sigpending[sig]->next->next == NULL);
                    /* keep the oldest, replace newer w/ brand-new one, for
                     * more spread-out alarms
                     */
                    sigpending_t *temp = info->sigpending[sig];
                    info->sigpending[sig] = temp->next;
                    special_heap_free(info->sigheap, temp);
                    info->num_pending--;
                    LOG(THREAD, LOG_ASYNCH, 2, "3rd pending alarm %d => dropping 2nd\n",
                        sig);
                    STATS_INC(num_signals_dropped);
                    SYSLOG_INTERNAL_WARNING_ONCE("dropping 3rd pending alarm signal");
                }
            }
            /* special heap alloc always uses sizeof(sigpending_t) blocks */
            pend = special_heap_alloc(info->sigheap);
            ASSERT(sig > 0 && sig <= MAX_SIGNUM);
            info->num_pending++;
            if (info->num_pending > DYNAMO_OPTION(max_pending_signals) &&
                !info->multiple_pending_units)
                info->multiple_pending_units = true;
            if (info->num_pending >= DYNAMO_OPTION(max_pending_signals)) {
                /* We're at the limit of our special heap: one more and it will try to
                 * allocate a new unit, which is unsafe as it acquires locks.  We take
                 * several steps: we notify the user; we check for this on delivery as
                 * well and proactively allocate a new unit in a safer context.
                 * XXX: Perhaps we should drop some signals here?
                 */
                DO_ONCE({
                    char max_string[32];
                    snprintf(max_string, BUFFER_SIZE_ELEMENTS(max_string), "%d",
                             DYNAMO_OPTION(max_pending_signals));
                    NULL_TERMINATE_BUFFER(max_string);
                    SYSLOG(SYSLOG_WARNING, MAX_PENDING_SIGNALS, 3, get_application_name(),
                           get_application_pid(), max_string);
                });
            }

            pend->next = info->sigpending[sig];
            info->sigpending[sig] = pend;
            pend->unblocked_at_receipt = !blocked && !reroute;

            /* FIXME: note that for asynchronous signals we don't need to
             *  bother to record exact machine context, even entire frame,
             *  since don't want to pass dynamo pc context to app handler.
             *  only copy frame for synchronous signals?  those only
             *  happen while in cache?  but for asynch, we would have to
             *  construct our own frame...kind of a pain.
             */
            copy_frame_to_pending(dcontext, sig, frame, access_address);

            /* i#1145: check whether we should auto-restart an interrupted syscall */
            if (at_auto_restart_syscall) {
                /* Adjust the pending frame to restart the syscall, if applicable */
                sigframe_rt_t *frame = &(info->sigpending[sig]->rt_frame);
                sigcontext_t *sc_pend = get_sigcontext_from_rt_frame(frame);
                if (adjust_syscall_for_restart(dcontext, info, sig, sc_pend, f,
                                               orig_retval_reg)) {
                    /* We're going to re-start this syscall after we go
                     * back to d_r_dispatch, run the post-syscall handler (for -EINTR),
                     * and deliver the signal.  We've adjusted the sigcontext
                     * for re-start on the sigreturn, but we need to tell
                     * execute_handler_from_dispatch() to use our sigcontext
                     * and not the mcontext.
                     * A client will see a second set of pre + post handlers for
                     * the restart, which seems reasonable, given the signal in
                     * between.
                     */
                    info->sigpending[sig]->use_sigcontext = true;
                }
            }

        } else {
            /* For clients, we document that we do not pass to them
             * unless we're prepared to deliver to app.  We would have
             * to change our model to pass them non-final-translated
             * contexts for delayable signals in order to give them
             * signals as soon as they come in.  Xref i#182/PR 449996.
             */
            LOG(THREAD, LOG_ASYNCH, 3,
                "\tnon-rt signal already in queue, ignoring this one!\n");
        }

        if (!blocked && dcontext->signals_pending == 0)
            dcontext->signals_pending = 1;
    }
    ostd->processing_signal--;
}

/* Distinguish SYS_kill-generated from instruction-generated signals.
 * If sent from another process we can't tell, but if sent from this
 * thread the interruption point should be our own post-syscall.
 * FIXME PR 368277: for other threads in same process we should set a flag
 * and identify them as well.
 * FIXME: for faults like SIGILL we could examine the interrupted pc
 * to see whether it is capable of generating such a fault (see code
 * used in handle_nudge_signal()).
 */
static bool
is_sys_kill(dcontext_t *dcontext, byte *pc, byte *xsp, kernel_siginfo_t *info)
{
#if !defined(VMX86_SERVER) /* does not use SI_KERNEL */
    /* i#133: use si_code to distinguish user-sent signals.
     * Even 2.2 Linux kernel supports <=0 meaning user-sent (except
     * SIGIO) so we assume we can rely on it.
     */
    if (info->si_code <= 0)
        return true;
#endif
    return (is_at_do_syscall(dcontext, pc, xsp) &&
            (dcontext->sys_num == SYS_kill ||
#ifdef LINUX
             dcontext->sys_num == SYS_tkill || dcontext->sys_num == SYS_tgkill ||
             dcontext->sys_num == SYS_rt_sigqueueinfo ||
             dcontext->sys_num == SYS_rt_tgsigqueueinfo
#elif defined(MACOS)
             dcontext->sys_num == SYS___pthread_kill
#endif
             ));
}

static byte *
compute_memory_target(dcontext_t *dcontext, cache_pc instr_cache_pc,
                      kernel_ucontext_t *uc, kernel_siginfo_t *si, bool *write)
{
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);
    byte *target = NULL;
    instr_t instr;
    priv_mcontext_t mc;
    uint memopidx, memoppos, memopsize;
    opnd_t memop;
    bool found_target = false;
    bool in_maps;
    bool use_allmem = false;
    uint prot;
    IF_ARM(dr_isa_mode_t old_mode;)

    LOG(THREAD, LOG_ALL, 2,
        "computing memory target for " PFX " causing SIGSEGV, kernel claims it is " PFX
        "\n",
        instr_cache_pc, (byte *)si->si_addr);
    /* ARM's sigcontext_t has a "fault_address" field but it also seems unreliable */
    IF_ARM(LOG(THREAD, LOG_ALL, 2, "fault_address: " PFX "\n", sc->fault_address));

    /* We used to do a memory query to check if instr_cache_pc is readable, but
     * now we use TRY/EXCEPT because we don't have the instr length and the OS
     * query is expensive.  If decoding faults, the signal handler will longjmp
     * out before it calls us recursively.
     */
    instr_init(dcontext, &instr);
    IF_ARM({
        /* Be sure to use the interrupted mode and not the last-dispatch mode */
        dr_set_isa_mode(dcontext, get_pc_mode_from_cpsr(sc), &old_mode);
    });
    TRY_EXCEPT(
        dcontext, { decode(dcontext, instr_cache_pc, &instr); },
        {
            return NULL; /* instr_cache_pc was unreadable */
        });
    IF_ARM(dr_set_isa_mode(dcontext, old_mode, NULL));

    if (!instr_valid(&instr)) {
        LOG(THREAD, LOG_ALL, 2,
            "WARNING: got SIGSEGV for invalid instr at cache pc " PFX "\n",
            instr_cache_pc);
        ASSERT_NOT_REACHED();
        instr_free(dcontext, &instr);
        return NULL;
    }

    ucontext_to_mcontext(&mc, uc);
    ASSERT(write != NULL);

    /* i#1009: If si_addr is plausibly one of the memory operands of the
     * faulting instruction, assume the target was si_addr.  If none of the
     * memops match, fall back to checking page protections, which can be racy.
     * For si_addr == NULL, we fall back to the protection check because it's
     * too likely to be a valid memop and we can live with a race on a page that
     * is typically unmapped.
     */
    if (si->si_code == SEGV_ACCERR && si->si_addr != NULL) {
        for (memopidx = 0; instr_compute_address_ex_priv(&instr, &mc, memopidx, &target,
                                                         write, &memoppos);
             memopidx++) {
            /* i#1045: check whether operand and si_addr overlap */
            memop = *write ? instr_get_dst(&instr, memoppos)
                           : instr_get_src(&instr, memoppos);
            memopsize = opnd_size_in_bytes(opnd_get_size(memop));
            LOG(THREAD, LOG_ALL, 2, "memory operand %u has address " PFX " and size %u\n",
                memopidx, target, memopsize);
            if ((byte *)si->si_addr >= target &&
                (byte *)si->si_addr < target + memopsize) {
                target = (byte *)si->si_addr;
                found_target = true;
                break;
            }
        }
    }

    /* For fcache faults, use all_memory_areas, which is faster but acquires
     * locks.  If it's possible we're in DR, go to the OS to avoid deadlock.
     */
    if (DYNAMO_OPTION(use_all_memory_areas)) {
        use_allmem = safe_is_in_fcache(dcontext, instr_cache_pc, (byte *)sc->SC_XSP);
    }
    if (!found_target) {
        if (si->si_addr != NULL) {
            LOG(THREAD, LOG_ALL, 3, "%s: falling back to racy protection checks\n",
                __FUNCTION__);
        }
        /* i#115/PR 394984: consider all memops */
        for (memopidx = 0;
             instr_compute_address_ex_priv(&instr, &mc, memopidx, &target, write, NULL);
             memopidx++) {
            if (use_allmem) {
                in_maps = get_memory_info(target, NULL, NULL, &prot);
            } else {
                in_maps = get_memory_info_from_os(target, NULL, NULL, &prot);
            }
            if ((!in_maps || !TEST(MEMPROT_READ, prot)) ||
                (*write && !TEST(MEMPROT_WRITE, prot))) {
                found_target = true;
                break;
            }
        }
    }

    if (!found_target) {
        /* probably an NX fault: how tell whether kernel enforcing? */
        in_maps = get_memory_info_from_os(instr_cache_pc, NULL, NULL, &prot);
        if (!in_maps || !TEST(MEMPROT_EXEC, prot)) {
            target = instr_cache_pc;
            found_target = true;
        }
    }

    /* we may still not find target, e.g. for SYS_kill(SIGSEGV) */
    if (!found_target)
        target = NULL;
    DOLOG(2, LOG_ALL, {
        LOG(THREAD, LOG_ALL, 2,
            "For SIGSEGV at cache pc " PFX ", computed target %s " PFX "\n",
            instr_cache_pc, *write ? "write" : "read", target);
        d_r_loginst(dcontext, 2, &instr, "\tfaulting instr");
    });
    instr_free(dcontext, &instr);
    return target;
}

/* If native_state is true, assumes the fault is not in the cache and thus
 * does not need translation but rather should always be re-executed.
 */
static bool
check_for_modified_code(dcontext_t *dcontext, cache_pc instr_cache_pc,
                        kernel_ucontext_t *uc, byte *target, bool native_state)
{
    /* special case: we expect a seg fault for executable regions
     * that were writable and marked read-only by us.
     * have to figure out the target address!
     * unfortunately the OS doesn't tell us, nor whether it's a write.
     * FIXME: if sent from SYS_kill(SIGSEGV), the pc will be post-syscall,
     * and if that post-syscall instr is a write that could have faulted,
     * how can we tell the difference?
     */
    if (was_executable_area_writable(target)) {
        /* translate instr_cache_pc to original app pc
         * DO NOT use translate_sigcontext, don't want to change the
         * signal frame or else we'll lose control when we try to
         * return to signal pc!
         */
        app_pc next_pc, translated_pc = NULL;
        fragment_t *f = NULL;
        fragment_t wrapper;
        ASSERT((cache_pc)SIGCXT_FROM_UCXT(uc)->SC_XIP == instr_cache_pc);
        if (!native_state) {
            /* For safe recreation we need to either be couldbelinking or hold
             * the initexit lock (to keep someone from flushing current
             * fragment), the initexit lock is easier
             */
            d_r_mutex_lock(&thread_initexit_lock);
            /* cache the fragment since pclookup is expensive for coarse units (i#658) */
            f = fragment_pclookup(dcontext, instr_cache_pc, &wrapper);
            translated_pc = recreate_app_pc(dcontext, instr_cache_pc, f);
            ASSERT(translated_pc != NULL);
            d_r_mutex_unlock(&thread_initexit_lock);
        }

        next_pc =
            handle_modified_code(dcontext, instr_cache_pc, translated_pc, target, f);

        if (!native_state) {
            /* going to exit from middle of fragment (at the write) so will mess up
             * trace building
             */
            if (is_building_trace(dcontext)) {
                LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
                trace_abort(dcontext);
            }
        }

        if (next_pc == NULL) {
            /* re-execute the write -- just have main_signal_handler return */
            return true;
        } else {
            ASSERT(!native_state);
            /* Do not resume execution in cache, go back to d_r_dispatch. */
            transfer_from_sig_handler_to_fcache_return(
                dcontext, uc, NULL, SIGSEGV, next_pc,
                (linkstub_t *)get_selfmod_linkstub(), false);
            /* now have main_signal_handler return */
            return true;
        }
    }
    return false;
}

#ifndef HAVE_SIGALTSTACK
/* The exact layout of this struct is relied on in main_signal_handler()
 * in x86.asm.
 */
struct clone_and_swap_args {
    byte *stack;
    byte *tos;
};

/* Helper function for swapping handler to dstack */
bool
sig_should_swap_stack(struct clone_and_swap_args *args, kernel_ucontext_t *ucxt)
{
    byte *cur_esp;
    dcontext_t *dcontext = get_thread_private_dcontext();
    if (dcontext == NULL)
        return false;
    GET_STACK_PTR(cur_esp);
    if (!is_on_dstack(dcontext, cur_esp)) {
        sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
        /* Pass back the proper args to clone_and_swap_stack: we want to
         * copy to dstack from the tos at the signal interruption point.
         */
        args->stack = dcontext->dstack;
        /* leave room for fpstate */
        args->stack -= signal_frame_extra_size(true);
        args->stack = (byte *)ALIGN_BACKWARD(args->stack, XSTATE_ALIGNMENT);
        args->tos = (byte *)sc->SC_XSP;
        return true;
    } else
        return false;
}
#endif

/* Helper that takes over the current thread signaled via SUSPEND_SIGNAL.  Kept
 * separate mostly to keep the priv_mcontext_t allocation out of
 * main_signal_handler_C.
 * If it returns, it returns false, and the signal should be squashed.
 */
static bool
sig_take_over(kernel_ucontext_t *uc)
{
    priv_mcontext_t mc;
    ucontext_to_mcontext(&mc, uc);
    /* We don't want our own blocked signals: we want the app's, stored in the frame. */
    if (!os_thread_take_over(&mc, SIGMASK_FROM_UCXT(uc)))
        return false;
    ASSERT_NOT_REACHED(); /* shouldn't return */
    return true;          /* make compiler happy */
}

static bool
is_safe_read_ucxt(kernel_ucontext_t *ucxt)
{
    app_pc pc = (app_pc)SIGCXT_FROM_UCXT(ucxt)->SC_XIP;
    return is_safe_read_pc(pc);
}

/* the main signal handler
 * WARNING: behavior varies with different versions of the kernel!
 * sigaction support was only added with 2.2
 */
#ifndef X86_32
/* stub in x86.asm passes our xsp to us */
#    ifdef MACOS
void
main_signal_handler_C(handler_t handler, int style, int sig, kernel_siginfo_t *siginfo,
                      kernel_ucontext_t *ucxt, byte *xsp)
#    else
void
main_signal_handler_C(int sig, kernel_siginfo_t *siginfo, kernel_ucontext_t *ucxt,
                      byte *xsp)
#    endif
#else
/* On ia32, adding a parameter disturbs the frame we're trying to capture, so we
 * add an intermediate frame and read the normal params off the stack directly.
 */
void
main_signal_handler_C(byte *xsp)
#endif
{
#if defined(MACOS64) && defined(X86)
    /* The kernel aligns to 16 after setting up the frame, so we instead compute
     * from the siginfo pointer.
     * XXX: 32-bit does the same thing: how was it working?!?
     */
    sigframe_rt_t *frame = (sigframe_rt_t *)((byte *)siginfo - sizeof(frame->mc));
    /* The kernel's method of aligning overshoots. */
#    define KERNEL_ALIGN_BACK(val, align) (((val)-align) & -(align))
    /* If this assert fails, we may be seeing an AVX512 frame. */
    ASSERT(KERNEL_ALIGN_BACK((ptr_uint_t)frame, 16) - 8 == (ptr_uint_t)xsp &&
           "AVX512 frames not yet supported");
#else
    sigframe_rt_t *frame = (sigframe_rt_t *)xsp;
#endif
#ifdef X86_32
    /* Read the normal arguments from the frame. */
    int sig = frame->sig;
    kernel_siginfo_t *siginfo = frame->pinfo;
    kernel_ucontext_t *ucxt = frame->puc;
#endif /* !X64 */
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    thread_record_t *tr;
#ifdef DEBUG
    uint level = 2;
#    if !defined(HAVE_MEMINFO)
    /* avoid logging every single TRY probe fault */
    if (!dynamo_initialized)
        level = 5;
#    endif
#endif
    bool local;
#if defined(MACOS) && !defined(X64)
    /* The kernel clears fs, so we have to re-instate our selector, if
     * it was set in the first place.
     */
    if (sc->__ss.__fs != 0)
        tls_reinstate_selector(sc->__ss.__fs);
#endif
#ifdef X86
    /* i#2089: For is_thread_tls_initialized() we need a safe_read path that does not
     * do any logging or call get_thread_private_dcontext() as those will recurse.
     * This path is global so there's no SELF_PROTECT_LOCAL and we also bypass
     * the ENTERING_DR() for this short path.
     */
    if (sig == SIGSEGV && sc->SC_XIP == (ptr_uint_t)safe_read_tls_magic) {
        sc->SC_RETURN_REG = 0;
        sc->SC_XIP = (reg_t)safe_read_tls_magic_recover;
        return;
    } else if (sig == SIGSEGV && sc->SC_XIP == (ptr_uint_t)safe_read_tls_self) {
        sc->SC_RETURN_REG = 0;
        sc->SC_XIP = (reg_t)safe_read_tls_self_recover;
        return;
    } else if (sig == SIGSEGV && sc->SC_XIP == (ptr_uint_t)safe_read_tls_app_self) {
        sc->SC_RETURN_REG = 0;
        sc->SC_XIP = (reg_t)safe_read_tls_app_self_recover;
        return;
    }
#endif

    dcontext_t *dcontext = get_thread_private_dcontext();

#if defined(MACOS) && !defined(AARCH64)
#    ifdef X64
    ASSERT((YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX64)) ||
           (!YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT64)));
#    else
    ASSERT((YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT_AVX32)) ||
           (!YMM_ENABLED() && ucxt->uc_mcsize == sizeof(_STRUCT_MCONTEXT)));
#    endif
#endif

    /* i#350: To support safe_read or TRY_EXCEPT without a dcontext, use the
     * global dcontext
     * when handling safe_read faults.  This lets us pass the check for a
     * dcontext below and causes us to use the global log.
     */
    if (dcontext == NULL && (sig == SIGSEGV || sig == SIGBUS) &&
        (is_safe_read_ucxt(ucxt) ||
         (!dynamo_initialized && global_try_except.try_except_state != NULL &&
          global_try_tid == get_sys_thread_id()))) {
        dcontext = GLOBAL_DCONTEXT;
    }

    if (dynamo_exited && d_r_get_num_threads() > 1 && sig == SIGSEGV) {
        /* PR 470957: this is almost certainly a race so just squelch it.
         * We live w/ the risk that it was holding a lock our release-build
         * exit code needs.
         */
        exit_thread_syscall(1);
    }
    /* FIXME: ensure the path for recording a pending signal does not grab any DR locks
     * that could have been interrupted
     * e.g., synchronize_dynamic_options grabs the stats_lock!
     */
    if (sig == SUSPEND_SIGNAL) {
        if (proc_get_vendor() == VENDOR_AMD) {
            /* i#3356: Work around an AMD processor bug where it does not clear the
             * hidden gs base when the gs selector is written.  Pre-4.7 Linux kernels
             * leave the prior thread's base in place on a switch due to this.
             * We can thus come here and get the wrong dcontext on attach; worse,
             * we can get NULL here but the wrong one later during init.  It's
             * safest to just set a non-zero value (the kernel ignores zero) for all
             * unknown threads here.  There are no problems for non-attach takeover.
             */
            if (dcontext == NULL || dcontext->owning_thread != get_sys_thread_id()) {
                /* tls_thread_preinit() further rules out a temp-native dcontext
                 * and avoids clobbering it, to preserve the thread_lookup() case
                 * below (which we do not want to run first as we could swap to
                 * the incorrect dcontext midway through it).
                 */
                if (!tls_thread_preinit()) {
                    SYSLOG_INTERNAL_ERROR_ONCE("ERROR: Failed to work around AMD context "
                                               "switch bug #3356: crashes or "
                                               "hangs may ensue...");
                }
                dcontext = NULL;
            }
        }
    }
    if (dcontext == NULL &&
        /* Check for a temporarily-native thread we're synch-ing with. */
        (sig == SUSPEND_SIGNAL
#ifdef X86
         ||
         (INTERNAL_OPTION(safe_read_tls_init) &&
          /* Check for whether this is a thread with its invalid sentinel magic set.
           * In this case, we assume that it is either a thread that is currently
           * temporarily-native via API like DR_EMIT_GO_NATIVE, or a thread in the
           * clone window. We know by inspection of our own code that it is safe to
           * call thread_lookup for either case thread makes a clone or was just
           * cloned. i.e. thread_lookup requires a lock that must not be held by the
           * calling thread (i#2921).
           * XXX: what is ARM doing, any special case w/ dcontext == NULL?
           */
          /* Don't do this if we're detaching as we risk a safe-read fault after
           * we've removed our handler (i#3535).
           */
          detacher_tid == INVALID_THREAD_ID && safe_read_tls_magic() == TLS_MAGIC_INVALID)
#endif
             )) {
        tr = thread_lookup(get_sys_thread_id());
        if (tr != NULL)
            dcontext = tr->dcontext;
    }
    if (dcontext == NULL ||
        (dcontext != GLOBAL_DCONTEXT &&
         (dcontext->signal_field == NULL ||
          !((thread_sig_info_t *)dcontext->signal_field)->fully_initialized))) {
        /* XXX i#26: this could be a signal arbitrarily sent to this thread.
         * We could try to route it to another thread, using a global queue
         * of pending signals.  But what if it was targeted to this thread
         * via SYS_{tgkill,tkill}?  Can we tell the difference, even if
         * we watch the kill syscalls: could come from another process?
         * For now we use our native thread signal delivery mechanism.
         */
        if (sig_is_alarm_signal(sig)) {
            /* Assuming an alarm during thread exit or init (xref PR 596127,
             * i#359).  Delivering it sometimes works, but we could be mid-init
             * where it's not easy and we could crash b/c of in-between state
             * (observed for alarm mid-signal_thread_init where info->sighand->action
             * is still NULL).
             */
            return;
        } else if (sig == SUSPEND_SIGNAL && dcontext == NULL) {
            /* We sent SUSPEND_SIGNAL to a thread we don't control (no
             * dcontext), which means we want to take over.
             */
            ASSERT(!doing_detach);
            if (!sig_take_over(ucxt))
                return;
            ASSERT_NOT_REACHED(); /* else, shouldn't return */
        } else {
            LOG(GLOBAL, LOG_ASYNCH, 1,
                "signal with no siginfo (tid=%d, sig=%d): attempting to deliver to "
                "native thread\n",
                get_sys_thread_id(), sig);
            DOLOG(1, LOG_ASYNCH, { dump_sigcontext(GLOBAL_DCONTEXT, sc); });
        }

        /* For can_always_delay[sig] we could just return and drop it, but we
         * try to perturb the app behavior less with a native signal frame:
         */
        execute_native_handler(dcontext, sig, frame);
        return;
    }

    /* we may be entering dynamo from code cache! */
    /* Note that this is unsafe if -single_thread_in_DR => we grab a lock =>
     * hang if signal interrupts DR: but we don't really support that option
     */
    ENTERING_DR();
    if (dcontext == GLOBAL_DCONTEXT) {
        local = false;
        tr = thread_lookup(get_sys_thread_id());
    } else {
        tr = dcontext->thread_record;
        local = local_heap_protected(dcontext);
        if (local)
            SELF_PROTECT_LOCAL(dcontext, WRITABLE);
    }
    /* i#1921: For proper native execution with re-takeover we need to propagate
     * signals to app handlers while native.  For now we do not support re-takeover
     * and we give up our handlers via signal_remove_handlers().
     */
    ASSERT(tr == NULL || tr->under_dynamo_control || IS_CLIENT_THREAD(dcontext) ||
           sig == SUSPEND_SIGNAL);

    LOG(THREAD, LOG_ASYNCH, level,
        "\nmain_signal_handler: thread=%d, sig=%d, xsp=" PFX ", retaddr=" PFX "\n",
        get_sys_thread_id(), sig, xsp, *((byte **)xsp));
    LOG(THREAD, LOG_ASYNCH, level + 1,
        "siginfo: sig = %d, pid = %d, status = %d, errno = %d, si_code = %d\n",
        siginfo->si_signo, siginfo->si_pid, siginfo->si_status, siginfo->si_errno,
        siginfo->si_code);
    DOLOG(level + 1, LOG_ASYNCH, { dump_sigcontext(dcontext, sc); });

#if defined(X86_32) && !defined(VMX86_SERVER) && defined(LINUX)
    /* FIXME case 6700: 2.6.9 (FC3) kernel sets up our frame with a pretcode
     * of 0x440.  This happens if our restorer is unspecified (though 2.6.9
     * src code shows setting the restorer to a default value in that case...)
     * or if we explicitly point at dynamorio_sigreturn.  I couldn't figure
     * out why it kept putting 0x440 there.  So we fix the issue w/ this
     * hardcoded return.
     * This hack causes vmkernel to kill the process on sigreturn due to
     * vmkernel's non-standard sigreturn semantics.  PR 404712.
     */
    *((byte **)xsp) = (byte *)dynamorio_sigreturn;
#endif

    /* N.B.:
     * ucontext_t is defined in two different places.  The one we get
     * included is /usr/include/sys/ucontext.h, which would have us
     * doing this:
     *     void *pc = (void *) ucxt->uc_mcontext.gregs[EIP];
     * However, EIP is not defined for us (used to be in older
     * RedHat version) unless we define __USE_GNU, which we don't want to do
     * for other reasons, so we'd have to also say:
     *     #define EIP 14
     * Instead we go by the ucontext_t definition in
     * /usr/include/asm/ucontext.h, which has it containing a sigcontext struct,
     * defined in /usr/include/asm/sigcontext.h.  This is the definition used
     * by the kernel.  The two definitions are field-for-field
     * identical except that the sys one has an fpstate struct at the end --
     * but the next field in the frame is an fpstate.  The only mystery
     * is why the rt frame is declared as ucontext instead of sigcontext.
     * The kernel's version of ucontext must be the asm one!
     * And the sys one grabs the next field of the frame.
     * Also note that mcontext_t.fpregs == sigcontext.fpstate is NULL if
     * floating point operations have not been used (lazy fp state saving).
     * Also, sigset_t has different sizes according to kernel (8 bytes) vs.
     * glibc (128 bytes?).
     */

    switch (sig) {

    case SIGBUS: /* PR 313665: look for DR crashes on unaligned memory or mmap bounds */
    case SIGSEGV: {
        /* Older kernels do NOT fill out the signal-specific fields of siginfo,
         * except for SIGCHLD.  Thus we cannot do this:
         *     void *pc = (void*) siginfo->si_addr;
         * Thus we must use the third argument, which is a ucontext_t (see above)
         */
        void *pc = (void *)sc->SC_XIP;
        bool syscall_signal = false; /* signal came from syscall? */
        bool is_write = false;
        byte *target;
        bool is_DR_exception = false;

#ifdef SIDELINE
        if (dcontext == NULL) {
            SYSLOG_INTERNAL_ERROR("seg fault in sideline thread -- NULL dcontext!");
            ASSERT_NOT_REACHED();
        }
#endif
        if (is_safe_read_ucxt(ucxt) ||
            (!dynamo_initialized && global_try_except.try_except_state != NULL) ||
            (dcontext != GLOBAL_DCONTEXT &&
             dcontext->try_except.try_except_state != NULL)) {
            /* handle our own TRY/EXCEPT */
            try_except_context_t *try_cxt;
#ifdef HAVE_MEMINFO
            /* our probe produces many of these every run */
            /* since we use for safe_*, making a _ONCE */
            SYSLOG_INTERNAL_WARNING_ONCE("(1+x) Handling our fault in a TRY at " PFX, pc);
#endif
            LOG(THREAD, LOG_ALL, level, "TRY fault at " PFX "\n", pc);
            ASSERT(dynamo_initialized || global_try_except.try_except_state == NULL ||
                   global_try_tid == get_sys_thread_id());
            if (TEST(DUMPCORE_TRY_EXCEPT, DYNAMO_OPTION(dumpcore_mask)))
                os_dump_core("try/except fault");

            if (is_safe_read_ucxt(ucxt)) {
                sc->SC_XIP = (reg_t)safe_read_resume_pc();
                /* Break out to log the normal return from the signal handler.
                 */
                break;
            }
            try_cxt = (dcontext != NULL && dcontext != GLOBAL_DCONTEXT)
                ? dcontext->try_except.try_except_state
                : global_try_except.try_except_state;
            ASSERT(try_cxt != NULL);

            /* The exception interception code did an ENTER so we must EXIT here */
            EXITING_DR();
            /* Since we have no sigreturn we have to restore the mask
             * manually, just like siglongjmp().  i#226/PR 492568: we rely
             * on the kernel storing the prior mask in ucxt, so we do not
             * need to store it on every setjmp.
             */
            /* Verify that there's no scenario where the mask gets changed prior
             * to a fault inside a try.  This relies on dr_setjmp_sigmask() filling
             * in the mask, which we only bother to do in debug build.
             */
            ASSERT(memcmp(&try_cxt->context.sigmask, &ucxt->uc_sigmask,
                          sizeof(ucxt->uc_sigmask)) == 0);
            sigprocmask_syscall(SIG_SETMASK, SIGMASK_FROM_UCXT(ucxt), NULL,
                                sizeof(ucxt->uc_sigmask));
            DR_LONGJMP(&try_cxt->context, LONGJMP_EXCEPTION);
            ASSERT_NOT_REACHED();
        }
        ASSERT(dcontext != GLOBAL_DCONTEXT); /* Had to be try/except. */

        target = compute_memory_target(dcontext, pc, ucxt, siginfo, &is_write);

        if (CLIENTS_EXIST() && is_in_client_lib(pc)) {
            /* i#1354: client might write to a page we made read-only.
             * If so, handle the fault and re-execute it, if it's safe to do so
             * (we document these criteria under DR_MEMPROT_PRETEND_WRITE).
             */
            if (is_write && !is_couldbelinking(dcontext) && OWN_NO_LOCKS(dcontext) &&
                check_for_modified_code(dcontext, pc, ucxt, target, true /*native*/))
                break;
#ifdef STATIC_LIBRARY
            /* i#4640: We cannot distinguish the client from DR or the app.  Though
             * it's rare, original app instructions can come here for some app
             * setups for static DR during init.  We send to the app handler.
             */
            execute_native_handler(dcontext, sig, frame);
            return;
#endif
            abort_on_fault(dcontext, DUMPCORE_CLIENT_EXCEPTION, pc, target, sig, frame,
                           exception_label_client, (sig == SIGSEGV) ? "SEGV" : "BUS",
                           " client library");
            ASSERT_NOT_REACHED();
        }

        /* For !HAVE_MEMINFO, we cannot compute the target until
         * after the try/except check b/c compute_memory_target()
         * calls get_memory_info_from_os() which does a probe: and the
         * try/except could be from a probe itself.  A try/except that
         * triggers a stack overflow should recover on the longjmp, so
         * this order should be fine.
         */

        /* FIXME: share code with Windows callback.c */
        /* FIXME PR 205795: in_fcache and is_dynamo_address do grab locks! */
        if ((is_on_dstack(dcontext, (byte *)sc->SC_XSP)
             /* PR 302951: clean call arg processing => pass to app/client.
              * Rather than call the risky in_fcache we check whereami. */
             && (dcontext->whereami != DR_WHERE_FCACHE)) ||
            is_on_alt_stack(dcontext, (byte *)sc->SC_XSP) ||
            is_on_initstack((byte *)sc->SC_XSP)) {
            /* Checks here need to cover everything that record_pending_signal()
             * thinks is non-fcache, non-gencode: else that routine will kill
             * process since can't delay or re-execute (i#195/PR 453964).
             */
            is_DR_exception = true;
        } else if (!safe_is_in_fcache(dcontext, pc, (byte *)sc->SC_XSP) &&
                   (in_generated_routine(dcontext, pc) ||
                    is_at_do_syscall(dcontext, pc, (byte *)sc->SC_XSP) ||
                    is_dynamo_address(pc))) {
            if (!in_generated_routine(dcontext, pc) &&
                !is_at_do_syscall(dcontext, pc, (byte *)sc->SC_XSP)) {
                /* PR 451074: client needs a chance to handle exceptions in its
                 * own gencode.  client_exception_event() won't return if client
                 * wants to re-execute faulting instr.
                 */
                if (!send_signal_to_client_and_handle_action(
                        dcontext, sig, frame, get_sigcontext_from_rt_frame(frame), sc,
                        target, false /*!blocked*/, NULL, true /*no_deliver*/)) {
                    /* client handled fault */
                    break;
                }
            }
            is_DR_exception = true;
        }
        if (is_DR_exception) {
            /* kill(getpid(), SIGSEGV) looks just like a SIGSEGV in the store of eax
             * to mcontext after the syscall instr in do_syscall -- try to distinguish:
             */
            if (is_sys_kill(dcontext, pc, (byte *)sc->SC_XSP, siginfo)) {
                LOG(THREAD, LOG_ALL, 2,
                    "assuming SIGSEGV at post-do-syscall is kill, not our write fault\n");
                syscall_signal = true;
            }
            if (!syscall_signal) {
                if (check_in_last_thread_vm_area(dcontext, target)) {
                    /* See comments in callback.c as well.
                     * FIXME: try to share code
                     */
                    SYSLOG_INTERNAL_WARNING("(decode) exception in last area, "
                                            "DR pc=" PFX ", app pc=" PFX,
                                            pc, target);
                    STATS_INC(num_exceptions_decode);
                    if (is_building_trace(dcontext)) {
                        LOG(THREAD, LOG_ASYNCH, 2,
                            "intercept_exception: "
                            "squashing old trace\n");
                        trace_abort(dcontext);
                    }
                    /* we do get faults when not building a bb: e.g.,
                     * ret_after_call_check does decoding (case 9396) */
                    if (dcontext->bb_build_info != NULL) {
                        /* must have been building a bb at the time */
                        bb_build_abort(dcontext, true /*clean vm area*/, true /*unlock*/);
                    }
                    /* Since we have no sigreturn we have to restore the mask manually */
                    unblock_all_signals(NULL);
                    /* Let's pass it back to the application - memory is unreadable */
                    if (TEST(DUMPCORE_FORGE_UNREAD_EXEC, DYNAMO_OPTION(dumpcore_mask)))
                        os_dump_core("Warning: Racy app execution (decode unreadable)");
                    os_forge_exception(target, UNREADABLE_MEMORY_EXECUTION_EXCEPTION);
                    ASSERT_NOT_REACHED();
                } else {
                    abort_on_DR_fault(dcontext, pc, target, sig, frame,
                                      (sig == SIGSEGV) ? "SEGV" : "BUS",
                                      in_generated_routine(dcontext, pc) ? " generated"
                                                                         : "");
                }
            }
        }
        /* if get here, pass the signal to the app */

        ASSERT(pc != 0); /* shouldn't get here */
        if (sig == SIGSEGV && !syscall_signal /*only for in-cache signals*/) {
            /* special case: we expect a seg fault for executable regions
             * that were writable and marked read-only by us.
             */
            if (is_write &&
                check_for_modified_code(dcontext, pc, ucxt, target, false /*!native*/)) {
                /* it was our signal, so don't pass to app -- return now */
                break;
            }
        }
        /* pass it to the application (or client) */
        if (dcontext->currently_stopped) {
            execute_native_handler(dcontext, sig, frame);
            break;
        }
        LOG(THREAD, LOG_ALL, 1,
            "** Received SIG%s at cache pc " PFX " in thread " TIDFMT "\n",
            (sig == SIGSEGV) ? "SEGV" : "BUS", pc, d_r_get_thread_id());
        ASSERT(syscall_signal || safe_is_in_fcache(dcontext, pc, (byte *)sc->SC_XSP));
        /* we do not call trace_abort() here since we may need to
         * translate from a temp private bb (i#376): but all paths
         * that deliver the signal or redirect will call it
         */
        record_pending_signal(dcontext, sig, ucxt, frame, false, target);
        break;
    }

    case SIGALRM:
    case SIGVTALRM:
    case SIGPROF:
        if (handle_alarm(dcontext, sig, ucxt))
            record_pending_signal(dcontext, sig, ucxt, frame, false, NULL);
        /* else, don't deliver to app */
        break;

#ifdef SIDELINE
    case SIGCHLD: {
        int status = siginfo->si_status;
        if (siginfo->si_pid == 0) {
            /* FIXME: with older versions of linux the sigchld fields of
             * siginfo are not filled in properly!
             * This is my attempt to handle that, pid seems to be 0
             */
            break;
        }
        if (status != 0) {
            LOG(THREAD, LOG_ALL, 0, "*** Child thread died with error %d\n", status);
            ASSERT_NOT_REACHED();
        }
        break;
    }
#endif

    default: {
        /* Handle nudges and suspends, which may use the same signal. */
        if (sig == SUSPEND_SIGNAL) {
            if (handle_suspend_signal(dcontext, siginfo, ucxt, frame)) {
                /* i#1921: see comment above */
                ASSERT(tr == NULL || tr->under_dynamo_control ||
                       IS_CLIENT_THREAD(dcontext));
                record_pending_signal(dcontext, sig, ucxt, frame, false, NULL);
            }
            /* else, don't deliver to app */
            break;
        }
        /* This might be the same signal as SUSPEND_SIGNAL in which case this else() won't
         * be entered (and handle_suspend_signal() will handle nudges too).
         */
        else if (sig == NUDGESIG_SIGNUM) {
            if (handle_nudge_signal(dcontext, siginfo, ucxt))
                record_pending_signal(dcontext, sig, ucxt, frame, false, NULL);
            /* else, don't deliver to app */
            break;
        }
        record_pending_signal(dcontext, sig, ucxt, frame, false, NULL);
        break;
    }
    } /* end switch */

    LOG(THREAD, LOG_ASYNCH, level,
        "\tmain_signal_handler %d returning now to " PFX "\n\n", sig, sc->SC_XIP);

    /* Ensure we didn't get the app's sigstack into our frame.  On Mac, the kernel
     * doesn't use the frame's uc_stack, so we limit this to Linux.
     * The pointers may be different if a thread is on its way to exit, and the app's
     * sigstack was already restored (i#3369).
     */
    IF_LINUX(ASSERT(dcontext == NULL || dcontext == GLOBAL_DCONTEXT || dynamo_exited ||
                    dcontext->is_exiting ||
                    frame->uc.uc_stack.ss_sp ==
                        ((thread_sig_info_t *)dcontext->signal_field)->sigstack.ss_sp));

    /* restore protections */
    if (local)
        SELF_PROTECT_LOCAL(dcontext, READONLY);
    EXITING_DR();
}

static bool
execute_handler_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame,
                           sigcontext_t *sc_orig, fragment_t *f, byte *access_address)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    /* we want to modify the sc in DR's frame */
    kernel_ucontext_t *uc = get_ucontext_from_rt_frame(our_frame);
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);
    kernel_sigset_t blocked;
    /* Need to get xsp now before get new dcontext.
     * This is the translated xsp, so we avoid PR 306410 (cleancall arg fault
     * on dstack => handler run on dstack) that Windows hit.
     */
    byte *xsp = get_sigstack_frame_ptr(dcontext, info, sig,
                                       our_frame/* take xsp from (translated)
                                                 * interruption point */);
    if (!send_signal_to_client_and_handle_action(dcontext, sig, our_frame, sc, sc_orig,
                                                 access_address, false /*not blocked*/, f,
                                                 false /*handle deliver*/))
        return false;

    LOG(THREAD, LOG_ASYNCH, 2, "execute_handler_from_cache for signal %d\n", sig);
    RSTATS_INC(num_signals);

    /* now that we know it's not a client-involved fault, dump as app fault */
    report_app_problem(dcontext, APPFAULT_FAULT, (byte *)sc->SC_XIP, (byte *)sc->SC_FP,
                       "\nSignal %d delivered to application handler.\n", sig);

    LOG(THREAD, LOG_ASYNCH, 3, "\txsp is " PFX "\n", xsp);

    if (info->in_sigsuspend)
        terminate_sigsuspend(dcontext, info, uc);

    /* copy frame to appropriate stack and convert to non-rt if necessary */
    copy_frame_to_stack(dcontext, info, sig, our_frame, (void *)xsp, false /*!pending*/);
    LOG(THREAD, LOG_ASYNCH, 3, "\tcopied frame from " PFX " to " PFX "\n", our_frame,
        xsp);
    sigcontext_t *app_sc = get_sigcontext_from_app_frame(info, sig, (void *)xsp);

    /* Because of difficulties determining when/if a signal handler
     * returns, we do what the kernel does: abandon all of our current
     * state, copy what we might need to the handler frame if we come back,
     * and then it's ok if the handler doesn't return.
     * If it does, we start interpreting afresh when we see sigreturn().
     * This routine assumes anything needed to return has been put in the
     * frame (only needed for signals queued up while in dynamo), and goes
     * ahead and trashes the current dcontext.
     */

    /* if we were building a trace, kill it */
    if (is_building_trace(dcontext)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
        trace_abort(dcontext);
    }

    /* add to set of blocked signals those in sigaction mask */
    blocked = info->sighand->action[sig]->mask;
    /* SA_NOMASK says whether to block sig itself or not */
    if ((info->sighand->action[sig]->flags & SA_NOMASK) == 0)
        kernel_sigaddset(&blocked, sig);
    set_blocked(dcontext, &blocked, false /*relative: OR these in*/);

    /* Doesn't matter what most app registers are, signal handler doesn't
     * expect anything except the frame on the stack.  We do need to set xsp,
     * only because if app wants special signal stack we need to point xsp
     * there.  (If no special signal stack, this is a nop.)
     */
    sc->SC_XSP = (ptr_uint_t)xsp;
    /* Set up args to handler: int sig, kernel_siginfo_t *siginfo,
     * kernel_ucontext_t *ucxt.
     */
#ifdef X86_64
    sc->SC_XDI = sig;
    sc->SC_XSI = (reg_t) & ((sigframe_rt_t *)xsp)->info;
    sc->SC_XDX = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
#elif defined(AARCHXX)
    sc->SC_R0 = sig;
    if (IS_RT_FOR_APP(info, sig)) {
        sc->SC_R1 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
        sc->SC_R2 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    }
#    ifdef LINUX
    if (sig_has_restorer(info, sig))
        sc->SC_LR = (reg_t)info->sighand->action[sig]->restorer;
    else
#    endif
        sc->SC_LR = (reg_t)dynamorio_sigreturn;
#elif defined(RISCV64)
    /* FIXME i#3544: Check if xsp is cast correctly? */
    sc->SC_A0 = sig;
    if (IS_RT_FOR_APP(info, sig)) {
        sc->SC_A1 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
        sc->SC_A2 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    }
    if (sig_has_restorer(info, sig))
        sc->SC_RA = (reg_t)info->sighand->action[sig]->restorer;
    else
        sc->SC_RA = (reg_t)dynamorio_sigreturn;
#endif
    /* Set our sigreturn context (NOT for the app: we already copied the
     * translated context to the app stack) to point to fcache_return!
     * Then we'll go back through kernel, appear in fcache_return,
     * and go through d_r_dispatch & interp, without messing up DR stack.
     */
    transfer_from_sig_handler_to_fcache_return(
        dcontext, uc, app_sc, sig,
        /* Make sure handler is next thing we execute */
        (app_pc)SIGACT_PRIMARY_HANDLER(info->sighand->action[sig]),
        (linkstub_t *)get_asynch_linkstub(), true);

    if ((info->sighand->action[sig]->flags & SA_ONESHOT) != 0) {
        /* clear handler now -- can't delete memory since sigreturn,
         * others may look at sigaction struct, so we just set to default
         */
        info->sighand->action[sig]->handler = (handler_t)SIG_DFL;
    }
    info->in_app_handler = true;

    LOG(THREAD, LOG_ASYNCH, 3, "\tset next_tag to handler " PFX ", xsp to " PFX "\n",
        SIGACT_PRIMARY_HANDLER(info->sighand->action[sig]), xsp);
    return true;
}

static bool
execute_handler_from_dispatch(dcontext_t *dcontext, int sig)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    byte *xsp = get_sigstack_frame_ptr(dcontext, info, sig, NULL);
    sigframe_rt_t *frame = &(info->sigpending[sig]->rt_frame);
    priv_mcontext_t *mcontext = get_mcontext(dcontext);
    sigcontext_t *sc;
    kernel_ucontext_t *uc;
    kernel_sigset_t blocked;

    dr_signal_action_t action;

    LOG(THREAD, LOG_ASYNCH, 2, "execute_handler_from_dispatch for signal %d\n", sig);
    RSTATS_INC(num_signals);

    /* modify the rtframe before copying to stack so we can pass final
     * version to client, and propagate its mods
     */
    uc = get_ucontext_from_rt_frame(frame);
    sc = SIGCXT_FROM_UCXT(uc);

    /* Because of difficulties determining when/if a signal handler
     * returns, we do what the kernel does: abandon all of our current
     * state, copy what we might need to the handler frame if we come back,
     * and then it's ok if the handler doesn't return.
     * If it does, we start interpreting afresh when we see sigreturn().
     */

#ifdef DEBUG
    if (d_r_stats->loglevel >= 3 && (d_r_stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "original sigcontext " PFX ":\n", sc);
        dump_sigcontext(dcontext, sc);
    }
#endif
    if (info->sigpending[sig]->use_sigcontext) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "%s: using sigcontext, not mcontext (syscall restart)\n", __FUNCTION__);
    } else {
        /* copy currently-interrupted-context to frame's context, so we can
         * abandon the currently-interrupted context.
         */
        mcontext_to_ucontext(uc, mcontext);
    }

    /* Sigreturn needs the target ISA mode to be set in the T bit in cpsr.
     * Since we came from d_r_dispatch, the post-signal target's mode is in dcontext.
     */
    IF_ARM(set_pc_mode_in_cpsr(sc, dr_get_isa_mode(dcontext)));
    /* mcontext does not contain fp or mmx or xmm state, which may have
     * changed since the frame was created (while finishing up interrupted
     * fragment prior to returning to d_r_dispatch).  Since DR does not touch
     * this state except for xmm on x64, we go ahead and copy the
     * current state into the frame, and then touch up xmm for x64.
     */
    /* FIXME: should this be done for all pending as soon as reach
     * d_r_dispatch?  what if get two asynch inside same frag prior to exiting
     * cache?  have issues with fpstate, but also prob with next_tag? FIXME
     */
    /* FIXME: we should clear fpstate for app handler itself as that's
     * how our own handler is executed.
     */
#if defined(LINUX) && defined(X86)
    ASSERT(sc->fpstate != NULL); /* not doing i#641 yet */
    save_fpstate(dcontext, frame);
#endif /* LINUX && X86 */
#ifdef DEBUG
    if (d_r_stats->loglevel >= 3 && (d_r_stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "new sigcontext " PFX ":\n", sc);
        dump_sigcontext(dcontext, sc);
        LOG(THREAD, LOG_ASYNCH, 3, "\n");
    }
    IF_AARCHXX(ASSERT(get_sigcxt_stolen_reg(sc) != (reg_t)*get_dr_tls_base_addr()));
#endif
    /* FIXME: other state?  debug regs?
     * if no syscall allowed between main_ (when frame created) and
     * receiving, then don't have to worry about debug regs, etc.
     * check for syscall when record pending, if it exists, try to
     * receive in pre_system_call or something? what if ignorable?  FIXME!
     */

    if (!info->sigpending[sig]->use_sigcontext) {
        /* for the pc we want the app pc not the cache pc */
        sc->SC_XIP = (ptr_uint_t)dcontext->next_tag;
        /* Point at the rseq abort handler if in an rseq region. */
        ptr_uint_t special_xl8 =
            (ptr_uint_t)translate_restore_special_cases(dcontext, (app_pc)sc->SC_XIP);
        if (special_xl8 != sc->SC_XIP) {
            dcontext->next_tag = (app_pc)special_xl8;
            sc->SC_XIP = special_xl8;
            LOG(THREAD, LOG_ASYNCH, 3, "set next PC to special xl8 %p\n",
                dcontext->next_tag);
        }
        LOG(THREAD, LOG_ASYNCH, 3, "\tset frame's eip to " PFX "\n", sc->SC_XIP);
    }

    sigcontext_t sc_interrupted = *sc;
    action = send_signal_to_client(dcontext, sig, frame, NULL,
                                   info->sigpending[sig]->access_address,
                                   false /*not blocked*/, NULL);
    /* in order to pass to the client, we come all the way here for signals
     * the app has no handler for
     */
    if (action == DR_SIGNAL_REDIRECT) {
        /* send_signal_to_client copied mcontext into frame's sc */
        priv_mcontext_t *mcontext = get_mcontext(dcontext);
        ucontext_to_mcontext(mcontext, uc);
        if (is_building_trace(dcontext)) {
            LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
            trace_abort(dcontext);
        }
        sig_full_cxt_t sc_interrupted_full = { &sc_interrupted, NULL /*not provided*/ };
        instrument_kernel_xfer(dcontext, DR_XFER_CLIENT_REDIRECT, sc_interrupted_full,
                               NULL, NULL, mcontext->pc, mcontext->xsp, osc_empty,
                               mcontext, sig);
        /* We're going back to dispatch and won't use the sigcontext.
         * canonicalize_pc_target() will set the dcontext ISA mode for us.
         */
        dcontext->next_tag = canonicalize_pc_target(dcontext, mcontext->pc);
        return true; /* don't try another signal */
    } else if (action == DR_SIGNAL_SUPPRESS ||
               (info->sighand->action[sig] != NULL &&
                info->sighand->action[sig]->handler == (handler_t)SIG_IGN)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: not delivering!\n",
            (action == DR_SIGNAL_SUPPRESS) ? "client suppressing signal"
                                           : "app signal handler is SIG_IGN");
        return false;
    } else if (action == DR_SIGNAL_BYPASS ||
               (info->sighand->action[sig] == NULL ||
                info->sighand->action[sig]->handler == (handler_t)SIG_DFL)) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: executing default action\n",
            (action == DR_SIGNAL_BYPASS) ? "client forcing default"
                                         : "app signal handler is SIG_DFL");
        if (info->sigpending[sig]->use_sigcontext) {
            /* after the default action we want to go to the sigcontext */
            dcontext->next_tag = canonicalize_pc_target(dcontext, (app_pc)sc->SC_XIP);
            ucontext_to_mcontext(get_mcontext(dcontext), uc);
            IF_ARM(dr_set_isa_mode(dcontext, get_pc_mode_from_cpsr(sc), NULL));
        }
        execute_default_from_dispatch(dcontext, sig, frame);
        return true;
    }
    CLIENT_ASSERT(action == DR_SIGNAL_DELIVER, "invalid signal event return value");

    if (info->in_sigsuspend)
        terminate_sigsuspend(dcontext, info, uc);

    /* now that we've made all our changes and given the client a
     * chance to make changes, copy the frame to the appropriate stack
     * location and convert to non-rt if necessary
     */
    copy_frame_to_stack(dcontext, info, sig, frame, xsp, true /*pending*/);
    /* now point at the app's frame */
    sc = get_sigcontext_from_app_frame(info, sig, (void *)xsp);

    ASSERT(info->sighand->action[sig] != NULL);

    /* add to set of blocked signals those in sigaction mask */
    blocked = info->sighand->action[sig]->mask;
    /* SA_NOMASK says whether to block sig itself or not */
    if ((info->sighand->action[sig]->flags & SA_NOMASK) == 0)
        kernel_sigaddset(&blocked, sig);
    set_blocked(dcontext, &blocked, false /*relative: OR these in*/);

    /* if we were building a trace, kill it */
    if (is_building_trace(dcontext)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
        trace_abort(dcontext);
    }

    /* Doesn't matter what most app registers are, signal handler doesn't
     * expect anything except the frame on the stack.  We do need to set xsp.
     */
    mcontext->xsp = (ptr_uint_t)xsp;
    /* Set up args to handler: int sig, kernel_siginfo_t *siginfo,
     * kernel_ucontext_t *ucxt.
     */
#if defined(MACOS64) && defined(AARCH64)
    mcontext->r0 = (reg_t)info->sighand->action[sig]->handler;
    int infostyle = TEST(SA_SIGINFO, info->sighand->action[sig]->flags)
        ? SIGHAND_STYLE_UC_FLAVOR
        : SIGHAND_STYLE_UC_TRAD;
    mcontext->r1 = infostyle;
    mcontext->r2 = sig;
    mcontext->r3 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
    mcontext->r4 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    mcontext->lr = (reg_t)dynamorio_sigreturn;
#elif defined(MACOS64) && defined(X86)
    mcontext->xdi = (reg_t)info->sighand->action[sig]->handler;
    int infostyle = TEST(SA_SIGINFO, info->sighand->action[sig]->flags)
        ? SIGHAND_STYLE_UC_FLAVOR
        : SIGHAND_STYLE_UC_TRAD;
    mcontext->xsi = infostyle;
    mcontext->xdx = sig;
    mcontext->xcx = (reg_t) & ((sigframe_rt_t *)xsp)->info;
    mcontext->r8 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
#elif defined(X86_64)
    mcontext->xdi = sig;
    mcontext->xsi = (reg_t) & ((sigframe_rt_t *)xsp)->info;
    mcontext->xdx = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
#elif defined(AARCHXX)
    mcontext->r0 = sig;
    if (IS_RT_FOR_APP(info, sig)) {
        mcontext->r1 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
        mcontext->r2 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    }
#    ifdef LINUX
    if (sig_has_restorer(info, sig))
        mcontext->lr = (reg_t)info->sighand->action[sig]->restorer;
    else
#    endif
        mcontext->lr = (reg_t)dynamorio_sigreturn;
#elif defined(RISCV64)
    /* FIXME i#3544: Check if xsp is cast correctly? */
    sc->SC_A0 = sig;
    if (IS_RT_FOR_APP(info, sig)) {
        sc->SC_A1 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
        sc->SC_A2 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    }
    if (sig_has_restorer(info, sig))
        sc->SC_RA = (reg_t)info->sighand->action[sig]->restorer;
    else
        sc->SC_RA = (reg_t)dynamorio_sigreturn;
#endif
#ifdef X86
    /* Clear eflags DF (signal handler should match function entry ABI) */
    mcontext->xflags &= ~EFLAGS_DF;
#endif
    /* Make sure handler is next thing we execute */
    mcontext->pc = (app_pc)SIGACT_PRIMARY_HANDLER(info->sighand->action[sig]);

    if ((info->sighand->action[sig]->flags & SA_ONESHOT) != 0) {
        /* clear handler now -- can't delete memory since sigreturn,
         * others may look at sigaction struct, so we just set to default
         */
        info->sighand->action[sig]->handler = (handler_t)SIG_DFL;
    }
    sig_full_cxt_t sc_full = { sc, NULL /*not provided*/ };
    /* i#4041: Provide the actually-interrupted mid-rseq PC to the rseq event. */
    ptr_uint_t official_xl8 = sc->SC_XIP;
    ptr_uint_t rseq_xl8 =
        (ptr_uint_t)translate_last_direct_translation(dcontext, (app_pc)official_xl8);
    if (rseq_xl8 != official_xl8) {
        sc->SC_XIP = rseq_xl8;
        instrument_kernel_xfer(dcontext, DR_XFER_RSEQ_ABORT, sc_full, NULL, NULL,
                               mcontext->pc, mcontext->xsp, osc_empty, mcontext, sig);
        /* The signal event has the abort handler, like the kernel passes. */
        sc->SC_XIP = official_xl8;
    }
    instrument_kernel_xfer(dcontext, DR_XFER_SIGNAL_DELIVERY, sc_full, NULL, NULL,
                           mcontext->pc, mcontext->xsp, osc_empty, mcontext, sig);
    dcontext->next_tag = canonicalize_pc_target(dcontext, mcontext->pc);
    info->in_app_handler = true;

    LOG(THREAD, LOG_ASYNCH, 3, "\tset xsp to " PFX "\n", xsp);
    return true;
}

/* Sends a signal to a currently-native thread.  dcontext can be NULL. */
static void
execute_native_handler(dcontext_t *dcontext, int sig, sigframe_rt_t *our_frame)
{
    /* If dcontext is NULL, we use a synthetic info struct where we fill in the
     * info we need: the app's sigaction and sigstack settings.
     */
    thread_sig_info_t synthetic = {};
    sighand_info_t sighand = {};
    kernel_sigaction_t sigact_struct;
    thread_sig_info_t *info;
    LOG(THREAD, LOG_ASYNCH, 2, "%s for signal %d in thread " TIDFMT "\n", __FUNCTION__,
        sig, get_sys_thread_id());
    if (dcontext != NULL && is_thread_signal_info_initialized(dcontext) &&
        /* For the DR init thread, we don't place the app handlers into signal_info
         * until DR starts running code and is equipped to deliver a managed signal.
         * So during init, until dynamo_started is set, we deliver natively.
         */
        dynamo_started) {
        info = (thread_sig_info_t *)dcontext->signal_field;
    } else {
        if (atomic_read_bool(&multiple_handlers_present)) {
            /* See i#1921 comment up top: we don't handle this. */
            report_unhandleable_signal_and_exit(sig, "multiple native handlers");
            ASSERT_NOT_REACHED();
        }
        info = &synthetic;
        synthetic.sighand = &sighand;
        synthetic.sighand->we_intercept[sig] = true;
        synthetic.sighand->action[sig] = &sigact_struct;
        d_r_read_lock(&detached_sigact_lock);
        memcpy(&sigact_struct, &detached_sigact[sig], sizeof(sigact_struct));
        d_r_read_unlock(&detached_sigact_lock);
#ifdef HAVE_SIGALTSTACK
        thread_sig_info_t *dc_info = NULL;
        if (dcontext != NULL)
            dc_info = (thread_sig_info_t *)dcontext->signal_field;
        /* DR's sigstack is set up before is_thread_signal_info_initialized().
         * If DR's is in place, the app's is stored (it's a syscall so atomic).
         */
        if (dc_info != NULL && dc_info->sigstack.ss_sp != NULL) {
            memcpy(&synthetic.app_sigstack, &dc_info->app_sigstack,
                   sizeof(synthetic.app_sigstack));
        } else {
            IF_DEBUG(int rc =)
            sigaltstack_syscall(NULL, &synthetic.app_sigstack);
            ASSERT(rc == 0);
        }
#endif
        dcontext = NULL; /* Clear for the not-yet-start or not-init cases above. */
    }
    kernel_ucontext_t *uc = get_ucontext_from_rt_frame(our_frame);
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);
    kernel_sigset_t blocked;
    byte *xsp = get_sigstack_frame_ptr(dcontext, info, sig, our_frame);

    /* We do not send the signal to clients as it is not a managed thread and we
     * cannot really take any action except send it to the native handler.
     */

    if (info->sighand->action[sig] == NULL ||
        info->sighand->action[sig]->handler == (handler_t)SIG_DFL) {
        /* TODO i#1921: We want to call:
         *   execute_default_from_cache(dcontext, sig, our_frame, sc, false)
         * But we need to pass in our sythentic info.
         * And not all that callee's code will work with a NULL dcontext yet: it needs
         * some work.
         * Plus it calls handle_free() which we need to arrange for.
         * For now we bail.
         */
        if (can_always_delay[sig]) {
            /* We are dropping asynchronous signals during detach. The thread may
             * already have lost its TLS (xref i#3535). A safe read may result in a
             * crash if DR's SIGSEGV handler is removed before the safe read's
             * SIGSEGV is delivered.
             *
             * Note that besides dropping potentially important signals, there is
             * still a small race window if the signal gets delivered after the
             * detach has finished, i.e.  doing detach is false. This is an issue in
             * particular if the app has started re-attaching.
             *
             * Signals that are not clearly asynchronous may hit corner case(s) of
             * i#3535.  (xref i#26).
             */
            if (doing_detach) {
                DOLOG(1, LOG_ASYNCH, { dump_sigcontext(GLOBAL_DCONTEXT, sc); });
                SYSLOG_INTERNAL_ERROR(
                    "Signal in native thread during detach with default action is not "
                    "fully supported: dropping and continuing (tid=%d, sig=%d)",
                    get_sys_thread_id(), sig);
            }
            return; /* Just drop the signal. */
        }
        if (default_action[sig] == DEFAULT_IGNORE)
            return;
        report_unhandleable_signal_and_exit(sig, "default action in native thread");
        ASSERT_NOT_REACHED();
        return;
    }
    ASSERT(info->sighand->action[sig] != NULL &&
           info->sighand->action[sig]->handler != (handler_t)SIG_IGN &&
           info->sighand->action[sig]->handler != (handler_t)SIG_DFL);

    RSTATS_INC(num_signals);
    RSTATS_INC(num_native_signals);
    report_app_problem(dcontext, APPFAULT_FAULT, (byte *)sc->SC_XIP, (byte *)sc->SC_FP,
                       "\nSignal %d delivered to application handler.\n", sig);

    copy_frame_to_stack(dcontext, info, sig, our_frame, (void *)xsp, false /*!pending*/);

    blocked = info->sighand->action[sig]->mask;
    if (!TEST(SA_NOMASK, (info->sighand->action[sig]->flags)))
        kernel_sigaddset(&blocked, sig);
    DOLOG(2, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 3, "Pre-signal blocked signals, stored in our frame:\n");
        dump_sigset(dcontext, (kernel_sigset_t *)&our_frame->uc.uc_sigmask);
    });
    for (int i = 1; i <= MAX_SIGNUM; i++) {
        if (kernel_sigismember(&blocked, i)) {
            kernel_sigaddset((kernel_sigset_t *)&our_frame->uc.uc_sigmask, i);
        }
    }
    DOLOG(2, LOG_ASYNCH, {
        LOG(THREAD, LOG_ASYNCH, 3, "Blocked signals within app handler will be:\n");
        dump_sigset(dcontext, (kernel_sigset_t *)&our_frame->uc.uc_sigmask);
    });

    /* Now edit the resumption point when main_signal_handler returns to go
     * straight to the app handler.
     */
    sc->SC_XSP = (ptr_uint_t)xsp;
/* Set up args to handler. */
#ifdef X86_64
    sc->SC_XDI = sig;
    sc->SC_XSI = (reg_t) & ((sigframe_rt_t *)xsp)->info;
    sc->SC_XDX = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
#elif defined(AARCHXX)
    sc->SC_R0 = sig;
    if (IS_RT_FOR_APP(info, sig)) {
        sc->SC_R1 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
        sc->SC_R2 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    }
#    ifdef LINUX
    if (sig_has_restorer(info, sig))
        sc->SC_LR = (reg_t)info->sighand->action[sig]->restorer;
    else
#    endif
        sc->SC_LR = (reg_t)dynamorio_sigreturn;
#elif defined(RISCV64)
    /* FIXME i#3544: Check if xsp is cast correctly? */
    sc->SC_A0 = sig;
    if (IS_RT_FOR_APP(info, sig)) {
        sc->SC_A1 = (reg_t) & ((sigframe_rt_t *)xsp)->info;
        sc->SC_A2 = (reg_t) & ((sigframe_rt_t *)xsp)->uc;
    }
    if (sig_has_restorer(info, sig))
        sc->SC_RA = (reg_t)info->sighand->action[sig]->restorer;
    else
        sc->SC_RA = (reg_t)dynamorio_sigreturn;
#endif
    sc->SC_XIP = (reg_t)SIGACT_PRIMARY_HANDLER(info->sighand->action[sig]);

    LOG(THREAD, LOG_ASYNCH, 2, "%s: set pc to handler %p with xsp=%p\n", __FUNCTION__,
        SIGACT_PRIMARY_HANDLER(info->sighand->action[sig]), xsp);

    if (TEST(SA_ONESHOT, detached_sigact[sig].flags)) {
        /* XXX: When we remove DR's handler from the final thread we'll ignore this
         * delivery: but we can't safely access that data struct here.  The best we
         * can do is avoid delivering twice ourselves.
         */
        d_r_write_lock(&detached_sigact_lock);
        memset(&detached_sigact[sig], 0, sizeof(detached_sigact[sig]));
        d_r_write_unlock(&detached_sigact_lock);
    }
}

/* The arg to SYS_kill, i.e., the signal number, should be in dcontext->sys_param0 */
/* This routine unblocks signals, but the caller must set the handler to default. */
static void
terminate_via_kill(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(dcontext == get_thread_private_dcontext());
    /* Enure signal_thread_exit() will not re-block.
     * Increment threads_unmasked to counteract the dec from the seemingly-unblocked
     * signals in signal_thread_exit().
     */
    for (int i = 1; i <= MAX_SIGNUM; i++) {
        if (!EMULATE_SIGMASK(info, i))
            continue;
        if (kernel_sigismember(&info->app_sigblocked, i))
            ATOMIC_INC(int, info->sighand->threads_unmasked[i]);
    }
    DOLOG(4, LOG_ASYNCH, dump_unmasked(dcontext, __FUNCTION__););
    /* Since we're exiting we do not acquire sigblocked_mutex to ensure this works
     * from fragile locations.
     */
    memset(&info->app_sigblocked, 0, sizeof(info->app_sigblocked));

    /* FIXME PR 541760: there can be multiple thread groups and thus
     * this may not exit all threads in the address space
     */
    block_cleanup_and_terminate(
        dcontext, SYS_kill,
        /* Pass -pid in case main thread has exited
         * in which case will get -ESRCH
         */
        IF_VMX86(os_in_vmkernel_userworld() ? -(int)get_process_id() :) get_process_id(),
        dcontext->sys_param0, true, 0, 0);
    ASSERT_NOT_REACHED();
}

bool
is_currently_on_sigaltstack(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    byte *cur_esp;
    GET_STACK_PTR(cur_esp);
    return (cur_esp >= (byte *)info->sigstack.ss_sp &&
            cur_esp < (byte *)info->sigstack.ss_sp + info->sigstack.ss_size);
}

static void
terminate_via_kill_from_anywhere(dcontext_t *dcontext, int sig)
{
    dcontext->sys_param0 = sig; /* store arg to SYS_kill */
    if (is_currently_on_sigaltstack(dcontext)) {
        /* We can't clean up our sigstack properly when we're on it
         * (i#1160) so we terminate on the dstack.
         */
        call_switch_stack(dcontext, dcontext->dstack,
                          (void (*)(void *))terminate_via_kill, NULL /*!d_r_initstack */,
                          false /*no return */);
    } else {
        terminate_via_kill(dcontext);
    }
    ASSERT_NOT_REACHED();
}

/* xref os_request_fatal_coredump() */
void
os_terminate_via_signal(dcontext_t *dcontext, terminate_flags_t flags, int sig)
{
    if (signal_is_interceptable(sig)) {
        bool set_action = false;
#if defined(STATIC_LIBRARY) && defined(LINUX)
        if (INTERNAL_OPTION(invoke_app_on_crash)) {
            /* We come here for asserts.  Faults already bypass this routine. */
            dcontext_t *my_dc = get_thread_private_dcontext();
            if (my_dc != NULL) {
                thread_sig_info_t *info = (thread_sig_info_t *)my_dc->signal_field;
                if (info != NULL && info->sighand->action[sig] != NULL &&
                    IS_RT_FOR_APP(info, sig)) {
                    set_action = true;
                    sigaction_syscall(sig, info->sighand->action[sig], NULL);
                }
            }
        }
#endif
        if (!set_action) {
            DEBUG_DECLARE(bool res =)
            set_default_signal_action(sig);
            ASSERT(res);
        }
    }
    if (TEST(TERMINATE_CLEANUP, flags)) {
        /* we enter from several different places, so rewind until top-level kstat */
        KSTOP_REWIND_UNTIL(thread_measured);
        ASSERT(dcontext != NULL);
        dcontext->sys_param0 = sig;
        /* XXX: the comment in the else below implies some systems have SYS_kill
         * of SIGSEGV w/ no handler on oneself actually return.
         * cleanup_and_terminate won't return to us and will use global_do_syscall
         * to invoke SYS_kill, which in debug will do an inf loop (good!) but
         * in release will do SYS_exit_group -- oh well, the systems I'm testing
         * on do an immediate exit.
         */
        terminate_via_kill_from_anywhere(dcontext, sig);
    } else {
        /* general clean up is unsafe: just remove .1config file */
        d_r_config_exit();
        dynamorio_syscall(SYS_kill, 2, get_process_id(), sig);
        /* We try both the SYS_kill and the immediate crash since on some platforms
         * the SIGKILL is delayed and on others the *-1 is hanging(?): should investigate
         */
        if (sig == SIGSEGV) /* make doubly-sure */
            *((int *)PTR_UINT_MINUS_1) = 0;
        while (true) {
            /* in case signal delivery is delayed we wait...forever */
            os_thread_yield();
        }
    }
    ASSERT_NOT_REACHED();
}

static bool
execute_default_action(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                       sigcontext_t *sc_orig, bool from_dispatch, bool forged)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    byte *pc = (byte *)sc->SC_XIP;

    LOG(THREAD, LOG_ASYNCH, 3, "execute_default_action for signal %d\n", sig);

    /* should only come here for signals we catch, or signal with ONESHOT
     * that didn't sigreturn
     */
    ASSERT(info->sighand->we_intercept[sig] ||
           (info->sighand->action[sig]->flags & SA_ONESHOT) != 0);

    if (info->sighand->action[sig] != NULL &&
        (info->sighand->action[sig]->flags & SA_ONESHOT) != 0) {
        if (!info->sighand->we_intercept[sig]) {
            handler_free(dcontext, info->sighand->action[sig],
                         sizeof(kernel_sigaction_t));
            info->sighand->action[sig] = NULL;
        }
    }

    /* FIXME PR 205310: we can't always perfectly emulate the default
     * behavior.  To execute the default action, we have to un-register our
     * handler, if we have one, for signals whose default action is not
     * ignore or that will just be re-raised upon returning to the
     * interrupted context -- FIXME: are any of the ignores repeated?
     * SIGURG?
     *
     * If called from execute_handler_from_cache(), our main_signal_handler()
     * is going to return directly to the translated context: which means we
     * go native to re-execute the instr, which if it does in fact generate
     * the signal again means we have a nice transparent core dump.
     *
     * If called from execute_handler_from_dispatch(), we need to generate
     * the signal ourselves.
     */
    if (default_action[sig] != DEFAULT_IGNORE) {
        DEBUG_DECLARE(bool ok =)
        set_default_signal_action(sig);
        ASSERT(ok);

        /* FIXME: to avoid races w/ shared handlers should set a flag to
         * prevent another thread from re-enabling.
         * Perhaps worse: what if this signal arrives for another thread
         * in the meantime (and the default is not terminate)?
         */
        if (info->sighand->is_shared) {
            LOG(THREAD, LOG_ASYNCH, 1,
                "WARNING: having to install SIG_DFL for thread " TIDFMT ", but will be "
                "shared!\n",
                d_r_get_thread_id());
        }
        if (default_action[sig] == DEFAULT_TERMINATE ||
            default_action[sig] == DEFAULT_TERMINATE_CORE) {
            report_app_problem(dcontext, APPFAULT_CRASH, pc, (byte *)sc->SC_FP,
                               "\nSignal %d delivered to application as default "
                               "action.\n",
                               sig);
            /* App may call sigaction to set handler SIG_DFL (unnecessary but legal),
             * in which case DR will put a handler in info->sighand->action[sig].
             * We must clear it, otherwise, signal_thread_exit may cleanup the
             * handler and set it to SIG_IGN instead.
             */
            if (info->sighand->action[sig] != NULL) {
                ASSERT(info->sighand->we_intercept[sig]);
                handler_free(dcontext, info->sighand->action[sig],
                             sizeof(kernel_sigaction_t));
                info->sighand->action[sig] = NULL;
            }
            /* N.B.: we don't have to restore our handler because the
             * default action is for the process (entire thread group for NPTL) to die!
             */
            if (from_dispatch || can_always_delay[sig] || forged ||
                is_sys_kill(dcontext, pc, (byte *)sc->SC_XSP, &frame->info)) {
                /* This must have come from SYS_kill rather than raised by
                 * a faulting instruction.  Thus we can't go re-execute the
                 * instr in order to re-raise the signal (if from_dispatch,
                 * we delayed and can't re-execute anyway).  Instead we
                 * re-generate via SYS_kill.  An alternative, if we don't
                 * care about generating a core dump, is to use SYS_exit
                 * and pass the right exit code to indicate the signal
                 * number: that would avoid races w/ the sigaction.
                 *
                 * FIXME: should have app make the syscall to get a more
                 * transparent core dump!
                 */
                LOG(THREAD, LOG_ASYNCH, 1, "Terminating via kill\n");
                if (!from_dispatch && !forged)
                    KSTOP_NOT_MATCHING_NOT_PROPAGATED(fcache_default);
                KSTOP_NOT_MATCHING_NOT_PROPAGATED(dispatch_num_exits);
                if (is_couldbelinking(dcontext)) /* won't be for SYS_kill (i#1159) */
                    enter_nolinking(dcontext, NULL, false);
                /* we could be on sigstack so call this version: */
                terminate_via_kill_from_anywhere(dcontext, sig);
                ASSERT_NOT_REACHED();
            } else {
                /* We assume that re-executing the interrupted instr will
                 * re-raise the fault.  We could easily be wrong:
                 * xref PR 363811 infinite loop due to memory we
                 * thought was unreadable and thus thought would raise
                 * a signal; xref PR 368277 to improve is_sys_kill(), and the
                 * "forged" parameter that puts us in the if() above.
                 * FIXME PR 205310: we should check whether we come out of
                 * the cache when we expected to terminate!
                 *
                 * An alternative is to abandon transparent core dumps and
                 * do the same explicit SYS_kill we do for from_dispatch.
                 * That would let us clean up DR as well.
                 * FIXME: currently we do not clean up DR for a synchronous
                 * signal death, but we do for asynch.
                 */
                /* i#552: cleanup and raise client exit event */
                int instr_sz = 0;
                thread_sig_info_t *info;
                /* We are on the sigstack now, so assign it to NULL to avoid being
                 * freed during process exit cleanup
                 */
                info = (thread_sig_info_t *)dcontext->signal_field;
                info->sigstack.ss_sp = NULL;
                /* We enter from several different places, so rewind until
                 * top-level kstat.
                 */
                KSTOP_REWIND_UNTIL(thread_measured);
                /* We try to raise the same signal in app's context so a correct
                 * coredump can be generated. However, the client might change
                 * the code in a way that the corresponding app code won't
                 * raise the signal, so we first check if the app instr is the
                 * same as instr in the cache, and raise the signal (by return).
                 * Otherwise, we kill the process instead.
                 * XXX: if the PC is unreadable we'll just crash here...should check
                 * for readability safely.
                 */
                ASSERT(sc_orig != NULL);
                instr_sz = decode_sizeof(dcontext, (byte *)sc_orig->SC_XIP,
                                         NULL _IF_X86_64(NULL));
                if (instr_sz != 0 &&
                    pc != NULL && /* avoid crash on xl8 failure (i#1699) */
                    instr_sz == decode_sizeof(dcontext, pc, NULL _IF_X86_64(NULL)) &&
                    memcmp(pc, (byte *)sc_orig->SC_XIP, instr_sz) == 0) {
                    /* the app instr matches the cache instr; cleanup and raise the
                     * the signal in the app context
                     */
                    LOG(THREAD, LOG_ASYNCH, 1, "Raising signal by re-executing\n");
                    dynamo_process_exit();
                    /* we cannot re-enter the cache, which is freed by now */
                    ASSERT(!from_dispatch);
                    return false;
                } else {
                    /* mismatch, cleanup and terminate */
                    LOG(THREAD, LOG_ASYNCH, 1, "Terminating via kill\n");
                    dcontext->sys_param0 = sig;
                    terminate_via_kill(dcontext);
                    ASSERT_NOT_REACHED();
                }
            }
        } else {
            /* FIXME PR 297033: in order to intercept DEFAULT_STOP /
             * DEFAULT_CONTINUE signals, we need to set sigcontext to point
             * to some kind of regain-control routine, so that when our
             * thread gets to run again we can reset our handler.  So far
             * we have no signals that fall here that we intercept.
             */
            CLIENT_ASSERT(false, "STOP/CONT signals not supported");
        }
#if defined(DEBUG) && defined(INTERNAL)
        if (sig == SIGSEGV && !dynamo_exited) {
            /* pc should be an app pc at this point (it was translated) --
             * check for bad cases here
             */
            if (safe_is_in_fcache(dcontext, pc, (byte *)sc->SC_XSP)) {
                fragment_t wrapper;
                fragment_t *f;
                LOG(THREAD, LOG_ALL, 1,
                    "Received SIGSEGV at pc " PFX " in thread " TIDFMT "\n", pc,
                    d_r_get_thread_id());
                f = fragment_pclookup(dcontext, pc, &wrapper);
                if (f)
                    disassemble_fragment(dcontext, f, false);
                ASSERT_NOT_REACHED();
            } else if (in_generated_routine(dcontext, pc)) {
                LOG(THREAD, LOG_ALL, 1,
                    "Received SIGSEGV at generated non-code-cache pc " PFX "\n", pc);
                ASSERT_NOT_REACHED();
            }
        }
#endif
    }

    /* now continue at the interruption point and re-raise the signal */
    return true;
}

static bool
execute_default_from_cache(dcontext_t *dcontext, int sig, sigframe_rt_t *frame,
                           sigcontext_t *sc_orig, bool forged)
{
    return execute_default_action(dcontext, sig, frame, sc_orig, false, forged);
}

static void
execute_default_from_dispatch(dcontext_t *dcontext, int sig, sigframe_rt_t *frame)
{
    execute_default_action(dcontext, sig, frame, NULL, true, false);
}

static bool
reroute_to_unmasked_thread(dcontext_t *dcontext, sigframe_rt_t *frame, int sig)
{
    kernel_siginfo_t *siginfo = &frame->info;
    sigcontext_t *sc = get_sigcontext_from_rt_frame(frame);
    if (!signal_is_process_wide(dcontext, siginfo, (byte *)sc->SC_XIP,
                                (byte *)sc->SC_XSP))
        return false;
    bool found = false;
    thread_record_t **trecs;
    int num_threads;
    bool was_linking = false;
    if (is_couldbelinking(dcontext)) {
        was_linking = true;
        enter_nolinking(dcontext, NULL, false);
    }
    d_r_mutex_lock(&thread_initexit_lock);
    get_list_of_threads(&trecs, &num_threads);
    for (int i = 0; i < num_threads; i++) {
        if (trecs[i]->dcontext == dcontext)
            continue;
        if (IS_CLIENT_THREAD(trecs[i]->dcontext))
            continue;
        thread_sig_info_t *tgt_info =
            (thread_sig_info_t *)trecs[i]->dcontext->signal_field;
        d_r_mutex_lock(&tgt_info->sigblocked_lock);
        if (!kernel_sigismember(&tgt_info->app_sigblocked, sig)) {
            LOG(THREAD, LOG_ASYNCH, 2,
                "rerouting signal %d to thread " TIDFMT " where it is unblocked\n", sig,
                trecs[i]->id);
            /* It is simpler to send a new signal to the target thread, rather than
             * trying to copy this same signal frame over there.
             */
            bool sent = false;
            if (siginfo->si_code == SI_QUEUE) {
                /* We need SYS_rt_tgsigqueueinfo which is on kernel 2.6.31+.
                 * If it's not available we have to fall back to SYS_tgkill.
                 * XXX: We ignore custom values in other siginfo fields for someone
                 * hackily using the raw syscall (like we do for nudges).
                 * XXX: This reroute will have different si_code values than the
                 * original process-wide signal.  We live with the loss of
                 * transparency.
                 */
                sent = thread_signal_queue(get_process_id(), trecs[i]->id, sig,
                                           siginfo->si_value.sival_ptr);
            }
            if (!sent)
                thread_signal(get_process_id(), trecs[i]->id, sig);
            found = true;
            RSTATS_INC(num_signals_rerouted);
        }
        d_r_mutex_unlock(&tgt_info->sigblocked_lock);
    }
    d_r_mutex_unlock(&thread_initexit_lock);
    if (was_linking)
        enter_couldbelinking(dcontext, NULL, false);
    global_heap_free(trecs,
                     num_threads * sizeof(thread_record_t *) HEAPACCT(ACCT_THREAD_MGT));
    return found;
}

void
receive_pending_signal(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    sigpending_t *temp;
    int sig;
    LOG(THREAD, LOG_ASYNCH, 3, "receive_pending_signal\n");
    if (info->interrupted != NULL) {
        relink_interrupted_fragment(dcontext, info);
    }
    /* grab first pending signal
     * XXX: start with real-time ones?
     */
    /* "lock" the array to prevent a new signal that interrupts this bit of
     * code from prepended or deleting from the array while we're accessing it
     */
    info->accessing_sigpending = true;
    /* barrier to prevent compiler from moving the above write below the loop */
    __asm__ __volatile__("" : : : "memory");
    if (!info->multiple_pending_units &&
        info->num_pending + 2 >= DYNAMO_OPTION(max_pending_signals)) {
        /* We're close to the limit: proactively get a new unit while it's safe
         * to acquire locks.  We do that by pushing over the edge.
         * We assume that filling up a 2nd unit is too pathological to plan for.
         */
        info->multiple_pending_units = true;
        SYSLOG_INTERNAL_WARNING("many pending signals: asking for 2nd special unit");
        sigpending_t *temp1 = special_heap_alloc(info->sigheap);
        sigpending_t *temp2 = special_heap_alloc(info->sigheap);
        sigpending_t *temp3 = special_heap_alloc(info->sigheap);
        special_heap_free(info->sigheap, temp1);
        special_heap_free(info->sigheap, temp2);
        special_heap_free(info->sigheap, temp3);
    }
    for (sig = 1; sig <= MAX_SIGNUM; sig++) {
        if (info->sigpending[sig] != NULL) {
            bool executing = true;
            /* We do not re-check whether blocked if it was unblocked at receive time
             * to handle a signal arriving during a mask-changing syscall
             * (handle_pre_extended_syscall_sigmasks()).  The problem is that we exit
             * the syscall (restoring the pre-syscall mask) *before* we come here.
             * We clear the unblocked_at_receipt field below to limit this to the
             * first syscall, to avoid erroneously delivering more later (xref
             * i#4998).  We don't need this for sigsuspend because we can delay its
             * post-syscall mask restore until signal delivery
             * (terminate_sigsuspend()).
             */
            if (!info->sigpending[sig]->unblocked_at_receipt &&
                kernel_sigismember(&info->app_sigblocked, sig)) {
                if (reroute_to_unmasked_thread(dcontext, &info->sigpending[sig]->rt_frame,
                                               sig)) {
                    temp = info->sigpending[sig];
                    info->sigpending[sig] = temp->next;
                    special_heap_free(info->sigheap, temp);
                    info->num_pending--;
                    continue;
                }
                LOG(THREAD, LOG_ASYNCH, 3, "\tsignal %d is blocked!\n", sig);
                continue;
            }

            LOG(THREAD, LOG_ASYNCH, 3, "\treceiving signal %d\n", sig);
            /* execute_handler_from_dispatch()'s call to copy_frame_to_stack() is
             * allowed to remove the front entry from info->sigpending[sig] and
             * jump to d_r_dispatch.
             */
            executing = execute_handler_from_dispatch(dcontext, sig);
            temp = info->sigpending[sig];
            info->sigpending[sig] = temp->next;
            special_heap_free(info->sigheap, temp);
            info->num_pending--;

            /* only one signal at a time! */
            if (executing) {
                /* Make negative so our fcache_enter check makes progress but
                 * our C code still considers there to be pending signals.
                 */
                dcontext->signals_pending = -1;
                break;
            }
        }
    }
    /* Only one signal can be delivered ignoring the back-in-dispatch blocked set. */
    for (sig = 1; sig <= MAX_SIGNUM; sig++) {
        if (info->sigpending[sig] != NULL && info->sigpending[sig]->unblocked_at_receipt)
            info->sigpending[sig]->unblocked_at_receipt = false;
    }
    /* barrier to prevent compiler from moving the below write above the loop */
    __asm__ __volatile__("" : : : "memory");
    info->accessing_sigpending = false;

    /* we only clear this on a call to us where we find NO pending signals */
    if (sig > MAX_SIGNUM) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tclearing signals_pending flag\n");
        dcontext->signals_pending = 0;
    }
}

/* Returns false if should NOT issue syscall. */
bool
#ifdef LINUX
handle_sigreturn(dcontext_t *dcontext, bool rt)
#else
handle_sigreturn(dcontext_t *dcontext, void *ucxt_param, int style)
#endif
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    sigcontext_t *sc = NULL; /* initialize to satisfy Mac clang */
    kernel_ucontext_t *ucxt = NULL;
    int sig = 0;
#if defined(DEBUG) || !defined(MACOS64)
    /* xsp was put in mcontext prior to pre_system_call() */
    reg_t xsp = get_mcontext(dcontext)->xsp;
#endif
#ifdef MACOS
    bool rt = true;
#endif

    LOG(THREAD, LOG_ASYNCH, 3, "%ssigreturn()\n", rt ? "rt_" : "");
    LOG(THREAD, LOG_ASYNCH, 3, "\txsp is " PFX "\n", xsp);

#ifdef PROGRAM_SHEPHERDING
    /* if (!sig_has_restorer, region was never added to exec list,
     * allowed as pattern only and kicked off at first write via
     * selfmod detection or otherwise if vsyscall, so no worries
     * about having to remove it here
     */
#endif

    /* The easiest way to set all the non-GPR state that DR does not separately
     * preserve is to actually execute the sigreturn syscall, so we set up to do
     * that.  We do not want to change DR's signal state, however, so we set it
     * back to DR's values after processing the state for the app.
     */
    kernel_sigset_t our_mask;
    sigprocmask_syscall(SIG_SETMASK, NULL, &our_mask, sizeof(our_mask));

    /* get sigframe: it's the top thing on the stack, except the ret
     * popped off pretcode.
     * WARNING: handler for tcsh's window_change (SIGWINCH) clobbers its
     * signal # arg, so don't use frame->sig!  (kernel doesn't look at sig
     * so app can get away with it)
     */
    if (rt) {
#ifdef LINUX
        sigframe_rt_t *frame = (sigframe_rt_t *)(xsp IF_X86(-sizeof(char *)));
        /* use si_signo instead of sig, less likely to be clobbered by app */
        sig = frame->info.si_signo;
#    ifdef X86_32
        LOG(THREAD, LOG_ASYNCH, 3, "\tsignal was %d (did == param %d)\n", sig,
            frame->sig);
        if (frame->sig != sig)
            LOG(THREAD, LOG_ASYNCH, 1, "WARNING: app sig handler clobbered sig param\n");
#    endif
        sc = get_sigcontext_from_app_frame(info, sig, (void *)frame);
        ucxt = &frame->uc;
        /* Check again for the magic words. See the i#3812 comment above. */
        IF_X86(ASSERT(
            (sc->fpstate->sw_reserved.magic1 == 0 &&
             sc->fpstate->sw_reserved.extended_size == sizeof(kernel_fpstate_t)) ||
            (sc->fpstate->sw_reserved.magic1 == FP_XSTATE_MAGIC1 &&
             *(int *)((byte *)sc->fpstate + sc->fpstate->sw_reserved.extended_size -
                      FP_XSTATE_MAGIC2_SIZE) == FP_XSTATE_MAGIC2)));
#elif defined(MACOS)
        ucxt = (kernel_ucontext_t *)ucxt_param;
        if (ucxt == NULL) {
            /* On Mac the kernel seems to store state on whether the process is
             * on the altstack, so longjmp calls _sigunaltstack() which issues a
             * sigreturn syscall telling the kernel about the altstack change,
             * with a NULL context.
             */
            LOG(THREAD, LOG_ASYNCH, 3, "\tsigunalstack sigreturn: no context\n");
            return true;
        }
#    ifdef X64
        kernel_siginfo_t *siginfo = (kernel_siginfo_t *)ucxt - 1;
        sig = siginfo->si_signo;
#    else
        /* The initial frame fields on the stack are messed up due to
         * params to handler from tramp, so use params to syscall.
         * XXX: we don't have signal # though: so we have to rely on app
         * not clobbering the sig param field.
         */
        sig = *(int *)xsp;
#    endif
        LOG(THREAD, LOG_ASYNCH, 3, "\tsignal was %d\n", sig);
        sc = SIGCXT_FROM_UCXT(ucxt);
#endif
        ASSERT(sig > 0 && sig <= MAX_SIGNUM && IS_RT_FOR_APP(info, sig));
        /* Re-set sigstack from the value stored in the frame.  Silently ignore failure,
         * just like the kernel does.
         */
        uint ignored;
        /* The kernel checks for being on the stack *after* swapping stacks, so pass
         * sc->SC_XSP as the current stack.
         */
        handle_sigaltstack(dcontext, &ucxt->uc_stack, NULL, sc->SC_XSP, &ignored);
        /* Restore DR's so sigreturn syscall won't change it. */
        ucxt->uc_stack = info->sigstack;

        /* FIXME: what if handler called sigaction and requested rt
         * when itself was non-rt?
         */

        /* Discard blocked signals, re-set from prev mask stored in frame. */
        set_blocked(dcontext, SIGMASK_FROM_UCXT(ucxt), true /*absolute*/);
        /* Restore DR's so sigreturn syscall won't change it. */
        *SIGMASK_FROM_UCXT(ucxt) = our_mask;
    }
#if defined(LINUX) && !defined(X64)
    else {
        /* FIXME: libc's restorer pops prior to calling sigreturn, I have
         * no idea why, but kernel asks for xsp-8 not xsp-4...weird!
         */
        kernel_sigset_t prevset;
        sigframe_plain_t *frame = (sigframe_plain_t *)(xsp IF_X86(-8));
        /* We don't trust frame->sig (app sometimes clobbers it), and for
         * plain frame there's no other place that sig is stored,
         * so as a hack we added a new frame!
         * FIXME: this means we won't support nonstandard use of SYS_sigreturn,
         * e.g., as NtContinue, if frame didn't come from a real signal and so
         * wasn't copied to stack by us.
         */
        sig = frame->sig_noclobber;
        LOG(THREAD, LOG_ASYNCH, 3, "\tsignal was %d (did == param %d)\n", sig,
            IF_X86_ELSE(frame->sig, 0));
#    ifdef X86_32
        if (frame->sig != sig)
            LOG(THREAD, LOG_ASYNCH, 1, "WARNING: app sig handler clobbered sig param\n");
#    endif
        ASSERT(sig > 0 && sig <= MAX_SIGNUM && !IS_RT_FOR_APP(info, sig));
        sc = get_sigcontext_from_app_frame(info, sig, (void *)frame);
        /* discard blocked signals, re-set from prev mask stored in frame */
        prevset.sig[0] = frame->IF_X86_ELSE(sc.oldmask, uc.uc_mcontext.oldmask);
        if (_NSIG_WORDS > 1) {
            memcpy(&prevset.sig[1], &frame->IF_X86_ELSE(extramask, uc.sigset_ex),
                   sizeof(prevset.sig[1]));
        }
#    ifdef ARM
        ucxt = &frame->uc; /* we leave ucxt NULL for x86: not needed there */
#    endif
        set_blocked(dcontext, &prevset, true /*absolute*/);
        /* Restore DR's so sigreturn syscall won't change it. */
        convert_rt_mask_to_nonrt(frame, &our_mask);
    }
#endif /* LINUX */

    /* Make sure we deliver pending signals that are now unblocked.
     */
    check_signals_pending(dcontext, info);

    /* if we were building a trace, kill it */
    if (is_building_trace(dcontext)) {
        LOG(THREAD, LOG_ASYNCH, 3, "\tsquashing trace-in-progress\n");
        trace_abort(dcontext);
    }

    /* Defensively check for NULL.
     * XXX i#3182: It did happen but it is not clear how.
     */
    if (info->sighand->action[sig] != NULL &&
        TEST(SA_ONESHOT, info->sighand->action[sig]->flags)) {
        ASSERT(info->sighand->action[sig]->handler == (handler_t)SIG_DFL);
        if (!info->sighand->we_intercept[sig]) {
            /* let kernel do default independent of us */
            handler_free(dcontext, info->sighand->action[sig],
                         sizeof(kernel_sigaction_t));
            info->sighand->action[sig] = NULL;
        }
    }

    ASSERT(!safe_is_in_fcache(dcontext, (app_pc)sc->SC_XIP, (byte *)sc->SC_XSP));

    sig_full_cxt_t sc_full = { sc, NULL /*not provided*/ };
    get_mcontext(dcontext)->pc = dcontext->next_tag;
    instrument_kernel_xfer(dcontext, DR_XFER_SIGNAL_RETURN, osc_empty, NULL,
                           get_mcontext(dcontext), (app_pc)sc->SC_XIP, sc->SC_XSP,
                           sc_full, NULL, sig);

#ifdef DEBUG
    if (d_r_stats->loglevel >= 3 && (d_r_stats->logmask & LOG_ASYNCH) != 0) {
        LOG(THREAD, LOG_ASYNCH, 3, "returning-to sigcontext " PFX ":\n", sc);
        dump_sigcontext(dcontext, sc);
    }
#endif

    /* XXX i#1206: if we interrupted a non-ignorable syscall to run the app's
     * handler, and we set up to restart the syscall, we'll come here with the
     * translated syscall pc -- thus we can't distinguish from a signal interrupting
     * the prior app instr.  So we can't simply point at do_syscall and call
     * set_at_syscall -- we have to re-interpret the syscall and re-run the
     * pre-syscall handler.  Hopefully all our pre-syscall handlers can handle that.
     */

    /* set up for d_r_dispatch */
    /* we have to use a different slot since next_tag ends up holding the do_syscall
     * entry when entered from d_r_dispatch (we're called from
     * pre_syscall, prior to entering cache)
     */
    dcontext->asynch_target = canonicalize_pc_target(
        dcontext, (app_pc)(sc->SC_XIP IF_ARM(| (TEST(EFLAGS_T, sc->SC_XFLAGS) ? 1 : 0))));
    DEBUG_DECLARE(app_pc next_pc = dcontext->asynch_target;)

#ifdef VMX86_SERVER
    /* PR 404712: kernel only restores gp regs so we do it ourselves and avoid
     * complexities of kernel's non-linux-like sigreturn semantics
     */
    sig_full_cxt_t sc_full = { sc, NULL }; /* non-ARM so NULL ok */
    sigcontext_to_mcontext(get_mcontext(dcontext), &sc_full, DR_MC_ALL);
#else
    /* HACK to get eax put into mcontext AFTER do_syscall */
    dcontext->next_tag = (app_pc)sc->SC_RETURN_REG;
    /* use special linkstub so we know why we came out of the cache */
    sc->SC_RETURN_REG = (ptr_uint_t)get_asynch_linkstub();

    /* set our sigreturn context to point to fcache_return */
    /* We don't need PC_AS_JMP_TGT b/c the kernel uses EFLAGS_T for the mode */
    sc->SC_XIP = (ptr_uint_t)fcache_return_routine(dcontext);

    /* if we overlaid inner frame on nested signal, will end up with this
     * error -- disable in release build since this is often app's fault (stack
     * too small)
     * FIXME: how make this transparent?  what ends up happening is that we
     * get a segfault when we start interpreting d_r_dispatch, we want to make it
     * look like whatever would happen to the app...
     */
    ASSERT((app_pc)sc->SC_XIP != next_pc);
#    if defined(AARCHXX)
    ASSERT(get_sigcxt_stolen_reg(sc) != (reg_t)*get_dr_tls_base_addr());
    /* We're called from DR and are not yet in the cache, so we want to set the
     * mcontext slot, not the TLS slot, to set the stolen reg value.
     */
    set_stolen_reg_val(get_mcontext(dcontext), get_sigcxt_stolen_reg(sc));
    /* The linkstub expects DR's TLS to be in the actual register. */
    set_sigcxt_stolen_reg(sc, (reg_t)*get_dr_tls_base_addr());
#        ifdef AARCH64
    /* On entry to the do_syscall gencode, we save X1 into TLS_REG1_SLOT.
     * Then the sigreturn would redirect the flow to the fcache_return gencode.
     * In fcache_return it recovers the values of x0 and x1 from TLS_SLOT 0 and 1.
     */
    get_mcontext(dcontext)->r1 = sc->SC_FIELD_AARCH64(1);
#        else
    /* We're going to our fcache_return gencode which uses DEFAULT_ISA_MODE */
    set_pc_mode_in_cpsr(sc, DEFAULT_ISA_MODE);
#        endif
#    endif
#endif

    LOG(THREAD, LOG_ASYNCH, 3, "set next tag to " PFX ", sc->SC_XIP to " PFX "\n",
        next_pc, sc->SC_XIP);
    info->in_app_handler = false;

    return IF_VMX86_ELSE(false, true);
}

bool
is_signal_restorer_code(byte *pc, size_t *len)
{
    /* is this a sigreturn pattern placed by kernel on the stack or vsyscall page?
     * for non-rt frame:
     *    0x58           popl %eax
     *    0xb8 <sysnum>  movl SYS_sigreturn, %eax
     *    0xcd 0x80      int 0x80
     * for rt frame:
     *    0xb8 <sysnum>  movl SYS_rt_sigreturn, %eax
     *    0xcd 0x80      int 0x80
     */
    /* optimized we only need two uint reads, but we have to do
     * some little-endian byte-order reverses to get the right result
     */
#define reverse(x)                                                      \
    ((((x)&0xff) << 24) | (((x)&0xff00) << 8) | (((x)&0xff0000) >> 8) | \
     (((x)&0xff000000) >> 24))
#ifdef MACOS
#    define SYS_RT_SIGRET SYS_sigreturn
#else
#    define SYS_RT_SIGRET SYS_rt_sigreturn
#endif
#ifndef X64
    /* 58 b8 s4 s3 s2 s1 cd 80 */
    static const uint non_rt_1w = reverse(0x58b80000 | (reverse(SYS_sigreturn) >> 16));
    static const uint non_rt_2w = reverse((reverse(SYS_sigreturn) << 16) | 0xcd80);
#endif
    /* b8 s4 s3 s2 s1 cd 80 XX */
    static const uint rt_1w = reverse(0xb8000000 | (reverse(SYS_RT_SIGRET) >> 8));
    static const uint rt_2w = reverse((reverse(SYS_RT_SIGRET) << 24) | 0x00cd8000);
    /* test rt first as it's the most common
     * only 7 bytes here so we ignore the last one (becomes msb since little-endian)
     */
    if (*((uint *)pc) == rt_1w && (*((uint *)(pc + 4)) & 0x00ffffff) == rt_2w) {
        if (len != NULL)
            *len = 7;
        return true;
    }
#ifndef X64
    if (*((uint *)pc) == non_rt_1w && *((uint *)(pc + 4)) == non_rt_2w) {
        if (len != NULL)
            *len = 8;
        return true;
    }
#endif
    return false;
}

void
os_forge_exception(app_pc target_pc, dr_exception_type_t type)
{
    /* PR 205136:
     * We want to deliver now, and the caller expects us not to return.
     * We have two alternatives:
     * 1) Emulate stack frame, and call transfer_to_dispatch() for delivery.  We
     *    may not know how to fill out every field of the frame (cr2, etc.).  Plus,
     *    we have problems w/ default actions (PR 205310) but we have to solve
     *    those long-term anyway.  We also have to create different frames based on
     *    whether app intercepts via rt or not.
     * 2) Call SYS_tgkill from a special location that our handler can
     *    recognize and know it's a signal meant for the app and that the
     *    interrupted DR can be discarded.  We'd then essentially repeat 1,
     *    but modifying the kernel-generated frame.  We'd have to always
     *    intercept SIGILL.
     * I'm going with #1 for now b/c the common case is simpler.
     */
    dcontext_t *dcontext = get_thread_private_dcontext();
#if defined(LINUX) && defined(X86)
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
#endif
    char frame_no_xstate[sizeof(sigframe_rt_t)];
    sigframe_rt_t *frame = (sigframe_rt_t *)frame_no_xstate;
    int sig;
    dr_where_am_i_t cur_whereami = dcontext->whereami;
    kernel_ucontext_t *uc = get_ucontext_from_rt_frame(frame);
    sigcontext_t *sc = SIGCXT_FROM_UCXT(uc);
    switch (type) {
    case ILLEGAL_INSTRUCTION_EXCEPTION: sig = SIGILL; break;
    case UNREADABLE_MEMORY_EXECUTION_EXCEPTION: sig = SIGSEGV; break;
    case SINGLE_STEP_EXCEPTION: ASSERT_NOT_IMPLEMENTED(false); /* FIXME: i#2144 */
    case IN_PAGE_ERROR_EXCEPTION: /* fall-through: Windows only */
    default:
        ASSERT_NOT_REACHED();
        sig = SIGSEGV;
        break;
    }

    LOG(GLOBAL, LOG_ASYNCH, 1, "os_forge_exception sig=%d\n", sig);

    /* Since we always delay delivery, we always want an rt frame.  we'll convert
     * to a plain frame on delivery.
     */
    memset(frame, 0, sizeof(*frame));
    frame->info.si_signo = sig;
    /* Set si_code to match what would happen natively.  We also need this to
     * avoid the !is_sys_kill() check in record_pending_signal() to avoid an
     * infinite loop (i#3171).
     */
    frame->info.si_code = IF_LINUX_ELSE(SI_KERNEL, 0);
    frame->info.si_addr = target_pc;
#ifdef X86_32
    frame->sig = sig;
    frame->pinfo = &frame->info;
    frame->puc = (void *)&frame->uc;
#endif
#if defined(LINUX) && defined(X86)
    /* We use a TLS buffer to avoid too much stack space here. */
    sc->fpstate = (kernel_fpstate_t *)get_and_initialize_xstate_buffer(dcontext);
#endif
    mcontext_to_ucontext(uc, get_mcontext(dcontext));
    sc->SC_XIP = (reg_t)target_pc;
    /* We'll fill in fpstate at delivery time.
     * We fill in segment registers to their current values and assume they won't
     * change and that these are the right values.
     *
     * FIXME i#2095: restore the app's segment register value(s).
     *
     * XXX: it seems to work w/o filling in the other state:
     * I'm leaving cr2 and other fields all zero.
     * If this gets problematic we could switch to approach #2.
     */
    thread_set_segment_registers(sc);
#if defined(X86) && defined(LINUX)
    if (sig_has_restorer(info, sig))
        frame->pretcode = (char *)info->sighand->action[sig]->restorer;
    else
        frame->pretcode = (char *)dynamorio_sigreturn;
#endif

    /* We assume that we do not need to translate the context when forged.
     * If we did, we'd move this below enter_nolinking() (and update
     * record_pending_signal() to do the translation).
     */
    record_pending_signal(dcontext, sig, &frame->uc, frame, true /*forged*/, NULL);

    /* For most callers this is not necessary and we only do it to match
     * the Windows usage model: but for forging from our own handler,
     * this is good b/c it resets us to the base of dstack.
     */
    /* tell d_r_dispatch() why we're coming there */
    dcontext->whereami = DR_WHERE_TRAMPOLINE;
    KSTART(dispatch_num_exits);
    set_last_exit(dcontext, (linkstub_t *)get_asynch_linkstub());
    if (is_couldbelinking(dcontext))
        enter_nolinking(dcontext, NULL, false);
    transfer_to_dispatch(
        dcontext, get_mcontext(dcontext),
        cur_whereami != DR_WHERE_FCACHE && cur_whereami != DR_WHERE_SIGNAL_HANDLER
        /*full_DR_state*/);
    ASSERT_NOT_REACHED();
}

void
os_request_fatal_coredump(const char *msg)
{
    /* To enable getting a coredump just make sure that rlimits are
     * not preventing getting one, e.g. ulimit -c unlimited
     */
    SYSLOG_INTERNAL_ERROR("Crashing the process deliberately for a core dump for: |%s|",
                          msg);
    os_terminate_via_signal(NULL, 0 /*no cleanup*/, SIGSEGV);
    ASSERT_NOT_REACHED();
}

void
os_request_live_coredump(const char *msg)
{
#ifdef VMX86_SERVER
    if (os_in_vmkernel_userworld()) {
        vmk_request_live_coredump(msg);
        return;
    }
#endif
    LOG(GLOBAL, LOG_ASYNCH, 1,
        "LiveCoreDump unsupported (PR 365105).  "
        "Continuing execution without a core.\n");
    return;
}

void
os_dump_core(const char *msg)
{
    /* FIXME Case 3408: fork stack dump crashes on 2.6 kernel, so moving the getchar
     * ahead to aid in debugging */
    if (TEST(DUMPCORE_WAIT_FOR_DEBUGGER, dynamo_options.dumpcore_mask)) {
        SYSLOG_INTERNAL_ERROR("looping so you can use gdb to attach to pid %s",
                              get_application_pid());
        SYSLOG(SYSLOG_CRITICAL, WAITING_FOR_DEBUGGER, 2, get_application_name(),
               get_application_pid());
        /* getchar() can hit our own vsyscall hook (from PR 212570); typically we
         * want to attach and not continue anyway, so doing an infinite loop:
         */
        while (true)
            os_thread_yield();
    }

    if (DYNAMO_OPTION(live_dump)) {
        os_request_live_coredump(msg);
    }

    if (TEST(DUMPCORE_INCLUDE_STACKDUMP, dynamo_options.dumpcore_mask)) {
        /* fork, dump core, then use gdb to get a stack dump
         * we can get into an infinite loop if there's a seg fault
         * in the process of doing this -- so we have a do-once test,
         * and if it failed we do the no-symbols dr callstack dump
         */
        static bool tried_stackdump = false;
        if (!tried_stackdump) {
            tried_stackdump = true;
            d_r_stackdump();
        } else {
            static bool tried_calldump = false;
            if (!tried_calldump) {
                tried_calldump = true;
                dump_dr_callstack(STDERR);
            }
        }
    }

    if (!DYNAMO_OPTION(live_dump)) {
        os_request_fatal_coredump(msg);
        ASSERT_NOT_REACHED();
    }
}

#ifdef RETURN_AFTER_CALL
bool
at_known_exception(dcontext_t *dcontext, app_pc target_pc, app_pc source_fragment)
{
    /* There is a known exception in signal restorers and the Linux
     * dynamic symbol resoulution.
     * The latter we assume it is the only other recurring known exception,
     * so the first time we pattern match to help make sure it is indeed
     * _dl_runtime_resolve (since with LD_BIND_NOW it will never be called).
     * After that we compare with the known value.
     */

    static app_pc known_exception = 0;
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;

    LOG(THREAD, LOG_INTERP, 1, "RCT: testing for KNOWN exception " PFX " " PFX "\n",
        target_pc, source_fragment);

    /* Check if this is a signal return.
       FIXME: we should really get that from the frame itself.
       Since currently grabbing restorer only when copying a frame,
       this will work with nested signals only if they all have same restorer
       (I haven't seen restorers other than the one in libc)
    */
    if (target_pc == info->signal_restorer_retaddr) {
        LOG(THREAD, LOG_INTERP, 1,
            "RCT: KNOWN exception this is a signal restorer --ok \n");
        STATS_INC(ret_after_call_signal_restorer);
        return true;
    }

    if (source_fragment == known_exception) {
        LOG(THREAD, LOG_INTERP, 1,
            "RCT: KNOWN exception again _dl_runtime_resolve --ok\n");
        return true;
    }

    if (known_exception == 0) {
        int ret_imm;
        return at_dl_runtime_resolve_ret(dcontext, source_fragment, &ret_imm);
    }
    return false;
}
#endif /* RETURN_AFTER_CALL */

/***************************************************************************
 * ITIMERS
 *
 * We support combining an app itimer with a DR itimer for each of the 3 types
 * (PR 204556).
 */

static inline uint64
timeval_to_usec(struct timeval *t1)
{
    return ((uint64)(t1->tv_sec)) * 1000000 + t1->tv_usec;
}

static inline void
usec_to_timeval(uint64 usec, struct timeval *t1)
{
    t1->tv_sec = (long)usec / 1000000;
    t1->tv_usec = (long)usec % 1000000;
}

static void
init_itimer(dcontext_t *dcontext, bool first)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int i;
    ASSERT(info != NULL);
    ASSERT(!info->shared_itimer); /* else inherit */
    LOG(THREAD, LOG_ASYNCH, 2, "thread has private itimers%s\n",
        os_itimers_thread_shared() ? " (for now)" : "");
    if (os_itimers_thread_shared()) {
        /* we have to allocate now even if no itimer is installed until later,
         * so that all child threads point to the same data
         */
        info->itimer = (thread_itimer_info_t(*)[NUM_ITIMERS])global_heap_alloc(
            sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
    } else {
        /* for simplicity and parallel w/ shared we allocate proactively */
        info->itimer = (thread_itimer_info_t(*)[NUM_ITIMERS])heap_alloc(
            dcontext, sizeof(*info->itimer) HEAPACCT(ACCT_OTHER));
    }
    memset(info->itimer, 0, sizeof(*info->itimer));
    for (i = 0; i < NUM_ITIMERS; i++) {
        ASSIGN_INIT_RECURSIVE_LOCK_FREE((*info->itimer)[i].lock, shared_itimer_lock);
    }
    if (first) {
        /* see if app has set up an itimer before we were loaded */
        struct itimerval prev;
        int which;
        for (which = 0; which < NUM_ITIMERS; which++) {
            DEBUG_DECLARE(int rc =) getitimer_syscall(which, &prev);
            ASSERT(rc == SUCCESS);
            (*info->itimer)[which].app.interval = timeval_to_usec(&prev.it_interval);
            (*info->itimer)[which].app.value = timeval_to_usec(&prev.it_value);
        }
    }
}

/* Up to caller to hold lock for shared itimers */
static bool
set_actual_itimer(dcontext_t *dcontext, int which, thread_sig_info_t *info, bool enable)
{
    struct itimerval val;
    int rc;
    ASSERT(info != NULL && info->itimer != NULL);
    ASSERT(which >= 0 && which < NUM_ITIMERS);
    if (enable) {
        LOG(THREAD, LOG_ASYNCH, 2,
            "installing itimer %d interval=" INT64_FORMAT_STRING
            ", value=" INT64_FORMAT_STRING "\n",
            which, (*info->itimer)[which].actual.interval,
            (*info->itimer)[which].actual.value);
        /* i#2907: we have no signal handlers until we start the app (i#2335)
         * so we can't set up an itimer until then.
         */
        ASSERT(dynamo_initialized);
        ASSERT(!info->shared_itimer ||
               self_owns_recursive_lock(&(*info->itimer)[which].lock));
        usec_to_timeval((*info->itimer)[which].actual.interval, &val.it_interval);
        usec_to_timeval((*info->itimer)[which].actual.value, &val.it_value);
    } else {
        LOG(THREAD, LOG_ASYNCH, 2, "disabling itimer %d\n", which);
        memset(&val, 0, sizeof(val));
        (*info->itimer)[which].actual.value = 0;
        (*info->itimer)[which].actual.interval = 0;
    }
    rc = setitimer_syscall(which, &val, NULL);
    return (rc == SUCCESS);
}

/* Caller should hold lock */
static bool
itimer_new_settings(dcontext_t *dcontext, int which, bool app_changed)
{
    struct itimerval val;
    bool res = true;
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    ASSERT(which >= 0 && which < NUM_ITIMERS);
    ASSERT(!info->shared_itimer ||
           self_owns_recursive_lock(&(*info->itimer)[which].lock));
    /* the general strategy is to set the actual value to the smaller,
     * update the larger on each signal, and when the larger becomes
     * smaller do a one-time swap for the remaining
     */
    if ((*info->itimer)[which].dr.interval > 0 &&
        ((*info->itimer)[which].app.interval == 0 ||
         (*info->itimer)[which].dr.interval < (*info->itimer)[which].app.interval))
        (*info->itimer)[which].actual.interval = (*info->itimer)[which].dr.interval;
    else
        (*info->itimer)[which].actual.interval = (*info->itimer)[which].app.interval;

    if ((*info->itimer)[which].actual.value > 0) {
        if ((*info->itimer)[which].actual.interval == 0 &&
            (*info->itimer)[which].dr.value == 0 &&
            (*info->itimer)[which].app.value == 0) {
            (*info->itimer)[which].actual.value = 0;
            res = set_actual_itimer(dcontext, which, info, false /*disabled*/);
        } else {
            /* one of app or us has an in-flight timer which we should not interrupt.
             * but, we already set the new requested value (for app or us), so we
             * need to update the actual value so we subtract properly.
             */
            DEBUG_DECLARE(int rc =) getitimer_syscall(which, &val);
            ASSERT(rc == SUCCESS);
            uint64 left = timeval_to_usec(&val.it_value);
            if (!app_changed &&
                (*info->itimer)[which].actual.value == (*info->itimer)[which].app.value)
                (*info->itimer)[which].app.value = left;
            if (app_changed &&
                (*info->itimer)[which].actual.value == (*info->itimer)[which].dr.value)
                (*info->itimer)[which].dr.value = left;
            (*info->itimer)[which].actual.value = left;
        }
    } else {
        if ((*info->itimer)[which].dr.value > 0 &&
            ((*info->itimer)[which].app.value == 0 ||
             (*info->itimer)[which].dr.value < (*info->itimer)[which].app.value))
            (*info->itimer)[which].actual.value = (*info->itimer)[which].dr.value;
        else {
            (*info->itimer)[which].actual.value = (*info->itimer)[which].app.value;
        }
        res = set_actual_itimer(dcontext, which, info, true /*enable*/);
    }
    return res;
}

bool
set_itimer_callback(dcontext_t *dcontext, int which, uint millisec,
                    void (*func)(dcontext_t *, priv_mcontext_t *),
                    void (*func_api)(dcontext_t *, dr_mcontext_t *))
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    bool rc;
    if (which < 0 || which >= NUM_ITIMERS) {
        CLIENT_ASSERT(false, "invalid itimer type");
        return false;
    }
    if (func == NULL && func_api == NULL && millisec != 0) {
        CLIENT_ASSERT(false, "invalid function");
        return false;
    }
    ASSERT(info != NULL && info->itimer != NULL);
    if (info->shared_itimer)
        acquire_recursive_lock(&(*info->itimer)[which].lock);
    (*info->itimer)[which].dr.interval = ((uint64)millisec) * 1000;
    (*info->itimer)[which].dr.value = (*info->itimer)[which].dr.interval;
    (*info->itimer)[which].cb = func;
    (*info->itimer)[which].cb_api = func_api;
    if (!dynamo_initialized) {
        /* i#2907: we have no signal handlers until we start the app (i#2335)
         * so we can't set up an itimer until then.  start_itimer() called
         * from os_thread_under_dynamo() will enable it.
         */
        LOG(THREAD, LOG_ASYNCH, 2, "delaying itimer until attach\n");
        rc = true;
    } else
        rc = itimer_new_settings(dcontext, which, false /*us*/);
    if (info->shared_itimer)
        release_recursive_lock(&(*info->itimer)[which].lock);
    return rc;
}

uint
get_itimer_frequency(dcontext_t *dcontext, int which)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    uint ms = 0;
    if (which < 0 || which >= NUM_ITIMERS) {
        CLIENT_ASSERT(false, "invalid itimer type");
        return 0;
    }
    ASSERT(info != NULL && info->itimer != NULL);
    if (info->shared_itimer)
        acquire_recursive_lock(&(*info->itimer)[which].lock);
    ms = (*info->itimer)[which].dr.interval / 1000;
    if (info->shared_itimer)
        release_recursive_lock(&(*info->itimer)[which].lock);
    return ms;
}

static int
signal_to_itimer_type(int sig)
{
    if (sig == SIGALRM)
        return ITIMER_REAL;
    else if (sig == SIGVTALRM)
        return ITIMER_VIRTUAL;
    else if (sig == SIGPROF)
        return ITIMER_PROF;
    else
        return -1;
}

static bool
alarm_signal_has_DR_only_itimer(dcontext_t *dcontext, int signal)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    int which = signal_to_itimer_type(signal);
    if (which == -1)
        return false;
    if (info->shared_itimer)
        acquire_recursive_lock(&(*info->itimer)[which].lock);
    bool DR_only =
        ((*info->itimer)[which].dr.value > 0 || (*info->itimer)[which].dr.interval > 0) &&
        (*info->itimer)[which].app.value == 0 && (*info->itimer)[which].app.interval == 0;
    if (info->shared_itimer)
        release_recursive_lock(&(*info->itimer)[which].lock);
    return DR_only;
}

static bool
handle_alarm(dcontext_t *dcontext, int sig, kernel_ucontext_t *ucxt)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    int which = 0;
    bool invoke_cb = false, pass_to_app = false, reset_timer_manually = false;
    bool should_release_lock = false;

    /* i#471: suppress alarms coming in after exit */
    if (dynamo_exited)
        return pass_to_app;

    which = signal_to_itimer_type(sig);
    ASSERT(which != -1);
    LOG(THREAD, LOG_ASYNCH, 2, "received alarm %d @" PFX "\n", which,
        SIGCXT_FROM_UCXT(ucxt)->SC_XIP);

    /* This alarm could have interrupted an app thread making an itimer syscall,
     * which is why we don't want to block on a lock here.
     * It can't interrupt this same thread handling a prior alarm (b/c we block
     * the signal in our handler).  It could arrive in thread B while thread A
     * is still handling a prior alarm if the alarm frequency is high and the
     * processing is slow, which is why we split the locks to be per-itimer-type.
     * We also avoid new thread setup code acquiring these itimer locks by using
     * atomic increments instead for the refcounts.  Xref i#2993.
     */
    if (info->shared_itimer) {
#ifdef DEADLOCK_AVOIDANCE
        /* i#2061: in debug build we can get an alarm while in deadlock handling
         * code that holds innermost_lock.  We just drop such alarms.
         */
        if (OWN_MUTEX(&innermost_lock))
            return pass_to_app;
#endif
        if (self_owns_recursive_lock(&(*info->itimer)[which].lock)) {
            /* What can we do?  We just go ahead and hope conflicting writes work out.
             * We don't re-acquire in case app was in middle of acquiring.
             */
        } else {
#define ALARM_LOCK_MAX_TRIES 4
            int i;
            for (i = 0; i < ALARM_LOCK_MAX_TRIES; ++i) {
                if (try_recursive_lock(&(*info->itimer)[which].lock)) {
                    should_release_lock = true;
                    break;
                }
                os_thread_yield();
            }
            if (!should_release_lock) {
                /* Heuristic: if fail N times then assume interrupted lock routine
                 * while processing an app syscall (see above: we ruled out other
                 * scenarios).  What can we do?  Just continue and hope conflicting
                 * writes work out.
                 */
            }
        }
    }
    if ((*info->itimer)[which].app.value > 0) {
        /* Alarm could have been on its way when app value changed */
        if ((*info->itimer)[which].app.value >= (*info->itimer)[which].actual.value) {
            (*info->itimer)[which].app.value -= (*info->itimer)[which].actual.value;
            LOG(THREAD, LOG_ASYNCH, 2, "\tapp value is now %d\n",
                (*info->itimer)[which].app.value);
            if ((*info->itimer)[which].app.value == 0) {
                pass_to_app = true;
                (*info->itimer)[which].app.value = (*info->itimer)[which].app.interval;
            } else
                reset_timer_manually = true;
        }
    } else if ((*info->itimer)[which].dr.value == 0) {
        /* This is an explicitly-sent signal, rather than generated by an itimer.
         * XXX i#5017: We need to also identify such a signal when the app has no
         * itimer and DR has one as today we'll swallow it and assume it was part of
         * the DR itimer.
         */
        pass_to_app = true;
    }
    if ((*info->itimer)[which].dr.value > 0) {
        /* Alarm could have been on its way when DR value changed */
        if ((*info->itimer)[which].dr.value >= (*info->itimer)[which].actual.value) {
            (*info->itimer)[which].dr.value -= (*info->itimer)[which].actual.value;
            LOG(THREAD, LOG_ASYNCH, 2, "\tdr value is now %d\n",
                (*info->itimer)[which].dr.value);
            if ((*info->itimer)[which].dr.value == 0) {
                invoke_cb = true;
                (*info->itimer)[which].dr.value = (*info->itimer)[which].dr.interval;
            } else
                reset_timer_manually = true;
        }
    }
    /* for efficiency we let the kernel reset the value to interval if
     * there's only one timer
     */
    if (reset_timer_manually) {
        (*info->itimer)[which].actual.value = 0;
        itimer_new_settings(dcontext, which, true /*doesn't matter: actual.value==0*/);
    } else
        (*info->itimer)[which].actual.value = (*info->itimer)[which].actual.interval;

    if (invoke_cb) {
        /* invoke after setting new itimer value */
        /* we save stack space by allocating superset dr_mcontext_t */
        dr_mcontext_t dmc;
        dr_mcontext_init(&dmc);
        priv_mcontext_t *mc = dr_mcontext_as_priv_mcontext(&dmc);
        ucontext_to_mcontext(mc, ucxt);
        void (*cb)(dcontext_t *, priv_mcontext_t *) = (*info->itimer)[which].cb;
        void (*cb_api)(dcontext_t *, dr_mcontext_t *) = (*info->itimer)[which].cb_api;

        if (which == ITIMER_VIRTUAL && info->shared_itimer && should_release_lock) {
            release_recursive_lock(&(*info->itimer)[which].lock);
            should_release_lock = false;
        }

        if (cb != NULL) {
            cb(dcontext, mc);
        } else {
            cb_api(dcontext, &dmc);
        }
    }
    if (info->shared_itimer && should_release_lock)
        release_recursive_lock(&(*info->itimer)[which].lock);
    return pass_to_app;
}

/* Starts itimer if stopped, or increases refcount of existing itimer if already
 * started.  It is *not* safe to call this more than once for the same thread,
 * since it will inflate the refcount and prevent cleanup.
 */
void
start_itimer(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    bool start = false;
    if (info->shared_itimer) {
        /* i#2993: We avoid acquiring the lock as an alarm signal can arrive during
         * the lock routine (esp in debug build) and cause problems.
         */
        int new_count =
            atomic_add_exchange_int((volatile int *)info->shared_itimer_underDR, 1);
        start = (new_count == 1);
    } else
        start = true;
    if (start) {
        /* Enable all DR itimers b/c at least one thread in this set of threads
         * sharing itimers is under DR control
         */
        int which;
        LOG(THREAD, LOG_ASYNCH, 2, "starting DR itimers from thread " TIDFMT "\n",
            d_r_get_thread_id());
        for (which = 0; which < NUM_ITIMERS; which++) {
            if (info->shared_itimer)
                acquire_recursive_lock(&(*info->itimer)[which].lock);
            /* May have already been set up with the start delayed (i#2907). */
            if ((*info->itimer)[which].dr.interval > 0) {
                (*info->itimer)[which].dr.value = (*info->itimer)[which].dr.interval;
                itimer_new_settings(dcontext, which, false /*!app*/);
            }
            if (info->shared_itimer)
                release_recursive_lock(&(*info->itimer)[which].lock);
        }
    }
}

/* Decrements the itimer refcount, and turns off the itimer once there are no
 * more threads listening for it.  It is not safe to call this more than once on
 * the same thread.
 */
void
stop_itimer(dcontext_t *dcontext)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    bool stop = false;
    if (info->shared_itimer) {
        ASSERT(*info->shared_itimer_underDR > 0);
        int new_count =
            atomic_add_exchange_int((volatile int *)info->shared_itimer_underDR, -1);
        stop = (new_count == 0);
    } else
        stop = true;
    if (stop) {
        /* Disable all DR itimers b/c this set of threads sharing this
         * itimer is now completely native
         */
        int which;
        LOG(THREAD, LOG_ASYNCH, 2, "stopping DR itimers from thread " TIDFMT "\n",
            d_r_get_thread_id());
        for (which = 0; which < NUM_ITIMERS; which++) {
            if (info->shared_itimer)
                acquire_recursive_lock(&(*info->itimer)[which].lock);
            if ((*info->itimer)[which].dr.value > 0) {
                (*info->itimer)[which].dr.value = 0;
                if ((*info->itimer)[which].app.value > 0) {
                    (*info->itimer)[which].actual.interval =
                        (*info->itimer)[which].app.interval;
                } else
                    set_actual_itimer(dcontext, which, info, false /*disable*/);
            }
            if (info->shared_itimer)
                release_recursive_lock(&(*info->itimer)[which].lock);
        }
    }
}

/* handle app itimer syscalls */
/* handle_pre_alarm also calls this function and passes NULL as prev_timer */
void
handle_pre_setitimer(dcontext_t *dcontext, int which, const struct itimerval *new_timer,
                     struct itimerval *prev_timer)
{
    if (new_timer == NULL || which < 0 || which >= NUM_ITIMERS)
        return;
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    struct itimerval val;
    if (d_r_safe_read(new_timer, sizeof(val), &val)) {
        if (info->shared_itimer)
            acquire_recursive_lock(&(*info->itimer)[which].lock);
        /* save a copy in case the syscall fails */
        (*info->itimer)[which].app_saved = (*info->itimer)[which].app;
        (*info->itimer)[which].app.interval = timeval_to_usec(&val.it_interval);
        (*info->itimer)[which].app.value = timeval_to_usec(&val.it_value);
        LOG(THREAD, LOG_ASYNCH, 2,
            "app setitimer type=%d interval=" SZFMT " value=" SZFMT "\n", which,
            (*info->itimer)[which].app.interval, (*info->itimer)[which].app.value);
        itimer_new_settings(dcontext, which, true /*app*/);
        if (info->shared_itimer)
            release_recursive_lock(&(*info->itimer)[which].lock);
    }
}

void
handle_post_setitimer(dcontext_t *dcontext, bool success, int which,
                      const struct itimerval *new_timer, struct itimerval *prev_timer)
{
    if (new_timer == NULL || which < 0 || which >= NUM_ITIMERS) {
        ASSERT(new_timer == NULL || !success);
        return;
    }
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    ASSERT(which >= 0 && which < NUM_ITIMERS);
    if (!success && new_timer != NULL) {
        if (info->shared_itimer)
            acquire_recursive_lock(&(*info->itimer)[which].lock);
        /* restore saved pre-syscall settings */
        (*info->itimer)[which].app = (*info->itimer)[which].app_saved;
        itimer_new_settings(dcontext, which, true /*app*/);
        if (info->shared_itimer)
            release_recursive_lock(&(*info->itimer)[which].lock);
    }
    if (success && prev_timer != NULL)
        handle_post_getitimer(dcontext, success, which, prev_timer);
}

void
handle_post_getitimer(dcontext_t *dcontext, bool success, int which,
                      struct itimerval *cur_timer)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    ASSERT(info != NULL && info->itimer != NULL);
    if (success) {
        /* write succeeded for kernel but we're user and can have races */
        struct timeval val;
        DEBUG_DECLARE(bool ok;)
        ASSERT(which >= 0 && which < NUM_ITIMERS);
        ASSERT(cur_timer != NULL);
        if (info->shared_itimer)
            acquire_recursive_lock(&(*info->itimer)[which].lock);
        usec_to_timeval((*info->itimer)[which].app.interval, &val);
        IF_DEBUG(ok =)
        safe_write_ex(&cur_timer->it_interval, sizeof(val), &val, NULL);
        ASSERT(ok);
        if (d_r_safe_read(&cur_timer->it_value, sizeof(val), &val)) {
            /* subtract the difference between last-asked-for value
             * and current value to reflect elapsed time
             */
            uint64 left = (*info->itimer)[which].app.value -
                ((*info->itimer)[which].actual.value - timeval_to_usec(&val));
            usec_to_timeval(left, &val);
            IF_DEBUG(ok =)
            safe_write_ex(&cur_timer->it_value, sizeof(val), &val, NULL);
            ASSERT(ok);
        } else
            ASSERT_NOT_REACHED();
        if (info->shared_itimer)
            release_recursive_lock(&(*info->itimer)[which].lock);
    }
}

/* handle app alarm syscall */
/* alarm uses the same itimer and could be defined in terms of setitimer */
void
handle_pre_alarm(dcontext_t *dcontext, unsigned int sec)
{
    struct itimerval val;
    val.it_interval.tv_usec = 0;
    val.it_interval.tv_sec = 0;
    val.it_value.tv_usec = 0;
    val.it_value.tv_sec = sec;
    handle_pre_setitimer(dcontext, ITIMER_REAL, &val, NULL);
}

void
handle_post_alarm(dcontext_t *dcontext, bool success, unsigned int sec)
{
    /* alarm is always successful, so do nothing in post */
    ASSERT(success);
    return;
}

/***************************************************************************
 * Internal DR communication
 */

typedef struct _sig_detach_info_t {
    KSYNCH_TYPE *detached;
    byte *sigframe_xsp;
#ifdef HAVE_SIGALTSTACK
    stack_t *app_sigstack;
#endif
} sig_detach_info_t;

static void
notify_and_jmp_without_stack(KSYNCH_TYPE *notify_var, byte *continuation, byte *xsp)
{
    if (ksynch_kernel_support()) {
        /* Can't use dstack once we signal so in asm we do:
         *   futex/semaphore = 1;
         *   %xsp = xsp;
         *   dynamorio_condvar_wake_and_jmp(notify_var, continuation);
         */
#ifdef MACOS
        ASSERT(sizeof(notify_var->sem) == 4);
#endif
        /* We clobber xsp last just in case the compiler uses xsp to fetch the
         * variables we're loading into registers.
         */
#ifdef DR_HOST_NOT_TARGET
        ASSERT_NOT_REACHED();
#elif defined(X86)
#    ifndef MACOS
        /* i#2632: recent clang for 32-bit annoyingly won't do the right thing for
         * "jmp dynamorio_condvar_wake_and_jmp" and leaves relocs so we ensure it's PIC.
         * We do this first as it may end up clobbering a scratch reg like xax.
         */
        void (*asm_jmp_tgt)() = dynamorio_condvar_wake_and_jmp;
        asm("mov  %0, %%" ASM_XDX : : "m"(asm_jmp_tgt));
#    endif
        asm("mov %0, %%" ASM_XAX : : "m"(notify_var));
        asm("mov %0, %%" ASM_XCX : : "m"(continuation));
        asm("mov %0, %%" ASM_XSP : : "m"(xsp)); /* Clobber xsp last (see above). */
#    ifdef MACOS
        asm("movl $1,4(%" ASM_XAX ")");
        asm("jmp _dynamorio_condvar_wake_and_jmp");
#    else
        asm("movl $1,(%" ASM_XAX ")");
        asm("jmp  *%" ASM_XDX);
#    endif
#elif defined(AARCHXX)
        asm("ldr " ASM_R0 ", %0" : : "m"(notify_var));
        asm("mov " ASM_R1 ", #1");
        asm("str " ASM_R1 ",[" ASM_R0 "]");
        asm("ldr " ASM_R1 ", %0" : : "m"(continuation));
        asm("ldr " ASM_R2 ", %0" : : "m"(xsp));
        asm("mov " ASM_XSP ", " ASM_R2); /* Clobber xsp last (see above). */
#    ifdef MACOS
        asm("b _dynamorio_condvar_wake_and_jmp");
#    else
        asm("b dynamorio_condvar_wake_and_jmp");
#    endif
#elif defined(RISCV64)
        asm("ld " ASM_R0 ", %0" : : "m"(notify_var));
        asm("li " ASM_R1 ", 1");
        asm("sd " ASM_R1 ",(" ASM_R0 ")");
        asm("ld " ASM_R1 ", %0" : : "m"(continuation));
        asm("ld " ASM_R2 ", %0" : : "m"(xsp)); /* Clobber xsp last (see above). */
        asm("mv " ASM_XSP ", " ASM_R2);
        asm("j dynamorio_condvar_wake_and_jmp");
#endif
    } else {
        ksynch_set_value(notify_var, 1);
#ifdef DR_HOST_NOT_TARGET
        ASSERT_NOT_REACHED();
#elif defined(X86)
        asm("mov %0, %%" ASM_XAX : : "m"(continuation));
        asm("mov %0, %%" ASM_XSP : : "m"(xsp)); /* Clobber xsp last (see above). */
        asm("jmp *%" ASM_XAX);
#elif defined(AARCHXX)
        asm("ldr " ASM_R0 ", %0" : : "m"(continuation));
        asm("ldr " ASM_R1 ", %0" : : "m"(xsp)); /* Clobber xsp last (see above). */
        asm("mov " ASM_XSP ", " ASM_R1);
        asm(ASM_INDJMP " " ASM_R0);
#elif defined(RISCV64)
        asm("ld " ASM_R0 ", %0" : : "m"(continuation));
        asm("ld " ASM_R1 ", %0" : : "m"(xsp)); /* Clobber xsp last (see above). */
        asm("mv " ASM_XSP ", " ASM_R1);
        asm(ASM_INDJMP " " ASM_R0);
#endif /* X86/ARM */
    }
}

/* Go native from detach. This is executed on the app stack. */
static void
sig_detach_go_native(sig_detach_info_t *info)
{
    byte *xsp = info->sigframe_xsp;

#ifdef HAVE_SIGALTSTACK
    /* Restore the app signal stack, though sigreturn will overwrite this with the
     * uc_stack in the frame's ucontext anyway (which we already set for the app).
     */
    DEBUG_DECLARE(int rc =)
    sigaltstack_syscall(info->app_sigstack, NULL);
    ASSERT(rc == 0);
#endif

#ifdef X86
    /* Skip pretcode */
    xsp += sizeof(char *);
#endif
    notify_and_jmp_without_stack(info->detached, (byte *)dynamorio_sigreturn, xsp);

    ASSERT_NOT_REACHED();
}

/* Sets this (secondary) thread to detach by directly returning from the signal. */
static void
sig_detach(dcontext_t *dcontext, sigframe_rt_t *frame, KSYNCH_TYPE *detached)
{
    thread_sig_info_t *info = (thread_sig_info_t *)dcontext->signal_field;
    byte *xsp;
    sig_detach_info_t detach_info;

    LOG(THREAD, LOG_ASYNCH, 1, "%s: detaching\n", __FUNCTION__);
    DOLOG(3, LOG_ASYNCH,
          { dump_sigcontext(dcontext, get_sigcontext_from_rt_frame(frame)); });

    if (!atomic_read_bool(&multiple_handlers_present)) {
        /* Save the app's handlers (we only support one handler setups) for
         * execute_native_handler().
         */
        d_r_read_lock(&detached_sigact_lock);
        for (int i = 0; i < SIGARRAY_SIZE; ++i) {
            if (info->sighand->action[i] != NULL &&
                detached_sigact[i].handler == (handler_t)0) {
                d_r_read_unlock(&detached_sigact_lock);
                d_r_write_lock(&detached_sigact_lock);
                if (detached_sigact[i].handler == (handler_t)0) {
                    memcpy(&detached_sigact[i], info->sighand->action[i],
                           sizeof(detached_sigact[i]));
                }
                d_r_write_unlock(&detached_sigact_lock);
                d_r_read_lock(&detached_sigact_lock);
            }
        }
        d_r_read_unlock(&detached_sigact_lock);
    }

    /* Update the mask of the signal frame so that the later sigreturn will
     * restore the app signal mask.
     */
    memcpy(&frame->uc.uc_sigmask, &info->app_sigblocked, sizeof(info->app_sigblocked));

    /* Copy the signal frame to the app stack.
     * XXX: We live with the transparency risk of storing the signal frame on
     * the app stack: we assume the app stack is writable where we need it to be,
     * and that we're not clobbering any app data beyond TOS.
     */
    xsp = get_sigstack_frame_ptr(dcontext, info, SUSPEND_SIGNAL, frame);
    copy_frame_to_stack(dcontext, info, SUSPEND_SIGNAL, frame, xsp, false /*!pending*/);

#ifdef HAVE_SIGALTSTACK
    /* Make sure the frame's sigstack reflects the app stack.
     * copy_frame_to_stack() should have done this for us.
     */
    ASSERT(((sigframe_rt_t *)xsp)->uc.uc_stack.ss_sp == info->app_sigstack.ss_sp);
#endif

    /* Restore app segment registers. */
    os_thread_not_under_dynamo(dcontext);
    os_tls_thread_exit(dcontext->local_state);

#ifdef HAVE_SIGALTSTACK
    /* We can't restore the app's sigstack here as that will invalidate the
     * sigstack we're currently on.
     */
    detach_info.app_sigstack = &info->app_sigstack;
#endif
    detach_info.detached = detached;
    detach_info.sigframe_xsp = xsp;

    call_switch_stack(&detach_info, xsp, (void (*)(void *))sig_detach_go_native,
                      false /*free_initstack*/, false /*do not return*/);

    ASSERT_NOT_REACHED();
}

static bool
is_signal_for_us(kernel_siginfo_t *siginfo, int sig_expecting)
{
    /* Distinguish a nudge/suspend from an app signal.
     * On Linux with SYS_rt_tgsigqueueinfo support, it is much easier, since an
     * app using libc sigqueue()
     * will never have its signal mistaken as libc does not expose the kernel_siginfo_t
     * and always passes 0 for si_errno, so we're only worried beyond our
     * si_code check about an app using a raw syscall that is deliberately
     * trying to fool us.
     * While there is a lot of padding space in kernel_siginfo_t, the kernel doesn't
     * copy it through on SYS_rt_sigqueueinfo so we don't have room for any
     * dedicated magic numbers.  The client id could function as a magic
     * number for client nudges, but I don't think we want to kill the app
     * if an external nudger types the client id wrong.
     */
    LOG(THREAD_GET, LOG_ASYNCH, 2, "%s T%d: sig=%d code=%d errno=%d\n", __FUNCTION__,
        get_sys_thread_id(), siginfo->si_signo, siginfo->si_code, siginfo->si_errno);
    if (siginfo->si_signo != sig_expecting)
        return false;
    if (is_sigqueue_supported()) {
        if (siginfo->si_code != SI_QUEUE || siginfo->si_errno == 0)
            return false;
    } else {
        /* Distinguish from a synchronous app signal by looking for
         * si_code where <=0 means user-sent.
         */
        if (siginfo->si_code > 0)
            return false;
    }
    return true;
}

/* Returns whether to pass on to app */
static bool
handle_suspend_signal(dcontext_t *dcontext, kernel_siginfo_t *siginfo,
                      kernel_ucontext_t *ucxt, sigframe_rt_t *frame)
{
    os_thread_data_t *ostd = (os_thread_data_t *)dcontext->os_field;
    kernel_sigset_t prevmask;
    sig_full_cxt_t sc_full;
    ASSERT(ostd != NULL);

    if (!is_signal_for_us(siginfo, SUSPEND_SIGNAL))
        return true; /* pass to app */

    if (is_sigqueue_supported() && SUSPEND_SIGNAL == NUDGESIG_SIGNUM) {
        nudge_arg_t *arg = (nudge_arg_t *)siginfo;
        if (!TEST(NUDGE_IS_SUSPEND, arg->flags))
            return handle_nudge_signal(dcontext, siginfo, ucxt);
    }

    /* We distinguish from an app signal further below from the rare case of an
     * app sending SUSPEND_SIGNAL asynchronously by looking for our particular
     * state settings that correspond to the use of this signal by DR.
     */

    if (ostd->terminate) {
        /* PR 297902: exit this thread, without using the dstack */
        /* For MacOS, we need a stack as 32-bit syscalls take args on the stack.
         * We go ahead and use it for x86 too for simpler sysenter return.
         * We don't have a lot of options: we're terminating, so we go ahead
         * and use the app stack.
         */
        byte *app_xsp;
        if (IS_CLIENT_THREAD(dcontext))
            app_xsp = (byte *)SIGCXT_FROM_UCXT(ucxt)->SC_XSP;
        else
            app_xsp = (byte *)get_mcontext(dcontext)->xsp;
        LOG(THREAD, LOG_ASYNCH, 2, "handle_suspend_signal: exiting\n");
        ASSERT(app_xsp != NULL);
        notify_and_jmp_without_stack(&ostd->terminated, (byte *)dynamorio_sys_exit,
                                     app_xsp);
        ASSERT_NOT_REACHED();
        return false;
    }

    if (!doing_detach && is_thread_currently_native(dcontext->thread_record) &&
        !IS_CLIENT_THREAD(dcontext) IF_APP_EXPORTS(&&!dr_api_exit)) {
        if (!sig_take_over(ucxt))
            return false;
        ASSERT_NOT_REACHED(); /* else, shouldn't return */
    }

    /* If suspend_count is 0, we are not trying to suspend this thread
     * (os_thread_resume() may have already decremented suspend_count to 0, but
     * os_thread_suspend() will not send a signal until this thread unsets
     * ostd->suspended, so not having a lock around the suspend_count read is
     * ok), so pass signal to app.
     * If we are trying or have already suspended this thread, our own
     * os_thread_suspend() will not send a 2nd suspend signal until we are
     * completely resumed, so we can distinguish app uses of SUSPEND_SIGNAL.  We
     * can't have a race between the read and write of suspended_sigcxt b/c
     * signals are blocked.  It's fine to have a race and reorder the app's
     * signal w/ DR's.
     */
    if (ostd->suspend_count == 0)
        return true; /* pass to app */
    ASSERT(ostd->suspended_sigcxt == NULL);

    /* XXX: we're not setting DR_WHERE_SIGNAL_HANDLER in enough places.
     * It's trickier than other whereamis b/c we want to resume the
     * prior whereami when we return from the handler, but there are
     * complex control paths that do not always return.
     * We try to at least do it for the ksynch_wait here.
     */
    dr_where_am_i_t prior_whereami = dcontext->whereami;
    dcontext->whereami = DR_WHERE_SIGNAL_HANDLER;

    sig_full_initialize(&sc_full, ucxt);
    ostd->suspended_sigcxt = &sc_full;

    LOG(THREAD, LOG_ASYNCH, 2, "handle_suspend_signal: suspended now\n");
    /* We cannot use mutexes here as we have interrupted DR at an
     * arbitrary point!  Thus we can't use the event_t routines.
     * However, the existing synch and check above prevent any
     * re-entrance here, and our cond vars target just a single thread,
     * so we can get away w/o a mutex.
     */
    /* Notify os_thread_suspend that it can now return, as this thread is
     * officially suspended now and is ready for thread_{get,set}_mcontext.
     */
    ASSERT(ksynch_get_value(&ostd->suspended) == 0);
    ksynch_set_value(&ostd->suspended, 1);
    ksynch_wake_all(&ostd->suspended);

    /* We're sitting on our sigaltstack w/ all signals blocked.  We're
     * going to stay here but unblock all signals so we don't lose any
     * delivered while we're waiting.  We're at a safe enough point (now
     * that we've set ostd->suspended: i#5779) to re-enter
     * main_signal_handler().  We use a mutex in thread_{suspend,resume} to
     * prevent our own re-suspension signal from arriving before we've
     * re-blocked on the resume.
     */
    sigprocmask_syscall(SIG_SETMASK, SIGMASK_FROM_UCXT(ucxt), &prevmask,
                        sizeof(ucxt->uc_sigmask));

    /* i#96/PR 295561: use futex(2) if available */
    while (ksynch_get_value(&ostd->wakeup) == 0) {
        /* Waits only if the wakeup flag is not set as 1. Return value
         * doesn't matter because the flag will be re-checked.
         */
        ksynch_wait(&ostd->wakeup, 0, 0);
        if (ksynch_get_value(&ostd->wakeup) == 0) {
            /* If it still has to wait, give up the cpu. */
            os_thread_yield();
        }
    }
    LOG(THREAD, LOG_ASYNCH, 2, "handle_suspend_signal: awake now\n");

    /* re-block so our exit from main_signal_handler is not interrupted */
    sigprocmask_syscall(SIG_SETMASK, &prevmask, NULL, sizeof(prevmask));
    ostd->suspended_sigcxt = NULL;

    /* Notify os_thread_resume that it can return now, which (assuming
     * suspend_count is back to 0) means it's then safe to re-suspend.
     */
    ksynch_set_value(&ostd->suspended, 0); /*reset prior to signalling os_thread_resume*/
    ksynch_set_value(&ostd->resumed, 1);
    ksynch_wake_all(&ostd->resumed);

    dcontext->whereami = prior_whereami;

    if (ostd->retakeover) {
        ostd->retakeover = false;
        sig_take_over(ucxt); /* shouldn't return for this case */
        ASSERT_NOT_REACHED();
    } else if (ostd->do_detach) {
        ostd->do_detach = false;
        sig_detach(dcontext, frame, &ostd->detached); /* no return */
        ASSERT_NOT_REACHED();
    }

    return false; /* do not pass to app */
}

/* PR 206278: for try/except we need to save the signal mask */
void
dr_setjmp_sigmask(dr_jmp_buf_t *buf)
{
    /* i#226/PR 492568: we rely on the kernel storing the prior mask in the
     * signal frame, so we do not need to store it on every setjmp, which
     * can be a performance hit.
     */
#ifdef DEBUG
    sigprocmask_syscall(SIG_SETMASK, NULL, &buf->sigmask, sizeof(buf->sigmask));
#endif
}

/* i#61/PR 211530: nudge on Linux.
 * Determines whether this is a nudge signal, and if so queues up a nudge,
 * or is an app signal.  Returns whether to pass the signal on to the app.
 */
static bool
handle_nudge_signal(dcontext_t *dcontext, kernel_siginfo_t *siginfo,
                    kernel_ucontext_t *ucxt)
{
    sigcontext_t *sc = SIGCXT_FROM_UCXT(ucxt);
    nudge_arg_t *arg = (nudge_arg_t *)siginfo;
    instr_t instr;
    char buf[MAX_INSTR_LENGTH];

    if (!is_signal_for_us(siginfo, NUDGESIG_SIGNUM))
        return true; /* pass to app */
#ifndef VMX86_SERVER
    DODEBUG({
        if (TEST(NUDGE_GENERIC(client), arg->nudge_action_mask) &&
            !is_valid_client_id(arg->client_id)) {
            SYSLOG_INTERNAL_WARNING("received client nudge for invalid id=0x%x",
                                    arg->client_id);
        }
    });
#endif
    if (dynamo_exited || !dynamo_initialized || dcontext == NULL) {
        /* Ignore the nudge: too early, or too late.
         * Xref Windows handling of such cases in nudge.c: old case 5702, etc.
         * We do this before the illegal-instr check b/c it's unsafe to decode
         * if too early or too late.
         */
        SYSLOG_INTERNAL_WARNING("too-early or too-late nudge: ignoring");
        return false; /* do not pass to app */
    }

    /* As a further check, try to detect whether this was raised synchronously
     * from a real illegal instr: though si_code for that should not be
     * SI_QUEUE.  It's possible a nudge happened to come at a bad instr before
     * it faulted, or maybe the instr after a syscall or other wait spot is
     * illegal, but we'll live with that risk for nudges; we do not do this for
     * suspend signals.
     */
    ASSERT(NUDGESIG_SIGNUM == SIGILL); /* else this check makes no sense */
    instr_init(dcontext, &instr);
    if (d_r_safe_read((byte *)sc->SC_XIP, sizeof(buf), buf) &&
        (decode(dcontext, (byte *)buf, &instr) == NULL ||
         /* check for ud2 (xref PR 523161) */
         instr_is_undefined(&instr))) {
        LOG(THREAD, LOG_ASYNCH, 2, "%s: real illegal instr @" PFX "\n", __FUNCTION__,
            sc->SC_XIP);
        DOLOG(2, LOG_ASYNCH,
              { disassemble_with_bytes(dcontext, (byte *)sc->SC_XIP, THREAD); });
        instr_free(dcontext, &instr);
        return true; /* pass to app */
    }
    instr_free(dcontext, &instr);

#ifdef VMX86_SERVER
    /* Treat as a client nudge until we have PR 477454 */
    if (siginfo->si_errno == 0) {
        arg->version = NUDGE_ARG_CURRENT_VERSION;
        arg->flags = 0;
        arg->nudge_action_mask = NUDGE_GENERIC(client);
        arg->client_id = 0;
        arg->client_arg = 0;
    }
#endif

    LOG(THREAD, LOG_ASYNCH, 1,
        "received nudge version=%u flags=0x%x mask=0x%x id=0x%08x "
        "arg=0x" ZHEX64_FORMAT_STRING "\n",
        arg->version, arg->flags, arg->nudge_action_mask, arg->client_id,
        arg->client_arg);
    SYSLOG_INTERNAL_INFO("received nudge mask=0x%x id=0x%08x arg=0x" ZHEX64_FORMAT_STRING,
                         arg->nudge_action_mask, arg->client_id, arg->client_arg);

    /* We need to handle the nudge at a safe, nolinking spot */
    if (safe_is_in_fcache(dcontext, (byte *)sc->SC_XIP, (byte *)sc->SC_XSP) &&
        dcontext->interrupted_for_nudge == NULL) {
        /* We unlink the interrupted fragment and skip any inlined syscalls to
         * bound the nudge delivery time.  If we already unlinked one we assume
         * that's sufficient.
         */
        fragment_t wrapper;
        fragment_t *f = fragment_pclookup(dcontext, (byte *)sc->SC_XIP, &wrapper);
        if (f != NULL) {
            if (unlink_fragment_for_signal(dcontext, f, (byte *)sc->SC_XIP))
                dcontext->interrupted_for_nudge = f;
        }
    }

    /* No lock is needed since thread-private and this signal is blocked now */
    nudge_add_pending(dcontext, arg);

    return false; /* do not pass to app */
}