/* **********************************************************
 * Copyright (c) 2010-2023 Google, Inc.  All rights reserved.
 * Copyright (c) 2001-2010 VMware, Inc.  All rights reserved.
 * **********************************************************/

/*
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * * Redistributions of source code must retain the above copyright notice,
 *   this list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright notice,
 *   this list of conditions and the following disclaimer in the documentation
 *   and/or other materials provided with the distribution.
 *
 * * Neither the name of VMware, Inc. nor the names of its contributors may be
 *   used to endorse or promote products derived from this software without
 *   specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL VMWARE, INC. OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 * DAMAGE.
 */

/* Copyright (c) 2003-2007 Determina Corp. */
/* Copyright (c) 2001-2003 Massachusetts Institute of Technology */
/* Copyright (c) 2001 Hewlett-Packard Company */

/*
 * heap.c - heap manager
 */

#include "globals.h"
#include <limits.h>

#include "fragment.h" /* for struct sizes */
#include "link.h"     /* for struct sizes */
#include "instr.h"    /* for struct sizes */
#include "fcache.h"   /* fcache_low_on_memory */
#ifdef UNIX
#    include "memquery.h"
#endif
#ifdef DEBUG
#    include "hotpatch.h" /* To handle leak for case 9593. */
#endif
#include "instrument.h"

#ifdef HEAP_ACCOUNTING
#    ifndef DEBUG
#        error HEAP_ACCOUNTING requires DEBUG
#    endif
#endif

#ifdef DEBUG_MEMORY
/* on by default but higher than general asserts */
#    define CHKLVL_MEMFILL CHKLVL_DEFAULT
#endif

extern bool vm_areas_exited;

/***************************************************************************
 * we cannot use malloc in the middle of interpreting the client program
 * because we could be in the middle of interpreting malloc, which is not
 * always reentrant
 *
 * We have a virtual memory manager which makes sure memory is
 * reserved within the application address space so that we don't have
 * to fight with the application.  We call os_heap_reserve to allocate
 * virtual space in a single consecutive region.  We later use
 * os_heap_commit to get committed memory in large chunks and manage
 * the chunks using a simple scheme of free lists of different sizes.
 * The virtual memory manager has to store out of band information about
 * used and free blocks, since of course there is no real memory to use.
 * The chunks (heap units) store in band extra information both for
 * used and free.  However, in the allocated blocks within a unit we
 * don't need to store any information since heap_free passes in the
 * size; we store the next pointers for the free lists at the start of
 * the free blocks themselves.  We have one large reservation for most of
 * our allocations, and yet another for allocations that we do not
 * plan on ever freeing up on detach - the only unavoidable tombstones
 * are those for thread private code system calls that may be stuck on
 * callbacks.  In case we run out of reserved memory we do fall back
 * on requests from the OS, but any of these may fail if we are
 * competing with the application.
 *
 * looking at dynamo behavior as of Jan 2001, most heap_alloc requests are
 * for < 128 bytes, very few for larger, so we have a bunch of fixed-size
 * blocks of small sizes
 *
 * the UINT_MAX size is a variable-length block, we keep one byte to store
 * the size (again storing the next pointer when free at the start of
 * what we pass to the user)
 */

static const uint BLOCK_SIZES[] = {
    8, /* for instr bits */
#ifndef X64
    /* for x64 future_fragment_t is 24 bytes (could be 20 if we could put flags last) */
    sizeof(future_fragment_t), /* 12 (24 x64) */
#endif
    /* we have a lot of size 16 requests for IR but they are transient */
    24, /* fcache empties and vm_area_t are now 20, vm area extras still 24 */
    /* 40 dbg / 36 rel: */
    ALIGN_FORWARD(sizeof(fragment_t) + sizeof(indirect_linkstub_t), HEAP_ALIGNMENT),
#if defined(X64)
#    ifdef DEBUG
    sizeof(fragment_t) + sizeof(direct_linkstub_t) +
        sizeof(cbr_fallthrough_linkstub_t), /* 112 dbg x64 / 104 rel x64 */
#    else
    sizeof(instr_t), /* 112 x64 */
#    endif
#else
    sizeof(fragment_t) + sizeof(direct_linkstub_t) +
        sizeof(cbr_fallthrough_linkstub_t), /* 60 dbg / 56 rel */
#    ifndef DEBUG
    sizeof(instr_t),                        /* 72 */
#    endif
#endif
    /* we keep this bucket even though only 10% or so of normal bbs
     * hit this.
     */
    ALIGN_FORWARD(sizeof(fragment_t) + 2 * sizeof(direct_linkstub_t),
                  HEAP_ALIGNMENT), /* 68 dbg / 64 rel (128 x64) */
    ALIGN_FORWARD(sizeof(trace_t) + 2 * sizeof(direct_linkstub_t) + sizeof(uint),
                  HEAP_ALIGNMENT), /* 80 dbg / 76 rel (148 x64 => 152) */
    /* FIXME: measure whether should put in indirect mixes as well */
    ALIGN_FORWARD(sizeof(trace_t) + 3 * sizeof(direct_linkstub_t) + sizeof(uint),
                  HEAP_ALIGNMENT), /* 96 dbg / 92 rel (180 x64 => 184) */
    ALIGN_FORWARD(sizeof(trace_t) + 5 * sizeof(direct_linkstub_t) + sizeof(uint),
                  HEAP_ALIGNMENT), /* 128 dbg / 124 rel (244 x64 => 248) */
    256, 512, UINT_MAX             /* variable-length */
};
#define BLOCK_TYPES (sizeof(BLOCK_SIZES) / sizeof(uint))

#ifdef DEBUG
/* FIXME: would be nice to have these stats per HEAPACCT category */
/* These are ints only b/c we used to do non-atomic adds and wanted to
 * gracefully handle underflow to negative values
 */
DECLARE_NEVERPROT_VAR(static int block_total_count[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static int block_count[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static int block_peak_count[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static int block_wasted[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static int block_peak_wasted[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static int block_align_pad[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static int block_peak_align_pad[BLOCK_TYPES], { 0 });
DECLARE_NEVERPROT_VAR(static bool out_of_vmheap_once, false);
#endif

/* variable-length: we steal one int for the size */
#define HEADER_SIZE (sizeof(size_t))
/* VARIABLE_SIZE is assignable */
#define VARIABLE_SIZE(p) (*(size_t *)((p)-HEADER_SIZE))
#define MEMSET_HEADER(p, value) VARIABLE_SIZE(p) = HEAP_TO_PTR_UINT(value)
#define GET_VARIABLE_ALLOCATION_SIZE(p) (VARIABLE_SIZE(p) + HEADER_SIZE)

/* The heap is allocated in units.
 * We start out with a small unit. Then each additional unit we
 * need doubles in size, up to a maximum.
 * We keep the initial units small for thread-private heaps, since with
 * thousands of threads the space can add up.
 */
#define HEAP_UNIT_MIN_SIZE DYNAMO_OPTION(initial_heap_unit_size)
#define HEAP_UNIT_MAX_SIZE INTERNAL_OPTION(max_heap_unit_size)
#define GLOBAL_UNIT_MIN_SIZE DYNAMO_OPTION(initial_global_heap_unit_size)

#define GUARD_PAGE_ADJUSTMENT (dynamo_options.guard_pages ? 2 * PAGE_SIZE : 0)

/* gets usable space in the unit */
#define UNITROOM(u) ((size_t)(u->end_pc - u->start_pc))
#define UNIT_RESERVED_ROOM(u) (u->reserved_end_pc - u->start_pc)
/* we keep the heap_unit_t header at top of the unit, this macro calculates
 * the committed size of the unit by adding header size to available size
 */
#define UNIT_COMMIT_SIZE(u) (UNITROOM(u) + sizeof(heap_unit_t))
#define UNIT_RESERVED_SIZE(u) (UNIT_RESERVED_ROOM(u) + sizeof(heap_unit_t))
#define UNIT_ALLOC_START(u) (u->start_pc - sizeof(heap_unit_t))
#define UNIT_GET_START_PC(u) (byte *)(((ptr_uint_t)u) + sizeof(heap_unit_t))
#define UNIT_COMMIT_END(u) (u->end_pc)
#define UNIT_RESERVED_END(u) (u->reserved_end_pc)

/* Gets the allocated size of the unit (reserved size; doesn't include guard pages
 * as those are not considered part of the usable space).
 */
#define UNITALLOC(u) (UNIT_RESERVED_SIZE(u))
/* Gets unit overhead: includes reserved and committed (sizeof(heap_unit_t)) portions. */
#define UNITOVERHEAD sizeof(heap_unit_t)

/* any alloc request larger than this needs a special unit */
#define MAXROOM (HEAP_UNIT_MAX_SIZE - UNITOVERHEAD)

/* maximum valid allocation (to guard against internal integer overflows) */
#define MAX_VALID_HEAP_ALLOCATION INT_MAX

/* thread-local heap structure
 * this struct is kept at top of unit itself, not in separate allocation
 */
typedef struct _heap_unit_t {
    heap_pc start_pc;        /* start address of heap storage */
    heap_pc end_pc;          /* open-ended end address of heap storage */
    heap_pc cur_pc;          /* open-ended current end of allocated storage */
    heap_pc reserved_end_pc; /* open-ended end of reserved (not nec committed) memory */
    bool in_vmarea_list;     /* perf opt for delayed batch vmarea updating */
    which_vmm_t which;
#ifdef DEBUG
    int id; /* # of this unit */
#endif
    struct _heap_unit_t *next_local;  /* used to link thread's units */
    struct _heap_unit_t *next_global; /* used to link all units */
    struct _heap_unit_t *prev_global; /* used to link all units */
} heap_unit_t;

#ifdef HEAP_ACCOUNTING
typedef struct _heap_acct_t {
    size_t alloc_reuse[ACCT_LAST];
    size_t alloc_new[ACCT_LAST];
    size_t cur_usage[ACCT_LAST];
    size_t max_usage[ACCT_LAST];
    size_t max_single[ACCT_LAST];
    uint num_alloc[ACCT_LAST];
} heap_acct_t;
#endif

/* FIXME (case 6336): rename to heap_t:
 *   a heap_t is a collection of units with the same properties
 * to reflect that this is used for more than just thread-private memory.
 * Also rename the "tu" vars to "h"
 */
typedef struct _thread_units_t {
    heap_unit_t *top_unit; /* start of linked list of heap units */
    heap_unit_t *cur_unit; /* current unit in heap list */
    heap_pc free_list[BLOCK_TYPES];
#ifdef DEBUG
    int num_units; /* total # of heap units */
#endif
    dcontext_t *dcontext; /* back pointer to owner */
    which_vmm_t which;
    bool writable; /* remember state of heap protection */
#ifdef HEAP_ACCOUNTING
    heap_acct_t acct;
#endif
} thread_units_t;

#define REACHABLE_HEAP() (IF_X64_ELSE(DYNAMO_OPTION(reachable_heap), true))

/* per-thread structure: */
typedef struct _thread_heap_t {
    thread_units_t *local_heap;
    /* We separate out heap memory used for fragments, linking, and vmarea multi-entries
     * both to enable resetting memory and for safety for unlink flushing in the presence
     * of clean calls out of the cache that might allocate IR memory (which does not
     * use nonpersistent heap).  Any client actions that involve fragments or linking
     * should require couldbelinking status, which makes them safe wrt unlink flushing.
     * Xref DrMi#1791.
     */
    thread_units_t *nonpersistent_heap;
    thread_units_t *reachable_heap; /* Only used if !REACHABLE_HEAP() */
#ifdef UNIX
    /* Used for -satisfy_w_xor_x. */
    heap_pc fork_copy_start;
    size_t fork_copy_size;
    vm_area_vector_t *fork_copy_areas;
#endif
} thread_heap_t;

/* global, unique thread-shared structure:
 * FIXME: give this name to thread_units_t, and name this AllHeapUnits
 */
typedef struct _heap_t {
    heap_unit_t *units; /* list of all allocated units */
    heap_unit_t *dead;  /* list of deleted units ready for re-allocation */
    /* FIXME: num_dead duplicates d_r_stats->heap_num_free, but we want num_dead
     * for release build too, so it's separate...can we do better?
     */
    uint num_dead;
} heap_t;

/* no synch needed since only written once */
static bool heap_exiting = false;

#ifdef DEBUG
DECLARE_NEVERPROT_VAR(static bool ever_beyond_vmm, false);
#endif

/* Lock used only for managing heap units, not for normal thread-local alloc.
 * Must be recursive due to circular dependencies between vmareas and global heap.
 * Furthermore, always grab dynamo_vm_areas_lock() before grabbing this lock,
 * to make DR areas update and heap alloc/free atomic!
 */
DECLARE_CXTSWPROT_VAR(static recursive_lock_t heap_unit_lock,
                      INIT_RECURSIVE_LOCK(heap_unit_lock));
/* N.B.: if these two locks are ever owned at the same time, the convention is
 * that global_alloc_lock MUST be grabbed first, to avoid deadlocks
 */
/* separate lock for global heap access to avoid contention between local unit
 * creation and global heap alloc
 * must be recursive so that heap_vmareas_synch_units can hold it and heap_unit_lock
 * up front to avoid deadlocks, and still allow vmareas to global_alloc --
 * BUT we do NOT want global_heap_alloc() to be able to recurse!
 * FIXME: either find a better solution to the heap_vmareas_synch_units deadlock
 * that is as efficient, or find a way to assert that the only recursion is
 * from heap_vmareas_synch_units to global_alloc
 */
DECLARE_CXTSWPROT_VAR(static recursive_lock_t global_alloc_lock,
                      INIT_RECURSIVE_LOCK(global_alloc_lock));

/* Used to sync low on memory event */
DECLARE_CXTSWPROT_VAR(static recursive_lock_t low_on_memory_pending_lock,
                      INIT_RECURSIVE_LOCK(low_on_memory_pending_lock));

/* Denotes whether or not low on memory event requires triggering. */
DECLARE_FREQPROT_VAR(bool low_on_memory_pending, false);

#if defined(DEBUG) && defined(HEAP_ACCOUNTING) && defined(HOT_PATCHING_INTERFACE)
static int
get_special_heap_header_size(void);
#endif

vm_area_vector_t *landing_pad_areas; /* PR 250294 */
#ifdef WINDOWS
/* i#939: we steal space from ntdll's +rx segment */
static app_pc lpad_temp_writable_start;
static size_t lpad_temp_writable_size;
static void
release_landing_pad_mem(void);
#endif

/* Indicates whether should back out of a global alloc/free and grab the
 * DR areas lock first, to retry
 */
static bool
safe_to_allocate_or_free_heap_units()
{
    return ((!self_owns_recursive_lock(&global_alloc_lock) &&
             !self_owns_recursive_lock(&heap_unit_lock)) ||
            self_owns_dynamo_vm_area_lock());
}

/* indicates a dynamo vm area remove was delayed
 * protected by the heap_unit_lock
 */
DECLARE_FREQPROT_VAR(static bool dynamo_areas_pending_remove, false);

#ifdef HEAP_ACCOUNTING
const char *whichheap_name[] = {
    /* max length for aligned output is length of "BB Fragments" */
    "BB Fragments",
    "Coarse Links",
    "Future Frag",
    "Frag Tables",
    "IBL Tables",
    "Traces",
    "FC Empties",
    "Vm Multis",
    "IR",
    "RCT Tables",
    "VM Areas",
    "Symbols",
#    ifdef SIDELINE
    "Sideline",
#    endif
    "TH Counter",
    "Tombstone",
    "Hot Patching",
    "Thread Mgt",
    "Memory Mgt",
    "Stats",
    "SpecialHeap",
    "Client",
    "Lib Dup",
    "Clean Call",
    /* NOTE: Add your heap name here */
    "Other",
};

/* Since using a lock for these stats adds a lot of contention, we
 * follow a two-pronged strategy:
 * 1) For accurate stats we add a thread's final stats to the global only
 * when it is cleaned up.  But, this prevents global stats from being
 * available in the middle of a run or if a run is not cleaned up nicely.
 * 2) We have a set of heap_accounting stats for incremental global stats
 * that are available at any time, yet racy and so may be off a little.
 */
/* all set to 0 is only initialization we need */
DECLARE_NEVERPROT_VAR(static thread_units_t global_racy_units, { 0 });

/* macro to get the type abstracted */
#    define ACCOUNT_FOR_ALLOC_HELPER(type, tu, which, alloc_sz, ask_sz)    \
        do {                                                               \
            (tu)->acct.type[which] += alloc_sz;                            \
            (tu)->acct.num_alloc[which]++;                                 \
            (tu)->acct.cur_usage[which] += alloc_sz;                       \
            if ((tu)->acct.cur_usage[which] > (tu)->acct.max_usage[which]) \
                (tu)->acct.max_usage[which] = (tu)->acct.cur_usage[which]; \
            if (ask_sz > (tu)->acct.max_single[which])                     \
                (tu)->acct.max_single[which] = ask_sz;                     \
        } while (0)

#    define ACCOUNT_FOR_ALLOC(type, tu, which, alloc_sz, ask_sz)                         \
        do {                                                                             \
            STATS_ADD_PEAK(heap_claimed, alloc_sz);                                      \
            ACCOUNT_FOR_ALLOC_HELPER(type, tu, which, alloc_sz, ask_sz);                 \
            ACCOUNT_FOR_ALLOC_HELPER(type, &global_racy_units, which, alloc_sz, ask_sz); \
        } while (0)

#    define ACCOUNT_FOR_FREE(tu, which, size)                \
        do {                                                 \
            STATS_SUB(heap_claimed, (size));                 \
            (tu)->acct.cur_usage[which] -= size;             \
            global_racy_units.acct.cur_usage[which] -= size; \
        } while (0)

#else
#    define ACCOUNT_FOR_ALLOC(type, tu, which, alloc_sz, ask_sz)
#    define ACCOUNT_FOR_FREE(tu, which, size)
#endif

typedef byte *vm_addr_t;

#ifdef X64
/* designates the closed interval within which we must allocate DR heap space */
static byte *heap_allowable_region_start = (byte *)PTR_UINT_0;
static byte *heap_allowable_region_end = (byte *)POINTER_MAX;

/* In standalone mode we do not guarantee 32-bit reachability for anything.
 * This lets apps grow beyond 4G of heap.
 */
#    define HEAP_REACHABILITY_ENABLED() (!standalone_library)

/* Used only to protect read/write access to the must_reach_* static variables
 * used in request_region_be_heap_reachable().
 */
DECLARE_CXTSWPROT_VAR(static mutex_t request_region_be_heap_reachable_lock,
                      INIT_LOCK_FREE(request_region_be_heap_reachable_lock));

/* Initialize so will be overridden on first call; protected by the
 * request_region_be_heap_reachable_lock.
 */
static byte *must_reach_region_start = (byte *)POINTER_MAX;
static byte *must_reach_region_end = (byte *)PTR_UINT_0; /* closed */

static void
reset_heap_reachable_bounds(void)
{
    heap_allowable_region_start = (byte *)PTR_UINT_0;
    heap_allowable_region_end = (byte *)POINTER_MAX;
    must_reach_region_start = (byte *)POINTER_MAX;
    must_reach_region_end = (byte *)PTR_UINT_0; /* closed */
}

/* Request that the supplied region be 32bit offset reachable from the DR heap.  Should
 * be called before vmm_heap_init() so we can place the DR heap to meet these constraints.
 * Can also be called post vmm_heap_init() but at that point acts as an assert that the
 * supplied region is reachable since the heap is already reserved.
 *
 * Must be called at least once up front, for the -heap_in_lower_4GB code here
 * to kick in!
 */
void
request_region_be_heap_reachable(byte *start, size_t size)
{
    if (!HEAP_REACHABILITY_ENABLED())
        return;

    LOG(GLOBAL, LOG_HEAP, 2,
        "Adding must-be-reachable-from-heap region " PFX "-" PFX "\n"
        "Existing must-be-reachable region " PFX "-" PFX "\n"
        "Existing allowed range " PFX "-" PFX "\n",
        start, start + size, must_reach_region_start, must_reach_region_end,
        heap_allowable_region_start, heap_allowable_region_end);
    ASSERT(!POINTER_OVERFLOW_ON_ADD(start, size));
    ASSERT(size > 0);

    d_r_mutex_lock(&request_region_be_heap_reachable_lock);
    if (start < must_reach_region_start) {
        byte *allowable_end_tmp;
        SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
        must_reach_region_start = start;
        allowable_end_tmp =
            REACHABLE_32BIT_END(must_reach_region_start, must_reach_region_end);
        /* PR 215395 - add in absolute address reachability */
        if (DYNAMO_OPTION(heap_in_lower_4GB) &&
            allowable_end_tmp > (byte *)POINTER_MAX_32BIT) {
            allowable_end_tmp = (byte *)POINTER_MAX_32BIT;
        }
        /* Write assumed to be atomic so we don't have to hold a lock to use
         * heap_allowable_region_end. */
        heap_allowable_region_end = allowable_end_tmp;
        SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
    }
    if (start + size - 1 > must_reach_region_end) {
        SELF_UNPROTECT_DATASEC(DATASEC_RARELY_PROT);
        must_reach_region_end = start + size - 1; /* closed */
        /* Write assumed to be atomic so we don't have to hold a lock to use
         * heap_allowable_region_start. */
        heap_allowable_region_start =
            REACHABLE_32BIT_START(must_reach_region_start, must_reach_region_end);
        SELF_PROTECT_DATASEC(DATASEC_RARELY_PROT);
    }
    ASSERT(must_reach_region_start <= must_reach_region_end); /* correctness check */
    /* verify can be addressed absolutely (if required), correctness check */
    ASSERT(!DYNAMO_OPTION(heap_in_lower_4GB) ||
           heap_allowable_region_end <= (byte *)POINTER_MAX_32BIT);
    d_r_mutex_unlock(&request_region_be_heap_reachable_lock);

    LOG(GLOBAL, LOG_HEAP, 1,
        "Added must-be-reachable-from-heap region " PFX "-" PFX "\n"
        "New must-be-reachable region " PFX "-" PFX "\n"
        "New allowed range " PFX "-" PFX "\n",
        start, start + size, must_reach_region_start, must_reach_region_end,
        heap_allowable_region_start, heap_allowable_region_end);

    /* Reachability checks (xref PR 215395, note since we currently can't directly
     * control where DR/client dlls are loaded these could fire if rebased). */
    ASSERT(heap_allowable_region_start <= must_reach_region_start &&
           "x64 reachability contraints not satisfiable");
    ASSERT(must_reach_region_end <= heap_allowable_region_end &&
           "x64 reachability contraints not satisfiable");

    /* Handle release build failure. */
    if (heap_allowable_region_start > must_reach_region_start ||
        must_reach_region_end > heap_allowable_region_end) {
        /* FIXME - in a released product we may want to detach or something else less
         * drastic than triggering a FATAL_USAGE_ERROR. */
        FATAL_USAGE_ERROR(HEAP_CONTRAINTS_UNSATISFIABLE, 2, get_application_name(),
                          get_application_pid());
    }
}

void
vmcode_get_reachable_region(byte **region_start OUT, byte **region_end OUT)
{
    /* We track sub-page for more accuracy on additional constraints, and
     * align when asked about it.
     */
    if (region_start != NULL)
        *region_start = (byte *)ALIGN_FORWARD(heap_allowable_region_start, PAGE_SIZE);
    if (region_end != NULL)
        *region_end = (byte *)ALIGN_BACKWARD(heap_allowable_region_end, PAGE_SIZE);
}
#endif

/* forward declarations of static functions */
static void
threadunits_init(dcontext_t *dcontext, thread_units_t *tu, size_t size, bool reachable);
/* dcontext only used for debugging */
static void
threadunits_exit(thread_units_t *tu, dcontext_t *dcontext);
static void *
common_heap_alloc(thread_units_t *tu, size_t size HEAPACCT(which_heap_t which));
static bool
common_heap_free(thread_units_t *tu, void *p, size_t size HEAPACCT(which_heap_t which));
static void
release_real_memory(void *p, size_t size, bool remove_vm, which_vmm_t which);
static void
release_guarded_real_memory(vm_addr_t p, size_t size, bool remove_vm, bool guarded,
                            which_vmm_t which);

typedef enum {
    /* I - Init, Interop - first allocation failed
     *    check for incompatible kernel drivers
     */
    OOM_INIT = 0x1,
    /* R - Reserve - out of virtual reservation *
     *    increase -vm_size to reserve more memory
     */
    OOM_RESERVE = 0x2,
    /* C - Commit - systemwide page file limit, or current process job limit hit
     * Increase pagefile size, check for memory leak in any application.
     *
     * FIXME: possible automatic actions
     *    if systemwide failure we may want to wait if transient
     *    FIXME: if in a job latter we want to detect and just die
     *    (though after freeing as much memory as we can)
     */
    OOM_COMMIT = 0x4,
    /* E - Extending Commit - same reasons as Commit
     *    as a possible workaround increasing -heap_commit_increment
     *    may make expose us to commit-ing less frequently,
     *    On the other hand committing smaller chunks has a higher
     *    chance of getting through when there is very little memory.
     *
     *    FIXME: not much more informative than OOM_COMMIT
     */
    OOM_EXTEND = 0x8,
} oom_source_t;

static void
report_low_on_memory(which_vmm_t which, oom_source_t source,
                     heap_error_code_t os_error_code);

#define MAX_VMCODE_SIZE (2ULL * 1024 * 1024 * 1024)
#define MAX_VMHEAP_SIZE (IF_X64_ELSE(128ULL, (4ULL - 1)) * 1024 * 1024 * 1024)

/* We should normally have only one large unit, so this is in fact
 * the maximum we should count on in one process
 */

/* minimum will be used only if an invalid option is set */
#define MIN_VMM_HEAP_UNIT_SIZE DYNAMO_OPTION(vmm_block_size)

typedef struct {
    vm_addr_t start_addr;  /* base virtual address */
    vm_addr_t end_addr;    /* noninclusive virtual memory range [start,end) */
    vm_addr_t alloc_start; /* base allocation virtual address */
    size_t alloc_size;     /* allocation size */
    /* for 64-bit do we want to shift to size_t to allow a larger region?
     * if so must update the bitmap_t routines
     */
    uint num_blocks; /* total number of blocks in virtual allocation */

    mutex_t lock; /* write access to the rest of the fields is protected */
    /* We make an assumption about the bitmap_t implementation being
       static therefore we don't grab locks on read accesses.  Anyways,
       currently the bitmap_t is used with no write intent only for ASSERTs. */
    uint num_free_blocks; /* currently free blocks */
    const char *name;
    /* We dynamically allocate the bitmap to allow for different sizes for
     * vmcode and vmheap and to allow for large vmheap sizes.
     * We place it at start_addr, or the writable equivalent for vmcode.
     */
    bitmap_element_t *blocks;
} vm_heap_t;

/* We keep our heap management structs on the heap for selfprot (case 8074).
 * Note that we do have static structs for bootstrapping and we later move
 * the data here.
 */
typedef struct _heap_management_t {
    /* We split our 32-bit-displacement-reachable memory, which is mostly our
     * code cache and thus is called "vmcode", from our heap which can go anywhere,
     * "vmheap".
     * For each, we reserve a single vm_heap_t for guaranteed allocation.
     * We fall back to the OS when run out of reservation space.
     * If REACHABLE_HEAP() we do not use vmheap and put everything in
     * vmcode.
     */
    vm_heap_t vmheap;
    /* We only need a single 32-bit-displacement-reachable region since it cannot
     * be larger than 2G anyway.
     * XXX i#1132: for 64-bit, we make it 2G by default so we never have to
     * fall back to the OS!  We'll have to make room to load client libs inside
     * the VMM-managed space is all.
     * For 32-bit it will have to remain smaller and handle falling back to the OS.
     */
    vm_heap_t vmcode;
    /* A writable mirror of read-only vmcode for -satisfy_w_xor_x. */
    file_t dual_map_file;
    vm_addr_t vmcode_writable_base;
    vm_addr_t vmcode_writable_alloc;
    heap_t heap;
    /* thread-shared heaps: */
    thread_units_t global_units;
    /* Separate non-persistent heap.  See thread_heap_t.nonpersisent_heap comment. */
    thread_units_t global_nonpersistent_units;
    bool global_heap_writable;
    thread_units_t global_unprotected_units;
    thread_units_t global_reachable_units; /* Used if !REACHABLE_HEAP() */
} heap_management_t;

/* For bootstrapping until we can allocate our real heapmgt (case 8074).
 * temp_heapmgt.lock is initialized in vmm_heap_unit_init().
 */
static heap_management_t temp_heapmgt;
static heap_management_t *heapmgt = &temp_heapmgt; /* initial value until alloced */

static bool vmm_heap_exited = false; /* FIXME: used only to thwart stack_free from trying,
                                        should change the interface for the last stack
                                     */

#define MEMORY_FILE_NAME "dynamorio_dual_map"

static vm_addr_t
vmm_heap_reserve_blocks(vm_heap_t *vmh, size_t size_in, byte *base, which_vmm_t which);

static bool
vmm_heap_commit(vm_addr_t p, size_t size, uint prot, heap_error_code_t *error_code,
                which_vmm_t which);

static inline uint
vmm_addr_to_block(vm_heap_t *vmh, vm_addr_t p)
{
    ASSERT(
        CHECK_TRUNCATE_TYPE_uint((p - vmh->start_addr) / DYNAMO_OPTION(vmm_block_size)));
    return (uint)((p - vmh->start_addr) / DYNAMO_OPTION(vmm_block_size));
}

static inline vm_addr_t
vmm_block_to_addr(vm_heap_t *vmh, uint block)
{
    ASSERT(block >= 0 && block < vmh->num_blocks);
    return (vm_addr_t)(vmh->start_addr + block * DYNAMO_OPTION(vmm_block_size));
}

static bool
vmm_in_same_block(vm_heap_t *vmh, vm_addr_t p1, vm_addr_t p2)
{
    return vmm_addr_to_block(vmh, p1) == vmm_addr_to_block(vmh, p2);
}

#if defined(DEBUG) && defined(INTERNAL)
static void
vmm_dump_map(vm_heap_t *vmh)
{
    uint i;
    bitmap_element_t *b = vmh->blocks;
    uint bitmap_size = vmh->num_blocks;
    uint last_i = 0;
    bool is_used = bitmap_test(b, 0) == 0;

    LOG(GLOBAL, LOG_HEAP, 3, "vmm_dump_map(" PFX ")\n", vmh);
    /* We used to do raw dumps but with the shift to 4K blocks, this is just way
     * too big.  We disable but leave the capability to enable one-off use.
     */
    DOLOG(20, LOG_HEAP, {
        dump_buffer_as_bytes(GLOBAL, b,
                             BITMAP_INDEX(bitmap_size) * sizeof(bitmap_element_t),
                             DUMP_RAW | DUMP_ADDRESS);
    });

    LOG(GLOBAL, LOG_HEAP, 1, "\nvmm_dump_map(" PFX ") virtual regions\n", vmh);
#    define VMM_DUMP_MAP_LOG(i, last_i)                                        \
        LOG(GLOBAL, LOG_HEAP, 1, PFX "-" PFX " size=%d %s\n",                  \
            vmm_block_to_addr(vmh, last_i),                                    \
            vmm_block_to_addr(vmh, i - 1) + DYNAMO_OPTION(vmm_block_size) - 1, \
            (i - last_i) * DYNAMO_OPTION(vmm_block_size),                      \
            is_used ? "reserved" : "free");

    for (i = 0; i < bitmap_size; i++) {
        /* start counting at free/used boundaries */
        if (is_used != (bitmap_test(b, i) == 0)) {
            VMM_DUMP_MAP_LOG(i, last_i);
            is_used = (bitmap_test(b, i) == 0);
            last_i = i;
        }
    }
    VMM_DUMP_MAP_LOG(bitmap_size, last_i);
}
#endif /* DEBUG */

static inline void
print_vmh_data(vm_heap_t *vmh, file_t outf)
{
    d_r_mutex_lock(&vmh->lock);
    print_file(outf, "VM heap: addr range " PFX "--" PFX ", # free blocks %d\n",
               vmh->start_addr, vmh->end_addr, vmh->num_free_blocks);
    d_r_mutex_unlock(&vmh->lock);
}

void
print_vmm_heap_data(file_t outf)
{
    if (heapmgt->vmheap.start_addr != NULL)
        print_vmh_data(&heapmgt->vmheap, outf);
    if (heapmgt->vmcode.start_addr != NULL)
        print_vmh_data(&heapmgt->vmcode, outf);
}

static inline void
vmm_heap_initialize_unusable(vm_heap_t *vmh)
{
    vmh->start_addr = vmh->end_addr = NULL;
    vmh->num_free_blocks = vmh->num_blocks = 0;
}

static void
report_w_xor_x_fatal_error_and_exit(void)
{
    REPORT_FATAL_ERROR_AND_EXIT(FAILED_TO_SATISFY_W_XOR_X, 2, get_application_name(),
                                get_application_pid());
    ASSERT_NOT_REACHED();
}

static void
vmm_place_vmcode(vm_heap_t *vmh, size_t size, heap_error_code_t *error_code)
{
    ptr_uint_t preferred = 0;
#ifdef X64
    /* -heap_in_lower_4GB takes top priority and has already set heap_allowable_region_*.
     * Next comes -vm_base_near_app.  It will fail for -vm_size=2G, which we document.
     */
    if (DYNAMO_OPTION(vm_base_near_app)) {
        /* Required for STATIC_LIBRARY: must be near app b/c clients are there.
         * Non-static: still a good idea for fewer rip-rel manglings.
         * Asking for app base means we'll prefer before the app, which
         * has less of an impact on its heap.
         */
        app_pc app_base = get_application_base();
        app_pc app_end = get_application_end();
        /* To avoid ignoring -vm_base and -vm_max_offset we fall through to that
         * code if the app base is near -vm_base.
         */
        if (!REL32_REACHABLE(app_base, (app_pc)DYNAMO_OPTION(vm_base)) ||
            !REL32_REACHABLE(app_base,
                             (app_pc)DYNAMO_OPTION(vm_base) +
                                 DYNAMO_OPTION(vm_max_offset)) ||
            ((app_pc)DYNAMO_OPTION(vm_base) < app_end &&
             (app_pc)DYNAMO_OPTION(vm_base) + DYNAMO_OPTION(vm_max_offset) > app_base)) {
            byte *reach_base = MAX(REACHABLE_32BIT_START(app_base, app_end),
                                   heap_allowable_region_start);
            byte *reach_end =
                MIN(REACHABLE_32BIT_END(app_base, app_end), heap_allowable_region_end);
            if (reach_base < reach_end) {
                size_t add_for_align = DYNAMO_OPTION(vmm_block_size);
                if (DYNAMO_OPTION(vmm_block_size) == PAGE_SIZE) {
                    /* No need for extra space for alignment. */
                    add_for_align = 0;
                }
                vmh->alloc_start = os_heap_reserve_in_region(
                    (void *)ALIGN_FORWARD(reach_base, PAGE_SIZE),
                    (void *)ALIGN_BACKWARD(reach_end, PAGE_SIZE), size + add_for_align,
                    error_code, true /*+x*/);
                if (vmh->alloc_start != NULL) {
                    vmh->start_addr = (heap_pc)ALIGN_FORWARD(
                        vmh->alloc_start, DYNAMO_OPTION(vmm_block_size));
                    if (add_for_align == 0) {
                        ASSERT(ALIGNED(vmh->alloc_start, DYNAMO_OPTION(vmm_block_size)));
                        ASSERT(vmh->start_addr == vmh->alloc_start);
                    }
                    request_region_be_heap_reachable(app_base, app_end - app_base);
                    LOG(GLOBAL, LOG_HEAP, 1, "vmm_heap_unit_init: placed %s near app\n",
                        vmh->name);
                }
            }
        }
    }
#endif /* X64 */

    /* Next we try the -vm_base value plus a random offset. */
    if (vmh->start_addr == NULL) {
        /* Out of 32 bits = 12 bits are page offset, windows wastes 4 more
         * since its allocation base is 64KB, and if we want to stay
         * safely in say 0x20000000-0x2fffffff we're left with only 12
         * bits of randomness - which may be too little.  On the other
         * hand changing any of the lower 16 bits will make our bugs
         * non-deterministic. */
        /* Make sure we don't waste the lower bits from our random number */
        preferred = (DYNAMO_OPTION(vm_base) +
                     get_random_offset(DYNAMO_OPTION(vm_max_offset) /
                                       DYNAMO_OPTION(vmm_block_size)) *
                         DYNAMO_OPTION(vmm_block_size));
        preferred = ALIGN_FORWARD(preferred, OS_ALLOC_GRANULARITY);
        /* overflow check: w/ vm_base shouldn't happen so debug-only check */
        ASSERT(!POINTER_OVERFLOW_ON_ADD(preferred, size));
        /* let's assume a single chunk is sufficient to reserve */
#ifdef X64
        if ((byte *)preferred < heap_allowable_region_start ||
            (byte *)preferred + size > heap_allowable_region_end) {
            *error_code = HEAP_ERROR_NOT_AT_PREFERRED;
            LOG(GLOBAL, LOG_HEAP, 1,
                "vmm_heap_unit_init preferred=" PFX " too far from " PFX "-" PFX "\n",
                preferred, heap_allowable_region_start, heap_allowable_region_end);
        } else {
#endif
            vmh->alloc_start =
                os_heap_reserve((void *)preferred, size, error_code, true /*+x*/);
            vmh->start_addr = vmh->alloc_start;
            LOG(GLOBAL, LOG_HEAP, 1,
                "vmm_heap_unit_init preferred=" PFX " got start_addr=" PFX "\n",
                preferred, vmh->start_addr);
#ifdef X64
        }
#endif
    }
    while (vmh->start_addr == NULL && DYNAMO_OPTION(vm_allow_not_at_base)) {
        /* Since we prioritize low-4GB or near-app over -vm_base, we do not
         * syslog or assert here
         */
        /* need extra size to ensure alignment */
        vmh->alloc_size = size + DYNAMO_OPTION(vmm_block_size);
#ifdef X64
        /* PR 215395, make sure allocation satisfies heap reachability contraints */
        vmh->alloc_start = os_heap_reserve_in_region(
            (void *)ALIGN_FORWARD(heap_allowable_region_start, PAGE_SIZE),
            (void *)ALIGN_BACKWARD(heap_allowable_region_end, PAGE_SIZE),
            size + DYNAMO_OPTION(vmm_block_size), error_code, true /*+x*/);
#else
        vmh->alloc_start = (heap_pc)os_heap_reserve(
            NULL, size + DYNAMO_OPTION(vmm_block_size), error_code, true /*+x*/);
#endif
        vmh->start_addr =
            (heap_pc)ALIGN_FORWARD(vmh->alloc_start, DYNAMO_OPTION(vmm_block_size));
        LOG(GLOBAL, LOG_HEAP, 1,
            "vmm_heap_unit_init unable to allocate at preferred=" PFX
            " letting OS place sz=%dM addr=" PFX "\n",
            preferred, size / (1024 * 1024), vmh->start_addr);
        if (vmh->alloc_start == NULL && DYNAMO_OPTION(vm_allow_smaller)) {
            /* Just a little smaller might fit */
            size_t sub = (size_t)ALIGN_FORWARD(size / 16, 1024 * 1024);
            SYSLOG_INTERNAL_WARNING_ONCE("Full size vmm heap allocation failed");
            if (size > sub)
                size -= sub;
            else
                break;
        } else
            break;
    }
#ifdef X64
    if (DYNAMO_OPTION(satisfy_w_xor_x)) {
        /* Rather than replacing the 3 os_heap_reserve* calls above with os_map_file
         * whose MAP_FILE_REACHABLE relies on VMM (us!) being initialized, which is
         * tricky, we simply do the standard reserve above and then map our file
         * on top.  TODO i#3566: We need a different strategy on Windows.
         */
        /* Ensure os_map_file ignores vmcode: */
        ASSERT(!is_vmm_reserved_address(vmh->start_addr, size, NULL, NULL));
        size_t map_size = vmh->alloc_size;
        byte *map_base =
            os_map_file(heapmgt->dual_map_file, &map_size, 0, vmh->alloc_start,
                        MEMPROT_NONE, MAP_FILE_VMM_COMMIT | MAP_FILE_FIXED);
        if (map_base != vmh->alloc_start || map_size != vmh->alloc_size) {
            report_w_xor_x_fatal_error_and_exit();
            ASSERT_NOT_REACHED();
        }
    }
    /* ensure future out-of-block heap allocations are reachable from this allocation */
    if (vmh->start_addr != NULL) {
        ASSERT(vmh->start_addr >= heap_allowable_region_start &&
               !POINTER_OVERFLOW_ON_ADD(vmh->start_addr, size) &&
               vmh->start_addr + size <= heap_allowable_region_end);
        request_region_be_heap_reachable(vmh->start_addr, size);
    }
#endif
    ASSERT(ALIGNED(vmh->start_addr, DYNAMO_OPTION(vmm_block_size)));
}

/* Does not return. */
static void
vmm_heap_unit_init_failed(vm_heap_t *vmh, heap_error_code_t error_code, const char *name)
{
    LOG(GLOBAL, LOG_HEAP, 1, "vmm_heap_unit_init %s: failed to allocate memory!\n", name);
    vmm_heap_initialize_unusable(vmh);
    /* We couldn't even reserve initial virtual memory - we're out of luck. */
    report_low_on_memory(VMM_HEAP, OOM_INIT, error_code);
    ASSERT_NOT_REACHED();
}

static void
vmm_heap_unit_init(vm_heap_t *vmh, size_t size, bool is_vmcode, const char *name)
{
    heap_error_code_t error_code = 0;
    ASSIGN_INIT_LOCK_FREE(vmh->lock, vmh_lock);
    /* We need to get the lock into the process list before we copy out of
     * temp_heapmgt, else it will point to freed memory when we go back to temp_heapmgt
     * for lock cleanup code.
     */
    d_r_mutex_lock(&vmh->lock);
    d_r_mutex_unlock(&vmh->lock);
    size = ALIGN_FORWARD(size, DYNAMO_OPTION(vmm_block_size));
    vmh->alloc_size = size;
    vmh->start_addr = NULL;
    vmh->name = name;

    if (size == 0) {
        vmm_heap_initialize_unusable(vmh);
        return;
    }

    if (is_vmcode) {
        /* This is our must-be-reachable alloc whose placement matters and is
         * controlled by runtime options.
         */
        if (DYNAMO_OPTION(satisfy_w_xor_x)) {
            heapmgt->dual_map_file = os_create_memory_file(MEMORY_FILE_NAME, size);
            if (heapmgt->dual_map_file == INVALID_FILE) {
                report_w_xor_x_fatal_error_and_exit();
                ASSERT_NOT_REACHED();
            }
        }
        vmm_place_vmcode(vmh, size, &error_code);
        if (DYNAMO_OPTION(satisfy_w_xor_x)) {
            size_t map_size = vmh->alloc_size;
            heapmgt->vmcode_writable_alloc =
                os_map_file(heapmgt->dual_map_file, &map_size, 0, NULL, MEMPROT_NONE, 0);
            ASSERT(map_size == vmh->alloc_size);
            if (heapmgt->vmcode_writable_alloc == 0) {
                LOG(GLOBAL, LOG_HEAP, 1,
                    "vmm_heap_unit_init %s: failed to allocate writable vmcode!\n");
                vmm_heap_initialize_unusable(vmh);
                report_low_on_memory(VMM_CACHE | VMM_REACHABLE, OOM_INIT, error_code);
                ASSERT_NOT_REACHED();
            }
            heapmgt->vmcode_writable_base = (heap_pc)ALIGN_FORWARD(
                heapmgt->vmcode_writable_alloc, DYNAMO_OPTION(vmm_block_size));
            LOG(GLOBAL, LOG_HEAP, 1,
                "vmm_heap_unit_init vmcode+w reservation: [" PFX "," PFX ")\n",
                heapmgt->vmcode_writable_base, heapmgt->vmcode_writable_base + size);
        }
    } else {
        /* These days every OS provides ASLR, so we do not bother to do our own
         * for this second reservation and rely on the OS.
         */
        vmh->alloc_size = size + DYNAMO_OPTION(vmm_block_size);
        vmh->alloc_start = (heap_pc)os_heap_reserve(
            NULL, size + DYNAMO_OPTION(vmm_block_size), &error_code, false /*-x*/);
        vmh->start_addr =
            (heap_pc)ALIGN_FORWARD(vmh->alloc_start, DYNAMO_OPTION(vmm_block_size));
    }

    if (vmh->start_addr == 0) {
        vmm_heap_unit_init_failed(vmh, error_code, name);
        ASSERT_NOT_REACHED();
    }
    vmh->end_addr = vmh->start_addr + size;
    ASSERT_TRUNCATE(vmh->num_blocks, uint, size / DYNAMO_OPTION(vmm_block_size));
    vmh->num_blocks = (uint)(size / DYNAMO_OPTION(vmm_block_size));
    size_t blocks_sz_bytes = BITMAP_INDEX(vmh->num_blocks) * sizeof(bitmap_element_t);
    blocks_sz_bytes = ALIGN_FORWARD(blocks_sz_bytes, DYNAMO_OPTION(vmm_block_size));
    /* We place the bitmap at the start of the (writable) vmm region. */
    vmh->blocks = (bitmap_element_t *)vmh->start_addr;
    if (is_vmcode)
        vmh->blocks = (bitmap_element_t *)vmcode_get_writable_addr((byte *)vmh->blocks);
    vmh->num_free_blocks = vmh->num_blocks;
    LOG(GLOBAL, LOG_HEAP, 1,
        "vmm_heap_unit_init %s reservation: [" PFX "," PFX ") total=%d free=%d\n", name,
        vmh->start_addr, vmh->end_addr, vmh->num_blocks, vmh->num_free_blocks);

    /* Make sure the vmm area is properly aligned on block boundaries.
     * The size was aligned above.
     */
    ASSERT(ALIGNED(vmh->blocks, DYNAMO_OPTION(vmm_block_size)));

    which_vmm_t which = VMM_HEAP | (is_vmcode ? VMM_REACHABLE : 0);
    /* We have to commit first which our code does support. */
    vmm_heap_commit((vm_addr_t)vmh->blocks, blocks_sz_bytes, MEMPROT_READ | MEMPROT_WRITE,
                    &error_code, which);
    if (error_code != 0) {
        vmm_heap_unit_init_failed(vmh, error_code, name);
        ASSERT_NOT_REACHED();
    }
    bitmap_initialize_free(vmh->blocks, vmh->num_blocks);
    vmm_heap_reserve_blocks(vmh, blocks_sz_bytes, vmh->start_addr, which);
    DOLOG(1, LOG_HEAP, { vmm_dump_map(vmh); });
    ASSERT(bitmap_check_consistency(vmh->blocks, vmh->num_blocks, vmh->num_free_blocks));
}

static void
vmm_heap_unit_exit(vm_heap_t *vmh)
{
    LOG(GLOBAL, LOG_HEAP, 1, "vmm_heap_unit_exit %s [" PFX "," PFX ") total=%d free=%d\n",
        vmh->name, vmh->start_addr, vmh->end_addr, vmh->num_blocks, vmh->num_free_blocks);
    /* we assume single thread in DR at this point */
    DELETE_LOCK(vmh->lock);

    if (vmh->start_addr == NULL)
        return;

    DOLOG(1, LOG_HEAP, { vmm_dump_map(vmh); });
    ASSERT(bitmap_check_consistency(vmh->blocks, vmh->num_blocks, vmh->num_free_blocks));
    ASSERT(vmh->num_blocks * DYNAMO_OPTION(vmm_block_size) ==
           (ptr_uint_t)(vmh->end_addr - vmh->start_addr));

    /* In case there are no tombstones we can just free the unit and
     * that is what we'll do, otherwise it will stay up forever.
     */
    bool free_heap = vmh->num_free_blocks == vmh->num_blocks;
#ifdef UNIX
    /* On unix there's no fear of leftover tombstones, and as long as we're
     * doing a detach we can be sure our stack is not actually in the heap.
     */
    if (doing_detach) {
        DODEBUG({
            byte *sp;
            GET_STACK_PTR(sp);
            ASSERT(!(sp >= vmh->start_addr && sp < vmh->end_addr));
        });
        free_heap = true;
    }
#endif
    if (free_heap) {
        heap_error_code_t error_code;
        os_heap_free(vmh->alloc_start, vmh->alloc_size, &error_code);
        ASSERT(error_code == HEAP_ERROR_SUCCESS);
        if (DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode) {
            os_heap_free(heapmgt->vmcode_writable_alloc, vmh->alloc_size, &error_code);
            ASSERT(error_code == HEAP_ERROR_SUCCESS);
            os_delete_memory_file(MEMORY_FILE_NAME, heapmgt->dual_map_file);
            heapmgt->dual_map_file = INVALID_FILE;
        }
    } else {
        /* FIXME: doing nothing for now - we only care about this in
         * detach scenarios where we should try to clean up from the
         * virtual address space
         */
    }
    vmm_heap_initialize_unusable(vmh);
}

/* Returns whether within the region we reserved from the OS for doling
 * out internally via our vm_heap_t; asserts that the address was also
 * logically reserved within the vm_heap_t.
 */
static bool
vmm_is_reserved_unit(vm_heap_t *vmh, vm_addr_t p, size_t size)
{
    size = ALIGN_FORWARD(size, DYNAMO_OPTION(vmm_block_size));
    if (p < vmh->start_addr || vmh->end_addr < p /*overflow*/ ||
        vmh->end_addr < (p + size))
        return false;
    ASSERT(CHECK_TRUNCATE_TYPE_uint(size / DYNAMO_OPTION(vmm_block_size)));
    ASSERT(bitmap_are_reserved_blocks(vmh->blocks, vmh->num_blocks,
                                      vmm_addr_to_block(vmh, p),
                                      (uint)(size / DYNAMO_OPTION(vmm_block_size))));
    return true;
}

static inline bool
is_vmh_reserved_address(vm_heap_t *vmh, byte *pc, size_t size, OUT byte **region_start,
                        OUT byte **region_end)
{
    /* Case 10293: we don't call vmm_is_reserved_unit to avoid its
     * assert, which we want to maintain for callers only dealing with
     * DR-allocated addresses, while this routine is called w/ random
     * addresses
     */
    if (pc >= vmh->start_addr && !POINTER_OVERFLOW_ON_ADD(pc, size) &&
        (pc + size) <= vmh->end_addr) {
        if (region_start != NULL)
            *region_start = vmh->start_addr;
        if (region_end != NULL)
            *region_end = vmh->end_addr;
        return true;
    }
    return false;
}

/* Returns whether entirely within a region we reserve from the OS for doling
 * out internally via our vm_heap_t.  Optionally returns the bounds of the region.
 * Does not consider memory we allocate once we run out of our original reservations.
 */
bool
is_vmm_reserved_address(byte *pc, size_t size, OUT byte **region_start,
                        OUT byte **region_end)
{
    ASSERT(heapmgt != NULL);
    if (heapmgt->vmheap.start_addr != NULL &&
        is_vmh_reserved_address(&heapmgt->vmheap, pc, size, region_start, region_end))
        return true;
    if (heapmgt->vmcode.start_addr != NULL &&
        is_vmh_reserved_address(&heapmgt->vmcode, pc, size, region_start, region_end))
        return true;
    if (heapmgt->vmcode_writable_base != NULL &&
        is_vmh_reserved_address(&heapmgt->vmcode, vmcode_get_executable_addr(pc), size,
                                region_start, region_end)) {
        if (region_start != NULL)
            *region_start = vmcode_get_writable_addr(*region_start);
        if (region_end != NULL)
            *region_end = vmcode_get_writable_addr(*region_end);
        return true;
    }
    return false;
}

byte *
vmcode_get_start(void)
{
    if (heapmgt->vmcode.start_addr != NULL)
        return heapmgt->vmcode.start_addr;
    if (heapmgt->vmheap.start_addr != NULL)
        return heapmgt->vmheap.start_addr;
    return NULL;
}

byte *
vmcode_get_end(void)
{
    if (heapmgt->vmcode.start_addr != NULL)
        return heapmgt->vmcode.end_addr;
    if (heapmgt->vmheap.start_addr != NULL)
        return heapmgt->vmheap.end_addr;
    return NULL;
}

static vm_heap_t *
vmheap_for_which(which_vmm_t which)
{
    if (TEST(VMM_REACHABLE, which) || REACHABLE_HEAP())
        return &heapmgt->vmcode;
    else
        return &heapmgt->vmheap;
}

byte *
vmcode_get_writable_addr(byte *exec_addr)
{
    /* XXX i#5383: Audit these calls and ensure they cover all scenarios, are placed
     * at the most efficient level, and are always properly paired.
     */
    PTHREAD_JIT_WRITE();
    if (!DYNAMO_OPTION(satisfy_w_xor_x))
        return exec_addr;
    /* If we want this to be an assert instead to catch superfluous calls, we'll need
     * to change things like set_selfmod_sandbox_offsets()'s call to
     * encode_with_patch_list() into a stack buffer.
     */
    if (exec_addr < heapmgt->vmcode.start_addr || exec_addr >= heapmgt->vmcode.end_addr)
        return exec_addr;
    return (exec_addr - heapmgt->vmcode.start_addr) + heapmgt->vmcode_writable_base;
}

byte *
vmcode_get_executable_addr(byte *write_addr)
{
    if (!DYNAMO_OPTION(satisfy_w_xor_x))
        return write_addr;
    if (write_addr < heapmgt->vmcode_writable_base ||
        write_addr >= heapmgt->vmcode_writable_base +
                (heapmgt->vmcode.end_addr - heapmgt->vmcode.start_addr))
        return write_addr;
    return (write_addr - heapmgt->vmcode_writable_base) + heapmgt->vmcode.start_addr;
}

#ifdef DEBUG_MEMORY
static inline byte *
vmm_get_writable_addr(byte *exec_addr, which_vmm_t which)
{
    vm_heap_t *vmh = vmheap_for_which(which);
    if (vmh == &heapmgt->vmcode)
        return vmcode_get_writable_addr(exec_addr);
    return exec_addr;
}
#endif

/* The caller must first ensure this is a vmcode address.  Returns p_writable. */
static inline vm_addr_t
vmm_normalize_addr(vm_heap_t *vmh, INOUT vm_addr_t *p_exec)
{
    vm_addr_t p = *p_exec;
    if (p < vmh->start_addr || p >= vmh->end_addr) {
        /* This is a writable addr. */
        p = (p - heapmgt->vmcode_writable_base) + vmh->start_addr;
        *p_exec = p;
    }
    return (p - vmh->start_addr) + heapmgt->vmcode_writable_base;
}

#ifdef WINDOWS
static byte *
vmheap_get_start(void)
{
    if (heapmgt->vmheap.start_addr != NULL)
        return heapmgt->vmheap.start_addr;
    if (heapmgt->vmcode.start_addr != NULL)
        return heapmgt->vmcode.start_addr;
    return NULL;
}
#endif

static inline bool
has_guard_pages(which_vmm_t which)
{
    if (!DYNAMO_OPTION(guard_pages))
        return false;
    if (TEST(VMM_PER_THREAD, which) && !DYNAMO_OPTION(per_thread_guard_pages))
        return false;
    return true;
}

void
iterate_vmm_regions(void (*cb)(byte *region_start, byte *region_end, void *user_data),
                    void *user_data)
{
    if (heapmgt->vmcode.start_addr != NULL)
        (*cb)(heapmgt->vmcode.start_addr, heapmgt->vmcode.end_addr, user_data);
    if (heapmgt->vmheap.start_addr != NULL)
        (*cb)(heapmgt->vmheap.start_addr, heapmgt->vmheap.end_addr, user_data);
    if (heapmgt->vmcode_writable_base != NULL) {
        (*cb)(heapmgt->vmcode_writable_base,
              heapmgt->vmcode_writable_base +
                  (heapmgt->vmcode.end_addr - heapmgt->vmcode.start_addr),
              user_data);
    }
}

byte *
vmcode_unreachable_pc(void)
{
#ifdef X86_64
    /* This is used to indicate something that is unreachable from *everything*
     * for DR_CLEANCALL_INDIRECT, so ideally we want to not just provide an
     * address that vmcode can't reach.
     * We use a non-canonical address for x86_64.
     */
    return (byte *)0x8000000100000000ULL;
#else
    /* This is not really used for aarch* so we just go with vmcode reachability. */
    ptr_uint_t start = (ptr_uint_t)vmcode_get_start();
    ptr_uint_t end = (ptr_uint_t)vmcode_get_end();
    if (start > INT_MAX)
        return NULL;
    else {
        /* We do not use -1 to avoid wraparound from thinking it's reachable. */
        return (byte *)end + INT_MAX + PAGE_SIZE;
    }
#endif
}

bool
rel32_reachable_from_vmcode(byte *tgt)
{
#ifdef X64
    /* To handle beyond-vmm-reservation allocs, we must compare to the allowable
     * heap range and not just the vmcode range (i#1479).
     */
    ptr_int_t new_offs = (tgt > heap_allowable_region_start)
        ? (tgt - heap_allowable_region_start)
        : (heap_allowable_region_end - tgt);
    ASSERT(vmcode_get_start() >= heap_allowable_region_start ||
           !DYNAMO_OPTION(vm_reserve));
    ASSERT(vmcode_get_end() <= heap_allowable_region_end + 1 /*closed*/ ||
           !DYNAMO_OPTION(vm_reserve));
    return REL32_REACHABLE_OFFS(new_offs);
#else
    return true;
#endif
}

bool
rel32_reachable_from_current_vmcode(byte *tgt)
{
#ifdef X64
    ptr_int_t new_offs = (tgt > must_reach_region_start) ? (tgt - must_reach_region_start)
                                                         : (must_reach_region_end - tgt);
    return REL32_REACHABLE_OFFS(new_offs);
#else
    return true;
#endif
}

static inline void
vmm_update_block_stats(which_vmm_t which, uint num_blocks, bool add)
{
    /* We do not split the stats for cache (always reachable) nor stack (never reachable).
     * We confirm our assumptions here.
     */
    ASSERT(!TESTALL(VMM_REACHABLE | VMM_STACK, which) &&
           (TEST(VMM_REACHABLE, which) || !TEST(VMM_CACHE, which)));
    /* XXX: find some way to make a stats array */
    if (add) {
        if (TEST(VMM_HEAP, which)) {
            if (TEST(VMM_REACHABLE, which))
                RSTATS_ADD_PEAK(vmm_blocks_reach_heap, num_blocks);
            else
                RSTATS_ADD_PEAK(vmm_blocks_unreach_heap, num_blocks);
        } else if (TEST(VMM_CACHE, which))
            RSTATS_ADD_PEAK(vmm_blocks_reach_cache, num_blocks);
        else if (TEST(VMM_STACK, which))
            RSTATS_ADD_PEAK(vmm_blocks_unreach_stack, num_blocks);
        else if (TEST(VMM_SPECIAL_HEAP, which)) {
            if (TEST(VMM_REACHABLE, which))
                RSTATS_ADD_PEAK(vmm_blocks_reach_special_heap, num_blocks);
            else
                RSTATS_ADD_PEAK(vmm_blocks_unreach_special_heap, num_blocks);
        } else if (TEST(VMM_SPECIAL_MMAP, which)) {
            if (TEST(VMM_REACHABLE, which))
                RSTATS_ADD_PEAK(vmm_blocks_reach_special_mmap, num_blocks);
            else
                RSTATS_ADD_PEAK(vmm_blocks_unreach_special_mmap, num_blocks);
        }
    } else {
        if (TEST(VMM_HEAP, which)) {
            if (TEST(VMM_REACHABLE, which))
                RSTATS_SUB(vmm_blocks_reach_heap, num_blocks);
            else
                RSTATS_SUB(vmm_blocks_unreach_heap, num_blocks);
        } else if (TEST(VMM_CACHE, which))
            RSTATS_SUB(vmm_blocks_reach_cache, num_blocks);
        else if (TEST(VMM_STACK, which))
            RSTATS_SUB(vmm_blocks_unreach_stack, num_blocks);
        else if (TEST(VMM_SPECIAL_HEAP, which)) {
            if (TEST(VMM_REACHABLE, which))
                RSTATS_SUB(vmm_blocks_reach_special_heap, num_blocks);
            else
                RSTATS_SUB(vmm_blocks_unreach_special_heap, num_blocks);
        } else if (TEST(VMM_SPECIAL_MMAP, which)) {
            if (TEST(VMM_REACHABLE, which))
                RSTATS_SUB(vmm_blocks_reach_special_mmap, num_blocks);
            else
                RSTATS_SUB(vmm_blocks_unreach_special_mmap, num_blocks);
        }
    }
}

/* Reservations here are done with DYNAMO_OPTION(vmm_block_size) alignment
 * (e.g. 64KB) but the caller is not forced to request at that
 * alignment.  We explicitly synchronize reservations and decommits
 * within the vm_heap_t.

 * Returns NULL if the VMMHeap is full or too fragmented to satisfy
 * the request.
 */
static vm_addr_t
vmm_heap_reserve_blocks(vm_heap_t *vmh, size_t size_in, byte *base, which_vmm_t which)
{
    vm_addr_t p;
    uint request;
    uint first_block;
    size_t size;
    uint must_start;

    size = ALIGN_FORWARD(size_in, DYNAMO_OPTION(vmm_block_size));
    ASSERT_TRUNCATE(request, uint, size / DYNAMO_OPTION(vmm_block_size));
    request = (uint)(size / DYNAMO_OPTION(vmm_block_size));

    if (base != NULL)
        must_start = vmm_addr_to_block(vmh, base);
    else
        must_start = UINT_MAX;

    LOG(GLOBAL, LOG_HEAP, 2,
        "vmm_heap_reserve_blocks %s: size=%d => %d in blocks=%d free_blocks=%d\n",
        vmh->name, size_in, size, request, vmh->num_free_blocks);

    d_r_mutex_lock(&vmh->lock);
    if (vmh->num_free_blocks < request) {
        d_r_mutex_unlock(&vmh->lock);
        return NULL;
    }
    first_block =
        bitmap_allocate_blocks(vmh->blocks, vmh->num_blocks, request, must_start);
    if (first_block != BITMAP_NOT_FOUND) {
        vmh->num_free_blocks -= request;
    }
    d_r_mutex_unlock(&vmh->lock);

    if (first_block != BITMAP_NOT_FOUND) {
        p = vmm_block_to_addr(vmh, first_block);
        RSTATS_ADD_PEAK(vmm_vsize_used, size);
        STATS_ADD_PEAK(vmm_vsize_blocks_used, request);
        STATS_ADD_PEAK(vmm_vsize_wasted, size - size_in);
        vmm_update_block_stats(which, request, true /*add*/);
        DOSTATS({
            if (request > 1) {
                STATS_INC(vmm_multi_block_allocs);
                STATS_ADD(vmm_multi_blocks, request);
            }
        });
    } else {
        p = NULL;
    }
    LOG(GLOBAL, LOG_HEAP, 2,
        "vmm_heap_reserve_blocks %s: size=%d blocks=%d p=" PFX " index=%u\n", vmh->name,
        size, request, p, first_block);
    DOLOG(5, LOG_HEAP, { vmm_dump_map(vmh); });
    return p;
}

/* We explicitly synchronize reservations and decommits within the vm_heap_t.
 * Update bookkeeping information about the freed region.
 */
static void
vmm_heap_free_blocks(vm_heap_t *vmh, vm_addr_t p, size_t size_in, which_vmm_t which)
{
    uint first_block = vmm_addr_to_block(vmh, p);
    uint request;
    size_t size;

    size = ALIGN_FORWARD(size_in, DYNAMO_OPTION(vmm_block_size));
    ASSERT_TRUNCATE(request, uint, size / DYNAMO_OPTION(vmm_block_size));
    request = (uint)(size / DYNAMO_OPTION(vmm_block_size));

    LOG(GLOBAL, LOG_HEAP, 2, "vmm_heap_free_blocks %s: size=%d blocks=%d p=" PFX "\n",
        vmh->name, size, request, p);

    d_r_mutex_lock(&vmh->lock);
    bitmap_free_blocks(vmh->blocks, vmh->num_blocks, first_block, request);
    vmh->num_free_blocks += request;
    d_r_mutex_unlock(&vmh->lock);

    ASSERT(vmh->num_free_blocks <= vmh->num_blocks);
    RSTATS_SUB(vmm_vsize_used, size);
    STATS_SUB(vmm_vsize_blocks_used, request);
    vmm_update_block_stats(which, request, false /*sub*/);
    STATS_SUB(vmm_vsize_wasted, size - size_in);
}

/* This is the proper interface for the rest of heap.c to the os_heap_* functions */

/* place all the local-scope static vars (from DO_THRESHOLD) into .fspdata to avoid
 * protection changes */
START_DATA_SECTION(FREQ_PROTECTED_SECTION, "w");

static bool
at_reset_at_vmm_limit(vm_heap_t *vmh)
{
    return (DYNAMO_OPTION(reset_at_vmm_percent_free_limit) != 0 &&
            100 * vmh->num_free_blocks <
                DYNAMO_OPTION(reset_at_vmm_percent_free_limit) * vmh->num_blocks) ||
        (DYNAMO_OPTION(reset_at_vmm_free_limit) != 0 &&
         vmh->num_free_blocks * DYNAMO_OPTION(vmm_block_size) <
             DYNAMO_OPTION(reset_at_vmm_free_limit));
}

static void
reached_beyond_vmm(which_vmm_t which)
{
    DODEBUG(ever_beyond_vmm = true;);
    /* Stats can be very useful to diagnose why we hit OOM. */
    if (INTERNAL_OPTION(rstats_to_stderr))
        dump_global_rstats_to_stderr();
    char message[256];
    if (DYNAMO_OPTION(satisfy_w_xor_x) &&
        (TEST(VMM_REACHABLE, which) || REACHABLE_HEAP())) {
        /* We do not bother to try to mirror separate from-OS allocs: the user
         * should set -vm_size 2G instead and take the rip-rel mangling hit
         * (see i#3570).
         */
        snprintf(
            message, BUFFER_SIZE_ELEMENTS(message),
            "Alloc type: 0x%x.  -satisfy_w_xor_x requires VMM memory: try '-vm_size 2G'",
            which);
        NULL_TERMINATE_BUFFER(message);
        REPORT_FATAL_ERROR_AND_EXIT(OUT_OF_VMM_CANNOT_USE_OS, 3, get_application_name(),
                                    get_application_pid(), message);
        ASSERT_NOT_REACHED();
    } else {
        snprintf(message, BUFFER_SIZE_ELEMENTS(message), "Alloc type: 0x%x.", which);
        NULL_TERMINATE_BUFFER(message);
        SYSLOG(SYSLOG_WARNING, OUT_OF_VMM_CANNOT_USE_OS, 3, get_application_name(),
               get_application_pid(), message);
    }
}

void
vmm_heap_handle_pending_low_on_memory_event_trigger()
{
    bool trigger = false;

    acquire_recursive_lock(&low_on_memory_pending_lock);
    if (low_on_memory_pending) {
        bool value = false;
        ATOMIC_1BYTE_WRITE(&low_on_memory_pending, value, false);
        trigger = true;
    }
    release_recursive_lock(&low_on_memory_pending_lock);

    if (trigger)
        instrument_low_on_memory();
}

static void
schedule_low_on_memory_event_trigger()
{
    bool value = true;
    ATOMIC_1BYTE_WRITE(&low_on_memory_pending, value, false);
}

/* Reserve virtual address space without committing swap space for it */
static vm_addr_t
vmm_heap_reserve(size_t size, heap_error_code_t *error_code, bool executable,
                 which_vmm_t which)
{
    vm_addr_t p;
    vm_heap_t *vmh = vmheap_for_which(which);
    /* should only be used on sizable aligned pieces */
    ASSERT(size > 0 && ALIGNED(size, PAGE_SIZE));
    ASSERT(!OWN_MUTEX(&reset_pending_lock));

    if (DYNAMO_OPTION(vm_reserve)) {
        /* FIXME: should we make this an external option? */
        if (INTERNAL_OPTION(vm_use_last) ||
            (DYNAMO_OPTION(switch_to_os_at_vmm_reset_limit) &&
             at_reset_at_vmm_limit(vmh))) {
            DO_ONCE({
                if (DYNAMO_OPTION(reset_at_switch_to_os_at_vmm_limit)) {
                    schedule_reset(RESET_ALL);
                }
                schedule_low_on_memory_event_trigger();
                DOCHECK(1, {
                    if (!INTERNAL_OPTION(vm_use_last)) {
                        ASSERT_CURIOSITY(false && "running low on vm reserve");
                    }
                });
                /* FIXME - for our testing would be nice to have some release build
                 * notification of this ... */
            });
            reached_beyond_vmm(which);
#ifdef X64
            if (TEST(VMM_REACHABLE, which) || REACHABLE_HEAP()) {
                /* PR 215395, make sure allocation satisfies heap reachability
                 * contraints */
                p = os_heap_reserve_in_region(
                    (void *)ALIGN_FORWARD(heap_allowable_region_start, PAGE_SIZE),
                    (void *)ALIGN_BACKWARD(heap_allowable_region_end, PAGE_SIZE), size,
                    error_code, executable);
                /* ensure future heap allocations are reachable from this allocation */
                if (p != NULL)
                    request_region_be_heap_reachable(p, size);
            } else
                p = os_heap_reserve(NULL, size, error_code, executable);
#else
            p = os_heap_reserve(NULL, size, error_code, executable);
#endif
            if (p != NULL)
                return p;
            LOG(GLOBAL, LOG_HEAP, 1, "vmm_heap_reserve %s: failed " PFX "\n", vmh->name,
                *error_code);
        }

        if (at_reset_at_vmm_limit(vmh)) {
            /* We're running low on our reservation, trigger a reset */
            schedule_low_on_memory_event_trigger();
            if (schedule_reset(RESET_ALL)) {
                STATS_INC(reset_low_vmm_count);
                DO_THRESHOLD_SAFE(
                    DYNAMO_OPTION(report_reset_vmm_threshold), FREQ_PROTECTED_SECTION,
                    { /* < max - nothing */ },
                    { /* >= max */
                      /* FIXME - do we want to report more then once to give some idea of
                       * how much thrashing there is? */
                      DO_ONCE({
                          SYSLOG_CUSTOM_NOTIFY(SYSLOG_WARNING, MSG_LOW_ON_VMM_MEMORY, 2,
                                               "Potentially thrashing on low virtual "
                                               "memory resetting.",
                                               get_application_name(),
                                               get_application_pid());
                          /* want QA to notice */
                          ASSERT_CURIOSITY(false && "vmm heap limit reset thrashing");
                      });
                    });
            }
        }

        p = vmm_heap_reserve_blocks(vmh, size, NULL, which);
        LOG(GLOBAL, LOG_HEAP, 2, "vmm_heap_reserve %s: size=%d p=" PFX "\n", vmh->name,
            size, p);

        if (p != NULL) {
            if (DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode &&
                !executable) {
                /* Pass back the writable address, not the executable.
                 * Then things like reachable heap do not need to convert to
                 * writable all over the place.
                 */
                p = (p - vmh->start_addr) + heapmgt->vmcode_writable_base;
            }
            return p;
        }
        DO_ONCE({
            DODEBUG({ out_of_vmheap_once = true; });
            if (!INTERNAL_OPTION(skip_out_of_vm_reserve_curiosity)) {
                /* this maybe unsafe for early services w.r.t. case 666 */
                SYSLOG_INTERNAL_WARNING("Out of %s reservation - reserving %dKB. "
                                        "Falling back onto OS allocation",
                                        (TEST(VMM_REACHABLE, which) || REACHABLE_HEAP())
                                            ? "vmcode"
                                            : "vmheap",
                                        size / 1024);
                ASSERT_CURIOSITY(false && "Out of vmheap reservation");
            }
            /* This actually-out trigger is only trying to help issues like a
             * thread-private configuration being a memory hog (and thus we use up
             * our reserve). Reset needs memory, and this is asynchronous, so no
             * guarantees here anyway (the app may have already reserved all memory
             * beyond our reservation, see sqlsrvr.exe and cisvc.exe for ex.) which is
             * why we have -reset_at_vmm_threshold to make reset success more likely. */
            if (DYNAMO_OPTION(reset_at_vmm_full)) {
                schedule_reset(RESET_ALL);
            }
        });
    }
    /* if we fail to allocate from our reservation we fall back to the OS */
    reached_beyond_vmm(which);
#ifdef X64
    if (TEST(VMM_REACHABLE, which) || REACHABLE_HEAP()) {
        /* PR 215395, make sure allocation satisfies heap reachability contraints */
        p = os_heap_reserve_in_region(
            (void *)ALIGN_FORWARD(heap_allowable_region_start, PAGE_SIZE),
            (void *)ALIGN_BACKWARD(heap_allowable_region_end, PAGE_SIZE), size,
            error_code, executable);
        /* ensure future heap allocations are reachable from this allocation */
        if (p != NULL)
            request_region_be_heap_reachable(p, size);
    } else
        p = os_heap_reserve(NULL, size, error_code, executable);
#else
    p = os_heap_reserve(NULL, size, error_code, executable);
#endif
    return p;
}

/* Commit previously reserved pages, returns false when out of memory
 * This is here just to complement the vmm interface, in fact it is
 * almost an alias for os_heap_commit.  (If we had strict types then
 * here we'd convert a vm_addr_t into a heap_pc.)
 */
static inline bool
vmm_heap_commit(vm_addr_t p, size_t size, uint prot, heap_error_code_t *error_code,
                which_vmm_t which)
{
    bool res = true;
    vm_heap_t *vmh = vmheap_for_which(which);
    LOG(GLOBAL, LOG_HEAP, 3, "vmm_heap_commit %s: size=%d p=" PFX " prot=%x\n", vmh->name,
        size, p, prot);
    if (DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode) {
        vm_addr_t p_writable = vmm_normalize_addr(vmh, &p);
        /* We blindly shadow even if prot is -w to simplify de-alloc.  -w is rare. */
        uint shadow_prot = prot & ~(MEMPROT_EXEC);
        res = os_heap_commit(p_writable, size, shadow_prot, error_code);
        prot &= ~(MEMPROT_WRITE);
        if (res) {
            /* We use mmap instead of mprotect since W^X policies often only allow
             * execution from regions allocated executable, not changed to executable.
             * There is a downside: IMA policies can cause a significant (~5s) delay
             * while a hash is computed of our vmcode region on the first +x mmap.
             * Today os_create_memory_file() does a temporary +x mmap for us, avoiding
             * any cost here.
             */
            size_t map_size = size;
            size_t map_offs = p - vmh->start_addr;
            vm_addr_t map_addr =
                os_map_file(heapmgt->dual_map_file, &map_size, map_offs, p, prot,
                            MAP_FILE_VMM_COMMIT | MAP_FILE_FIXED);
            ASSERT(map_size == size);
            res = (map_addr != NULL);
            ASSERT(map_addr == NULL || map_addr == p);
        }
    } else
        res = os_heap_commit(p, size, prot, error_code);
    size_t commit_used, commit_limit;
    ASSERT(!OWN_MUTEX(&reset_pending_lock));
    if ((DYNAMO_OPTION(reset_at_commit_percent_free_limit) != 0 ||
         DYNAMO_OPTION(reset_at_commit_free_limit) != 0) &&
        os_heap_get_commit_limit(&commit_used, &commit_limit)) {
        size_t commit_left = commit_limit - commit_used;
        ASSERT(commit_used <= commit_limit);
        /* FIXME - worry about overflow in the multiplies below? With 4kb pages isn't
         * an issue till 160GB of committable memory. */
        if ((DYNAMO_OPTION(reset_at_commit_free_limit) != 0 &&
             commit_left < DYNAMO_OPTION(reset_at_commit_free_limit) / PAGE_SIZE) ||
            (DYNAMO_OPTION(reset_at_commit_percent_free_limit) != 0 &&
             100 * commit_left <
                 DYNAMO_OPTION(reset_at_commit_percent_free_limit) * commit_limit)) {
            /* Machine is getting low on memory, trigger a reset */
            /* FIXME - if we aren't the ones hogging committed memory (rougue app) then
             * do we want a version of reset that doesn't de-commit our already grabbed
             * memory to avoid someone else stealing it (or perhaps keep just a minimal
             * level to ensure we make some progress)? */
            /* FIXME - the commit limit is for the whole system; we have no good way of
             * telling if we're running in a job and if so what the commit limit for the
             * job is. */
            /* FIXME - if a new process is started under dr while the machine is already
             * past the threshold we will just spin resetting here and not make any
             * progress, may be better to only reset when we have a reasonable amount of
             * non-persistent memory to free (so that we can at least make some progress
             * before resetting again). */
            /* FIXME - the threshold is calculated at the current page file size, but
             * it's possible that the pagefile is expandable (dependent on disk space of
             * course) and thus we're preventing a potentially beneficial (to us)
             * upsizing of the pagefile here.  See "HKLM\SYSTEM\CCS\ControlSession /
             * Manager\Memory Management" for the initial/max size of the various page
             * files (query SystemPafefileInformation only gets you the current size). */
            /* xref case 345 on fixmes (and link to wiki discussion) */
            if (schedule_reset(RESET_ALL)) {
                STATS_INC(reset_low_commit_count);
                DO_THRESHOLD_SAFE(
                    DYNAMO_OPTION(report_reset_commit_threshold), FREQ_PROTECTED_SECTION,
                    { /* < max - nothing */ },
                    { /* >= max */
                      /* FIXME - do we want to report more then once to give some idea of
                       * how much thrashing there is? */
                      DO_ONCE({
                          SYSLOG_CUSTOM_NOTIFY(
                              SYSLOG_WARNING, MSG_LOW_ON_COMMITTABLE_MEMORY, 2,
                              "Potentially thrashing on low committable "
                              "memory resetting.",
                              get_application_name(), get_application_pid());
                          /* want QA to notice */
                          ASSERT_CURIOSITY(false && "commit limit reset thrashing");
                      });
                    });
            }
        }
    }
    if (!res && DYNAMO_OPTION(oom_timeout) != 0 &&
        !(DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode)) {
        DEBUG_DECLARE(heap_error_code_t old_error_code = *error_code;)
        ASSERT(old_error_code != HEAP_ERROR_SUCCESS);

        /* check whether worth retrying */
        if (!os_heap_systemwide_overcommit(*error_code)) {
            /* FIXME: we should check whether current process is the hog */
            /* unless we have used the memory, there is still a
             * miniscule chance another thread will free up some or
             * will attempt suicide, so could retry even if current
             * process has a leak */
            ASSERT_NOT_IMPLEMENTED(false);
            /* retry */
        }

        SYSLOG_INTERNAL_WARNING("vmm_heap_commit oom: timeout and retry");
        /* let's hope a memory hog dies in the mean time */
        os_timeout(DYNAMO_OPTION(oom_timeout));

        res = os_heap_commit(p, size, prot, error_code);
        DODEBUG({
            if (res) {
                SYSLOG_INTERNAL_WARNING("vmm_heap_commit retried, got away!  old=" PFX
                                        " new=" PFX "\n",
                                        old_error_code, *error_code);
            } else {
                SYSLOG_INTERNAL_WARNING("vmm_heap_commit retrying, no luck.  old=" PFX
                                        " new=" PFX "\n",
                                        old_error_code, *error_code);
            }
        });
    }

    return res;
}

/* back to normal section */
END_DATA_SECTION()

/* Free previously reserved and possibly committed memory.  Check if
 * it is within the memory managed by the virtual memory manager we
 * only decommit back to the OS, and we remove the vmm reservation.
 * Keep in mind that this can be called on units that are not fully
 * committed, e.g. guard pages are added to this - as long as the
 * os_heap_decommit interface can handle this we're OK
 */
static void
vmm_heap_free(vm_addr_t p, size_t size, heap_error_code_t *error_code, which_vmm_t which)
{
    vm_heap_t *vmh = vmheap_for_which(which);
    LOG(GLOBAL, LOG_HEAP, 2, "vmm_heap_free %s: size=%d p=" PFX " is_reserved=%d\n",
        vmh->name, size, p, vmm_is_reserved_unit(vmh, p, size));
    vm_addr_t p_writable = p;
    if (DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode)
        p_writable = vmm_normalize_addr(vmh, &p);

    /* The memory doesn't have to be within our VM reserve if it
     * was allocated as an extra OS call when if we ran out.
     */
    if (DYNAMO_OPTION(vm_reserve)) {
        if (vmm_is_reserved_unit(vmh, p, size)) {
            if (DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode)
                os_heap_decommit(p_writable, size, error_code);
            os_heap_decommit(p, size, error_code);
            vmm_heap_free_blocks(vmh, p, size, which);
            LOG(GLOBAL, LOG_HEAP, 2, "vmm_heap_free %s: freed size=%d p=" PFX "\n",
                vmh->name, size, p);
            return;
        } else {
            /* FIXME: check if this is stack_free getting in the way, then ignore it */
            /* FIXME: could do this by overriding the meaning of the vmheap fields
               after cleanup to a different combination that start_pc = end_pc = NULL
             */
            /* FIXME: see vmm_heap_unit_exit for the current stack_free problem */
            if (vmm_heap_exited) {
                *error_code = HEAP_ERROR_SUCCESS;
                return;
            }
        }
    }
    if (DYNAMO_OPTION(satisfy_w_xor_x) && vmh == &heapmgt->vmcode)
        os_heap_free(p_writable, size, error_code);
    os_heap_free(p, size, error_code);
}

static void
vmm_heap_decommit(vm_addr_t p, size_t size, heap_error_code_t *error_code,
                  which_vmm_t which)
{
    LOG(GLOBAL, LOG_HEAP, 2, "vmm_heap_decommit: size=%d p=" PFX " is_reserved=%d\n",
        size, p, is_vmm_reserved_address(p, size, NULL, NULL));
    if (DYNAMO_OPTION(satisfy_w_xor_x)) {
        vm_heap_t *vmh = vmheap_for_which(which);
        if (vmh == &heapmgt->vmcode) {
            vm_addr_t p_writable = vmm_normalize_addr(vmh, &p);
            os_heap_decommit(p_writable, size, error_code);
        }
    }
    os_heap_decommit(p, size, error_code);
    /* nothing to be done to vmm blocks */
}

/* Caller is required to handle thread synchronization and to update dynamo vm areas.
 * size must be PAGE_SIZE-aligned.
 * Returns NULL if fails to allocate memory!
 */
static void *
vmm_heap_alloc(size_t size, uint prot, heap_error_code_t *error_code, which_vmm_t which)
{
    vm_addr_t p = vmm_heap_reserve(size, error_code, TEST(MEMPROT_EXEC, prot), which);
    if (!p)
        return NULL; /* out of reserved memory */

    if (!vmm_heap_commit(p, size, prot, error_code, which))
        return NULL; /* out of committed memory */
    return p;
}

/* virtual memory manager initialization */
void
vmm_heap_init()
{
    IF_WINDOWS(ASSERT(ALIGNED(OS_ALLOC_GRANULARITY, DYNAMO_OPTION(vmm_block_size))));
#ifdef X64
    /* add reachable regions before we allocate the heap, xref PR 215395 */
    /* i#774, i#901: we no longer need the DR library nor ntdll.dll to be
     * reachable by the vmheap reservation.  But, for -heap_in_lower_4GB,
     * we must call request_region_be_heap_reachable() up front.
     * This is a hard requirement so we set it prior to locating the vmm region.
     */
    if (DYNAMO_OPTION(heap_in_lower_4GB))
        request_region_be_heap_reachable(0, 0x80000000);
#endif
    if (DYNAMO_OPTION(vm_reserve)) {
        vmm_heap_unit_init(&heapmgt->vmcode, DYNAMO_OPTION(vm_size), true, "vmcode");
        if (!REACHABLE_HEAP()) {
            vmm_heap_unit_init(
                &heapmgt->vmheap,
                /* Use vmheap_size_wow64 if target is WoW64 windows process. */
                IF_WINDOWS_ELSE(IF_X64_ELSE(is_wow64_process(NT_CURRENT_PROCESS)
                                                ? DYNAMO_OPTION(vmheap_size_wow64)
                                                : DYNAMO_OPTION(vmheap_size),
                                            DYNAMO_OPTION(vmheap_size)),
                                DYNAMO_OPTION(vmheap_size)),
                false, "vmheap");
        }
    }
}

static void
vmh_exit(vm_heap_t *vmh, bool contains_stacks)
{
    /* We have three regions that are not explicitly deallocated: current stack, init
     * stack, global_do_syscall.
     */
    DOCHECK(1, {
        uint perstack =
            (uint)(ALIGN_FORWARD_UINT(
                       DYNAMO_OPTION(stack_size) +
                           (has_guard_pages(VMM_STACK | VMM_PER_THREAD)
                                ? (2 * PAGE_SIZE)
                                : (DYNAMO_OPTION(stack_guard_pages) ? PAGE_SIZE : 0)),
                       DYNAMO_OPTION(vmm_block_size)) /
                   DYNAMO_OPTION(vmm_block_size));
        uint unfreed_blocks;
        if (!contains_stacks || standalone_library)
            unfreed_blocks = 0;
        else {
            unfreed_blocks = perstack * 1 /* d_r_initstack */ +
                /* current stack */
                perstack * ((doing_detach IF_APP_EXPORTS(|| dr_api_exit)) ? 0 : 1);
        }
        /* Our bitmap does not get freed. */
        size_t blocks_sz_bytes =
            ALIGN_FORWARD_UINT(BITMAP_INDEX(vmh->num_blocks) * sizeof(bitmap_element_t),
                               DYNAMO_OPTION(vmm_block_size));
        unfreed_blocks += (uint)(blocks_sz_bytes / DYNAMO_OPTION(vmm_block_size));
        /* XXX: On detach, arch_thread_exit should explicitly mark as
         * left behind all TPCs needed so then we can assert even for
         * detach.
         */
        ASSERT(IF_WINDOWS(doing_detach ||) /* not deterministic when detaching */
                   vmh->num_free_blocks == vmh->num_blocks - unfreed_blocks ||
               /* >=, not ==, b/c if we hit the vmm limit the cur dstack
                * could be outside of vmm (i#1164).
                */
               ((ever_beyond_vmm
                     /* This also happens for dstacks up high for DrMi#1723. */
                     IF_WINDOWS(|| get_os_version() >= WINDOWS_VERSION_8_1)) &&
                vmh->num_free_blocks >= vmh->num_blocks - unfreed_blocks));
    });
    /* On process exit we are currently executing off a
     * stack in this region so we cannot free the whole allocation.

     * XXX: Any tombstone allocations will have to use a
     * different interface than the generic heap_mmap() which is
     * sometimes used to leave things behind.  FIXME: Currently
     * we'll leave behind the whole vm unit if any tombstones are
     * left - which in fact is always the case, no matter whether
     * thread private code needs to be left or not.

     * global_do_syscall 32 byte allocation should be part of our
     * dll and won't have to be left.

     * The current stack is the main problem because it is later
     * cleaned up in cleanup_and_terminate by calling stack_free which
     * in turn gets all the way to vmm_heap_free.  Therefore we add an
     * explicit test for vmm_heap_exited, so that we can otherwise free
     * bookkeeping information and delete the lock now.

     * Potential solution to most of these problems is to have
     * cleanup_and_terminate call vmm_heap_exit when cleaning up
     * the process, or to just leave the vm mapping behind and
     * simply pass a different argument to stack_free.
     */
    vmm_heap_unit_exit(vmh);
}

void
vmm_heap_exit()
{
    /* virtual memory manager exit */
    if (DYNAMO_OPTION(vm_reserve)) {
        if (heapmgt->vmcode.start_addr != NULL)
            vmh_exit(&heapmgt->vmcode, heapmgt->vmheap.start_addr == NULL);
        if (heapmgt->vmheap.start_addr != NULL)
            vmh_exit(&heapmgt->vmheap, true);
        vmm_heap_exited = true;
    }
}

#ifdef UNIX
void
vmm_heap_fork_pre(dcontext_t *dcontext)
{
    if (!DYNAMO_OPTION(satisfy_w_xor_x))
        return;
    /* The child wants a private copy of our dual mapping setup, rather than
     * sharing the parent's.  Unfortunately that requires copying the entire
     * vmcode contents into new mappings.  To avoid a race while the child makes
     * this copy from our live mappings, we create a temp copy now.  The
     * disadvantage is that we need a bunch of free memory (and address space:
     * but this is 64-bit-only).  The alternative is to have the parent wait for
     * the child but that seems too disruptive to scheduling.
     */
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    heap_error_code_t error_code;
    /* We store in a dcontext field to avoid races with other threads doing forks. */
    th->fork_copy_size = heapmgt->vmcode.alloc_size;
    th->fork_copy_start =
        os_heap_reserve(NULL, th->fork_copy_size, &error_code, true /*+x*/);
    if (th->fork_copy_start == NULL) {
        report_w_xor_x_fatal_error_and_exit();
        ASSERT_NOT_REACHED();
    }

    /* Copy each mapping.  We also need to record the +*x protections (because some
     * are +rw (ELF data segments), some are +rx, and some are +r (reachable
     * (non-exec) heap)).  We can't use the actual page prot of the copy to store
     * what the vmcode prot should be, because some W^X implementations remove +x
     * from a +wx region, and we require +w to make our copy.  Thus we store the
     * mapping prots in a vmvector.
     */
    VMVECTOR_ALLOC_VECTOR(th->fork_copy_areas, dcontext,
                          VECTOR_NEVER_MERGE | VECTOR_NO_LOCK, innermost_lock);
    memquery_iter_t iter;
    if (!memquery_iterator_start(&iter, heapmgt->vmcode.alloc_start,
                                 true /*using heap*/)) {
        report_w_xor_x_fatal_error_and_exit();
        ASSERT_NOT_REACHED();
    }
    while (memquery_iterator_next(&iter) && iter.vm_start < heapmgt->vmcode.end_addr) {
        if (iter.vm_start < heapmgt->vmcode.alloc_start || iter.prot == MEMPROT_NONE)
            continue;
        byte *new_start =
            iter.vm_start - heapmgt->vmcode.alloc_start + th->fork_copy_start;
        vmvector_add(th->fork_copy_areas, new_start,
                     new_start + (iter.vm_end - iter.vm_start),
                     (void *)(ptr_uint_t)iter.prot);
        if (!os_heap_commit(new_start, iter.vm_end - iter.vm_start,
                            MEMPROT_READ | MEMPROT_WRITE, &error_code)) {
            report_w_xor_x_fatal_error_and_exit();
            ASSERT_NOT_REACHED();
        }
        memcpy(new_start, iter.vm_start, iter.vm_end - iter.vm_start);
        LOG(GLOBAL, LOG_HEAP, 2, "%s: copied %p-%p %x to %p-%p\n", __FUNCTION__,
            iter.vm_start, iter.vm_end, iter.prot, new_start,
            new_start + (iter.vm_end - iter.vm_start));
    }
    memquery_iterator_stop(&iter);
}

void
vmm_heap_fork_post(dcontext_t *dcontext, bool parent)
{
    if (!DYNAMO_OPTION(satisfy_w_xor_x) || !parent)
        return;
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    heap_error_code_t error_code;
    os_heap_free(th->fork_copy_start, th->fork_copy_size, &error_code);
    if (error_code != HEAP_ERROR_SUCCESS) {
        report_w_xor_x_fatal_error_and_exit();
        ASSERT_NOT_REACHED();
    }
    th->fork_copy_start = NULL;
    th->fork_copy_size = 0;
    vmvector_reset_vector(dcontext, th->fork_copy_areas);
    vmvector_delete_vector(dcontext, th->fork_copy_areas);
    th->fork_copy_areas = NULL;
}

void
vmm_heap_fork_init(dcontext_t *dcontext)
{
    if (!DYNAMO_OPTION(satisfy_w_xor_x))
        return;
    /* We want a private copy of our dual mapping setup, rather than sharing the
     * parent's.  Unfortunately that requires copying the entire vmcode contents
     * into new mappings.  The parent has made a temp copy for us to avoid races
     * if we tried to copy its live memory.
     */

    /* First, make a new file. */
    int old_fd = heapmgt->dual_map_file;
    heapmgt->dual_map_file =
        os_create_memory_file(MEMORY_FILE_NAME, heapmgt->vmcode.alloc_size);
    if (heapmgt->dual_map_file == INVALID_FILE)
        goto vmm_heap_fork_init_failed;
    LOG(GLOBAL, LOG_HEAP, 2, "%s: new dual_map_file is %d\n", __FUNCTION__,
        heapmgt->dual_map_file);

    /* Second, make a new +w region and copy the old protections and contents. */
    size_t map_size = heapmgt->vmcode.alloc_size;
    byte *map_base =
        os_map_file(heapmgt->dual_map_file, &map_size, 0, heapmgt->vmcode_writable_alloc,
                    MEMPROT_NONE, MAP_FILE_VMM_COMMIT | MAP_FILE_FIXED);
    if (map_base != heapmgt->vmcode_writable_alloc ||
        map_size != heapmgt->vmcode.alloc_size)
        goto vmm_heap_fork_init_failed;
    heap_error_code_t error_code;
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    vmvector_iterator_t vmvi;
    vmvector_iterator_start(th->fork_copy_areas, &vmvi);
    while (vmvector_iterator_hasnext(&vmvi)) {
        byte *start, *end;
        uint prot = (uint)(ptr_uint_t)vmvector_iterator_next(&vmvi, &start, &end);
        byte *new_start = start - th->fork_copy_start + heapmgt->vmcode_writable_alloc;
        uint new_prot = (prot & ~(MEMPROT_EXEC)) | MEMPROT_WRITE;
        if (!os_heap_commit(new_start, end - start, new_prot, &error_code))
            goto vmm_heap_fork_init_failed;
        memcpy(new_start, start, end - start);
        LOG(GLOBAL, LOG_HEAP, 2, "%s: re-mapped %p-%p %x; copied from %p-%p %x\n",
            __FUNCTION__, new_start, new_start + (end - start), new_prot, start, end,
            prot);
    }
    vmvector_iterator_stop(&vmvi);

    /* Third, make a new +x region and set up the right protections and mappings. */
    map_size = heapmgt->vmcode.alloc_size;
    map_base =
        os_map_file(heapmgt->dual_map_file, &map_size, 0, heapmgt->vmcode.alloc_start,
                    MEMPROT_NONE, MAP_FILE_VMM_COMMIT | MAP_FILE_FIXED);
    if (map_base != heapmgt->vmcode.alloc_start || map_size != heapmgt->vmcode.alloc_size)
        goto vmm_heap_fork_init_failed;
    vmvector_iterator_start(th->fork_copy_areas, &vmvi);
    while (vmvector_iterator_hasnext(&vmvi)) {
        byte *start, *end;
        uint prot = (uint)(ptr_uint_t)vmvector_iterator_next(&vmvi, &start, &end);
        byte *new_start = start - th->fork_copy_start + heapmgt->vmcode.alloc_start;
        map_size = end - start;
        map_base =
            os_map_file(heapmgt->dual_map_file, &map_size, start - th->fork_copy_start,
                        new_start, prot, MAP_FILE_VMM_COMMIT | MAP_FILE_FIXED);
        if (map_base != new_start || map_size != end - start)
            goto vmm_heap_fork_init_failed;
        LOG(GLOBAL, LOG_HEAP, 2, "%s: re-mapped %p-%p %x\n", __FUNCTION__, new_start,
            new_start + map_size, prot);
    }
    vmvector_iterator_stop(&vmvi);

    os_heap_free(th->fork_copy_start, th->fork_copy_size, &error_code);
    if (error_code != HEAP_ERROR_SUCCESS)
        goto vmm_heap_fork_init_failed;
    th->fork_copy_start = NULL;
    th->fork_copy_size = 0;
    vmvector_reset_vector(dcontext, th->fork_copy_areas);
    vmvector_delete_vector(dcontext, th->fork_copy_areas);
    th->fork_copy_areas = NULL;

    /* XXX: We don't want to unlink any tmpfs file so we don't use
     * os_delete_memory_file(). This may not work on Windows if that function needs to do
     * more.
     */
    os_close_protected(old_fd);
    return;

vmm_heap_fork_init_failed:
    report_w_xor_x_fatal_error_and_exit();
    ASSERT_NOT_REACHED();
}
#endif

/* checks for compatibility among heap options, returns true if
 * modified the value of any options to make them compatible
 */
bool
heap_check_option_compatibility()
{
    bool ret = false;

    ret = check_param_bounds(&dynamo_options.vm_size, MIN_VMM_HEAP_UNIT_SIZE,
                             MAX_VMCODE_SIZE, "vm_size") ||
        ret;
    ret = check_param_bounds(&dynamo_options.vmheap_size, MIN_VMM_HEAP_UNIT_SIZE,
                             MAX_VMHEAP_SIZE, "vmheap_size") ||
        ret;
#ifdef INTERNAL
    /* if max_heap_unit_size is too small you may get a funny message
     * "initial_heap_unit_size must be >= 8229 and <= 4096" but in
     * release build we will take the min and then complain about
     * max_heap_unit_size and set it to the min also, so it all works
     * out w/o needing an extra check() call.
     */
    /* case 7626: don't short-circuit checks, as later ones may be needed */
    ret = check_param_bounds(&dynamo_options.initial_heap_unit_size,
                             /* if have units smaller than a page we end up
                              * allocating 64KB chunks for "oversized" units
                              * for just about every alloc!  so round up to
                              * at least a page.
                              */
                             ALIGN_FORWARD(UNITOVERHEAD + 1, (uint)PAGE_SIZE),
                             HEAP_UNIT_MAX_SIZE, "initial_heap_unit_size") ||
        ret;
    ret = check_param_bounds(&dynamo_options.initial_global_heap_unit_size,
                             ALIGN_FORWARD(UNITOVERHEAD + 1, (uint)PAGE_SIZE),
                             HEAP_UNIT_MAX_SIZE, "initial_global_heap_unit_size") ||
        ret;
    ret = check_param_bounds(&dynamo_options.max_heap_unit_size,
                             MAX(HEAP_UNIT_MIN_SIZE, GLOBAL_UNIT_MIN_SIZE), INT_MAX,
                             "max_heap_unit_size") ||
        ret;
#endif
    return ret;
}

/* thread-shared initialization that should be repeated after a reset */
void
heap_reset_init()
{
    threadunits_init(GLOBAL_DCONTEXT, &heapmgt->global_nonpersistent_units,
                     GLOBAL_UNIT_MIN_SIZE, false);
}

/* initialization */
void
d_r_heap_init()
{
    int i;
    DEBUG_DECLARE(uint prev_sz = 0;)

    LOG(GLOBAL, LOG_TOP | LOG_HEAP, 2, "Heap bucket sizes are:\n");
    /* make sure we'll preserve alignment */
    ASSERT(ALIGNED(HEADER_SIZE, HEAP_ALIGNMENT));
    /* make sure free list pointers will fit */
    ASSERT(BLOCK_SIZES[0] >= sizeof(heap_pc *));
    /* since sizes depend on size of structs, make sure they're in order */
    for (i = 0; i < BLOCK_TYPES; i++) {
        ASSERT(BLOCK_SIZES[i] > prev_sz);
        /* we assume all of our heap allocs are aligned */
        ASSERT(i == BLOCK_TYPES - 1 || ALIGNED(BLOCK_SIZES[i], HEAP_ALIGNMENT));
        DODEBUG(prev_sz = BLOCK_SIZES[i];);
        LOG(GLOBAL, LOG_TOP | LOG_HEAP, 2, "\t%d bytes\n", BLOCK_SIZES[i]);
    }

    /* we assume writes to some static vars are atomic,
     * i.e., the vars don't cross cache lines.  they shouldn't since
     * they should all be 4-byte-aligned in the data segment.
     * FIXME: ensure that release build aligns ok?
     * I would be quite surprised if static vars were not 4-byte-aligned!
     */
    ASSERT(ALIGN_BACKWARD(&heap_exiting, CACHE_LINE_SIZE()) ==
           ALIGN_BACKWARD(&heap_exiting + 1, CACHE_LINE_SIZE()));
    ASSERT(ALIGN_BACKWARD(&heap_unit_lock.owner, CACHE_LINE_SIZE()) ==
           ALIGN_BACKWARD(&heap_unit_lock.owner + 1, CACHE_LINE_SIZE()));

    /* For simplicity we go through our normal heap mechanism to allocate
     * our post-init heapmgt struct
     */
    ASSERT(heapmgt == &temp_heapmgt);
    heapmgt->global_heap_writable = true; /* this is relied on in global_heap_alloc */
    threadunits_init(GLOBAL_DCONTEXT, &heapmgt->global_units, GLOBAL_UNIT_MIN_SIZE,
                     false);

    heapmgt =
        HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, heap_management_t, ACCT_MEM_MGT, PROTECTED);
    ASSERT(sizeof(temp_heapmgt) == sizeof(*heapmgt));
    memcpy(heapmgt, &temp_heapmgt, sizeof(temp_heapmgt));

    threadunits_init(GLOBAL_DCONTEXT, &heapmgt->global_unprotected_units,
                     GLOBAL_UNIT_MIN_SIZE, false);
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        threadunits_init(GLOBAL_DCONTEXT, &heapmgt->global_reachable_units,
                         GLOBAL_UNIT_MIN_SIZE, true);
    }
    heap_reset_init();

#ifdef WINDOWS
    /* PR 250294: As part of 64-bit hook work, hook reachability was addressed
     * using landing pads (see win32/callback.c for more explanation).  Landing
     * pad areas are a type of special heap, so they should be initialized
     * during heap init.
     * Each landing pad area has its own allocation pointer, so they shouldn't
     * be merged automatically.
     */
    VMVECTOR_ALLOC_VECTOR(landing_pad_areas, GLOBAL_DCONTEXT,
                          VECTOR_SHARED | VECTOR_NEVER_MERGE, landing_pad_areas_lock);
#endif
}

/* need to not remove from vmareas on process exit -- vmareas has already exited! */
static void
really_free_unit(heap_unit_t *u)
{
    RSTATS_SUB(heap_capacity, UNIT_COMMIT_SIZE(u));
    STATS_ADD(heap_reserved_only,
              (stats_int_t)(UNIT_COMMIT_SIZE(u) - UNIT_RESERVED_SIZE(u)));
    /* remember that u itself is inside unit, not separately allocated */
    release_guarded_real_memory((vm_addr_t)u, UNIT_RESERVED_SIZE(u),
                                false /*do not update DR areas now*/, true, u->which);
}

/* Free all thread-shared state not critical to forward progress;
 * heap_reset_init() will be called before continuing.
 */
void
heap_reset_free()
{
    heap_unit_t *u, *next_u;
    /* FIXME: share some code w/ heap_exit -- currently only called by reset */
    ASSERT(DYNAMO_OPTION(enable_reset));

    /* we must grab this lock before heap_unit_lock to avoid rank
     * order violations when freeing
     */
    dynamo_vm_areas_lock();

    /* for combining stats into global_units we need this lock
     * FIXME: remove if we go to separate stats sum location
     */
    DODEBUG({ acquire_recursive_lock(&global_alloc_lock); });

    acquire_recursive_lock(&heap_unit_lock);

    LOG(GLOBAL, LOG_HEAP, 1, "Pre-reset, global heap unit stats:\n");
    /* FIXME: free directly rather than putting on dead list first */
    threadunits_exit(&heapmgt->global_nonpersistent_units, GLOBAL_DCONTEXT);

    /* free all dead units */
    u = heapmgt->heap.dead;
    while (u != NULL) {
        next_u = u->next_global;
        LOG(GLOBAL, LOG_HEAP, 1, "\tfreeing dead unit " PFX "-" PFX " [-" PFX "]\n", u,
            UNIT_COMMIT_END(u), UNIT_RESERVED_END(u));
        RSTATS_DEC(heap_num_free);
        really_free_unit(u);
        u = next_u;
    }
    heapmgt->heap.dead = NULL;
    heapmgt->heap.num_dead = 0;
    release_recursive_lock(&heap_unit_lock);
    DODEBUG({ release_recursive_lock(&global_alloc_lock); });
    dynamo_vm_areas_unlock();
}

/* atexit cleanup */
void
d_r_heap_exit()
{
    heap_unit_t *u, *next_u;
    heap_management_t *temp;

    heap_exiting = true;
    /* FIXME: we shouldn't need either lock if executed last */
    dynamo_vm_areas_lock();
    acquire_recursive_lock(&heap_unit_lock);

#ifdef WINDOWS
    release_landing_pad_mem(); /* PR 250294 */
#endif

    LOG(GLOBAL, LOG_HEAP, 1, "Global unprotected heap unit stats:\n");
    threadunits_exit(&heapmgt->global_unprotected_units, GLOBAL_DCONTEXT);
    LOG(GLOBAL, LOG_HEAP, 1, "Global nonpersistent heap unit stats:\n");
    threadunits_exit(&heapmgt->global_nonpersistent_units, GLOBAL_DCONTEXT);
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        LOG(GLOBAL, LOG_HEAP, 1, "Global reachable heap unit stats:\n");
        threadunits_exit(&heapmgt->global_reachable_units, GLOBAL_DCONTEXT);
    }

    /* Now we need to go back to the static struct to clean up */
    ASSERT(heapmgt != &temp_heapmgt);
    /* We need to maintain the lock process list which was using the temp_heapmgt
     * lock structure.
     */
    mutex_t temp_vmcode = temp_heapmgt.vmcode.lock;
    mutex_t temp_vmheap = temp_heapmgt.vmheap.lock;
    memcpy(&temp_heapmgt, heapmgt, sizeof(temp_heapmgt));
    temp_heapmgt.vmcode.lock = temp_vmcode;
    temp_heapmgt.vmheap.lock = temp_vmheap;
    temp = heapmgt;
    heapmgt = &temp_heapmgt;
    HEAP_TYPE_FREE(GLOBAL_DCONTEXT, temp, heap_management_t, ACCT_MEM_MGT, PROTECTED);

    LOG(GLOBAL, LOG_HEAP, 1, "Global heap unit stats:\n");
    threadunits_exit(&heapmgt->global_units, GLOBAL_DCONTEXT);

    /* free heap for all unfreed units */
    LOG(GLOBAL, LOG_HEAP, 1, "Unfreed units:\n");
    u = heapmgt->heap.units;
    while (u != NULL) {
        next_u = u->next_global;
        LOG(GLOBAL, LOG_HEAP, 1, "\tfreeing live unit " PFX "-" PFX " [-" PFX "]\n", u,
            UNIT_COMMIT_END(u), UNIT_RESERVED_END(u));
        RSTATS_DEC(heap_num_live);
        really_free_unit(u);
        u = next_u;
    }
    heapmgt->heap.units = NULL;
    u = heapmgt->heap.dead;
    while (u != NULL) {
        next_u = u->next_global;
        LOG(GLOBAL, LOG_HEAP, 1, "\tfreeing dead unit " PFX "-" PFX " [-" PFX "]\n", u,
            UNIT_COMMIT_END(u), UNIT_RESERVED_END(u));
        RSTATS_DEC(heap_num_free);
        really_free_unit(u);
        u = next_u;
    }
    heapmgt->heap.dead = NULL;
    heapmgt->global_heap_writable = false; /* This is relied on in global_heap_alloc. */
    release_recursive_lock(&heap_unit_lock);
    dynamo_vm_areas_unlock();

    DELETE_RECURSIVE_LOCK(heap_unit_lock);
    DELETE_RECURSIVE_LOCK(global_alloc_lock);
    DELETE_RECURSIVE_LOCK(low_on_memory_pending_lock);

#ifdef X64
    DELETE_LOCK(request_region_be_heap_reachable_lock);
#endif

    if (doing_detach) {
        heapmgt = &temp_heapmgt;
        IF_X64(reset_heap_reachable_bounds());
    }
}

void
heap_post_exit()
{
    heap_exiting = false;
}

/* FIXME:
 * detect if the app is who we're fighting for memory, if so, don't
 * free memory, else the app will just keep grabbing more.
 * need a test for hitting 2GB (or 3GB!) user mode limit.
 */
static void
heap_low_on_memory()
{
    /* free some memory! */
    heap_unit_t *u, *next_u;
    DEBUG_DECLARE(size_t freed = 0;)
    LOG(GLOBAL, LOG_CACHE | LOG_STATS, 1,
        "heap_low_on_memory: about to free dead list units\n");
    /* WARNING: this routine is called at arbitrary allocation failure points,
     * so we have to be careful what locks we grab
     * However, no allocation site can hold a lock weaker in rank than
     * heap_unit_lock, b/c it could deadlock on the allocation itself!
     * So we're safe.
     */
    /* must grab this lock prior to heap_unit_lock if affecting DR vm areas
     * this is recursive so ok if we ran out of memory while holding DR vm area lock
     */
    ASSERT(safe_to_allocate_or_free_heap_units());
    dynamo_vm_areas_lock();
    acquire_recursive_lock(&heap_unit_lock);
    u = heapmgt->heap.dead;
    while (u != NULL) {
        next_u = u->next_global;
        DODEBUG(freed += UNIT_COMMIT_SIZE(u););
        /* FIXME: if out of committed pages only, could keep our reservations */
        LOG(GLOBAL, LOG_HEAP, 1, "\tfreeing dead unit " PFX "-" PFX " [-" PFX "]\n", u,
            UNIT_COMMIT_END(u), UNIT_RESERVED_END(u));
        RSTATS_DEC(heap_num_free);
        really_free_unit(u);
        u = next_u;
        heapmgt->heap.num_dead--;
    }
    heapmgt->heap.dead = NULL;
    release_recursive_lock(&heap_unit_lock);
    dynamo_vm_areas_unlock();
    LOG(GLOBAL, LOG_CACHE | LOG_STATS, 1, "heap_low_on_memory: freed %d KB\n",
        freed / 1024);
    /* FIXME: we don't keep a list of guard pages, which we may decide to throw
     * out or compact at this time.
     */
    /* FIXME: should also fix up the allocator to look in other free lists
     * of sizes larger than asked for, we may have plenty of memory available
     * in other lists!  see comments in common_heap_alloc
     */
}

static const char *
get_oom_source_name(oom_source_t source)
{
    /* currently only single character codenames,
     * (still as a string though)
     */
    const char *code_name = "?";

    switch (source) {
    case OOM_INIT: code_name = "I"; break;
    case OOM_RESERVE: code_name = "R"; break;
    case OOM_COMMIT: code_name = "C"; break;
    case OOM_EXTEND: code_name = "E"; break;
    default: ASSERT_NOT_REACHED();
    }
    return code_name;
}

static bool
silent_oom_for_process(oom_source_t source)
{
    if (TESTANY(OOM_COMMIT | OOM_EXTEND, source) &&
        !IS_STRING_OPTION_EMPTY(silent_commit_oom_list)) {
        bool onlist;
        const char *process_name = get_short_name(get_application_name());
        string_option_read_lock();
        onlist = check_filter_with_wildcards(DYNAMO_OPTION(silent_commit_oom_list),
                                             process_name);
        string_option_read_unlock();

        if (onlist) {
            SYSLOG_INTERNAL_WARNING("not reporting last words of executable %s",
                                    process_name);
            return true;
        }
    }
    return false;
}

/* oom_source_t identifies the action we were taking, os_error_code is
 * the returned value from the last system call - opaque at this OS
 * independent layer.
 */
static void
report_low_on_memory(which_vmm_t which, oom_source_t source,
                     heap_error_code_t os_error_code)
{
    if (TESTANY(DYNAMO_OPTION(silent_oom_mask), source) ||
        silent_oom_for_process(source)) {
        SYSLOG_INTERNAL_WARNING("Mostly silent OOM: %s " PFX ".\n",
                                get_oom_source_name(source), os_error_code);
        /* still produce an ldmp for internal use */
        if (TEST(DUMPCORE_OUT_OF_MEM_SILENT, DYNAMO_OPTION(dumpcore_mask)))
            os_dump_core("Out of memory, silently aborting program.");
    } else {
        const char *oom_source_code = get_oom_source_name(source);
        char type_hex[19];
        snprintf(type_hex, BUFFER_SIZE_ELEMENTS(type_hex), PFX, which);
        NULL_TERMINATE_BUFFER(type_hex);
        char status_hex[19];
        snprintf(status_hex, BUFFER_SIZE_ELEMENTS(status_hex), PFX, os_error_code);
        NULL_TERMINATE_BUFFER(status_hex);
        /* SYSLOG first */
        SYSLOG(SYSLOG_CRITICAL, OUT_OF_MEMORY, 4, get_application_name(),
               get_application_pid(), oom_source_code, type_hex, status_hex);
        /* Stats can be very useful to diagnose why we hit OOM. */
        if (INTERNAL_OPTION(rstats_to_stderr))
            dump_global_rstats_to_stderr();

        /* XXX: case 7296 - ldmp even if we have decided not to produce an event above */
        if (TEST(DUMPCORE_OUT_OF_MEM, DYNAMO_OPTION(dumpcore_mask)))
            os_dump_core("Out of memory, aborting program.");

        /* pass only status code to XML where we should have a stack dump and callstack */
        report_diagnostics("Out of memory", status_hex, NO_VIOLATION_BAD_INTERNAL_STATE);
    }
    os_terminate(NULL, TERMINATE_PROCESS);
    ASSERT_NOT_REACHED();
}

/* update statistics for committed memory, and add to vm_areas */
static inline void
account_for_memory(void *p, size_t size, uint prot, bool add_vm,
                   bool image _IF_DEBUG(const char *comment))
{
    RSTATS_ADD_PEAK(memory_capacity, size);

    /* case 3045: areas inside the vmheap reservation are not added to the list
     * for clients that use DR-allocated memory, we have get_memory_info()
     * query from the OS to see inside
     */
    if (is_vmm_reserved_address(p, size, NULL, NULL)) {
        return;
    }

    if (add_vm) {
        add_dynamo_vm_area(p, ((app_pc)p) + size, prot, image _IF_DEBUG(comment));
    } else {
        /* due to circular dependencies bet vmareas and global heap we do not call
         * add_dynamo_vm_area here, instead we indicate that something has changed
         */
        mark_dynamo_vm_areas_stale();
        /* NOTE: 'prot' info is lost about this region, but is needed in
         * heap_vmareas_synch_units to update all_memory_areas.  Currently
         * heap_create_unit is the only place that passes 'false' with prot rw-.
         */
        ASSERT(TESTALL(MEMPROT_READ | MEMPROT_WRITE, prot));
    }
}

/* remove_vm MUST be false iff this is heap memory, which is updated separately */
static void
update_dynamo_areas_on_release(app_pc start, app_pc end, bool remove_vm)
{
    if (!vm_areas_exited && !heap_exiting) { /* avoid problems when exiting */
        /* case 3045: areas inside the vmheap reservation are not added to the list
         * for clients that use DR-allocated memory, we have get_memory_info()
         * query from the OS to see inside
         */
        if (is_vmm_reserved_address(start, end - start, NULL, NULL)) {
            return;
        }
        if (remove_vm) {
            remove_dynamo_vm_area(start, end);
        } else {
            /* Due to cyclic dependencies bet heap and vmareas we cannot remove
             * incrementally.  The pending set is protected by the same lock
             * needed to synch the vm areas, so we will never mis-identify free
             * memory as DR memory.
             */
            mark_dynamo_vm_areas_stale();
            dynamo_areas_pending_remove = true;
        }
    }
}

bool
lockwise_safe_to_allocate_memory()
{
    /* check whether it's safe to hold a lock that normally can be held
     * for memory allocation -- i.e., check whether we hold the
     * global_alloc_lock
     */
    return !self_owns_recursive_lock(&global_alloc_lock);
}

/* Reserves space inside the VMM region which can be used by the caller for mapping
 * mapping a file.  First attempts to reserve at "preferred" but if that fails it
 * attempts at any available location.
 */
byte *
heap_reserve_for_external_mapping(byte *preferred, size_t size, which_vmm_t which)
{
#ifdef WINDOWS
    /* TODO i#3570: Add Windows support, which is complex as we cannot map a file
     * on top of an existing reservation; nor can we un-reserve a piece of a
     * reservation.  See the issue for solution ideas.
     */
    ASSERT_NOT_IMPLEMENTED(false && "i#3570");
    return NULL;
#endif
    vm_addr_t p = NULL;
    vm_heap_t *vmh = vmheap_for_which(which);
    ASSERT(size > 0);
    size = ALIGN_FORWARD(size, PAGE_SIZE);
    if (!DYNAMO_OPTION(vm_reserve))
        return NULL;
    if (preferred >= vmh->start_addr && preferred + size <= vmh->end_addr)
        p = vmm_heap_reserve_blocks(vmh, size, preferred, which);
    if (p == NULL)
        p = vmm_heap_reserve_blocks(vmh, size, NULL, which);
    LOG(GLOBAL, LOG_HEAP, 2, "%s %s: size=%d p=" PFX "\n", __FUNCTION__, vmh->name, size,
        p);
    return p;
}

/* Before calling this function, the caller must restore [p,p+size) to
 * its state from before heap_reserve_for_external_mapping() was
 * called: reserved but not committed.
 */
bool
heap_unreserve_for_external_mapping(byte *p, size_t size, which_vmm_t which)
{
#ifdef WINDOWS
    /* TODO i#3570: Add Windows support, which is complex as we cannot map a file
     * on top of an existing reservation; nor can we un-reserve a piece of a
     * reservation.  See the issue for solution ideas.
     */
    ASSERT_NOT_IMPLEMENTED(false && "i#3570");
    return false;
#endif
    vm_heap_t *vmh = vmheap_for_which(which);
    ASSERT(size > 0);
    size = ALIGN_FORWARD(size, PAGE_SIZE);
    if (!DYNAMO_OPTION(vm_reserve) || !is_vmm_reserved_address(p, size, NULL, NULL))
        return false;
    vmm_heap_free_blocks(vmh, p, size, which);
    LOG(GLOBAL, LOG_HEAP, 2, "%s %s: size=%d p=" PFX "\n", __FUNCTION__, vmh->name, size,
        p);
    return true;
}

/* we indirect all os memory requests through here so we have a central place
 * to handle the out-of-memory condition.
 * add_vm MUST be false iff this is heap memory, which is updated separately.
 */
static void *
get_real_memory(size_t size, uint prot, bool add_vm,
                which_vmm_t which _IF_DEBUG(const char *comment))
{
    void *p;
    heap_error_code_t error_code;
    /* must round up to page sizes, else vmm_heap_alloc assert triggers */
    size = ALIGN_FORWARD(size, PAGE_SIZE);

    /* memory alloc/dealloc and updating DR list must be atomic */
    dynamo_vm_areas_lock(); /* if already hold lock this is a nop */

    p = vmm_heap_alloc(size, prot, &error_code, which);
    if (p == NULL) {
        SYSLOG_INTERNAL_WARNING_ONCE("Out of memory -- cannot reserve or "
                                     "commit %dKB.  Trying to recover.",
                                     size / 1024);
        /* we should be ok here, shouldn't come in here holding global_alloc_lock
         * or heap_unit_lock w/o first having grabbed DR areas lock
         */
        ASSERT(safe_to_allocate_or_free_heap_units());
        heap_low_on_memory();
        fcache_low_on_memory();
        /* try again
         * FIXME: have more sophisticated strategy of freeing a little, then getting
         * more drastic with each subsequent failure
         * FIXME: can only free live fcache units for current thread w/ current
         * impl...should we wait a while and try again if out of memory, hoping
         * other threads have freed some?!?!
         */
        p = vmm_heap_alloc(size, prot, &error_code, which);
        if (p == NULL) {
            report_low_on_memory(which, OOM_RESERVE, error_code);
        }
        SYSLOG_INTERNAL_WARNING_ONCE("Out of memory -- but still alive after "
                                     "emergency free.");
    }

    account_for_memory(p, size, prot, add_vm, false _IF_DEBUG(comment));
    dynamo_vm_areas_unlock();

    return p;
}

static void
release_memory_and_update_areas(app_pc p, size_t size, bool decommit, bool remove_vm,
                                which_vmm_t which)
{
    heap_error_code_t error_code;
    /* these two operations need to be atomic wrt DR area updates */
    dynamo_vm_areas_lock(); /* if already hold lock this is a nop */
    /* ref case 3035, we must remove from dynamo_areas before we free in case
     * we end up allocating memory in the process of removing the area
     * (we don't want to end up getting the memory we just freed since that
     *  would lead to errors in the list when we finally did remove it)
     */
    update_dynamo_areas_on_release(p, p + size, remove_vm);
    if (decommit)
        vmm_heap_decommit(p, size, &error_code, which);
    else
        vmm_heap_free(p, size, &error_code, which);
    ASSERT(error_code == HEAP_ERROR_SUCCESS);
    dynamo_vm_areas_unlock();
}

/* remove_vm MUST be false iff this is heap memory, which is updated separately */
static void
release_real_memory(void *p, size_t size, bool remove_vm, which_vmm_t which)
{
    /* must round up to page sizes for vmm_heap_free */
    size = ALIGN_FORWARD(size, PAGE_SIZE);

    release_memory_and_update_areas((app_pc)p, size, false /*free*/, remove_vm, which);

    /* avoid problem w/ being called by cleanup_and_terminate after dynamo_process_exit */
    if (IF_DEBUG_ELSE(!dynamo_exited_log_and_stats, true))
        RSTATS_SUB(memory_capacity, size);
}

static void
extend_commitment(vm_addr_t p, size_t size, uint prot, bool initial_commit,
                  which_vmm_t which)
{
    heap_error_code_t error_code;
    ASSERT(ALIGNED(p, PAGE_SIZE));
    size = ALIGN_FORWARD(size, PAGE_SIZE);
    if (!vmm_heap_commit(p, size, prot, &error_code, which)) {
        SYSLOG_INTERNAL_WARNING_ONCE("Out of memory - cannot extend commit "
                                     "%dKB. Trying to recover.",
                                     size / 1024);
        heap_low_on_memory();
        fcache_low_on_memory();
        /* see low-memory ideas in get_real_memory */
        if (!vmm_heap_commit(p, size, prot, &error_code, which)) {
            report_low_on_memory(which, initial_commit ? OOM_COMMIT : OOM_EXTEND,
                                 error_code);
        }

        SYSLOG_INTERNAL_WARNING_ONCE("Out of memory in extend - still alive "
                                     "after emergency free.");
    }
}

/* A wrapper around get_real_memory that adds a guard page on each side of the
 * requested unit.  These should consume only uncommitted virtual address and
 * should not use any physical memory.
 * add_vm MUST be false iff this is heap memory, which is updated separately.
 * Non-NULL min_addr is only supported for stack allocations (DrMi#1723).
 */
static vm_addr_t
get_guarded_real_memory(size_t reserve_size, size_t commit_size, uint prot, bool add_vm,
                        bool guarded, byte *min_addr,
                        which_vmm_t which _IF_DEBUG(const char *comment))
{
    vm_addr_t p = NULL;
    uint guard_size = (uint)PAGE_SIZE;
    heap_error_code_t error_code;
    bool try_vmm = true;
    ASSERT(reserve_size >= commit_size);
    if (!guarded || !has_guard_pages(which)) {
        if (reserve_size == commit_size)
            return get_real_memory(reserve_size, prot, add_vm, which _IF_DEBUG(comment));
        guard_size = 0;
    }

    reserve_size = ALIGN_FORWARD(reserve_size, PAGE_SIZE);
    commit_size = ALIGN_FORWARD(commit_size, PAGE_SIZE);

    reserve_size += 2 * guard_size; /* add top and bottom guards */

    /* memory alloc/dealloc and updating DR list must be atomic */
    dynamo_vm_areas_lock(); /* if already hold lock this is a nop */

#ifdef WINDOWS
    /* DrMi#1723: if we swap TEB stack fields, a client (or a DR app mem touch)
     * can trigger an app guard
     * page.  We have to ensure that the kernel will update TEB.StackLimit in that
     * case, which requires our dstack to be higher than the app stack.
     * This results in more fragmentation and larger dynamo_areas so we avoid
     * if we can.  We could consider a 2nd vm_reserve region just for stacks.
     */
    if (SWAP_TEB_STACKBASE() && (!DYNAMO_OPTION(vm_reserve) && min_addr > NULL) ||
        (DYNAMO_OPTION(vm_reserve) && min_addr > vmheap_get_start())) {
        try_vmm = false;
    }
#endif

    if (try_vmm)
        p = vmm_heap_reserve(reserve_size, &error_code, TEST(MEMPROT_EXEC, prot), which);

#ifdef WINDOWS
    if (!try_vmm || p < (vm_addr_t)min_addr) {
        if (p != NULL)
            vmm_heap_free(p, reserve_size, &error_code, which);
        p = os_heap_reserve_in_region((void *)ALIGN_FORWARD(min_addr, PAGE_SIZE),
                                      (void *)PAGE_START(POINTER_MAX), reserve_size,
                                      &error_code, TEST(MEMPROT_EXEC, prot));
        /* No reason to update heap-reachable b/c stack doesn't need to reach
         * (min_addr != NULL assumed to be stack).
         */
        ASSERT(!DYNAMO_OPTION(stack_shares_gencode)); /* would break reachability */
        /* If it fails we can't do much: we fall back to within-vmm, if possible,
         * and rely on our other best-effort TEB.StackLimit updating checks
         * (check_app_stack_limit()).
         */
        if (p == NULL) {
            SYSLOG_INTERNAL_WARNING_ONCE("Unable to allocate dstack above app stack");
            if (!try_vmm) {
                p = vmm_heap_reserve(reserve_size, &error_code, TEST(MEMPROT_EXEC, prot),
                                     which);
            }
        }
    }
#endif

    if (p == NULL) {
        /* Very unlikely to happen: we have to reach at least 2GB reserved memory. */
        SYSLOG_INTERNAL_WARNING_ONCE("Out of memory - cannot reserve %dKB. "
                                     "Trying to recover.",
                                     reserve_size / 1024);
        heap_low_on_memory();
        fcache_low_on_memory();

        p = vmm_heap_reserve(reserve_size, &error_code, TEST(MEMPROT_EXEC, prot), which);
        if (p == NULL) {
            report_low_on_memory(which, OOM_RESERVE, error_code);
        }

        SYSLOG_INTERNAL_WARNING_ONCE("Out of memory on reserve - but still "
                                     "alive after emergency free.");
    }
    /* includes guard pages if add_vm -- else, heap_vmareas_synch_units() will
     * add guard pages in by assuming one page on each side of every heap unit
     * if dynamo_options.guard_pages
     */
    account_for_memory((void *)p, reserve_size, prot, add_vm, false _IF_DEBUG(comment));
    dynamo_vm_areas_unlock();

    STATS_ADD_PEAK(reserved_memory_capacity, reserve_size);
    STATS_ADD_PEAK(guard_pages, 2);

    p += guard_size;
    extend_commitment(p, commit_size, prot, true /* initial commit */, which);

    return p;
}

/* A wrapper around get_release_memory that also frees the guard pages on each
 * side of the requested unit.  remove_vm MUST be false iff this is heap memory,
 * which is updated separately.
 */
static void
release_guarded_real_memory(vm_addr_t p, size_t size, bool remove_vm, bool guarded,
                            which_vmm_t which)
{
    if (!guarded || !has_guard_pages(which)) {
        release_real_memory(p, size, remove_vm, which);
        return;
    }

    size = ALIGN_FORWARD(size, PAGE_SIZE);
    size += PAGE_SIZE * 2; /* add top and bottom guards */
    p -= PAGE_SIZE;

    release_memory_and_update_areas((app_pc)p, size, false /*free*/, remove_vm, which);

    /* avoid problem w/ being called by cleanup_and_terminate after dynamo_process_exit */
    if (IF_DEBUG_ELSE(!dynamo_exited_log_and_stats, true)) {
        RSTATS_SUB(memory_capacity, size);
        STATS_SUB(reserved_memory_capacity, size);
        STATS_ADD(guard_pages, -2);
    }
}

/* use heap_mmap to allocate large chunks of executable memory
 * it's mainly used to allocate our fcache units
 */
void *
heap_mmap_ex(size_t reserve_size, size_t commit_size, uint prot, bool guarded,
             which_vmm_t which)
{
    void *p = get_guarded_real_memory(reserve_size, commit_size, prot, true, guarded,
                                      NULL, which _IF_DEBUG("heap_mmap"));
#ifdef DEBUG_MEMORY
    if (TEST(MEMPROT_WRITE, prot))
        memset(vmm_get_writable_addr(p, which), HEAP_ALLOCATED_BYTE, commit_size);
#endif
    /* We rely on this for freeing _post_stack in absence of dcontext */
    ASSERT(!DYNAMO_OPTION(vm_reserve) || !DYNAMO_OPTION(stack_shares_gencode) ||
           (ptr_uint_t)p - (guarded ? (GUARD_PAGE_ADJUSTMENT / 2) : 0) ==
               ALIGN_BACKWARD(p, DYNAMO_OPTION(vmm_block_size)) ||
           at_reset_at_vmm_limit(vmheap_for_which(which)));
    LOG(GLOBAL, LOG_HEAP, 2, "heap_mmap: %d bytes [/ %d] @ " PFX "\n", commit_size,
        reserve_size, p);
    STATS_ADD_PEAK(mmap_capacity, commit_size);
    STATS_ADD_PEAK(mmap_reserved_only, (reserve_size - commit_size));
    return p;
}

/* Use heap_mmap to allocate large chunks of memory. */
void *
heap_mmap_reserve(size_t reserve_size, size_t commit_size, uint prot, which_vmm_t which)
{
    return heap_mmap_ex(reserve_size, commit_size, prot, true, which);
}

/* It is up to the caller to ensure commit_size is a page size multiple,
 * and that it does not extend beyond the initial reservation.
 */
void
heap_mmap_extend_commitment(void *p, size_t commit_size, which_vmm_t which)
{
    extend_commitment(p, commit_size, MEMPROT_EXEC | MEMPROT_READ | MEMPROT_WRITE,
                      false /*not initial commit*/, which);
    STATS_SUB(mmap_reserved_only, commit_size);
    STATS_ADD_PEAK(mmap_capacity, commit_size);
#ifdef DEBUG_MEMORY
    memset(vmm_get_writable_addr(p, which), HEAP_ALLOCATED_BYTE, commit_size);
#endif
}

/* De-commits from a committed region. */
void
heap_mmap_retract_commitment(void *retract_start, size_t decommit_size, which_vmm_t which)
{
    heap_error_code_t error_code;
    ASSERT(ALIGNED(decommit_size, PAGE_SIZE));
    vmm_heap_decommit(retract_start, decommit_size, &error_code, which);
    STATS_ADD(mmap_reserved_only, decommit_size);
    STATS_ADD_PEAK(mmap_capacity, -(stats_int_t)decommit_size);
}

/* Allocates executable memory in the same allocation region as this thread's
 * stack, to save address space (case 9474).
 * Doing so is only supported for -reachable_heap (but for x64 we don't have
 * private gencode in any case).
 */
void *
heap_mmap_reserve_post_stack(dcontext_t *dcontext, size_t reserve_size,
                             size_t commit_size, uint prot, which_vmm_t which)
{
    void *p;
    byte *stack_reserve_end = NULL;
    heap_error_code_t error_code;
    size_t available = 0;
    uint cur_prot = 0; /* avoid compiler warning */
    bool known_stack = false;
    vm_heap_t *vmh = vmheap_for_which(which);
    ASSERT(reserve_size > 0 && commit_size < reserve_size);
    /* 1.5 * guard page adjustment since we'll share the middle one */
    if (DYNAMO_OPTION(stack_size) + reserve_size + GUARD_PAGE_ADJUSTMENT +
            GUARD_PAGE_ADJUSTMENT / 2 >
        DYNAMO_OPTION(vmm_block_size)) {
        /* there's not enough room to share the allocation block, stack is too big */
        LOG(GLOBAL, LOG_HEAP, 1,
            "Not enough room to allocate 0x%08x bytes post stack "
            "of size 0x%08x\n",
            reserve_size, DYNAMO_OPTION(stack_size));
        return heap_mmap_reserve(reserve_size, commit_size, prot, which);
    }
    if (DYNAMO_OPTION(stack_shares_gencode) &&
        /* FIXME: we could support this w/o vm_reserve, or when beyond
         * the reservation, but we don't bother */
        DYNAMO_OPTION(vm_reserve) && dcontext != GLOBAL_DCONTEXT && dcontext != NULL) {
        stack_reserve_end = dcontext->dstack + GUARD_PAGE_ADJUSTMENT / 2;
#if defined(UNIX) && !defined(HAVE_MEMINFO)
        if (!dynamo_initialized) {
            /* memory info is not yet set up.  since so early we only support
             * post-stack if inside vmm (won't be true only for pathological
             * tiny vmm sizes)
             */
            if (vmm_is_reserved_unit(vmh, stack_reserve_end, reserve_size)) {
                known_stack = true;
                available = reserve_size;
            } else
                known_stack = false;
        } else
#elif defined(UNIX)
        /* the all_memory_areas list doesn't keep details inside vmheap */
        known_stack =
            get_memory_info_from_os(stack_reserve_end, NULL, &available, &cur_prot);
#else
        known_stack = get_memory_info(stack_reserve_end, NULL, &available, &cur_prot);
#endif
            /* If ever out of vmheap, then may have free space beyond stack,
             * which we could support but don't (see FIXME above) */
            ASSERT(out_of_vmheap_once ||
                   (known_stack && available >= reserve_size && cur_prot == 0));
    }
    if (!known_stack ||
        /* if -no_vm_reserve will short-circuit so no vmh deref danger */
        !vmm_in_same_block(vmh, dcontext->dstack,
                           /* we do want a guard page at the end */
                           stack_reserve_end + reserve_size) ||
        available < reserve_size) {
        ASSERT(!DYNAMO_OPTION(stack_shares_gencode) || !DYNAMO_OPTION(vm_reserve) ||
               out_of_vmheap_once);
        DOLOG(1, LOG_HEAP, {
            if (known_stack && available < reserve_size) {
                LOG(GLOBAL, LOG_HEAP, 1,
                    "heap_mmap_reserve_post_stack: avail %d < needed %d\n", available,
                    reserve_size);
            }
        });
        STATS_INC(mmap_no_share_stack_region);
        return heap_mmap_reserve(reserve_size, commit_size, prot, which);
    }
    ASSERT(DYNAMO_OPTION(vm_reserve));
    ASSERT(REACHABLE_HEAP());
    ASSERT(stack_reserve_end != NULL);
    /* memory alloc/dealloc and updating DR list must be atomic */
    dynamo_vm_areas_lock(); /* if already hold lock this is a nop */
    /* We share the stack's end guard page as our start guard page */
    if (vmm_is_reserved_unit(vmh, stack_reserve_end, reserve_size)) {
        /* Memory is already reserved with OS */
        p = stack_reserve_end;
    } else {
        p = os_heap_reserve(stack_reserve_end, reserve_size, &error_code, true /*+x*/);
#ifdef X64
        /* ensure future heap allocations are reachable from this allocation
         * (this will also verify that this region meets reachability requirements) */
        if (p != NULL)
            request_region_be_heap_reachable(p, reserve_size);
#endif
        if (p == NULL) {
            ASSERT_NOT_REACHED();
            LOG(GLOBAL, LOG_HEAP, 1,
                "heap_mmap_reserve_post_stack: reserve failed " PFX "\n", error_code);
            dynamo_vm_areas_unlock();
            STATS_INC(mmap_no_share_stack_region);
            return heap_mmap_reserve(reserve_size, commit_size, prot, which);
        }
        ASSERT(error_code == HEAP_ERROR_SUCCESS);
    }
    if (!vmm_heap_commit(p, commit_size, prot, &error_code, which)) {
        ASSERT_NOT_REACHED();
        LOG(GLOBAL, LOG_HEAP, 1, "heap_mmap_reserve_post_stack: commit failed " PFX "\n",
            error_code);
        if (!vmm_is_reserved_unit(vmh, stack_reserve_end, reserve_size)) {
            os_heap_free(p, reserve_size, &error_code);
            ASSERT(error_code == HEAP_ERROR_SUCCESS);
        }
        dynamo_vm_areas_unlock();
        STATS_INC(mmap_no_share_stack_region);
        return heap_mmap_reserve(reserve_size, commit_size, prot, which);
    }
    account_for_memory(p, reserve_size, prot, true /*add now*/,
                       false _IF_DEBUG("heap_mmap_reserve_post_stack"));
    dynamo_vm_areas_unlock();
    /* We rely on this for freeing in absence of dcontext */
    ASSERT((ptr_uint_t)p - GUARD_PAGE_ADJUSTMENT / 2 !=
           ALIGN_BACKWARD(p, DYNAMO_OPTION(vmm_block_size)));
#ifdef DEBUG_MEMORY
    memset(vmm_get_writable_addr(p, which), HEAP_ALLOCATED_BYTE, commit_size);
#endif
    LOG(GLOBAL, LOG_HEAP, 2, "heap_mmap w/ stack: %d bytes [/ %d] @ " PFX "\n",
        commit_size, reserve_size, p);
    STATS_ADD_PEAK(mmap_capacity, commit_size);
    STATS_ADD_PEAK(mmap_reserved_only, (reserve_size - commit_size));
    STATS_INC(mmap_share_stack_region);
    return p;
}

/* De-commits memory that was allocated in the same allocation region as this
 * thread's stack (case 9474).
 */
void
heap_munmap_post_stack(dcontext_t *dcontext, void *p, size_t reserve_size,
                       which_vmm_t which)
{
    /* We would require a valid dcontext and compare to the stack reserve end,
     * but on detach we have no dcontext, so we instead use block alignment.
     */
    DOCHECK(1, {
        if (dcontext != NULL && dcontext != GLOBAL_DCONTEXT &&
            DYNAMO_OPTION(vm_reserve) && DYNAMO_OPTION(stack_shares_gencode)) {
            bool at_stack_end = (p == dcontext->dstack + GUARD_PAGE_ADJUSTMENT / 2);
            bool at_block_start = ((ptr_uint_t)p - GUARD_PAGE_ADJUSTMENT / 2 ==
                                   ALIGN_BACKWARD(p, DYNAMO_OPTION(vmm_block_size)));
            ASSERT((at_stack_end && !at_block_start) ||
                   (!at_stack_end && at_block_start));
        }
    });
    if (!DYNAMO_OPTION(vm_reserve) || !DYNAMO_OPTION(stack_shares_gencode) ||
        (ptr_uint_t)p - GUARD_PAGE_ADJUSTMENT / 2 ==
            ALIGN_BACKWARD(p, DYNAMO_OPTION(vmm_block_size))) {
        heap_munmap(p, reserve_size, which);
    } else {
        /* Detach makes it a pain to pass in the commit size so
         * we use the reserve size, which works fine.
         */
        release_memory_and_update_areas((app_pc)p, reserve_size, true /*decommit*/,
                                        true /*update now*/, which);
        LOG(GLOBAL, LOG_HEAP, 2, "heap_munmap_post_stack: %d bytes @ " PFX "\n",
            reserve_size, p);
        STATS_SUB(mmap_capacity, reserve_size);
        STATS_SUB(mmap_reserved_only, reserve_size);
    }
}

/* Use heap_mmap to allocate large chunks of memory. */
void *
heap_mmap(size_t size, uint prot, which_vmm_t which)
{
    return heap_mmap_ex(size, size, prot, true, which);
}

/* free memory-mapped storage */
void
heap_munmap_ex(void *p, size_t size, bool guarded, which_vmm_t which)
{
#ifdef DEBUG_MEMORY
    /* can't set to HEAP_UNALLOCATED_BYTE since really not in our address
     * space anymore */
#endif
    release_guarded_real_memory((vm_addr_t)p, size, true /*update DR areas immediately*/,
                                guarded, which);

    DOSTATS({
        /* avoid problem w/ being called by cleanup_and_terminate after
         * dynamo_process_exit
         */
        if (!dynamo_exited_log_and_stats) {
            LOG(GLOBAL, LOG_HEAP, 2, "heap_munmap: %d bytes @ " PFX "\n", size, p);
            STATS_SUB(mmap_capacity, size);
            STATS_SUB(mmap_reserved_only, size);
        }
    });
}

/* free memory-mapped storage */
void
heap_munmap(void *p, size_t size, which_vmm_t which)
{
    heap_munmap_ex(p, size, true /*guarded*/, which);
}

/* use stack_alloc to build a stack -- it returns TOS
 * For -stack_guard_pages, also allocates an extra page
 * on the bottom and uses it to detect overflows when accessed.
 */
void *
stack_alloc(size_t size, byte *min_addr)
{
    void *p;
    /* we reserve and commit at once for now
     * FIXME case 2330: commit-on-demand could allow larger max sizes w/o
     * hurting us in the common case
     */
    size_t alloc_size = size;
    if (!has_guard_pages(VMM_STACK | VMM_PER_THREAD) && DYNAMO_OPTION(stack_guard_pages))
        alloc_size += PAGE_SIZE;
    p = get_guarded_real_memory(alloc_size, alloc_size, MEMPROT_READ | MEMPROT_WRITE,
                                true, true, min_addr,
                                VMM_STACK | VMM_PER_THREAD _IF_DEBUG("stack_alloc"));
    if (!has_guard_pages(VMM_STACK | VMM_PER_THREAD) && DYNAMO_OPTION(stack_guard_pages))
        p = (byte *)p + PAGE_SIZE;
#ifdef DEBUG_MEMORY
    memset(p, HEAP_ALLOCATED_BYTE, size);
#endif

    if (DYNAMO_OPTION(stack_guard_pages)) {
        /* XXX: maybe we should this option a count of how many pages, to catch
         * overflow that uses a large stride and skips over one page (UNIX-only
         * since Windows code always uses chkstk to trigger guard pages).
         */
        /* We place a guard on UNIX signal stacks too: although we can't report
         * such overflows, we'd rather have a clear crash than memory corruption
         * from clobbering whatever memory is below the stack.
         */
        /* mark the bottom page non-accessible to trap stack overflow */
        byte *guard = (byte *)p - PAGE_SIZE;
#ifdef WINDOWS
        /* Only a committed page can be a guard page. */
        /* XXX: this doesn't work well with -vm_reserve where the kernel will
         * auto-expand the stack into adjacent allocations below the stack.
         */
        heap_error_code_t error_code;
        if (vmm_heap_commit(guard, PAGE_SIZE, MEMPROT_READ | MEMPROT_WRITE, &error_code,
                            VMM_STACK | VMM_PER_THREAD))
            mark_page_as_guard(guard);
#else
        /* For UNIX we just mark it as inaccessible. */
        if (!has_guard_pages(VMM_STACK | VMM_PER_THREAD))
            set_protection(guard, PAGE_SIZE, MEMPROT_READ);
#endif
    }

    RSTATS_ADD_PEAK(stack_capacity, size);
    /* stack grows from high to low */
    return (void *)((ptr_uint_t)p + size);
}

/* free stack storage */
void
stack_free(void *p, size_t size)
{
    size_t alloc_size;
    if (size == 0)
        size = DYNAMORIO_STACK_SIZE;
    alloc_size = size;
    p = (void *)((vm_addr_t)p - size);
    if (!has_guard_pages(VMM_STACK | VMM_PER_THREAD) &&
        DYNAMO_OPTION(stack_guard_pages)) {
        alloc_size += PAGE_SIZE;
        p = (byte *)p - PAGE_SIZE;
    }
    release_guarded_real_memory((vm_addr_t)p, alloc_size,
                                true /*update DR areas immediately*/, true,
                                VMM_STACK | VMM_PER_THREAD);
    if (IF_DEBUG_ELSE(!dynamo_exited_log_and_stats, true))
        RSTATS_SUB(stack_capacity, size);
}

/* only checks d_r_initstack and current dcontext
 * does not check any dstacks on the callback stack (win32) */
bool
is_stack_overflow(dcontext_t *dcontext, byte *sp)
{
    /* ASSUMPTION: size of stack is DYNAMORIO_STACK_SIZE = dynamo_options.stack_size
     * Currently sideline violates that for a thread stack, and we have separated
     * -signal_stack_size, but all dstacks and d_r_initstack should be this size.
     */
    byte *bottom = dcontext->dstack - DYNAMORIO_STACK_SIZE;
    if (!DYNAMO_OPTION(stack_guard_pages) && !DYNAMO_OPTION(per_thread_guard_pages))
        return false;
    /* see if in bottom guard page of dstack */
    if (sp >= bottom - PAGE_SIZE && sp < bottom)
        return true;
    /* now check the d_r_initstack */
    bottom = d_r_initstack - DYNAMORIO_STACK_SIZE;
    if (sp >= bottom - PAGE_SIZE && sp < bottom)
        return true;
    return false;
}

byte *
d_r_map_file(file_t f, size_t *size INOUT, uint64 offs, app_pc addr, uint prot,
             map_flags_t map_flags)
{
    byte *view;
    /* memory alloc/dealloc and updating DR list must be atomic */
    dynamo_vm_areas_lock(); /* if already hold lock this is a nop */
    view = os_map_file(f, size, offs, addr, prot, map_flags);
    if (view != NULL) {
        STATS_ADD_PEAK(file_map_capacity, *size);
        account_for_memory((void *)view, *size, prot, true /*add now*/,
                           true /*image*/
                           _IF_DEBUG("map_file"));
    }
    dynamo_vm_areas_unlock();
    return view;
}

bool
d_r_unmap_file(byte *map, size_t size)
{
    bool success;
    ASSERT(map != NULL && ALIGNED(map, PAGE_SIZE));
    size = ALIGN_FORWARD(size, PAGE_SIZE);
    /* memory alloc/dealloc and updating DR list must be atomic */
    dynamo_vm_areas_lock(); /* if already hold lock this is a nop */
    success = os_unmap_file(map, size);
    if (success) {
        /* Only update the all_memory_areas on success.
         * It should still be atomic to the outside observers.
         */
        update_dynamo_areas_on_release(map, map + size, true /*remove now*/);
        STATS_SUB(file_map_capacity, size);
    }
    dynamo_vm_areas_unlock();
    return success;
}

/* We cannot incrementally keep dynamo vm area list up to date due to
 * circular dependencies bet vmareas and global heap (trust me, I've tried
 * to support it with reentrant routines and recursive locks, the hard part
 * is getting add_vm_area to be reentrant or to queue up adding areas,
 * I think this solution is much more elegant, plus it avoids race conditions
 * between DR memory allocation and the vmareas list by ensuring the list
 * is up to date at the exact time of each query).
 * Instead we on-demand walk the units.
 * Freed units can usually be removed incrementally, except when we
 * hold the heap_unit_lock when we run out of memory -- when we set
 * a flag telling the caller of this routine to remove all heap areas
 * from the vm list prior to calling us to add the real ones back in.
 * Re-adding everyone is the simplest policy, so we don't have to keep
 * track of who's been added.
 * The caller is assumed to hold the dynamo vm areas write lock.
 */
void
heap_vmareas_synch_units()
{
    heap_unit_t *u, *next;
    /* we again have circular dependence w/ vmareas if it happens to need a
     * new unit in the course of adding these areas, so we use a recursive lock!
     * furthermore, we need to own the global lock now, to avoid deadlock with
     * another thread who does global_alloc and then needs a new unit!
     * which means that the global_alloc lock must be recursive since vmareas
     * may need to global_alloc...
     */
    /* if chance could own both locks, must grab both now
     * always grab global_alloc first, then we won't have deadlocks
     */
    acquire_recursive_lock(&global_alloc_lock);
    acquire_recursive_lock(&heap_unit_lock);
    if (dynamo_areas_pending_remove) {
        dynamo_areas_pending_remove = false;
        remove_dynamo_heap_areas();

        /* When heap units are removed from the dynamo_area, they should be
         * marked so.  See case 4196.
         */
        for (u = heapmgt->heap.units; u != NULL; u = u->next_global)
            u->in_vmarea_list = false;
        for (u = heapmgt->heap.dead; u != NULL; u = u->next_global)
            u->in_vmarea_list = false;
    }
    for (u = heapmgt->heap.units; u != NULL; u = next) {
        /* Make sure to add any guard page on each side, as well. */
        uint offs = has_guard_pages(u->which) ? (uint)PAGE_SIZE : 0;
        app_pc start = (app_pc)u - offs;
        /* support un-aligned heap reservation end: PR 415269 (though as
         * part of that PR we shouldn't have un-aligned anymore)
         */
        app_pc end_align = (app_pc)ALIGN_FORWARD(UNIT_RESERVED_END(u), PAGE_SIZE);
        app_pc end = end_align + offs;
        /* u can be moved to dead list, so cache the next link; case 4196. */
        next = u->next_global;
        /* case 3045: areas inside the vmheap reservation are not added to the list */
        if (!u->in_vmarea_list &&
            !is_vmm_reserved_address(start, end - start, NULL, NULL)) {
            /* case 4196 if next is used by dynamo_vmareas then next
             * may become dead if vector is resized, then u should be
             * alive and u->next_global should be reset AFTER add  */
            bool next_may_die =
                /* keep breaking abstractions */
                is_dynamo_area_buffer(UNIT_GET_START_PC(next));
            /* dynamo_areas.buf vector may get resized and u can either
             * go to the dead unit list, or it can be released back to
             * the OS.  We'll mark it as being in vmarea list to avoid
             * re-adding when going through dead one's, and we'll mark
             * _before_ the potential free.  If dynamo_areas.buf is
             * freed back to the OS we'll have another iteration in
             * update_dynamo_vm_areas() until we get fully
             * synchronized, so we don't need to worry about the
             * inconsistency.
             */
            u->in_vmarea_list = true;
            add_dynamo_heap_vm_area(start, end, true, false _IF_DEBUG("heap unit"));
            /* NOTE: Since we could mark_dynamo_vm_areas_stale instead of adding to
             * it, we may lose prot info about this unit.
             * FIXME: Currently, this is done only at one place, which allocates unit
             * as MEMPROT_READ | MEMPROT_WRITE.  If other places are added, then this
             * needs to change.
             */
            update_all_memory_areas((app_pc)u, end_align, MEMPROT_READ | MEMPROT_WRITE,
                                    DR_MEMTYPE_DATA); /* unit */
            if (offs != 0) {
                /* guard pages */
                update_all_memory_areas((app_pc)u - offs, (app_pc)u, MEMPROT_NONE,
                                        DR_MEMTYPE_DATA);
                update_all_memory_areas(end_align, end, MEMPROT_NONE, DR_MEMTYPE_DATA);
            }
            if (next_may_die) {
                STATS_INC(num_vmareas_resize_synch);
                /* if next was potentially on dead row, then current
                 * should still be live and point to the next live
                 */
                next = u->next_global;
            }
        }
    }
    for (u = heapmgt->heap.dead; u != NULL; u = next) {
        uint offs = has_guard_pages(u->which) ? (uint)PAGE_SIZE : 0;
        app_pc start = (app_pc)u - offs;
        /* support un-aligned heap reservation end: PR 415269 (though as
         * part of that PR we shouldn't have un-aligned anymore)
         */
        app_pc end_align = (app_pc)ALIGN_FORWARD(UNIT_RESERVED_END(u), PAGE_SIZE);
        app_pc end = end_align + offs;
        /* u can be moved to live list, so cache the next link; case 4196. */
        next = u->next_global;
        /* case 3045: areas inside the vmheap reservation are not added to the list */
        if (!u->in_vmarea_list &&
            !is_vmm_reserved_address(start, end - start, NULL, NULL)) {
            u->in_vmarea_list = true;
            add_dynamo_heap_vm_area(start, end, true, false _IF_DEBUG("dead heap unit"));
            update_all_memory_areas((app_pc)u, end_align, MEMPROT_READ | MEMPROT_WRITE,
                                    DR_MEMTYPE_DATA); /* unit */
            if (offs != 0) {
                /* guard pages */
                update_all_memory_areas(start, (app_pc)u, MEMPROT_NONE, DR_MEMTYPE_DATA);
                update_all_memory_areas(end_align, end, MEMPROT_NONE, DR_MEMTYPE_DATA);
            }
            /* case 4196 if next was put back on live list for
             * dynamo_areas.buf vector, then next will no longer be a
             * valid iterator over dead list
             */
            /* keep breaking abstractions */
            if (is_dynamo_area_buffer(UNIT_GET_START_PC(next))) {
                STATS_INC(num_vmareas_resize_synch);
                ASSERT_NOT_TESTED();
                next = u->next_global;
            }
        }
    }
    release_recursive_lock(&heap_unit_lock);
    release_recursive_lock(&global_alloc_lock);
}

/* shared between global and global_unprotected */
static void *
common_global_heap_alloc(thread_units_t *tu, size_t size HEAPACCT(which_heap_t which))
{
#ifdef STATIC_LIBRARY
    if (standalone_library) {
        /* i#3316: Use regular malloc for better multi-thread performance and better
         * interoperability with tools like sanitizers.
         * We limit this to static DR b/c we can have a direct call to libc malloc
         * there and b/c that is the common use case for standalone mode these days.
         */
        return malloc(size);
    }
#endif
    void *p;
    acquire_recursive_lock(&global_alloc_lock);
    p = common_heap_alloc(tu, size HEAPACCT(which));
    release_recursive_lock(&global_alloc_lock);
    if (p == NULL) {
        /* circular dependence solution: we need to hold DR lock before
         * global alloc lock -- so we back out, grab it, and retry
         */
        dynamo_vm_areas_lock();
        acquire_recursive_lock(&global_alloc_lock);
        p = common_heap_alloc(tu, size HEAPACCT(which));
        release_recursive_lock(&global_alloc_lock);
        dynamo_vm_areas_unlock();
    }
    ASSERT(p != NULL);
    return p;
}

/* shared between global and global_unprotected */
static void
common_global_heap_free(thread_units_t *tu, void *p,
                        size_t size HEAPACCT(which_heap_t which))
{
#ifdef STATIC_LIBRARY
    if (standalone_library) {
        /* i#3316: Use regular malloc for better multi-thread performance and better
         * interoperability with tools like sanitizers.
         * We limit this to static DR b/c we can have a direct call to libc malloc
         * there and b/c that is the common use case for standalone mode these days.
         */
        free(p);
        return;
    }
#endif
    bool ok;
    if (p == NULL) {
        ASSERT(false && "attempt to free NULL");
        return;
    }

    acquire_recursive_lock(&global_alloc_lock);
    ok = common_heap_free(tu, p, size HEAPACCT(which));
    release_recursive_lock(&global_alloc_lock);
    if (!ok) {
        /* circular dependence solution: we need to hold DR lock before
         * global alloc lock -- so we back out, grab it, and retry
         */
        dynamo_vm_areas_lock();
        acquire_recursive_lock(&global_alloc_lock);
        ok = common_heap_free(tu, p, size HEAPACCT(which));
        release_recursive_lock(&global_alloc_lock);
        dynamo_vm_areas_unlock();
    }
    ASSERT(ok);
}

/* these functions use the global heap instead of a thread's heap: */
void *
global_heap_alloc(size_t size HEAPACCT(which_heap_t which))
{
    void *p;
    /* We pay the cost of this branch to support using DR's decode routines from the
     * regular DR library and not just drdecode, to support libraries that would use
     * drdecode but that also have to work with full DR (i#2499).
     */
    if (heapmgt == &temp_heapmgt &&
        /* We prevent recrusion by checking for a field that heap_init writes. */
        !heapmgt->global_heap_writable) {
        /* TODO i#2499: We have no control point currently to call standalone_exit().
         * We need to develop a solution with atexit() or ELF destructors or sthg.
         */
        standalone_init();
    }
    p = common_global_heap_alloc(&heapmgt->global_units, size HEAPACCT(which));
    ASSERT(p != NULL);
    LOG(GLOBAL, LOG_HEAP, 6, "\nglobal alloc: " PFX " (%d bytes)\n", p, size);
    return p;
}

void
global_heap_free(void *p, size_t size HEAPACCT(which_heap_t which))
{
    common_global_heap_free(&heapmgt->global_units, p, size HEAPACCT(which));
    LOG(GLOBAL, LOG_HEAP, 6, "\nglobal free: " PFX " (%d bytes)\n", p, size);
}

/* reallocate area
   allocates new_num elements of element_size
   if ptr is NULL acts like global_heap_alloc,
   copies an old_num elements of given size in the new area */
/* FIXME: do a heap_realloc and a special_heap_realloc too */
void *
global_heap_realloc(void *ptr, size_t old_num, size_t new_num,
                    size_t element_size HEAPACCT(which_heap_t which))
{
    void *new_area = global_heap_alloc(new_num * element_size HEAPACCT(which));
    if (ptr) {
        memcpy(new_area, ptr, (old_num < new_num ? old_num : new_num) * element_size);
        global_heap_free(ptr, old_num * element_size HEAPACCT(which));
    }
    return new_area;
}

/* size does not include guard pages (if any) and is reserved, but only
 * DYNAMO_OPTION(heap_commit_increment) is committed up front
 */
static heap_unit_t *
heap_create_unit(thread_units_t *tu, size_t size, bool must_be_new)
{
    heap_unit_t *u = NULL, *dead = NULL, *prev_dead = NULL;

    /* we do not restrict size to unit max as we have to make larger-than-max
     * units for oversized requests
     */

    /* modifying heap list and DR areas must be atomic, and must grab
     * DR area lock before heap_unit_lock
     */
    ASSERT(safe_to_allocate_or_free_heap_units());
    dynamo_vm_areas_lock();
    /* take from dead list if possible */
    acquire_recursive_lock(&heap_unit_lock);

    /* FIXME: need to unprotect units that we're going to perform
     * {next,prev}_global assignments too -- but need to know whether
     * to re-protect -- do all at once, or each we need?  add a writable
     * flag to heap_unit_t?
     */

    if (!must_be_new) {
        for (dead = heapmgt->heap.dead; dead != NULL &&
             (UNIT_RESERVED_SIZE(dead) < size || dead->which != tu->which);
             prev_dead = dead, dead = dead->next_global)
            ; /* nothing */
    }
    if (dead != NULL) {
        if (prev_dead == NULL)
            heapmgt->heap.dead = dead->next_global;
        else
            prev_dead->next_global = dead->next_global;
        u = dead;
        heapmgt->heap.num_dead--;
        RSTATS_DEC(heap_num_free);
        release_recursive_lock(&heap_unit_lock);
        LOG(GLOBAL, LOG_HEAP, 2,
            "Re-using dead heap unit: " PFX "-" PFX " %d KB (need %d KB)\n", u,
            ((byte *)u) + size, UNIT_RESERVED_SIZE(u) / 1024, size / 1024);
    } else {
        size_t commit_size = DYNAMO_OPTION(heap_commit_increment);
        release_recursive_lock(&heap_unit_lock); /* do not hold while asking for memory */
        /* create new unit */
        /* Just cap the commit size to this unit's size, to support raising the
         * commit size w/o setting a lot of different unit size parameters.
         */
        if (commit_size > size)
            commit_size = size;
        u = (heap_unit_t *)get_guarded_real_memory(size, commit_size,
                                                   MEMPROT_READ | MEMPROT_WRITE, false,
                                                   true, NULL, tu->which _IF_DEBUG(""));
        /* FIXME: handle low memory conditions by freeing units, + fcache units? */
        ASSERT(u);
        LOG(GLOBAL, LOG_HEAP, 2, "New heap unit: " PFX "-" PFX "\n", u,
            ((byte *)u) + size);
        /* u is kept at top of unit itself, so displace start pc */
        u->start_pc = (heap_pc)(((ptr_uint_t)u) + sizeof(heap_unit_t));
        u->end_pc = ((heap_pc)u) + commit_size;
        u->reserved_end_pc = ((heap_pc)u) + size;
        u->in_vmarea_list = false;
        u->which = tu->which;
        RSTATS_ADD_PEAK(heap_capacity, commit_size);
        /* FIXME: heap sizes are not always page-aligned so stats will be off */
        STATS_ADD_PEAK(heap_reserved_only, (u->reserved_end_pc - u->end_pc));
    }
    RSTATS_ADD_PEAK(heap_num_live, 1);

    ASSERT(u->which == tu->which);
    u->cur_pc = u->start_pc;
    u->next_local = NULL;
    DODEBUG({
        u->id = tu->num_units;
        tu->num_units++;
    });

    acquire_recursive_lock(&heap_unit_lock);
    u->next_global = heapmgt->heap.units;
    if (heapmgt->heap.units != NULL)
        heapmgt->heap.units->prev_global = u;
    u->prev_global = NULL;
    heapmgt->heap.units = u;
    release_recursive_lock(&heap_unit_lock);
    dynamo_vm_areas_unlock();

#ifdef DEBUG_MEMORY
    DOCHECK(CHKLVL_MEMFILL,
            memset(u->start_pc, HEAP_UNALLOCATED_BYTE, u->end_pc - u->start_pc););
#endif
    return u;
}

/* dcontext only used to determine whether a global unit or not */
static void
heap_free_unit(heap_unit_t *unit, dcontext_t *dcontext)
{
    heap_unit_t *u, *prev_u;
#ifdef DEBUG_MEMORY
    /* Unit should already be set to all HEAP_UNALLOCATED by the individual
     * frees and the free list cleanup, verify. */
    /* NOTE - this assert fires if any memory in the unit wasn't freed. This
     * would include memory allocated ACCT_TOMBSTONE (which we don't currently
     * use). Using ACCT_TOMBSTONE is dangerous since we will still free the
     * unit here (say at proc or thread exit) even if there are ACCT_TOMBSTONE
     * allocations in it. */
    /* Note, this memset check is done only on the special heap unit header,
     * not on the unit itself - FIXME: case 10434.  Maybe we should embed the
     * special heap unit header in the first special heap unit itself. */
    /* The hotp_only leak relaxation below is for case 9588 & 9593.  */
    DOCHECK(CHKLVL_MEMFILL, {
        CLIENT_ASSERT(IF_HOTP(hotp_only_contains_leaked_trampoline(
                                  unit->start_pc, unit->end_pc - unit->start_pc) ||)
                              /* i#157: private loader => system lib allocs come here =>
                               * they don't always clean up.  we have to relax here, but
                               * our threadunits_exit checks should find all leaks anyway.
                               */
                              heapmgt->global_units.acct.cur_usage[ACCT_LIBDUP] > 0 ||
                          is_region_memset_to_char(unit->start_pc,
                                                   unit->end_pc - unit->start_pc,
                                                   HEAP_UNALLOCATED_BYTE)
                          /* don't assert when client does premature exit as it's
                           * hard for Extension libs, etc. to clean up in such situations
                           */
                          || client_requested_exit,
                      "memory leak detected");
    });
#endif
    /* modifying heap list and DR areas must be atomic, and must grab
     * DR area lock before heap_unit_lock
     */
    ASSERT(safe_to_allocate_or_free_heap_units());
    dynamo_vm_areas_lock();
    acquire_recursive_lock(&heap_unit_lock);

    /* FIXME: need to unprotect units that we're going to perform
     * {next,prev}_global assignments too -- but need to know whether
     * to re-protect -- do all at once, or each we need?  add a writable
     * flag to heap_unit_t?
     */

    /* remove from live list */
    if (unit->prev_global != NULL) {
        unit->prev_global->next_global = unit->next_global;
    } else
        heapmgt->heap.units = unit->next_global;
    if (unit->next_global != NULL) {
        unit->next_global->prev_global = unit->prev_global;
    }
    /* prev_global is not used in the dead list */
    unit->prev_global = NULL;
    RSTATS_DEC(heap_num_live);

    /* heuristic: don't keep around more dead units than max(5, 1/4 num threads)
     * FIXME: share the policy with the fcache dead unit policy
     * also, don't put special larger-than-max units on free list -- though
     * we do now have support for doing so (after PR 415269)
     */
    if (UNITALLOC(unit) <= HEAP_UNIT_MAX_SIZE &&
        (heapmgt->heap.num_dead < 5 ||
         heapmgt->heap.num_dead * 4U <= (uint)d_r_get_num_threads())) {
        /* Keep dead list sorted small-to-large to avoid grabbing large
         * when can take small and then needing to allocate when only
         * have small left.  Helps out with lots of small threads.
         */
        for (u = heapmgt->heap.dead, prev_u = NULL;
             u != NULL && UNIT_RESERVED_SIZE(u) < UNIT_RESERVED_SIZE(unit);
             prev_u = u, u = u->next_global)
            ;
        if (prev_u == NULL) {
            unit->next_global = heapmgt->heap.dead;
            heapmgt->heap.dead = unit;
        } else {
            unit->next_global = u;
            prev_u->next_global = unit;
        }
        heapmgt->heap.num_dead++;
        release_recursive_lock(&heap_unit_lock);
        RSTATS_ADD_PEAK(heap_num_free, 1);
    } else {
        /* don't need to hold this while freeing since still hold DR areas lock */
        release_recursive_lock(&heap_unit_lock);
        LOG(GLOBAL, LOG_HEAP, 1,
            "\tfreeing excess dead unit " PFX "-" PFX " [-" PFX "]\n", unit,
            UNIT_COMMIT_END(unit), UNIT_RESERVED_END(unit));
        really_free_unit(unit);
    }
    /* FIXME: shrink lock-held path if we see contention */
    dynamo_vm_areas_unlock();
}

#ifdef DEBUG_MEMORY
static heap_unit_t *
find_heap_unit(thread_units_t *tu, heap_pc p, size_t size)
{
    /* FIXME (case 6198): this is a perf hit in debug builds.  But, we can't use
     * a new vmvector b/c of circular dependences.  Proposal: use custom data
     * field of vm_area_t in dynamo_areas list for heap entries to store a pointer
     * to the heap_unit_t struct, and add a backpointer to the owning thread_units_t
     * in heap_unit_t.  Then have to make sure it's ok lock-wise to query the
     * dynamo_areas in the middle of an alloc or a free.  It should be but for
     * global alloc and free we will have to grab the dynamo_areas lock up front
     * every time instead of the very rare times now when we need a new unit.
     */
    heap_unit_t *unit;
    ASSERT(!POINTER_OVERFLOW_ON_ADD(p, size)); /* should not overflow */
    for (unit = tu->top_unit;
         unit != NULL && (p < unit->start_pc || p + size > unit->end_pc);
         unit = unit->next_local)
        ;
    return unit;
}
#endif

static void
threadunits_init(dcontext_t *dcontext, thread_units_t *tu, size_t size, bool reachable)
{
    int i;
    DODEBUG({ tu->num_units = 0; });
    tu->which = VMM_HEAP | (reachable ? VMM_REACHABLE : 0);
    if (dcontext != GLOBAL_DCONTEXT) {
        /* Tradeoff (i#4424): no guard pages on per-thread units, to
         * save space for many-threaded apps.  These units are rarely used.
         * Note that this also precludes sharing dead units between thread-private
         * and shared heaps due to the different "which" value.
         */
        tu->which |= VMM_PER_THREAD;
    }
    tu->top_unit = heap_create_unit(tu, size, false /*can reuse*/);
    tu->cur_unit = tu->top_unit;
    tu->dcontext = dcontext;
    tu->writable = true;
#ifdef HEAP_ACCOUNTING
    memset(&tu->acct, 0, sizeof(tu->acct));
#endif
    for (i = 0; i < BLOCK_TYPES; i++)
        tu->free_list[i] = NULL;
}

#ifdef HEAP_ACCOUNTING
#    define MAX_5_DIGIT 99999
static void
print_tu_heap_statistics(thread_units_t *tu, file_t logfile, const char *prefix)
{
    int i;
    size_t total = 0, cur = 0;
    LOG(logfile, LOG_HEAP | LOG_STATS, 1, "%s heap breakdown:\n", prefix);
    for (i = 0; i < ACCT_LAST; i++) {
        /* print out cur since this is done periodically, not just at end */
        LOG(logfile, LOG_HEAP | LOG_STATS, 1,
            "%12s: cur=%5" SZFC "K, max=%5" SZFC "K, #=%7d, 1=", whichheap_name[i],
            tu->acct.cur_usage[i] / 1024, tu->acct.max_usage[i] / 1024,
            tu->acct.num_alloc[i]);
        if (tu->acct.max_single[i] <= MAX_5_DIGIT)
            LOG(logfile, LOG_HEAP | LOG_STATS, 1, "%5" SZFC, tu->acct.max_single[i]);
        else {
            LOG(logfile, LOG_HEAP | LOG_STATS, 1, "%4" SZFC "K",
                tu->acct.max_single[i] / 1024);
        }
        LOG(logfile, LOG_HEAP | LOG_STATS, 1, ", new=%5" SZFC "K, re=%5" SZFC "K\n",
            tu->acct.alloc_new[i] / 1024, tu->acct.alloc_reuse[i] / 1024);
        total += tu->acct.max_usage[i];
        cur += tu->acct.cur_usage[i];
    }
    LOG(logfile, LOG_HEAP | LOG_STATS, 1, "Total cur usage: %6" SZFC " KB\n", cur / 1024);
    LOG(logfile, LOG_HEAP | LOG_STATS, 1,
        "Total max (not nec. all used simult.): %6" SZFC " KB\n", total / 1024);
}

void
print_heap_statistics()
{
    /* just do cur thread, don't try to walk all threads */
    dcontext_t *dcontext = get_thread_private_dcontext();
    DOSTATS({
        uint i;
        LOG(GLOBAL, LOG_STATS, 1, "Heap bucket usage counts and wasted memory:\n");
        for (i = 0; i < BLOCK_TYPES; i++) {
            LOG(GLOBAL, LOG_STATS | LOG_HEAP, 1,
                "%2d %3d count=%9u peak_count=%9u peak_wasted=%9u peak_align=%9u\n", i,
                BLOCK_SIZES[i], block_total_count[i], block_peak_count[i],
                block_peak_wasted[i], block_peak_align_pad[i]);
        }
    });
    if (dcontext != NULL) {
        thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
        if (th != NULL) { /* may not be initialized yet */
            print_tu_heap_statistics(th->local_heap, THREAD, "Thread");
            ASSERT(th->nonpersistent_heap != NULL);
            print_tu_heap_statistics(th->nonpersistent_heap, THREAD,
                                     "Thread non-persistent");
            if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
                ASSERT(th->reachable_heap != NULL);
                print_tu_heap_statistics(th->reachable_heap, THREAD, "Thread reachable");
            }
        }
    }
    print_tu_heap_statistics(&heapmgt->global_nonpersistent_units, GLOBAL,
                             "Non-persistent global units");
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        print_tu_heap_statistics(&heapmgt->global_reachable_units, GLOBAL,
                                 "Reachable global units");
    }
    print_tu_heap_statistics(&heapmgt->global_unprotected_units, GLOBAL,
                             "Uprotected global units");
    print_tu_heap_statistics(&global_racy_units, GLOBAL, "Racy Up-to-date Process");
    print_tu_heap_statistics(&heapmgt->global_units, GLOBAL,
                             "Updated-at-end Process (max is total of maxes)");
}

static void
add_heapacct_to_global_stats(heap_acct_t *acct)
{
    /* add this thread's stats to the accurate (non-racy) global stats
     * FIXME: this gives a nice in-one-place total, but loses the
     * global-heap-only stats -- perhaps should add a total_units stats
     * to capture total and leave global alone here?
     */
    uint i;
    acquire_recursive_lock(&global_alloc_lock);
    for (i = 0; i < ACCT_LAST; i++) {
        heapmgt->global_units.acct.alloc_reuse[i] += acct->alloc_reuse[i];
        heapmgt->global_units.acct.alloc_new[i] += acct->alloc_new[i];
        heapmgt->global_units.acct.cur_usage[i] += acct->cur_usage[i];
        /* FIXME: these maxes are now not simultaneous max but sum-of-maxes */
        heapmgt->global_units.acct.max_usage[i] += acct->max_usage[i];
        heapmgt->global_units.acct.max_single[i] += acct->max_single[i];
        heapmgt->global_units.acct.num_alloc[i] += acct->num_alloc[i];
    }
    release_recursive_lock(&global_alloc_lock);
}
#endif

/* dcontext only used for debugging */
static void
threadunits_exit(thread_units_t *tu, dcontext_t *dcontext)
{
    heap_unit_t *u, *next_u;
#ifdef DEBUG
    size_t total_heap_used = 0;
#    ifdef HEAP_ACCOUNTING
    int j;
#    endif
#endif
#ifdef DEBUG_MEMORY
    /* verify and clear (for later asserts) the free list */
    uint i;
    for (i = 0; i < BLOCK_TYPES; i++) {
        heap_pc p, next_p;
        for (p = tu->free_list[i]; p != NULL; p = next_p) {
            next_p = *(heap_pc *)p;
            /* clear the pointer to the next free for later asserts */
            *(heap_pc *)p = (heap_pc)HEAP_UNALLOCATED_PTR_UINT;
            DOCHECK(CHKLVL_MEMFILL, {
                if (i < BLOCK_TYPES - 1) {
                    CLIENT_ASSERT(is_region_memset_to_char(p, BLOCK_SIZES[i],
                                                           HEAP_UNALLOCATED_BYTE),
                                  "memory corruption detected");
                } else {
                    /* variable sized blocks */
                    CLIENT_ASSERT(is_region_memset_to_char(p, VARIABLE_SIZE(p),
                                                           HEAP_UNALLOCATED_BYTE),
                                  "memory corruption detected");
                    /* clear the header for later asserts */
                    MEMSET_HEADER(p, HEAP_UNALLOCATED);
                }
            });
        }
        tu->free_list[i] = NULL;
    }
#endif
    u = tu->top_unit;
    while (u != NULL) {
        DOLOG(1, LOG_HEAP | LOG_STATS, {
            size_t num_used = u->cur_pc - u->start_pc;
            total_heap_used += num_used;
            LOG(THREAD, LOG_HEAP | LOG_STATS, 1,
                "Heap unit %d @" PFX "-" PFX " [-" PFX "] (" SZFMT " [/" SZFMT
                "] KB): used " SZFMT " bytes\n",
                u->id, u, UNIT_COMMIT_END(u), UNIT_RESERVED_END(u),
                (UNIT_COMMIT_SIZE(u)) / 1024, (UNIT_RESERVED_SIZE(u)) / 1024, num_used);
        });
        next_u = u->next_local;
        heap_free_unit(u, dcontext);
        u = next_u;
    }
    LOG(THREAD, LOG_HEAP | LOG_STATS, 1, "\tTotal heap used: " SZFMT " KB\n",
        total_heap_used / 1024);
#if defined(DEBUG) && defined(HEAP_ACCOUNTING)
    /* FIXME: separate scopes: smaller functions for DEBUG_MEMORY x HEAP_ACCOUNTING */
    for (j = 0; j < ACCT_LAST; j++) {
        size_t usage = tu->acct.cur_usage[j];
        if (usage > 0) {
            LOG(THREAD, LOG_HEAP | LOG_STATS, 1,
                "WARNING: %s " SZFMT " bytes not freed!\n", whichheap_name[j],
                tu->acct.cur_usage[j]);

#    ifdef HOT_PATCHING_INTERFACE /* known leaks for case 9593 */
            if (DYNAMO_OPTION(hotp_only) &&
                ((j == ACCT_SPECIAL && usage == (size_t)hotp_only_tramp_bytes_leaked) ||
                 /* +4 is for the allocation's header; internal to heap mgt. */
                 (j == ACCT_MEM_MGT &&
                  usage == (size_t)(get_special_heap_header_size() + 4) &&
                  hotp_only_tramp_bytes_leaked > 0)))
                continue;
#    endif
            if (j != ACCT_TOMBSTONE /* known leak */ &&
                /* i#157: private loader => system lib allocs come here =>
                 * they don't always clean up
                 */
                j != ACCT_LIBDUP && INTERNAL_OPTION(heap_accounting_assert)) {
                SYSLOG_INTERNAL_ERROR("memory leak: %s " SZFMT " bytes not freed",
                                      whichheap_name[j], tu->acct.cur_usage[j]);
                /* Don't assert when client does premature exit as it's
                 * hard for Extension libs, etc. to clean up in such situations:
                 */
                CLIENT_ASSERT(client_requested_exit || false, "memory leak detected");
            }
        }
    }
    if (tu != &heapmgt->global_units)
        add_heapacct_to_global_stats(&tu->acct);

    DOLOG(1, LOG_HEAP | LOG_STATS, {
        print_tu_heap_statistics(tu, THREAD,
                                 dcontext == GLOBAL_DCONTEXT ? "Process" : "Thread");
    });
#endif /* defined(DEBUG) && defined(HEAP_ACCOUNTING) */
}

void
heap_thread_reset_init(dcontext_t *dcontext)
{
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    ASSERT(th->nonpersistent_heap != NULL);
    threadunits_init(dcontext, th->nonpersistent_heap,
                     DYNAMO_OPTION(initial_heap_nonpers_size), false);
}

void
heap_thread_init(dcontext_t *dcontext)
{
    thread_heap_t *th =
        (thread_heap_t *)global_heap_alloc(sizeof(thread_heap_t) HEAPACCT(ACCT_MEM_MGT));
    dcontext->heap_field = (void *)th;
    th->local_heap = (thread_units_t *)global_heap_alloc(sizeof(thread_units_t)
                                                             HEAPACCT(ACCT_MEM_MGT));
    threadunits_init(dcontext, th->local_heap, HEAP_UNIT_MIN_SIZE, false);
    th->nonpersistent_heap = (thread_units_t *)global_heap_alloc(
        sizeof(thread_units_t) HEAPACCT(ACCT_MEM_MGT));
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        th->reachable_heap = (thread_units_t *)global_heap_alloc(
            sizeof(thread_units_t) HEAPACCT(ACCT_MEM_MGT));
        threadunits_init(dcontext, th->reachable_heap, HEAP_UNIT_MIN_SIZE, true);
    } else
        th->reachable_heap = NULL;
    heap_thread_reset_init(dcontext);
#ifdef UNIX
    th->fork_copy_start = NULL;
    th->fork_copy_size = 0;
#endif
}

void
heap_thread_reset_free(dcontext_t *dcontext)
{
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    ASSERT(th->nonpersistent_heap != NULL);
    /* FIXME: free directly rather than sending to dead list for
     * heap_reset_free() to free!
     * FIXME: for reset, don't free last unit so don't have to
     * recreate in reset_init()
     */
    threadunits_exit(th->nonpersistent_heap, dcontext);
}

void
heap_thread_exit(dcontext_t *dcontext)
{
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    threadunits_exit(th->local_heap, dcontext);
    heap_thread_reset_free(dcontext);
    global_heap_free(th->local_heap, sizeof(thread_units_t) HEAPACCT(ACCT_MEM_MGT));
    ASSERT(th->nonpersistent_heap != NULL);
    global_heap_free(th->nonpersistent_heap,
                     sizeof(thread_units_t) HEAPACCT(ACCT_MEM_MGT));
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        ASSERT(th->reachable_heap != NULL);
        threadunits_exit(th->reachable_heap, dcontext);
        global_heap_free(th->reachable_heap,
                         sizeof(thread_units_t) HEAPACCT(ACCT_MEM_MGT));
    }
    global_heap_free(th, sizeof(thread_heap_t) HEAPACCT(ACCT_MEM_MGT));
}

#if defined(DEBUG_MEMORY) && defined(DEBUG)
void
print_free_list(thread_units_t *tu, int i)
{
    void *p;
    int len = 0;
    dcontext_t *dcontext = tu->dcontext;
    LOG(THREAD, LOG_HEAP, 1, "Free list for size %d (== %d bytes):\n", i, BLOCK_SIZES[i]);
    p = (void *)tu->free_list[i];
    while (p != NULL) {
        LOG(THREAD, LOG_HEAP, 1, "\tp = " PFX "\n", p);
        len++;
        p = *((char **)p);
    }
    LOG(THREAD, LOG_HEAP, 1, "Total length is %d\n", len);
}
#endif

/* Used for both heap_unit_t and special_heap_unit_t.
 * Returns the amount it increased the unit by, so caller should increment
 * end_pc.
 * Both end_pc and reserved_end_pc are assumed to be open-ended!
 */
static size_t
common_heap_extend_commitment(heap_pc cur_pc, heap_pc end_pc, heap_pc reserved_end_pc,
                              size_t size_need, uint prot, which_vmm_t which)
{
    if (end_pc < reserved_end_pc && !POINTER_OVERFLOW_ON_ADD(cur_pc, size_need)) {
        /* extend commitment if have more reserved */
        size_t commit_size = DYNAMO_OPTION(heap_commit_increment);
        /* simpler to just not support taking very last page in address space */
        if (POINTER_OVERFLOW_ON_ADD(end_pc, commit_size))
            return 0;
        if (cur_pc + size_need > end_pc + commit_size) {
            commit_size =
                ALIGN_FORWARD(cur_pc + size_need - (ptr_uint_t)end_pc, PAGE_SIZE);
        }
        if (end_pc + commit_size > reserved_end_pc ||
            POINTER_OVERFLOW_ON_ADD(end_pc,
                                    commit_size) /*overflow seen in PR 518644 */) {
            /* commit anyway before caller moves on to new unit so that
             * we keep an invariant that all units but the current one
             * are fully committed, so our algorithm for looking at the end
             * of prior units holds
             */
            commit_size = reserved_end_pc - end_pc;
        }
        ASSERT(!POINTER_OVERFLOW_ON_ADD(end_pc, commit_size) &&
               end_pc + commit_size <= reserved_end_pc);
        extend_commitment(end_pc, commit_size, prot, false /* extension */, which);
#ifdef DEBUG_MEMORY
        memset(vmcode_get_writable_addr(end_pc), HEAP_UNALLOCATED_BYTE, commit_size);
#endif
        /* caller should do end_pc += commit_size */
        RSTATS_ADD_PEAK(heap_capacity, commit_size);
        /* FIXME: heap sizes are not always page-aligned so stats will be off */
        STATS_SUB(heap_reserved_only, commit_size);
        ASSERT(end_pc <= reserved_end_pc);
        return commit_size;
    } else
        return 0;
}

static void
heap_unit_extend_commitment(heap_unit_t *u, size_t size_need, uint prot)
{
    u->end_pc += common_heap_extend_commitment(u->cur_pc, u->end_pc, u->reserved_end_pc,
                                               size_need, prot, u->which);
}

/* allocate storage on the DR heap
 * returns NULL iff caller needs to grab dynamo_vm_areas_lock() and retry
 */
static void *
common_heap_alloc(thread_units_t *tu, size_t size HEAPACCT(which_heap_t which))
{
    heap_unit_t *u = tu->cur_unit;
    heap_pc p = NULL;
    int bucket = 0;
    size_t alloc_size, aligned_size;
#if defined(DEBUG_MEMORY) && defined(DEBUG)
    size_t check_alloc_size;
    dcontext_t *dcontext = tu->dcontext;
    /* DrMem i#999: private libs can be heap-intensive and our checks here
     * can have a prohibitive perf cost!
     */
    uint chklvl = CHKLVL_MEMFILL + (IF_HEAPACCT_ELSE(which == ACCT_LIBDUP ? 1 : 0, 0));
    ASSERT_CURIOSITY(which != ACCT_TOMBSTONE &&
                     "Do you really need to use ACCT_TOMBSTONE? (potentially dangerous)");
#endif
    ASSERT(size > 0); /* we don't want to pay check cost in release */
    ASSERT(size < MAX_VALID_HEAP_ALLOCATION && "potential integer overflow");
    /* we prefer to crash than having heap overflows */
    if (size > MAX_VALID_HEAP_ALLOCATION) {
        /* This routine can currently accommodate without integer
         * overflows sizes up to UINT_MAX - sizeof(heap_unit_t), but
         * INT_MAX should be more than enough.
         *
         * Caller will likely crash, but that is better than a heap
         * overflow, where a crash would be the best we can hope for.
         */
        return NULL;
    }

    /* NOTE - all of our buckets are sized to preserve alignment, so this can't change
     * which bucket is used. */
    aligned_size = ALIGN_FORWARD(size, HEAP_ALIGNMENT);
    while (aligned_size > BLOCK_SIZES[bucket])
        bucket++;
    if (bucket == BLOCK_TYPES - 1)
        alloc_size = aligned_size + HEADER_SIZE;
    else
        alloc_size = BLOCK_SIZES[bucket];
    ASSERT(size <= alloc_size);
#ifdef DEBUG_MEMORY
    /* case 10292: use original calculated size for later check */
    check_alloc_size = alloc_size;
#endif
    if (alloc_size > MAXROOM) {
        /* too big for normal unit, build a special unit just for this allocation */
        /* don't need alloc_size or even aligned_size, just need size */
        heap_unit_t *new_unit, *prev;
        /* we page-align to avoid wasting space if unit gets reused later */
        size_t unit_size = ALIGN_FORWARD(size + sizeof(heap_unit_t), PAGE_SIZE);
        ASSERT(size < unit_size && "overflow");

        if (!safe_to_allocate_or_free_heap_units()) {
            /* circular dependence solution: we need to hold DR lock before
             * global alloc lock -- so we back out, grab it, and then come back
             */
            return NULL;
        }

        /* Can reuse a dead unit if large enough: we'll just not use any
         * excess size until this is freed and put back on dead list.
         * (Currently we don't put oversized units on dead list though.)
         */
        new_unit = heap_create_unit(tu, unit_size, false /*can be reused*/);
        /* we want to commit the whole alloc right away */
        heap_unit_extend_commitment(new_unit, size, MEMPROT_READ | MEMPROT_WRITE);
        prev = tu->top_unit;
        alloc_size = size; /* should we include page-alignment? */
        /* insert prior to cur unit (new unit will be full, so keep cur unit
         * where it is)
         */
        while (prev != u && prev->next_local != u) {
            ASSERT(prev != NULL && prev->next_local != NULL);
            prev = prev->next_local;
        }
        if (prev == u) {
            ASSERT(prev == tu->top_unit);
            tu->top_unit = new_unit;
        } else
            prev->next_local = new_unit;
        new_unit->next_local = u;
#ifdef DEBUG_MEMORY
        LOG(THREAD, LOG_HEAP, 3, "\tCreating new oversized heap unit %d (%d [/%d] KB)\n",
            new_unit->id, UNIT_COMMIT_SIZE(new_unit) / 1024,
            UNIT_RESERVED_SIZE(new_unit) / 1024);
#endif
        p = new_unit->start_pc;
        new_unit->cur_pc += size;
        ACCOUNT_FOR_ALLOC(alloc_new, tu, which, size, size); /* use alloc_size? */
        goto done_allocating;
    }
    if (tu->free_list[bucket] != NULL) {
        if (bucket == BLOCK_TYPES - 1) {
            /* variable-length blocks, try to find one big enough */
            size_t sz;
            heap_pc next = tu->free_list[bucket];
            heap_pc prev;
            do {
                prev = p;
                p = next;
                /* aligned_size is written right _before_ next pointer */
                sz = VARIABLE_SIZE(next);
                next = *((heap_pc *)p);
            } while (aligned_size > sz && next != NULL);
            if (aligned_size <= sz) {
                ASSERT(ALIGNED(next, HEAP_ALIGNMENT));
                /* found one, extract from free list */
                if (p == tu->free_list[bucket])
                    tu->free_list[bucket] = next;
                else
                    *((heap_pc *)prev) = next;
#ifdef DEBUG_MEMORY
                LOG(THREAD, LOG_HEAP, 2,
                    "Variable-size block: allocating " PFX " (%d bytes [%d aligned] in "
                    "%d block)\n",
                    p, size, aligned_size, sz);
                /* ensure memory we got from the free list is in a heap unit */
                DOCHECK(CHKLVL_DEFAULT, { /* expensive check */
                                          ASSERT(find_heap_unit(tu, p, sz) != NULL);
                });
#endif
                ASSERT(ALIGNED(sz, HEAP_ALIGNMENT));
                alloc_size = sz + HEADER_SIZE;
                ACCOUNT_FOR_ALLOC(alloc_reuse, tu, which, alloc_size, aligned_size);
            } else {
                /* no free block big enough available */
                p = NULL;
            }
        } else {
            /* fixed-length free block available */
            p = tu->free_list[bucket];
            tu->free_list[bucket] = *((heap_pc *)p);
            ASSERT(ALIGNED(tu->free_list[bucket], HEAP_ALIGNMENT));
#ifdef DEBUG_MEMORY
            /* ensure memory we got from the free list is in a heap unit */
            DOCHECK(CHKLVL_DEFAULT, { /* expensive check */
                                      ASSERT(find_heap_unit(tu, p, alloc_size) != NULL);
            });
#endif
            ACCOUNT_FOR_ALLOC(alloc_reuse, tu, which, alloc_size, aligned_size);
        }
    }
    if (p == NULL) {
        /* no free blocks, grab a new one */
        /* FIXME: if no more heap but lots of larger blocks available,
         * should use the larger blocks instead of failing! */

        /* see if room for allocation size */
        ASSERT(ALIGNED(u->cur_pc, HEAP_ALIGNMENT));
        ASSERT(ALIGNED(alloc_size, HEAP_ALIGNMENT));
        if (u->cur_pc + alloc_size > u->end_pc ||
            POINTER_OVERFLOW_ON_ADD(u->cur_pc, alloc_size) /*xref PR 495961*/) {
            /* We either have to extend the current unit or, failing that,
             * allocate a new unit. */
            if (!safe_to_allocate_or_free_heap_units()) {
                /* circular dependence solution: we need to hold dynamo areas
                 * lock before global alloc lock in case we end up adding a new
                 * unit or we hit oom (which may free units) while extending
                 * the commmitment -- so we back out, grab it, and then come
                 * back. */
                return NULL;
            }
            /* try to extend if possible */
            heap_unit_extend_commitment(u, alloc_size, MEMPROT_READ | MEMPROT_WRITE);
            /* check again after extending commit */
            if (u->cur_pc + alloc_size > u->end_pc ||
                POINTER_OVERFLOW_ON_ADD(u->cur_pc, alloc_size) /*xref PR 495961*/) {
                /* no room, look for room at end of previous units
                 * FIXME: instead should put end of unit space on free list!
                 */
                heap_unit_t *prev = tu->top_unit;
                while (1) {
                    /* make sure we do NOT steal space from oversized units,
                     * who though they may have extra space from alignment
                     * may be freed wholesale when primary alloc is freed
                     */
                    if (UNITALLOC(prev) <= HEAP_UNIT_MAX_SIZE &&
                        !POINTER_OVERFLOW_ON_ADD(prev->cur_pc, alloc_size) &&
                        prev->cur_pc + alloc_size <= prev->end_pc) {
                        tu->cur_unit = prev;
                        u = prev;
                        break;
                    }
                    if (prev->next_local == NULL) {
                        /* no room anywhere, so create new unit
                         * double size of old unit (until hit max size)
                         */
                        heap_unit_t *new_unit;
                        size_t unit_size;

                        unit_size = UNITALLOC(u) * 2;
                        while (unit_size < alloc_size + UNITOVERHEAD)
                            unit_size *= 2;
                        if (unit_size > HEAP_UNIT_MAX_SIZE)
                            unit_size = HEAP_UNIT_MAX_SIZE;
                        ASSERT(unit_size > UNITOVERHEAD);
                        new_unit = heap_create_unit(tu, unit_size, false /*can reuse*/);
                        prev->next_local = new_unit;
#ifdef DEBUG_MEMORY
                        LOG(THREAD, LOG_HEAP, 2,
                            "\tCreating new heap unit %d (%d [/%d] KB)\n", new_unit->id,
                            UNIT_COMMIT_SIZE(new_unit) / 1024,
                            UNIT_RESERVED_SIZE(new_unit) / 1024);
#endif
                        /* use new unit for all future non-free-list allocations
                         * we'll try to use the free room at the end of the old unit(s)
                         * only when we next run out of room
                         */
                        tu->cur_unit = new_unit;
                        u = new_unit;
                        /* may need to extend now if alloc_size is large */
                        heap_unit_extend_commitment(u, alloc_size,
                                                    MEMPROT_READ | MEMPROT_WRITE);
                        /* otherwise would have been bigger than MAXROOM */
                        ASSERT(alloc_size <= (ptr_uint_t)(u->end_pc - u->cur_pc));
                        break;
                    }
                    prev = prev->next_local;
                }
            }
        }

        p = u->cur_pc;
        if (bucket == BLOCK_TYPES - 1) {
            /* we keep HEADER_SIZE bytes to store the size */
            p += HEADER_SIZE;
            VARIABLE_SIZE(p) = aligned_size;
        }
        u->cur_pc += alloc_size;

        ACCOUNT_FOR_ALLOC(alloc_new, tu, which, alloc_size, aligned_size);
    }
    DOSTATS({
        /* do this before done_allocating: want to ignore special-unit allocs */
        ATOMIC_ADD(int, block_count[bucket], 1);
        ATOMIC_ADD(int, block_total_count[bucket], 1);
        /* FIXME: should atomically store inc-ed val in temp to avoid races w/ max */
        ATOMIC_MAX(int, block_peak_count[bucket], block_count[bucket]);
        ASSERT(CHECK_TRUNCATE_TYPE_uint(alloc_size - aligned_size));
        ATOMIC_ADD(int, block_wasted[bucket], (int)(alloc_size - aligned_size));
        /* FIXME: should atomically store val in temp to avoid races w/ max */
        ATOMIC_MAX(int, block_peak_wasted[bucket], block_wasted[bucket]);
        if (aligned_size > size) {
            ASSERT(CHECK_TRUNCATE_TYPE_uint(aligned_size - size));
            ATOMIC_ADD(int, block_align_pad[bucket], (int)(aligned_size - size));
            /* FIXME: should atomically store val in temp to avoid races w/ max */
            ATOMIC_MAX(int, block_peak_align_pad[bucket], block_align_pad[bucket]);
            STATS_ADD_PEAK(heap_align, aligned_size - size);
            LOG(GLOBAL, LOG_STATS, 5,
                "alignment mismatch: %s ask %d, aligned is %d -> %d pad\n",
                IF_HEAPACCT_ELSE(whichheap_name[which], ""), size, aligned_size,
                aligned_size - size);
        }
        if (bucket == BLOCK_TYPES - 1) {
            STATS_ADD(heap_headers, HEADER_SIZE);
            STATS_INC(heap_allocs_variable);
        } else {
            STATS_INC(heap_allocs_buckets);
            if (alloc_size > aligned_size) {
                STATS_ADD_PEAK(heap_bucket_pad, alloc_size - aligned_size);
                LOG(GLOBAL, LOG_STATS, 5,
                    "bucket mismatch: %s ask (aligned) %d, got %d, -> %d\n",
                    IF_HEAPACCT_ELSE(whichheap_name[which], ""), aligned_size, alloc_size,
                    alloc_size - aligned_size);
            }
        }
    });
done_allocating:
#ifdef DEBUG_MEMORY
    if (bucket == BLOCK_TYPES - 1 && check_alloc_size <= MAXROOM) {
        /* verify is unallocated memory, skip possible free list next pointer */
        DOCHECK(chklvl, {
            CLIENT_ASSERT(
                is_region_memset_to_char(p + sizeof(heap_pc *),
                                         (alloc_size - HEADER_SIZE) - sizeof(heap_pc *),
                                         HEAP_UNALLOCATED_BYTE),
                "memory corruption detected");
        });
        LOG(THREAD, LOG_HEAP, 6,
            "\nalloc var " PFX "-" PFX " %d bytes, ret " PFX "-" PFX " %d bytes\n",
            p - HEADER_SIZE, p - HEADER_SIZE + alloc_size, alloc_size, p, p + size, size);
        /* there can only be extra padding if we took off of the free list */
        DOCHECK(chklvl,
                memset(p + size, HEAP_PAD_BYTE, (alloc_size - HEADER_SIZE) - size););
    } else {
        /* verify is unallocated memory, skip possible free list next pointer */
        DOCHECK(chklvl, {
            CLIENT_ASSERT(is_region_memset_to_char(p + sizeof(heap_pc *),
                                                   alloc_size - sizeof(heap_pc *),
                                                   HEAP_UNALLOCATED_BYTE),
                          "memory corruption detected");
        });
        LOG(THREAD, LOG_HEAP, 6,
            "\nalloc fix or oversize " PFX "-" PFX " %d bytes, ret " PFX "-" PFX
            " %d bytes\n",
            p, p + alloc_size, alloc_size, p, p + size, size);
        DOCHECK(chklvl, memset(p + size, HEAP_PAD_BYTE, alloc_size - size););
    }
    DOCHECK(chklvl, memset(p, HEAP_ALLOCATED_BYTE, size););
#    ifdef HEAP_ACCOUNTING
    LOG(THREAD, LOG_HEAP, 6, "\t%s\n", whichheap_name[which]);
#    endif
#endif
    return (void *)p;
}

/* allocate storage on the thread's private heap */
void *
heap_alloc(dcontext_t *dcontext, size_t size HEAPACCT(which_heap_t which))
{
    thread_units_t *tu;
    void *ret_val;
    if (dcontext == GLOBAL_DCONTEXT)
        return global_heap_alloc(size HEAPACCT(which));
    tu = ((thread_heap_t *)dcontext->heap_field)->local_heap;
    ret_val = common_heap_alloc(tu, size HEAPACCT(which));
    ASSERT(ret_val != NULL);
    return ret_val;
}

/* free heap storage
 * returns false if caller needs to grab dynamo_vm_areas_lock() and retry
 */
static bool
common_heap_free(thread_units_t *tu, void *p_void,
                 size_t size HEAPACCT(which_heap_t which))
{
    int bucket = 0;
    heap_pc p = (heap_pc)p_void;
#if defined(DEBUG) && (defined(DEBUG_MEMORY) || defined(HEAP_ACCOUNTING))
    dcontext_t *dcontext = tu->dcontext;
    /* DrMem i#999: private libs can be heap-intensive and our checks here
     * can have a prohibitive perf cost!
     * XXX: b/c of re-use we have to memset on free.  Perhaps we should
     * have a separate heap pool for private libs.  But, the overhead
     * from that final memset is small compared to what we've already
     * saved, so maybe not worth it.
     */
    uint chklvl = CHKLVL_MEMFILL + (IF_HEAPACCT_ELSE(which == ACCT_LIBDUP ? 1 : 0, 0));
#endif
    size_t alloc_size, aligned_size = ALIGN_FORWARD(size, HEAP_ALIGNMENT);
    ASSERT(size > 0); /* we don't want to pay check cost in release */
    ASSERT(p != NULL);
#ifdef DEBUG_MEMORY
    /* FIXME i#417: This curiosity assertion is trying to make sure we don't
     * perform a double free, but it can fire if we ever free a data structure
     * that has the 0xcdcdcdcd bitpattern in the first or last 4 bytes.  This
     * has happened a few times:
     *
     * - case 8802: App's eax is 0xcdcdcdcd (from an app dbg memset) and we have
     *   dcontext->allocated_start==dcontext.
     * - i#417: On Linux x64 we get rax == 0xcdcdcdcd from a memset, and
     *   opnd_create_reg() only updates part of the register before returning by
     *   value in RAX:RDX.  We initialize to zero in debug mode to work around
     *   this.
     * - i#540: On Win7 x64 we see this assert when running the TSan tests in
     *   NegativeTests.WindowsRegisterWaitForSingleObjectTest.
     *
     * For now, we've downgraded this to a curiosity, but if it fires too much
     * in the future we should maintain a separate data structure in debug mode
     * to perform this check.  We accept objects that start with 0xcdcdcdcd so
     * long as the second four bytes are not also 0xcdcdcdcd.
     */
    DOCHECK(chklvl, {
        ASSERT_CURIOSITY(
            (*(uint *)p != HEAP_UNALLOCATED_UINT ||
             (size >= 2 * sizeof(uint) && *(((uint *)p) + 1) != HEAP_UNALLOCATED_UINT)) &&
            *(uint *)(p + size - sizeof(int)) != HEAP_UNALLOCATED_UINT &&
            "attempting to free memory containing HEAP_UNALLOCATED pattern, "
            "possible double free!");
    });
#endif

    while (aligned_size > BLOCK_SIZES[bucket])
        bucket++;
    if (bucket == BLOCK_TYPES - 1)
        alloc_size = aligned_size + HEADER_SIZE;
    else
        alloc_size = BLOCK_SIZES[bucket];

    if (alloc_size > MAXROOM) {
        /* we must have used a special unit just for this allocation */
        heap_unit_t *u = tu->top_unit, *prev = NULL;

#ifdef DEBUG_MEMORY
        /* ensure we are freeing memory in a proper unit */
        DOCHECK(CHKLVL_DEFAULT, { /* expensive check */
                                  ASSERT(find_heap_unit(tu, p, size) != NULL);
        });
#endif

        if (!safe_to_allocate_or_free_heap_units()) {
            /* circular dependence solution: we need to hold DR lock before
             * global alloc lock -- so we back out, grab it, and then come back
             */
            return false;
        }

        while (u != NULL && u->start_pc != p) {
            prev = u;
            u = u->next_local;
        }
        ASSERT(u != NULL);
        /* remove this unit from this thread's list, move to dead list
         * for future use -- no problems will be caused by it being
         * larger than normal
         */
        if (prev == NULL)
            tu->top_unit = u->next_local;
        else
            prev->next_local = u->next_local;
            /* just retire the unit # */
#ifdef DEBUG_MEMORY
        LOG(THREAD, LOG_HEAP, 3, "\tFreeing oversized heap unit %d (%d KB)\n", u->id,
            size / 1024);
        /* go ahead and set unallocated, even though we are just going to free
         * the unit, is needed for an assert in heap_free_unit anyways */
        DOCHECK(CHKLVL_MEMFILL, memset(p, HEAP_UNALLOCATED_BYTE, size););
#endif
        ASSERT(size <= UNITROOM(u));
        heap_free_unit(u, tu->dcontext);
        ACCOUNT_FOR_FREE(tu, which, size);
        return true;
    } else if (bucket == BLOCK_TYPES - 1) {
        ASSERT(GET_VARIABLE_ALLOCATION_SIZE(p) >= alloc_size);
        alloc_size = GET_VARIABLE_ALLOCATION_SIZE(p);
        ASSERT(alloc_size - HEADER_SIZE >= aligned_size);
    }

#if defined(DEBUG) || defined(DEBUG_MEMORY) || defined(HEAP_ACCOUNTING)
    if (bucket == BLOCK_TYPES - 1) {
#    ifdef DEBUG_MEMORY
        LOG(THREAD, LOG_HEAP, 6,
            "\nfree var " PFX "-" PFX " %d bytes, asked " PFX "-" PFX " %d bytes\n",
            p - HEADER_SIZE, p - HEADER_SIZE + alloc_size, alloc_size, p, p + size, size);
        ASSERT_MESSAGE(chklvl, "heap overflow",
                       is_region_memset_to_char(
                           p + size, (alloc_size - HEADER_SIZE) - size, HEAP_PAD_BYTE));
        /* ensure we are freeing memory in a proper unit */
        DOCHECK(CHKLVL_DEFAULT,
                { /* expensive check */
                  ASSERT(find_heap_unit(tu, p, alloc_size - HEADER_SIZE) != NULL);
                });
        /* set used and padding memory back to unallocated */
        DOCHECK(CHKLVL_MEMFILL,
                memset(p, HEAP_UNALLOCATED_BYTE, alloc_size - HEADER_SIZE););
#    endif
        STATS_SUB(heap_headers, HEADER_SIZE);
    } else {
#    ifdef DEBUG_MEMORY
        LOG(THREAD, LOG_HEAP, 6,
            "\nfree fix " PFX "-" PFX " %d bytes, asked " PFX "-" PFX " %d bytes\n", p,
            p + alloc_size, alloc_size, p, p + size, size);
        ASSERT_MESSAGE(
            chklvl, "heap overflow",
            is_region_memset_to_char(p + size, alloc_size - size, HEAP_PAD_BYTE));
        /* ensure we are freeing memory in a proper unit */
        DOCHECK(CHKLVL_DEFAULT, { /* expensive check */
                                  ASSERT(find_heap_unit(tu, p, alloc_size) != NULL);
        });
        /* set used and padding memory back to unallocated */
        DOCHECK(CHKLVL_MEMFILL, memset(p, HEAP_UNALLOCATED_BYTE, alloc_size););
#    endif
        STATS_SUB(heap_bucket_pad, (alloc_size - aligned_size));
    }
    STATS_SUB(heap_align, (aligned_size - size));
    DOSTATS({
        ATOMIC_ADD(int, block_count[bucket], -1);
        ATOMIC_ADD(int, block_wasted[bucket], -(int)(alloc_size - aligned_size));
        ATOMIC_ADD(int, block_align_pad[bucket], -(int)(aligned_size - size));
    });
#    ifdef HEAP_ACCOUNTING
    LOG(THREAD, LOG_HEAP, 6, "\t%s\n", whichheap_name[which]);
    ACCOUNT_FOR_FREE(tu, which, alloc_size);
#    endif
#endif

    /* write next pointer */
    *((heap_pc *)p) = tu->free_list[bucket];
    ASSERT(ALIGNED(tu->free_list[bucket], HEAP_ALIGNMENT));
    tu->free_list[bucket] = p;
    ASSERT(ALIGNED(tu->free_list[bucket], HEAP_ALIGNMENT));
    return true;
}

/* free heap storage */
void
heap_free(dcontext_t *dcontext, void *p, size_t size HEAPACCT(which_heap_t which))
{
    thread_units_t *tu;
    DEBUG_DECLARE(bool ok;)
    if (dcontext == GLOBAL_DCONTEXT) {
        global_heap_free(p, size HEAPACCT(which));
        return;
    }
    tu = ((thread_heap_t *)dcontext->heap_field)->local_heap;
    DEBUG_DECLARE(ok =) common_heap_free(tu, p, size HEAPACCT(which));
    ASSERT(ok);
}

bool
local_heap_protected(dcontext_t *dcontext)
{
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    return (!th->local_heap->writable ||
            (th->nonpersistent_heap != NULL && !th->nonpersistent_heap->writable) ||
            (th->reachable_heap != NULL && !th->reachable_heap->writable));
}

static inline void
protect_local_units_helper(heap_unit_t *u, bool writable)
{
    /* win32 does not allow single protection change call on units that
     * were allocated with separate calls so we don't try to combine
     * adjacent units here
     */
    while (u != NULL) {
        change_protection(UNIT_ALLOC_START(u), UNIT_COMMIT_SIZE(u), writable);
        u = u->next_local;
    }
}

static void
protect_threadunits(thread_units_t *tu, bool writable)
{
    ASSERT(TEST(SELFPROT_LOCAL, dynamo_options.protect_mask));
    if (tu->writable == writable)
        return;
    protect_local_units_helper(tu->top_unit, writable);
    tu->writable = writable;
}

void
protect_local_heap(dcontext_t *dcontext, bool writable)
{
    thread_heap_t *th = (thread_heap_t *)dcontext->heap_field;
    protect_threadunits(th->local_heap, writable);
    protect_threadunits(th->nonpersistent_heap, writable);
    if (!REACHABLE_HEAP()) /* If off, all heap is reachable. */
        protect_threadunits(th->reachable_heap, writable);
}

/* assumption: vmm_heap_alloc only gets called for HeapUnits themselves, which
 * are protected by us here, so ignore os heap
 */
void
protect_global_heap(bool writable)
{
    ASSERT(TEST(SELFPROT_GLOBAL, dynamo_options.protect_mask));

    acquire_recursive_lock(&global_alloc_lock);

    if (heapmgt->global_heap_writable == writable) {
        release_recursive_lock(&global_alloc_lock);
        return;
    }
    /* win32 does not allow single protection change call on units that
     * were allocated with separate calls so we don't try to combine
     * adjacent units here

     * FIXME: That may no longer be true for our virtual memory manager that
     * will in fact be allocated as a single unit.  It is only in case
     * we have run out of that initial allocation that we may have to
     * keep a separate list of allocations.
     */

    if (!writable) {
        ASSERT(heapmgt->global_heap_writable);
        heapmgt->global_heap_writable = writable;
    }

    protect_local_units_helper(heapmgt->global_units.top_unit, writable);
    protect_local_units_helper(heapmgt->global_nonpersistent_units.top_unit, writable);

    if (writable) {
        ASSERT(!heapmgt->global_heap_writable);
        heapmgt->global_heap_writable = writable;
    }

    release_recursive_lock(&global_alloc_lock);
}

/* FIXME: share some code...right now these are identical to protected
 * versions except the unit used
 */
void *
global_unprotected_heap_alloc(size_t size HEAPACCT(which_heap_t which))
{
    void *p = common_global_heap_alloc(&heapmgt->global_unprotected_units,
                                       size HEAPACCT(which));
    ASSERT(p != NULL);
    LOG(GLOBAL, LOG_HEAP, 6, "\nglobal unprotected alloc: " PFX " (%d bytes)\n", p, size);
    return p;
}

void
global_unprotected_heap_free(void *p, size_t size HEAPACCT(which_heap_t which))
{
    common_global_heap_free(&heapmgt->global_unprotected_units, p, size HEAPACCT(which));
    LOG(GLOBAL, LOG_HEAP, 6, "\nglobal unprotected free: " PFX " (%d bytes)\n", p, size);
}

void *
nonpersistent_heap_alloc(dcontext_t *dcontext, size_t size HEAPACCT(which_heap_t which))
{
    void *p;
    if (dcontext == GLOBAL_DCONTEXT) {
        p = common_global_heap_alloc(&heapmgt->global_nonpersistent_units,
                                     size HEAPACCT(which));
        LOG(GLOBAL, LOG_HEAP, 6, "\nglobal nonpersistent alloc: " PFX " (%d bytes)\n", p,
            size);
    } else {
        thread_units_t *nph = ((thread_heap_t *)dcontext->heap_field)->nonpersistent_heap;
        p = common_heap_alloc(nph, size HEAPACCT(which));
    }
    ASSERT(p != NULL);
    return p;
}

void
nonpersistent_heap_free(dcontext_t *dcontext, void *p,
                        size_t size HEAPACCT(which_heap_t which))
{
    if (dcontext == GLOBAL_DCONTEXT) {
        common_global_heap_free(&heapmgt->global_nonpersistent_units, p,
                                size HEAPACCT(which));
        LOG(GLOBAL, LOG_HEAP, 6, "\nglobal nonpersistent free: " PFX " (%d bytes)\n", p,
            size);
    } else {
        thread_units_t *nph = ((thread_heap_t *)dcontext->heap_field)->nonpersistent_heap;
        DEBUG_DECLARE(bool ok =) common_heap_free(nph, p, size HEAPACCT(which));
        ASSERT(ok);
    }
}

void *
heap_reachable_alloc(dcontext_t *dcontext, size_t size HEAPACCT(which_heap_t which))
{
    void *p;
    /* We pay the cost of this branch to support using DR's decode routines from the
     * regular DR library and not just drdecode, to support libraries that would use
     * drdecode but that also have to work with full DR (i#2499).
     */
    if (heapmgt == &temp_heapmgt &&
        /* We prevent recursion by checking for a field that d_r_heap_init() sets and
         * d_r_heap_exit() clears.
         */
        !heapmgt->global_heap_writable) {
        /* XXX: We have no control point to call standalone_exit(). */
        standalone_init();
    }
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        if (dcontext == GLOBAL_DCONTEXT) {
            p = common_global_heap_alloc(&heapmgt->global_reachable_units,
                                         size HEAPACCT(which));
            LOG(GLOBAL, LOG_HEAP, 6, "\nglobal reachable alloc: " PFX " (%d bytes)\n", p,
                size);
        } else {
            thread_units_t *units =
                ((thread_heap_t *)dcontext->heap_field)->reachable_heap;
            p = common_heap_alloc(units, size HEAPACCT(which));
        }
    } else {
        p = heap_alloc(dcontext, size HEAPACCT(which));
    }
    ASSERT(p != NULL);
    return p;
}

void
heap_reachable_free(dcontext_t *dcontext, void *p,
                    size_t size HEAPACCT(which_heap_t which))
{
    if (!REACHABLE_HEAP()) { /* If off, all heap is reachable. */
        if (dcontext == GLOBAL_DCONTEXT) {
            common_global_heap_free(&heapmgt->global_reachable_units, p,
                                    size HEAPACCT(which));
            LOG(GLOBAL, LOG_HEAP, 6, "\nglobal reachable free: " PFX " (%d bytes)\n", p,
                size);
        } else {
            thread_units_t *units =
                ((thread_heap_t *)dcontext->heap_field)->reachable_heap;
            DEBUG_DECLARE(bool ok =) common_heap_free(units, p, size HEAPACCT(which));
            ASSERT(ok);
        }
    } else {
        heap_free(dcontext, p, size HEAPACCT(which));
    }
}

/****************************************************************************
 * SPECIAL SINGLE-ALLOC-SIZE HEAP SERVICE
 */

/* Assumptions:
 *   All allocations are of a single block size
 *   If use_lock is false, no synchronization is needed or even safe
 */

/* We use our own unit struct to give us flexibility.
 * 1) We don't always allocate the header inline.
 * 2) We are sometimes executed from and so need pc prof support.
 * 3) We don't need all the fields of heap_unit_t.
 */
typedef struct _special_heap_unit_t {
    heap_pc alloc_pc;        /* start of allocation region */
    heap_pc start_pc;        /* first address we'll give out for storage */
    heap_pc end_pc;          /* open-ended address of heap storage */
    heap_pc cur_pc;          /* current end (open) of allocated storage */
    heap_pc reserved_end_pc; /* (open) end of reserved (not nec committed) memory */
#ifdef WINDOWS_PC_SAMPLE
    profile_t *profile;
#endif
#ifdef DEBUG
    int id; /* # of this unit */
#endif
    struct _special_heap_unit_t *next;
} special_heap_unit_t;

#define SPECIAL_UNIT_COMMIT_SIZE(u) ((u)->end_pc - (u)->alloc_pc)
#define SPECIAL_UNIT_RESERVED_SIZE(u) ((u)->reserved_end_pc - (u)->alloc_pc)
#define SPECIAL_UNIT_HEADER_INLINE(u) ((u)->alloc_pc != (u)->start_pc)
#define SPECIAL_UNIT_ALLOC_SIZE(u) (SPECIAL_UNIT_RESERVED_SIZE(u))

/* the cfree list stores a next ptr and a count */
typedef struct _cfree_header {
    struct _cfree_header *next_cfree;
    uint count;
} cfree_header_t;

typedef struct _special_units_t {
    special_heap_unit_t *top_unit; /* start of linked list of heap units */
    special_heap_unit_t *cur_unit; /* current unit in heap list */
    uint block_size;               /* all blocks are this size */
    uint block_alignment;
    heap_pc free_list;
    cfree_header_t *cfree_list;
#ifdef DEBUG
    int num_units; /* total # of heap units */
#endif
    bool writable : 1; /* remember state of heap protection */
    bool executable : 1;
    /* if use_lock is false, grabbing _any_ lock may be hazardous!
     * (this isn't just an optimization, it's for correctness)
     */
    bool use_lock : 1;
    bool in_iterator : 1;
    bool persistent : 1;
    bool per_thread : 1;
    mutex_t lock;

    /* Yet another feature added: pclookup, but across multiple heaps,
     * so it's via a passed-in vector and passed-in data
     */
    vm_area_vector_t *heap_areas;
    void *lookup_retval;

#ifdef WINDOWS_PC_SAMPLE
    struct _special_units_t *next;
#endif
#ifdef HEAP_ACCOUNTING
    /* we only need one bucket for SpecialHeap but to re-use code we waste space */
    heap_acct_t acct;
#endif
} special_units_t;

#if defined(WINDOWS_PC_SAMPLE) && !defined(DEBUG)
/* For fast exit path we need a quick way to walk all the units */
DECLARE_CXTSWPROT_VAR(static mutex_t special_units_list_lock,
                      INIT_LOCK_FREE(special_units_list_lock));
/* This is only used for profiling so we don't bother to protect it */
DECLARE_CXTSWPROT_VAR(static special_units_t *special_units_list, NULL);
#endif

#if defined(DEBUG) && defined(HEAP_ACCOUNTING) && defined(HOT_PATCHING_INTERFACE)
/* To get around the problem of the special_units_t "module" being defined after
 * the heap module in the same file.  Part of fix for case 9593 that required
 * leaking memory.
 */
static int
get_special_heap_header_size(void)
{
    return sizeof(special_units_t);
}
#endif

#ifdef WINDOWS_PC_SAMPLE
static inline bool
special_heap_profile_enabled()
{
    return (dynamo_options.profile_pcs && dynamo_options.prof_pcs_stubs >= 2 &&
            dynamo_options.prof_pcs_stubs <= 32);
}
#endif

static inline uint
get_prot(special_units_t *su)
{
    return (su->executable ? MEMPROT_READ | MEMPROT_WRITE | MEMPROT_EXEC
                           : MEMPROT_READ | MEMPROT_WRITE);
}

static inline which_vmm_t
get_which(special_units_t *su)
{
    which_vmm_t which = VMM_SPECIAL_HEAP;
    /* We assume that +x special heap must be reachable. */
    if (su->executable)
        which |= VMM_REACHABLE;
    if (su->per_thread)
        which |= VMM_PER_THREAD;
    return which;
}

static inline byte *
special_heap_get_writable_addr(special_units_t *su, byte *addr)
{
    if (su->executable)
        return vmcode_get_writable_addr(addr);
    return addr;
}

static inline byte *
special_heap_get_executable_addr(special_units_t *su, byte *addr)
{
    if (su->executable)
        return vmcode_get_executable_addr(addr);
    return addr;
}

static void
special_unit_extend_commitment(special_units_t *su, special_heap_unit_t *u,
                               size_t size_need, uint prot)
{
    u->end_pc += common_heap_extend_commitment(u->cur_pc, u->end_pc, u->reserved_end_pc,
                                               size_need, prot, get_which(su));
}

/* If pc is NULL, allocates memory and stores the header inside it;
 * If pc is non-NULL, allocates separate memory for the header, and
 * uses pc for the heap region (assuming size is fully committed).
 * unit_full only applies to the non-NULL case, indicating whether
 * to continue to allocate from this unit.
 */
static special_heap_unit_t *
special_heap_create_unit(special_units_t *su, byte *pc, size_t size, bool unit_full)
{
    special_heap_unit_t *u;
    size_t commit_size;
    uint prot = get_prot(su);
    ASSERT_OWN_MUTEX(su->use_lock, &su->lock);

    if (pc != NULL) {
        u = HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, special_heap_unit_t, ACCT_MEM_MGT,
                            PROTECTED);
        ASSERT(u != NULL);
        u->start_pc = pc;
        u->alloc_pc = pc;
        commit_size = size;
        /* caller should arrange alignment */
        ASSERT(su->block_alignment == 0 || ALIGNED(u->start_pc, su->block_alignment));
    } else {
        ASSERT(ALIGNED(size, PAGE_SIZE));
        commit_size = DYNAMO_OPTION(heap_commit_increment);
        /* Allow the general commit size to be larger than a special unit. */
        if (commit_size > size)
            commit_size = size;
        /* create new unit */
        /* Since vmm lock, dynamo_vm_areas lock, all_memory_areas lock (on
         * linux), etc. will be acquired, and presumably !su->use_lock means
         * user can't handle ANY lock being acquired, we warn here: xref PR
         * 596768.  In release build, we try to acqure the memory anyway.  I'm
         * worried about pcprofile: can only fit ~1K in one unit and so will
         * easily run out...should it allocate additional units up front?
         * => PR 596808.
         */
        DODEBUG({
            if (su->top_unit != NULL /*init*/ && !su->use_lock) {
                SYSLOG_INTERNAL_WARNING_ONCE("potentially unsafe: allocating a new "
                                             "fragile special heap unit!");
            }
        });
        u = (special_heap_unit_t *)get_guarded_real_memory(
            size, commit_size, prot, true, true, NULL,
            get_which(su) _IF_DEBUG("special_heap"));
        ASSERT(u != NULL);
        /* Unlike gencode and code cache memory, we store the writable, since there
         * is a much narrower interface for executable addresses: just returning
         * pointers on alloc, while we have many write points.
         */
        u = (special_heap_unit_t *)special_heap_get_writable_addr(su, (byte *)u);
        u->alloc_pc = (heap_pc)u;
        /* u is kept at top of unit itself, so displace start pc */
        u->start_pc = (heap_pc)(((ptr_uint_t)u) + sizeof(special_heap_unit_t));
        if (su->block_alignment != 0) {
            STATS_ADD(heap_special_align,
                      ALIGN_FORWARD(u->start_pc, su->block_alignment) -
                          (ptr_uint_t)u->start_pc);
            u->start_pc = (heap_pc)ALIGN_FORWARD(u->start_pc, su->block_alignment);
        }
    }
    u->end_pc = u->alloc_pc + commit_size;
    u->reserved_end_pc = u->alloc_pc + size;
    if (pc != NULL && unit_full) {
        ASSERT(u->reserved_end_pc == u->end_pc);
        u->cur_pc = u->end_pc;
    } else
        u->cur_pc = u->start_pc;
    u->next = NULL;
    DODEBUG({
        u->id = su->num_units;
        su->num_units++;
    });

#ifdef WINDOWS_PC_SAMPLE
    if (special_heap_profile_enabled()) {
        u->profile = create_profile((app_pc)PAGE_START(u->start_pc), u->reserved_end_pc,
                                    dynamo_options.prof_pcs_stubs, NULL);
        start_profile(u->profile);
    } else
        u->profile = NULL;
#endif

    /* N.B.: if STATS macros ever change to grab a mutex, we could deadlock
     * if !su->use_lock!
     */
    RSTATS_ADD_PEAK(heap_capacity, commit_size);
    RSTATS_ADD_PEAK(heap_special_capacity, commit_size);
    STATS_ADD_PEAK(heap_special_units, 1);
    STATS_ADD_PEAK(heap_reserved_only, (u->reserved_end_pc - u->end_pc));

    if (su->heap_areas != NULL) {
        vmvector_add(su->heap_areas, u->alloc_pc, u->reserved_end_pc, su->lookup_retval);
    }

#ifdef DEBUG_MEMORY
    /* Don't clobber already-allocated memory */
    DOCHECK(CHKLVL_MEMFILL, {
        if (pc == NULL) {
            memset(u->start_pc, HEAP_UNALLOCATED_BYTE, u->end_pc - u->start_pc);
        }
    });
#endif
    return u;
}

/* caller must store the special_units_t *, which is opaque */
static void *
special_heap_init_internal(uint block_size, uint block_alignment, bool use_lock,
                           bool executable, bool persistent, vm_area_vector_t *vector,
                           void *vector_data, byte *heap_region, size_t heap_size,
                           bool unit_full)
{
    special_units_t *su;
    size_t unit_size = heap_size;
    if (block_alignment != 0)
        block_size = ALIGN_FORWARD(block_size, block_alignment);
    if (unit_size == 0) {
        /* Our main uses (stubs, whether global or coarse, and signal
         * pending queue) don't need a lot of space, so we have a smaller min size
         * than regular heap units which use HEAP_UNIT_MIN_SIZE.
         */
        unit_size = (block_size * 16 > PAGE_SIZE) ? (block_size * 16) : PAGE_SIZE;
    }
    if (heap_region == NULL) {
        unit_size = (size_t)ALIGN_FORWARD(unit_size, PAGE_SIZE);
    }
    su = (special_units_t *)(persistent
                                 ? global_heap_alloc(sizeof(special_units_t)
                                                         HEAPACCT(ACCT_MEM_MGT))
                                 : nonpersistent_heap_alloc(GLOBAL_DCONTEXT,
                                                            sizeof(special_units_t)
                                                                HEAPACCT(ACCT_MEM_MGT)));
    memset(su, 0, sizeof(*su));
    ASSERT(block_size >= sizeof(heap_pc *) && "need room for free list ptrs");
    ASSERT(block_size >= sizeof(heap_pc *) + sizeof(uint) &&
           "need room for cfree list ptrs");
    su->block_size = block_size;
    su->block_alignment = block_alignment;
    su->executable = executable;
    su->persistent = persistent;
    /* We assume that a lockless heap is a per-thread heap. */
    su->per_thread = !use_lock;
    su->writable = true;
    su->free_list = NULL;
    su->cfree_list = NULL;
    DODEBUG({ su->num_units = 0; });
    ASSERT((vector == NULL) == (vector_data == NULL));
    su->heap_areas = vector;
    su->lookup_retval = vector_data;
    su->in_iterator = false;
    if (use_lock)
        ASSIGN_INIT_LOCK_FREE(su->lock, special_heap_lock);
    /* For persistent cache loading we hold executable_areas lock and so
     * cannot acquire special_heap_lock -- so we do not acquire
     * for the initial unit creation, which is safe since su is still
     * private to this routine.
     */
    su->use_lock = false; /* we set to real value below */
    su->top_unit = special_heap_create_unit(su, heap_region, unit_size, unit_full);
    su->use_lock = use_lock;
#ifdef HEAP_ACCOUNTING
    memset(&su->acct, 0, sizeof(su->acct));
#endif
    su->cur_unit = su->top_unit;

#if defined(WINDOWS_PC_SAMPLE) && !defined(DEBUG)
    if (special_heap_profile_enabled()) {
        /* Add to the global main list, which requires a lock */
        d_r_mutex_lock(&special_units_list_lock);
        su->next = special_units_list;
        special_units_list = su;
        d_r_mutex_unlock(&special_units_list_lock);
    }
#endif

    return su;
}

/* Typical usage */
void *
special_heap_init(uint block_size, bool use_lock, bool executable, bool persistent)
{
    uint alignment = 0;
    /* Some users expect alignment; not much of a space loss for those who don't.
     * XXX: find those users and have them call special_heap_init_aligned()
     * and removed this.
     */
    if (IS_POWER_OF_2(block_size))
        alignment = block_size;
    return special_heap_init_internal(block_size, alignment, use_lock, executable,
                                      persistent, NULL, NULL, NULL, 0, false);
}

void *
special_heap_init_aligned(uint block_size, uint alignment, bool use_lock, bool executable,
                          bool persistent, size_t initial_unit_size)
{
    return special_heap_init_internal(block_size, alignment, use_lock, executable,
                                      persistent, NULL, NULL, NULL, initial_unit_size,
                                      false);
}

/* Special heap w/ a vector for lookups.  Also supports a pre-created heap region
 * (heap_region, heap_region+heap_size) whose fullness is unit_full. */
void *
special_heap_pclookup_init(uint block_size, bool use_lock, bool executable,
                           bool persistent, vm_area_vector_t *vector, void *vector_data,
                           byte *heap_region, size_t heap_size, bool unit_full)
{
    uint alignment = 0;
    /* XXX: see comment in special_heap_init() */
    if (IS_POWER_OF_2(block_size))
        alignment = block_size;
    return special_heap_init_internal(block_size, alignment, use_lock, executable,
                                      persistent, vector, vector_data, heap_region,
                                      heap_size, unit_full);
}

/* Sets the vector data for the lookup vector used by the special heap */
void
special_heap_set_vector_data(void *special, void *vector_data)
{
    special_units_t *su = (special_units_t *)special;
    special_heap_unit_t *u;
    ASSERT(su->heap_areas != NULL);
    /* FIXME: more efficient to walk the vector, but no interface
     * to set the data: we'd need to expose the iterator index or
     * the vmarea struct rather than than the clean copy we have now
     */
    for (u = su->top_unit; u != NULL; u = u->next) {
        vmvector_modify_data(su->heap_areas, u->alloc_pc, u->reserved_end_pc,
                             vector_data);
    }
}

/* Returns false if the special heap has more than one unit or has a
 * non-externally-allocated unit.
 * Sets the cur pc for the only unit to end_pc.
 */
bool
special_heap_set_unit_end(void *special, byte *end_pc)
{
    special_units_t *su = (special_units_t *)special;
    if (su->top_unit->next != NULL || SPECIAL_UNIT_HEADER_INLINE(su->top_unit) ||
        end_pc < su->top_unit->start_pc || end_pc > su->top_unit->end_pc)
        return false;
    su->top_unit->cur_pc = end_pc;
    return true;
}

#ifdef WINDOWS_PC_SAMPLE
static void
special_heap_profile_stop(special_heap_unit_t *u)
{
    int sum;
    ASSERT(special_heap_profile_enabled());
    stop_profile(u->profile);
    sum = sum_profile(u->profile);
    if (sum > 0) {
        d_r_mutex_lock(&profile_dump_lock);
        print_file(profile_file, "\nDumping special heap unit profile\n%d hits\n", sum);
        dump_profile(profile_file, u->profile);
        d_r_mutex_unlock(&profile_dump_lock);
    }
}
#endif

#if defined(WINDOWS_PC_SAMPLE) && !defined(DEBUG)
/* for fast exit path only, normal path taken care of */
void
special_heap_profile_exit()
{
    special_heap_unit_t *u;
    special_units_t *su;
    ASSERT(special_heap_profile_enabled()); /* will never be compiled in I guess :) */
    d_r_mutex_lock(&special_units_list_lock);
    for (su = special_units_list; su != NULL; su = su->next) {
        if (su->use_lock)
            d_r_mutex_lock(&su->lock);
        for (u = su->top_unit; u != NULL; u = u->next) {
            if (u->profile != NULL)
                special_heap_profile_stop(u);
            /* fast exit path: do not bother to free */
        }
        if (su->use_lock)
            d_r_mutex_unlock(&su->lock);
    }
    d_r_mutex_unlock(&special_units_list_lock);
}
#endif

void
special_heap_exit(void *special)
{
    special_units_t *su = (special_units_t *)special;
    special_heap_unit_t *u, *next_u;
#ifdef DEBUG
    size_t total_heap_used = 0;
#endif
    u = su->top_unit;
    while (u != NULL) {
        /* Assumption: it's ok to use print_lock even if !su->use_lock */
        DOLOG(1, LOG_HEAP | LOG_STATS, {
            size_t num_used = u->cur_pc - u->start_pc;
            total_heap_used += num_used;
            LOG(THREAD_GET, LOG_HEAP | LOG_STATS, 1,
                "Heap unit " SZFMT " (size " SZFMT " [/" SZFMT "] KB): used " SZFMT
                " KB\n",
                u->id, (SPECIAL_UNIT_COMMIT_SIZE(u)) / 1024,
                SPECIAL_UNIT_RESERVED_SIZE(u) / 1024, num_used / 1024);
        });
        next_u = u->next;
#ifdef WINDOWS_PC_SAMPLE
        if (u->profile != NULL) {
            ASSERT(special_heap_profile_enabled());
            special_heap_profile_stop(u);
            free_profile(u->profile);
            u->profile = NULL;
        }
#endif
        STATS_ADD(heap_special_units, -1);
        RSTATS_SUB(heap_special_capacity, SPECIAL_UNIT_COMMIT_SIZE(u));
        if (su->heap_areas != NULL) {
            vmvector_remove(su->heap_areas, u->alloc_pc, u->reserved_end_pc);
        }
        if (!SPECIAL_UNIT_HEADER_INLINE(u)) {
            HEAP_TYPE_FREE(GLOBAL_DCONTEXT, u, special_heap_unit_t, ACCT_MEM_MGT,
                           PROTECTED);
            /* up to creator to free the heap region */
        } else {
            release_guarded_real_memory((vm_addr_t)u, SPECIAL_UNIT_RESERVED_SIZE(u),
                                        true /*update DR areas immediately*/, true,
                                        get_which(su));
        }
        u = next_u;
    }
#ifdef HEAP_ACCOUNTING
    add_heapacct_to_global_stats(&su->acct);
#endif
    LOG(THREAD_GET, LOG_HEAP | LOG_STATS, 1, "\tTotal heap used: " SZFMT " KB\n",
        total_heap_used / 1024);
#if defined(WINDOWS_PC_SAMPLE) && !defined(DEBUG)
    if (special_heap_profile_enabled()) {
        /* Removed this special_units_t from the main list */
        d_r_mutex_lock(&special_units_list_lock);
        if (special_units_list == su)
            special_units_list = su->next;
        else {
            special_units_t *prev = special_units_list;
            ASSERT(prev != NULL);
            for (; prev->next != NULL && prev->next != su; prev = prev->next)
                ; /*nothing*/
            ASSERT(prev->next == su);
            prev->next = su->next;
        }
        d_r_mutex_unlock(&special_units_list_lock);
    }
#endif
    if (su->use_lock)
        DELETE_LOCK(su->lock);
    /* up to caller to free the vector, which is typically multi-heap */
    if (su->persistent) {
        global_heap_free(su, sizeof(special_units_t) HEAPACCT(ACCT_MEM_MGT));
    } else {
        nonpersistent_heap_free(GLOBAL_DCONTEXT, su,
                                sizeof(special_units_t) HEAPACCT(ACCT_MEM_MGT));
    }
}

void *
special_heap_calloc(void *special, uint num)
{
#ifdef DEBUG
    dcontext_t *dcontext = get_thread_private_dcontext();
#endif
    special_units_t *su = (special_units_t *)special;
    special_heap_unit_t *u;
    void *p = NULL;
    bool took_free = false;
    ASSERT(num > 0);
    if (su->use_lock)
        d_r_mutex_lock(&su->lock);
    u = su->cur_unit;
    if (su->free_list != NULL && num == 1) {
        p = (void *)su->free_list;
        su->free_list = *((heap_pc *)p);
        took_free = true;
    } else if (su->cfree_list != NULL && num > 1) {
        /* FIXME: take a piece of cfree if num == 1?
         * seems better to save the bigger pieces
         */
        cfree_header_t *cfree = su->cfree_list, *prev = NULL;
        while (cfree != NULL && cfree->count < num) {
            prev = cfree;
            cfree = cfree->next_cfree;
        }
        if (cfree != NULL) {
            ASSERT(cfree->count >= num);
            took_free = true;
            if (cfree->count == num) {
                /* take it out of list altogether */
                if (prev == NULL)
                    su->cfree_list = cfree->next_cfree;
                else
                    prev->next_cfree = cfree->next_cfree;
                p = (void *)cfree;
            } else if (cfree->count == num + 1) {
                /* add single-size piece to normal free list */
                heap_pc tail = ((heap_pc)cfree) + num * su->block_size;
                *((heap_pc *)tail) = su->free_list;
                su->free_list = tail;
                p = (void *)cfree;
            } else {
                /* if take tail don't have to change free list ptrs at all */
                cfree->count -= num;
                p = (void *)(((heap_pc)cfree) + (cfree->count - num) * su->block_size);
            }
        }
    }
    if (!took_free) {
        /* no free blocks, grab a new one */
        if (u->cur_pc + su->block_size * num > u->end_pc ||
            POINTER_OVERFLOW_ON_ADD(u->cur_pc, su->block_size * num)) {
            /* simply extend commitment, if possible */
            size_t pre_commit_size = SPECIAL_UNIT_COMMIT_SIZE(u);
            special_unit_extend_commitment(su, u, su->block_size * num, get_prot(su));
            RSTATS_ADD_PEAK(heap_special_capacity,
                            SPECIAL_UNIT_COMMIT_SIZE(u) - pre_commit_size);
            /* check again after extending commit */
            if (u->cur_pc + su->block_size * num > u->end_pc ||
                POINTER_OVERFLOW_ON_ADD(u->cur_pc, su->block_size * num)) {
                /* no room, need new unit */
                special_heap_unit_t *new_unit;
                special_heap_unit_t *prev = su->top_unit;
                size_t size = SPECIAL_UNIT_ALLOC_SIZE(u);
                ASSERT(ALIGNED(size, PAGE_SIZE));
                while (prev->next != NULL)
                    prev = prev->next;
                /* create new unit double size of old unit (until hit max size) */
                if (size * 2 <= HEAP_UNIT_MAX_SIZE)
                    size *= 2;
                /* we don't support arbitrarily long sequences */
                ASSERT(su->block_size * num < size);
                new_unit = special_heap_create_unit(su, NULL, size, false /*empty*/);
                prev->next = new_unit;
                if (su->use_lock) {
                    /* if synch bad so is printing */
                    LOG(THREAD, LOG_HEAP, 3,
                        "%s: Creating new heap unit %d " PFX "-" PFX "-" PFX "\n",
                        __FUNCTION__, new_unit->id, new_unit->alloc_pc, new_unit->end_pc,
                        new_unit->reserved_end_pc);
                }
                su->cur_unit = new_unit;
                u = new_unit;
                ASSERT(u->cur_pc + su->block_size * num <= u->end_pc &&
                       !POINTER_OVERFLOW_ON_ADD(u->cur_pc, su->block_size * num));
            }
        }

        p = (void *)u->cur_pc;
        u->cur_pc += su->block_size * num;
        ACCOUNT_FOR_ALLOC(alloc_new, su, ACCT_SPECIAL, su->block_size * num,
                          su->block_size * num);
    } else {
        ACCOUNT_FOR_ALLOC(alloc_reuse, su, ACCT_SPECIAL, su->block_size * num,
                          su->block_size * num);
    }
    if (su->use_lock)
        d_r_mutex_unlock(&su->lock);

#ifdef DEBUG_MEMORY
    DOCHECK(CHKLVL_MEMFILL, memset(p, HEAP_ALLOCATED_BYTE, su->block_size * num););
#endif
    ASSERT(p != NULL);
    return (void *)special_heap_get_executable_addr(su, p);
}

void *
special_heap_alloc(void *special)
{
    return special_heap_calloc(special, 1);
}

void
special_heap_cfree(void *special, void *p, uint num)
{
    special_units_t *su = (special_units_t *)special;
    ASSERT(num > 0);
    ASSERT(p != NULL);
    /* Allow freeing while iterating w/o deadlock (iterator holds lock) */
    ASSERT(!su->in_iterator || OWN_MUTEX(&su->lock));
    if (su->use_lock && !su->in_iterator)
        d_r_mutex_lock(&su->lock);
    p = (void *)special_heap_get_writable_addr(su, p);
#ifdef DEBUG_MEMORY
    /* FIXME: ensure that p is in allocated state */
    DOCHECK(CHKLVL_MEMFILL, memset(p, HEAP_UNALLOCATED_BYTE, su->block_size * num););
#endif
    if (num == 1) {
        /* write next pointer */
        *((heap_pc *)p) = su->free_list;
        su->free_list = (heap_pc)p;
    } else {
        cfree_header_t *cfree = (cfree_header_t *)p;
        cfree->next_cfree = su->cfree_list;
        cfree->count = num;
        su->cfree_list = (cfree_header_t *)p;
    }
#ifdef HEAP_ACCOUNTING
    ACCOUNT_FOR_FREE(su, ACCT_SPECIAL, su->block_size * num);
#endif
    if (su->use_lock && !su->in_iterator)
        d_r_mutex_unlock(&su->lock);
}

void
special_heap_free(void *special, void *p)
{
    special_heap_cfree(special, p, 1);
}

bool
special_heap_can_calloc(void *special, uint num)
{
    special_units_t *su = (special_units_t *)special;
    bool can_calloc = false;

    ASSERT(num > 0);
    if (su->use_lock)
        d_r_mutex_lock(&su->lock);
    if (su->free_list != NULL && num == 1) {
        can_calloc = true;
    } else if (su->cfree_list != NULL && num > 1) {
        cfree_header_t *cfree = su->cfree_list;
        while (cfree != NULL) {
            if (cfree->count >= num) {
                can_calloc = true;
                break;
            }
            cfree = cfree->next_cfree;
        }
    }
    if (!can_calloc) {
        special_heap_unit_t *u = su->cur_unit; /* what if more units are available? */
        can_calloc = (u->cur_pc + su->block_size * num <= u->reserved_end_pc &&
                      !POINTER_OVERFLOW_ON_ADD(u->cur_pc, su->block_size * num));
    }
    if (su->use_lock)
        d_r_mutex_unlock(&su->lock);

    return can_calloc;
}

/* Special heap iterator.  Initialized with special_heap_iterator_start(), which
 * grabs the heap lock (regardless of whether synch is used for allocs), and
 * destroyed with special_heap_iterator_stop() to release the lock.
 * If the special heap uses no lock for alloc, it is up to the caller
 * to prevent race conditions causing problems.
 * Accessor special_heap_iterator_next() should be called only when
 * predicate special_heap_iterator_hasnext() is true.
 * Any mutation of the heap while iterating will result in a deadlock
 * for heaps that use locks for alloc, except for individual freeing,
 * which will proceed w/o trying to grab the lock a second time.
 * FIXME: could generalize to regular heaps if a use arises.
 */
void
special_heap_iterator_start(void *heap, special_heap_iterator_t *shi)
{
    special_units_t *su = (special_units_t *)heap;
    ASSERT(heap != NULL);
    ASSERT(shi != NULL);
    d_r_mutex_lock(&su->lock);
    shi->heap = heap;
    shi->next_unit = (void *)su->top_unit;
    su->in_iterator = true;
}

bool
special_heap_iterator_hasnext(special_heap_iterator_t *shi)
{
    ASSERT(shi != NULL);
    DOCHECK(1, {
        special_units_t *su = (special_units_t *)shi->heap;
        ASSERT(su != NULL);
        ASSERT_OWN_MUTEX(true, &su->lock);
    });
    return (shi->next_unit != NULL);
}

/* Iterator accessor:
 * Has to be initialized with special_heap_iterator_start, and should be
 * called only when special_heap_iterator_hasnext() is true.
 * Sets the area boundaries in area_start and area_end.
 */
void
special_heap_iterator_next(special_heap_iterator_t *shi /* IN/OUT */,
                           app_pc *heap_start /* OUT */, app_pc *heap_end /* OUT */)
{
    special_units_t *su;
    special_heap_unit_t *u;
    ASSERT(shi != NULL);
    su = (special_units_t *)shi->heap;
    ASSERT(su != NULL);
    ASSERT_OWN_MUTEX(true, &su->lock);
    u = (special_heap_unit_t *)shi->next_unit;
    ASSERT(u != NULL);
    if (u != NULL) { /* caller error, but paranoid */
        if (heap_start != NULL)
            *heap_start = special_heap_get_executable_addr(su, u->start_pc);
        ASSERT(u->cur_pc <= u->end_pc);
        if (heap_end != NULL)
            *heap_end = special_heap_get_executable_addr(su, u->cur_pc);
        shi->next_unit = (void *)u->next;
    }
}

void
special_heap_iterator_stop(special_heap_iterator_t *shi)
{
    special_units_t *su;
    ASSERT(shi != NULL);
    su = (special_units_t *)shi->heap;
    ASSERT(su != NULL);
    ASSERT_OWN_MUTEX(true, &su->lock);
    su->in_iterator = false;
    d_r_mutex_unlock(&su->lock);
    DODEBUG({
        shi->heap = NULL;
        shi->next_unit = NULL;
    });
}

#if defined(DEBUG) && defined(HOT_PATCHING_INTERFACE)
/* We leak hotp trampolines as part of fix for case 9593; so, during a detach
 * we can't delete the trampoline heap.  However if that heap's lock isn't
 * deleted, we'll assert.  This routine is used only for that.  Normally, we
 * should call special_heap_exit() which deletes the lock. */
void
special_heap_delete_lock(void *special)
{
    special_units_t *su = (special_units_t *)special;

    /* No one calls this routine unless they have a lock to delete. */
    ASSERT(su != NULL);
    if (su == NULL)
        return;

    ASSERT(su->use_lock);
    if (su->use_lock)
        DELETE_LOCK(su->lock);
}
#endif

/*----------------------------------------------------------------------------*/
#ifdef WINDOWS /* currently not used on linux */
/* Landing pads (introduced as part of work for PR 250294). */

/* landing_pad_areas is a vmvector made up of regions of memory called
 * landing pad areas, each of which contains multiple landing pads.  Landing
 * pads are small trampolines used to jump from the hook point to the main
 * trampoline.  This is used in both 32-bit and 64-bit DR.  In both cases it
 * will handle the problem of hook chaining by 3rd party software and us having
 * to release our hooks (we'll nop the landing pad and free the trampoline).
 * In 64-bit it also solves the problem of reachability of the 5-byte rel jmp
 * we use for hooking, i.e., that 5-byte rel jmp may not reach the main
 * trampoline in DR heap.  We have to maintain the hook a 5-byte jmp because
 * hotp_only assumes it (see PR 250294).
 *
 * A landing pad will have nothing more than a jump (5-byte rel for 32-bit DR
 * and 64-bit abs ind jmp for 64-bit DR) to the trampoline and a 5-byte rel jmp
 * back to the next instruction after the hook, plus the displaced app instrs.
 *
 * To handle hook chaining landing pads won't be released till process exit
 * (not on a detach), their first jump will just be nop'ed.  As landing pads
 * aren't released till exit, all landing pads are just incrementally allocated
 * in a landing pad area.
 *
 * Note: Landing pad areas don't necessarily have to fall within the vm_reserve
 * region or capacity, so aren't accounted by our vmm.
 *
 * Note: If in future other needs for such region specific allocation should
 * arise, then we should convert this into special_heap_alloc_in_region().  For
 * now, landing pads are the only consumers, so was decided to be acceptable.
 *
 * See win32/callback.c for emit_landing_pad_code() and landing pad usage.
 */

typedef struct {
    byte *start;      /* start of reserved region */
    byte *end;        /* end of reserved region */
    byte *commit_end; /* end of committed memory in the reserved region */
    byte *cur_ptr;    /* pointer to next allocatable landing pad memory */
    bool allocated;   /* allocated, or stolen from an app dll? */
} landing_pad_area_t;

/* Allocates a landing pad so that a hook inserted at addr_to_hook can reach
 * its trampoline via the landing pad.  The landing pad will reachable by a
 * 32-bit relative jmp from addr_to_hook.
 * Note: we may want to generalize this at some point such that the size of the
 *       landing pad is passed as an argument.
 *
 * For Windows we assume that landing_pads_to_executable_areas(true) will be
 * called once landing pads are finished being created.
 */
byte *
alloc_landing_pad(app_pc addr_to_hook)
{
    app_pc hook_region_start, hook_region_end;
    app_pc alloc_region_start, alloc_region_end;
    app_pc lpad_area_start = NULL, lpad_area_end;
    app_pc lpad = NULL;
    landing_pad_area_t *lpad_area = NULL;

    /* Allocate the landing pad area such that any hook from within the module
     * or memory region containing addr_to_hook can use the same area for a
     * landing pad.  Makes it more efficient. */
    hook_region_start = get_allocation_base(addr_to_hook);
    if (hook_region_start == NULL) { /* to support raw virtual address hooks */
        ASSERT_CURIOSITY("trying to hook raw or unallocated memory?");
        hook_region_start = addr_to_hook;
        hook_region_end = addr_to_hook;
    } else {
        hook_region_end =
            hook_region_start + get_allocation_size(hook_region_start, NULL);
        ASSERT(hook_region_end > hook_region_start); /* check overflow */
        /* If region size is > 2 GB, then it isn't an image; PE32{,+} restrict
         * images to 2 GB.  Also, if region is > 2 GB the reachability macros
         * called below will return a region smaller (and with start and end
         * inverted) than the region from which the reachability is desired,
         * i.e., some of the areas in [hook_region_start, hook_region_end)
         * won't be able to reach the region computed.
         *
         * A better choice is to pick something smaller (100 MB) because if the
         * region is close to 2 GB in size then we might not be able to
         * allocate memory for a landing pad that is reachable.
         */
        if (hook_region_end - hook_region_start > 100 * 1024 * 1024) {
            /* Try a smaller region of 100 MB around the address to hook. */
            ASSERT_CURIOSITY(false && "seeing patch region > 100 MB - DGC?");
            hook_region_start = MIN(
                addr_to_hook, MAX(hook_region_start, addr_to_hook - 50 * 1024 * 1024));
            hook_region_end =
                MAX(addr_to_hook, MIN(hook_region_end, addr_to_hook + 50 * 1024 * 1024));
        }
    }

    /* Define the region that can be reached from anywhere within the
     * hook region with a 32-bit rel jmp.
     */
    alloc_region_start = REACHABLE_32BIT_START(hook_region_start, hook_region_end);
    alloc_region_end = REACHABLE_32BIT_END(hook_region_start, hook_region_end);
    ASSERT(alloc_region_start < alloc_region_end);

    /* Check if there is an existing landing pad area within the reachable
     * region for the hook location.  If so use it, else allocate one.
     */
    d_r_write_lock(&landing_pad_areas->lock);
    if (vmvector_overlap(landing_pad_areas, alloc_region_start, alloc_region_end)) {
        /* Now we have to get that landing pad area that is FULLY contained
         * within alloc_region_start and alloc_region_end.  If a landing pad
         * area is only partially within the alloc region, then a landing pad
         * created there won't be able to reach the addr_to_hook.  BTW, that
         * landing pad area should have enough space to allocate a landing pad!
         * If these conditions are met allocate a landing pad.
         */
        vmvector_iterator_t lpad_area_iter;
        vmvector_iterator_start(landing_pad_areas, &lpad_area_iter);
        while (vmvector_iterator_hasnext(&lpad_area_iter)) {
            lpad_area =
                vmvector_iterator_next(&lpad_area_iter, &lpad_area_start, &lpad_area_end);
            if (lpad_area_start < alloc_region_end &&
                lpad_area_end > alloc_region_start &&
                (lpad_area->cur_ptr + LANDING_PAD_SIZE) < lpad_area_end) {
                /* See if enough memory in this landing pad area has been
                 * committed, if not commit more memory.
                 */
                if ((lpad_area->cur_ptr + LANDING_PAD_SIZE) >= lpad_area->commit_end) {
                    ASSERT(lpad_area->allocated);
                    extend_commitment(lpad_area->commit_end, PAGE_SIZE,
                                      MEMPROT_READ | MEMPROT_EXEC,
                                      false /* not initial commit */, VMM_SPECIAL_MMAP);
                    lpad_area->commit_end += PAGE_SIZE;
                }

                /* Update the current pointer for the landing pad area, i.e.,
                 * allocate the landing pad.
                 */
                lpad = lpad_area->cur_ptr;
                lpad_area->cur_ptr += LANDING_PAD_SIZE;
                break;
            }
        }
        vmvector_iterator_stop(&lpad_area_iter);
    }

    /* If a landing pad area wasn't found because there wasn't any in the
     * allocation region or none fully contained within the allocation region,
     * then create a new one within the allocation region.  Then allocate a
     * landing pad in it.
     */
    if (lpad == NULL) {
        bool allocated = true;
        heap_error_code_t heap_error;
        lpad_area_end = NULL;
        lpad_area_start = os_heap_reserve_in_region(
            (void *)ALIGN_FORWARD(alloc_region_start, PAGE_SIZE),
            (void *)ALIGN_BACKWARD(alloc_region_end, PAGE_SIZE), LANDING_PAD_AREA_SIZE,
            &heap_error, true /*+x*/);
        if (lpad_area_start == NULL || heap_error == HEAP_ERROR_CANT_RESERVE_IN_REGION) {
            /* Should retry with using just the aligned target address - we may
             * have made the region so large that there's nothing nearby to
             * reserve.
             */
            lpad_area_start = os_heap_reserve(
                (void *)ALIGN_FORWARD(addr_to_hook, LANDING_PAD_AREA_SIZE),
                LANDING_PAD_AREA_SIZE, &heap_error, true /*+x*/);
#    ifdef WINDOWS
            if (lpad_area_start == NULL &&
                /* We can only do this once w/ current interface.
                 * XXX: support multiple "allocs" inside libs.
                 */
                vmvector_empty(landing_pad_areas) &&
                os_find_free_code_space_in_libs(&lpad_area_start, &lpad_area_end)) {
                if (lpad_area_end - lpad_area_start >= LANDING_PAD_SIZE &&
                    /* Mark writable until we're done creating landing pads */
                    make_hookable(lpad_area_start, lpad_area_end - lpad_area_start,
                                  NULL)) {
                    /* Let's take it */
                    allocated = false;
                    /* We assume that landing_pads_to_executable_areas(true) will be
                     * called once landing pads are finished being created and we
                     * can restore to +rx there.
                     */
                    lpad_temp_writable_start = lpad_area_start;
                    lpad_temp_writable_size = lpad_area_end - lpad_area_start;
                } else
                    lpad_area_start = NULL; /* not big enough */
            }
#    endif
            if (lpad_area_start == NULL) {
                /* Even at startup when there will be enough memory,
                 * theoretically 2 GB of dlls might get packed together before
                 * we get control (very unlikely), so we can fail.  If it does,
                 * then say 'oom' and exit.
                 */
                SYSLOG_INTERNAL_WARNING("unable to reserve memory for landing pads");
                report_low_on_memory(VMM_SPECIAL_MMAP | VMM_REACHABLE, OOM_RESERVE,
                                     heap_error);
            }
        }

        /* Allocate the landing pad area as rx, allocate a landing pad in it
         * and add it to landing_pad_areas vector.  Note, we only commit 4k
         * initially even though we reserve 64k (LANDING_PAD_AREA_SIZE), to
         * avoid wastage.
         */
        if (allocated) {
            extend_commitment(lpad_area_start, PAGE_SIZE, MEMPROT_READ | MEMPROT_EXEC,
                              true /* initial commit */, VMM_SPECIAL_MMAP);
        }

        lpad_area =
            HEAP_TYPE_ALLOC(GLOBAL_DCONTEXT, landing_pad_area_t, ACCT_VMAREAS, PROTECTED);
        lpad_area->start = lpad_area_start;
        lpad_area->end = (lpad_area_end == NULL ? lpad_area_start + LANDING_PAD_AREA_SIZE
                                                : lpad_area_end);
        lpad_area->commit_end = lpad_area_start + PAGE_SIZE;
        lpad_area->cur_ptr = lpad_area_start;
        lpad_area->allocated = allocated;
        lpad = lpad_area->cur_ptr;
        lpad_area->cur_ptr += LANDING_PAD_SIZE;

        vmvector_add(landing_pad_areas, lpad_area->start, lpad_area->end, lpad_area);
        STATS_INC(num_landing_pad_areas);
    }

    /* Landing pads aren't added to executable_areas here because not all
     * landing pads should be added.  Only the ones used for DR hooks should be
     * added to executable areas (which is done using
     * landing_pads_to_executable_areas() at the end of inserting DR hooks).
     * hotp_only related landing pads shouldn't be added to executable areas as
     * their trampolines aren't added to executable_areas.  This is why landing
     * pads aren't added to executable_areas here, at the point of allocation.
     */

    LOG(GLOBAL, LOG_ALL, 3, "%s: used " PIFX " bytes in " PFX "-" PFX "\n", __FUNCTION__,
        lpad_area->cur_ptr - lpad_area->start, lpad_area->start, lpad_area->end);

    /* Boundary check to make sure the allocation is within the landing pad area. */
    ASSERT(lpad_area->cur_ptr <= lpad_area->end);
    d_r_write_unlock(&landing_pad_areas->lock);
    return lpad;
}

/* Attempts to save space in the landing pad region by trimming the most
 * recently allocated landing pad to the actual space used.
 * Will fail if another landing pad was allocated between lpad_start
 * being allocated and this routine being called.
 */
bool
trim_landing_pad(byte *lpad_start, size_t space_used)
{
    landing_pad_area_t *lpad_area = NULL;
    bool res = false;
    d_r_write_lock(&landing_pad_areas->lock);
    if (vmvector_lookup_data(landing_pad_areas, lpad_start, NULL, NULL, &lpad_area)) {
        if (lpad_start == lpad_area->cur_ptr - LANDING_PAD_SIZE) {
            lpad_area->cur_ptr -= (LANDING_PAD_SIZE - space_used);
            res = true;
        }
    }
    d_r_write_unlock(&landing_pad_areas->lock);
    return res;
}

/* Adds or removes all landing pads from executable_areas by adding whole
 * landing pad areas.  This is done to prevent bb building from considering
 * landing pads to be selfmod code; as such, these don't have to be
 * {add,remov}ed from executable_areas for hotp_only or for thin_client mode.
 */
void
landing_pads_to_executable_areas(bool add)
{
    vmvector_iterator_t lpad_area_iter;
    app_pc lpad_area_start, lpad_area_end;
    DEBUG_DECLARE(landing_pad_area_t * lpad_area;)
    uint lpad_area_size;

    if (RUNNING_WITHOUT_CODE_CACHE())
        return;

#    ifdef WINDOWS
    if (add && lpad_temp_writable_start != NULL) {
        make_unhookable(lpad_temp_writable_start, lpad_temp_writable_size, true);
        lpad_temp_writable_start = NULL;
    }
#    endif

    /* With code cache, there should be only one landing pad area, just for
     * dr hooks in ntdll.  For 64-bit, the image entry hook will result in a
     * new landing pad.
     */
    IF_X64_ELSE(, ASSERT(landing_pad_areas->length == 1);)

    /* Just to be safe, walk through all areas in release build. */
    vmvector_iterator_start(landing_pad_areas, &lpad_area_iter);
    while (vmvector_iterator_hasnext(&lpad_area_iter)) {

        DEBUG_DECLARE(lpad_area =)
        vmvector_iterator_next(&lpad_area_iter, &lpad_area_start, &lpad_area_end);
        lpad_area_size = (uint)(lpad_area_end - lpad_area_start);
        ASSERT(lpad_area_size <= LANDING_PAD_AREA_SIZE);
        /* Current ptr should be within area. */
        ASSERT(lpad_area->cur_ptr < lpad_area_end);
        if (add) {
            add_executable_region(lpad_area_start,
                                  lpad_area_size _IF_DEBUG(
                                      "add landing pad areas after inserting dr hooks"));
        } else {
            remove_executable_region(lpad_area_start, lpad_area_size,
                                     false /* no lock */);
        }
    }
    vmvector_iterator_stop(&lpad_area_iter);
}

/* Delete landing_pad_areas and the landing_pad_area_t allocated for each
 * landing pad area.  However, release all landing pads only on process exit;
 * for detach leave the landing pads in (in case some one hooks after us they
 * shouldn't crash if they chain correctly).
 */
static void
release_landing_pad_mem(void)
{
    vmvector_iterator_t lpad_area_iter;
    app_pc lpad_area_start, lpad_area_end;
    landing_pad_area_t *lpad_area;
    heap_error_code_t heap_error;

    vmvector_iterator_start(landing_pad_areas, &lpad_area_iter);
    while (vmvector_iterator_hasnext(&lpad_area_iter)) {
        bool allocated;
        lpad_area =
            vmvector_iterator_next(&lpad_area_iter, &lpad_area_start, &lpad_area_end);
        allocated = lpad_area->allocated;
        HEAP_TYPE_FREE(GLOBAL_DCONTEXT, lpad_area, landing_pad_area_t, ACCT_VMAREAS,
                       PROTECTED);
        if (!doing_detach && /* On normal exit release the landing pads. */
            allocated)
            os_heap_free(lpad_area_start, LANDING_PAD_AREA_SIZE, &heap_error);
    }
    vmvector_iterator_stop(&lpad_area_iter);
    vmvector_delete_vector(GLOBAL_DCONTEXT, landing_pad_areas);
}
#endif /* WINDOWS */
/*----------------------------------------------------------------------------*/