"""Extracts the unwind tables in from breakpad symbol files
Runs dump_syms on the given binary file and extracts the CFI data into the
given output file.
The output file is a binary file containing CFI rows ordered based on function
address. The output file only contains rows that match the most popular rule
type in CFI table, to reduce the output size and specify data in compact format.
See doc https://github.com/google/breakpad/blob/main/docs/symbol_files.md.
1. The CFA rules should be of postfix form "SP <val> +".
2. The RA rules should be of postfix form "CFA <val> + ^".
Note: breakpad represents dereferencing address with '^' operator.
The output file has 2 tables UNW_INDEX and UNW_DATA, inspired from ARM EHABI
format. The first table contains function addresses and an index into the
UNW_DATA table. The second table contains one or more rows for the function
unwind information.
The output file starts with 4 bytes counting the number of entries in UNW_INDEX.
Then UNW_INDEX table and UNW_DATA table.
UNW_INDEX contains two columns of N rows each, where N is the number of
functions.
1. First column 4 byte rows of all the function start address as offset from
start of the binary, in sorted order.
2. For each function addr, the second column contains 2 byte indices in order.
The indices are offsets (in count of 2 bytes) of the CFI data from start of
UNW_DATA.
The last entry in the table always contains CANT_UNWIND index to specify the
end address of the last function.
UNW_DATA contains data of all the functions. Each function data contains N rows.
The data found at the address pointed from UNW_INDEX will be:
2 bytes: N - number of rows that belong to current function.
N * 4 bytes: N rows of data. 16 bits : Address offset from function start.
14 bits : CFA offset / 4.
2 bits : RA offset / 4.
The function is not added to the unwind table in following conditions:
C1. If length of the function code (number of instructions) is greater than
0xFFFF (2 byte address span). This is because we use 16 bits to refer to
offset of instruction from start of the address.
C2. If the function moves the SP by more than 0xFFFF bytes. This is because we
use 14 bits to denote CFA offset (last 2 bits are 0).
C3. If the Return Address is stored at an offset >= 16 from the CFA. Some
functions which have variable arguments can have offset upto 16.
TODO(ssid): We can actually store offset 16 by subtracting 1 from RA/4 since
we never have 0.
C4: Some functions do not have unwind information defined in dwarf info. These
functions have index value CANT_UNWIND(0xFFFF) in UNW_INDEX table.
Usage:
extract_unwind_tables.py --input_path [root path to unstripped chrome.so]
--output_path [output path] --dump_syms_path [path to dump_syms binary]
"""
import argparse
import re
import struct
import subprocess
import sys
import tempfile
_CFA_REG = '.cfa'
_RA_REG = '.ra'
_ADDR_ENTRY = 0
_LENGTH_ENTRY = 1
_CANT_UNWIND = 0xFFFF
def _Write4Bytes(output_file, val):
"""Writes a 32 bit unsigned integer to the given output file."""
output_file.write(struct.pack('<L', val));
def _Write2Bytes(output_file, val):
"""Writes a 16 bit unsigned integer to the given output file."""
output_file.write(struct.pack('<H', val));
def _FindRuleForRegister(cfi_row, reg):
"""Returns the postfix expression as string for a given register.
Breakpad CFI row format specifies rules for unwinding each register in postfix
expression form separated by space. Each rule starts with register name and a
colon. Eg: "CFI R1: <rule> R2: <rule>".
"""
out = []
found_register = False
for part in cfi_row:
if found_register:
if part[-1] == ':':
break
out.append(part)
elif part == reg + ':':
found_register = True
return ' '.join(out)
def _GetCfaAndRaOffset(cfi_row):
"""Returns a tuple with 2 numbers (cfa_offset, ra_offset).
Returns right values if rule matches the predefined criteria. Returns (0, 0)
otherwise. The criteria for CFA rule is postfix form "SP <val> +" and RA rule
is postfix form "CFA -<val> + ^".
"""
cfa_offset = 0
ra_offset = 0
cfa_rule = _FindRuleForRegister(cfi_row, _CFA_REG)
ra_rule = _FindRuleForRegister(cfi_row, _RA_REG)
if cfa_rule and re.match(r'sp [0-9]+ \+', cfa_rule):
cfa_offset = int(cfa_rule.split()[1], 10)
if ra_rule:
if not re.match(r'.cfa -[0-9]+ \+ \^', ra_rule):
return (0, 0)
ra_offset = -1 * int(ra_rule.split()[1], 10)
return (cfa_offset, ra_offset)
def _GetAllCfiRows(symbol_file):
"""Returns parsed CFI data from given symbol_file.
Each entry in the cfi data dictionary returned is a map from function start
address to array of function rows, starting with FUNCTION type, followed by
one or more CFI rows.
"""
cfi_data = {}
current_func = []
for line in symbol_file:
line = line.decode('utf8')
if 'STACK CFI' not in line:
continue
parts = line.split()
data = {}
if parts[2] == 'INIT':
if len(current_func) > 1:
cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
current_func = []
data[_ADDR_ENTRY] = int(parts[3], 16)
data[_LENGTH_ENTRY] = int(parts[4], 16)
if data[_LENGTH_ENTRY] == 0 or data[_LENGTH_ENTRY] > 0xffff:
continue
else:
if len(current_func) == 0:
continue
data[_ADDR_ENTRY] = int(parts[2], 16)
(data[_CFA_REG], data[_RA_REG]) = _GetCfaAndRaOffset(parts)
if data[_CFA_REG] == 0 or data[_RA_REG] >= 16 or data[_CFA_REG] > 0xffff:
current_func = []
continue
assert data[_CFA_REG] % 4 == 0
assert data[_ADDR_ENTRY] - current_func[0][_ADDR_ENTRY] < 0xffff
if data[_ADDR_ENTRY] == 0:
current_func = []
continue
assert data[_ADDR_ENTRY] % 2 == 0
current_func.append(data)
if len(current_func) > 1:
cfi_data[current_func[0][_ADDR_ENTRY]] = current_func
return cfi_data
def _WriteCfiData(cfi_data, out_file):
"""Writes the CFI data in defined format to out_file."""
unw_data = []
data_to_index = {}
func_addr_to_index = {}
previous_func_end = 0
for addr, function in sorted(cfi_data.items()):
if previous_func_end != 0 and addr - previous_func_end > 4:
func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
previous_func_end = addr + cfi_data[addr][0][_LENGTH_ENTRY]
assert len(function) > 1
func_data_arr = []
func_data = 0
for row in function[1:]:
addr_offset = row[_ADDR_ENTRY] - addr
cfa_offset = (row[_CFA_REG]) | (row[_RA_REG] // 4)
func_data_arr.append(addr_offset)
func_data_arr.append(cfa_offset)
for data in func_data_arr:
func_data = (func_data << 16) | data
row_count = len(func_data_arr) // 2
if func_data not in data_to_index:
index = len(unw_data)
data_to_index[func_data] = index
unw_data.append(row_count)
for row in func_data_arr:
unw_data.append(row)
else:
index = data_to_index[func_data]
assert row_count == unw_data[index]
func_addr_to_index[addr] = data_to_index[func_data]
func_addr_to_index[previous_func_end + 2] = _CANT_UNWIND
_Write4Bytes(out_file, len(func_addr_to_index))
sorted_unw_index = sorted(func_addr_to_index.items())
for addr, index in sorted_unw_index:
_Write4Bytes(out_file, addr)
for addr, index in sorted_unw_index:
_Write2Bytes(out_file, index)
for data in unw_data:
_Write2Bytes(out_file, data)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
'--input_path', required=True,
help='The input path of the unstripped binary')
parser.add_argument(
'--output_path', required=True,
help='The path of the output file')
parser.add_argument(
'--dump_syms_path', required=True,
help='The path of the dump_syms binary')
args = parser.parse_args()
cmd = ['./' + args.dump_syms_path, args.input_path, '-v']
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
cfi_data = _GetAllCfiRows(proc.stdout)
if proc.wait():
sys.stderr.write('dump_syms exited with code {} after {} symbols\n'.format(
proc.returncode, len(cfi_data)))
sys.exit(proc.returncode)
with open(args.output_path, 'wb') as out_file:
_WriteCfiData(cfi_data, out_file)
if __name__ == '__main__':
main()