"""Utilities to get and manipulate symbols from a binary."""
import collections
import json
import logging
import os
import re
import subprocess
START_OF_TEXT_SYMBOL = 'linker_script_start_of_text'
_SRC_PATH = os.path.abspath(os.path.join(
os.path.dirname(__file__), os.pardir, os.pardir))
_TOOL_PREFIX = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
'Release+Asserts', 'bin', 'llvm-')
_MAX_WARNINGS_TO_PRINT = 200
SymbolInfo = collections.namedtuple('SymbolInfo', ('name', 'offset', 'size',
'section'))
def _IsExpectedSectionForInstrumentedCode(section):
return section in ['.text', 'malloc_hook']
def _SymbolInfosFromStream(input_file):
"""Parses the output of llvm-readelf, and gets all the symbols from a binary.
Args:
input_file: a .json file handle containing the readelf output.
Returns:
A list of SymbolInfo.
"""
raw_symbols = json.load(input_file)
assert len(raw_symbols) == 1
raw_symbols = raw_symbols[0]
assert 'Symbols' in raw_symbols
raw_symbols = raw_symbols['Symbols']
name_to_offsets = collections.defaultdict(list)
symbol_infos = []
for symbol in raw_symbols:
symbol = symbol['Symbol']
name = symbol['Name']['Name']
offset = symbol['Value']
size = symbol['Size']
section = symbol['Section']['Name']
scope = symbol['Binding']['Name']
if name == START_OF_TEXT_SYMBOL:
symbol_infos.append(
SymbolInfo(name=name, offset=offset, section='.text', size=0))
continue
symbol_type = symbol['Type']['Name']
if symbol_type == 'None':
continue
if section == 'Undefined':
assert scope != 'Local', name
continue
if not symbol_type in ['Function', 'GNU_IFunc']:
continue
assert _IsExpectedSectionForInstrumentedCode(section), (
f'Symbol {name} in unexpected section "{section}"')
assert scope in ['Local', 'Global', 'Weak']
assert re.match('^[a-zA-Z0-9_.][a-zA-Z0-9_.$]*$', name), name
symbol_info = SymbolInfo(name=name,
offset=offset,
section=section,
size=size)
if not symbol_info.name.startswith('__ThumbV7PILongThunk_'):
name_to_offsets[symbol_info.name].append(symbol_info.offset)
symbol_infos.append(symbol_info)
repeated_symbols = list(
filter(lambda s: len(name_to_offsets[s]) > 1,
(k for k in name_to_offsets.keys()
if not k.startswith('OUTLINED_FUNCTION_'))))
if repeated_symbols:
logging.warning('%d symbols repeated with multiple offsets:\n %s',
len(repeated_symbols), '\n '.join(
'{} {}'.format(sym, ' '.join(
str(offset) for offset in name_to_offsets[sym][:5]))
for sym in repeated_symbols[:10]))
return symbol_infos
def SymbolInfosFromBinary(binary_filename):
"""Runs llvm-readelf to get all the symbols from a binary.
Args:
binary_filename: path to the binary.
Returns:
A list of SymbolInfo from the binary.
"""
command = [
_TOOL_PREFIX + 'readelf', '--syms', '--elf-output-style=JSON',
'--pretty-print', binary_filename
]
try:
p = subprocess.Popen(command,
stdout=subprocess.PIPE,
universal_newlines=True)
except OSError as error:
logging.error('Failed to execute the command: path=%s, binary_filename=%s',
command[0], binary_filename)
raise error
try:
return _SymbolInfosFromStream(p.stdout)
finally:
p.wait()
_LLVM_NM_LINE_RE = re.compile(
r'^[\-0-9a-f]{8,16}[ ](?P<symbol_type>.)[ ](?P<name>.*)$', re.VERBOSE)
def _SymbolInfosFromLlvmNm(lines):
"""Extracts all defined symbols names from llvm-nm output.
Only defined (weak and regular) symbols are extracted.
Args:
lines: Iterable of lines.
Returns:
[str] A list of symbol names, can be empty.
"""
symbol_names = []
for line in lines:
m = _LLVM_NM_LINE_RE.match(line)
assert m is not None, line
if m.group('symbol_type') not in ['t', 'T', 'w', 'W']:
continue
symbol_names.append(m.group('name'))
return symbol_names
_NM_PATH = os.path.join(_SRC_PATH, 'third_party', 'llvm-build',
'Release+Asserts', 'bin', 'llvm-nm')
def CheckLlvmNmExists():
assert os.path.exists(_NM_PATH), (
'llvm-nm not found. Please run '
'//tools/clang/scripts/update.py --package=objdump to install it.')
def SymbolNamesFromLlvmBitcodeFile(filename):
"""Extracts all defined symbols names from an LLVM bitcode file.
Args:
filename: (str) File to parse.
Returns:
[str] A list of symbol names, can be empty.
"""
command = (_NM_PATH, '--defined-only', filename)
p = subprocess.Popen(command,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True)
try:
result = _SymbolInfosFromLlvmNm(p.stdout)
if not result:
file_size = os.stat(filename).st_size
logging.warning('No symbols for %s (size %d)', filename, file_size)
return result
finally:
_, _ = p.communicate()
if p.stdout:
p.stdout.close()
assert p.wait() == 0
def GroupSymbolInfosByOffset(symbol_infos):
"""Create a dict {offset: [symbol_info1, ...], ...}.
As several symbols can be at the same offset, this is a 1-to-many
relationship.
Args:
symbol_infos: iterable of SymbolInfo instances
Returns:
a dict {offset: [symbol_info1, ...], ...}
"""
offset_to_symbol_infos = collections.defaultdict(list)
for symbol_info in symbol_infos:
offset_to_symbol_infos[symbol_info.offset].append(symbol_info)
return dict(offset_to_symbol_infos)