"""Utilities to extract string literals from object files.
LookupElfRodataInfo():
Runs readelf to extract and return .rodata section spec of an ELF file.
ReadFileChunks():
Reads raw data from a file, given a list of ranges in the file.
ReadStringLiterals():
Reads the ELF file to find the string contents of a list of string literals.
ResolveStringPiecesIndirect():
BulkForkAndCall() target: Given {path: [string addresses]} and
[raw_string_data for each string_section]:
- Reads {path: [src_strings]}.
- For each path, searches for src_strings in at most 1 raw_string_data over
each string_section. If found, translates to string_range and annotates it
to the string_section.
- Returns [{path: [string_ranges]} for each string_section].
ResolveStringPieces():
BulkForkAndCall() target: Given {path: [strings]} and
[raw_string_data for each string_section]:
- For each path, searches for src_strings in at most 1 raw_string_data over
each string_section. If found, translates to string_range and annotates it
to the string_section.
- Returns [{path: [string_ranges]} for each string_section].
GetNameOfStringLiteralBytes():
Converts string literal bytes to printable form, useful for assigning
full_name of string literal symbols. If any non-printable character is found
then returns models.STRING_LITERAL_NAME. Otherwise the returned string is
quoted, and may be truncated (with "[...]" appended).
"""
import ast
import collections
import itertools
import logging
import os
import string
import subprocess
import ar
import models
import parallel
import path_util
_STRING_LITERAL_LENGTH_CUTOFF = 30
def LookupElfRodataInfo(elf_path):
"""Returns (address, offset, size) for the .rodata section."""
args = [path_util.GetReadElfPath(), '-S', '--wide', elf_path]
output = subprocess.check_output(args).decode('ascii')
lines = output.splitlines()
for line in lines:
if '.rodata ' in line:
fields = line[line.index(models.SECTION_RODATA):].split()
return int(fields[2], 16), int(fields[3], 16), int(fields[4], 16)
raise AssertionError('No .rodata for command: ' + repr(args))
def ReadFileChunks(path, section_ranges):
"""Returns a list of raw data from |path|, specified by |section_ranges|.
Args:
section_ranges: List of (offset, size).
"""
ret = []
if not section_ranges:
return ret
with open(path, 'rb') as f:
for offset, size in section_ranges:
f.seek(offset)
ret.append(f.read(size))
return ret
def _ExtractArchivePath(path):
if path.endswith(')'):
start_idx = path.index('(')
return path[:start_idx]
return None
def _LookupStringSectionPositions(target, output_directory):
"""Returns a dict of object_path -> [(offset, size)...] of .rodata sections.
Args:
target: An archive path string (e.g., "foo.a") or a list of object paths.
"""
is_archive = isinstance(target, str)
args = [path_util.GetReadElfPath(), '-S', '--wide']
if is_archive:
args.append(target)
else:
path = target[0]
args.extend(target)
output = subprocess.check_output(args, cwd=output_directory).decode('ascii')
lines = output.splitlines()
section_positions_by_path = {}
cur_offsets = []
for line in lines:
if line.startswith('File: '):
if cur_offsets:
section_positions_by_path[path] = cur_offsets
cur_offsets = []
path = line[6:]
elif '.rodata.' in line:
progbits_idx = line.find('PROGBITS ')
if progbits_idx != -1:
fields = line[progbits_idx:].split()
position = (int(fields[2], 16), int(fields[3], 16))
if fields[-1] == '1':
cur_offsets.insert(0, position)
else:
cur_offsets.append(position)
if cur_offsets:
section_positions_by_path[path] = cur_offsets
return section_positions_by_path
def _ReadStringSections(target, output_directory, positions_by_path):
"""Returns a dict of object_path -> [string...] of .rodata chunks.
Args:
target: An archive path string (e.g., "foo.a") or a list of object paths.
positions_by_path: A dict of object_path -> [(offset, size)...]
"""
is_archive = isinstance(target, str)
string_sections_by_path = {}
if is_archive:
for subpath, chunk in ar.IterArchiveChunks(
os.path.join(output_directory, target)):
path = '{}({})'.format(target, subpath)
positions = positions_by_path.get(path)
if positions:
string_sections_by_path[path] = (
[chunk[offset:offset + size] for offset, size in positions])
else:
for path in target:
positions = positions_by_path.get(path)
if positions:
string_sections_by_path[path] = ReadFileChunks(
os.path.join(output_directory, path), positions)
return string_sections_by_path
def _IterStringLiterals(path, addresses, obj_sections):
"""Yields all string literals (including \0) for the given object path.
Args:
path: Object file path.
addresses: List of string offsets encoded as hex strings.
obj_sections: List of contents of .rodata.str sections read from the given
object file.
"""
next_offsets = sorted(int(a, 16) for a in addresses)
if not obj_sections:
logging.warning('Object has %d strings but no string sections: %s',
len(addresses), path)
return
for section_data in obj_sections:
cur_offsets = next_offsets
next_offsets = [0]
prev_offset = 0
for offset in cur_offsets[1:]:
if offset >= len(section_data):
next_offsets.append(offset)
continue
if offset == prev_offset or section_data[offset - 1] != 0:
next_offsets.append(offset)
continue
yield section_data[prev_offset:offset]
prev_offset = offset
if prev_offset < len(section_data):
yield section_data[prev_offset:]
def _AnnotateStringData(string_data, path_value_gen):
"""Annotates each |string_data| section data with paths and ranges.
Args:
string_data: [raw_string_data for each string_section] from an ELF file.
path_value_gen: A generator of (path, value) pairs, where |path|
is the path to an object file and |value| is a string to annotate.
Returns:
[{path: [string_ranges]} for each string_section].
"""
ret = [collections.defaultdict(list) for _ in string_data]
for path, value in path_value_gen:
first_match = -1
first_match_dict = None
for target_dict, data in zip(ret, string_data):
offset = -len(value)
while True:
offset = data.find(value, offset + len(value))
if offset == -1:
break
if offset == 0 or data[offset - 1] == 0:
break
if first_match == -1:
first_match = offset
first_match_dict = target_dict
if offset != -1:
break
if offset == -1:
offset = first_match
target_dict = first_match_dict
if offset != -1:
target_dict[path].append(str(offset) + ':' + str(len(value)))
return ret
def ResolveStringPiecesIndirect(encoded_string_addresses_by_path, string_data,
output_directory):
string_addresses_by_path = parallel.DecodeDictOfLists(
encoded_string_addresses_by_path)
any_path = next(iter(string_addresses_by_path.keys()))
target = _ExtractArchivePath(any_path)
if not target:
target = list(string_addresses_by_path.keys())
section_positions_by_path = _LookupStringSectionPositions(
target, output_directory)
string_sections_by_path = _ReadStringSections(
target, output_directory, section_positions_by_path)
def GeneratePathAndValues():
for path, object_addresses in string_addresses_by_path.items():
for value in _IterStringLiterals(
path, object_addresses, string_sections_by_path.get(path)):
yield path, value
ret = _AnnotateStringData(string_data, GeneratePathAndValues())
return [parallel.EncodeDictOfLists(x) for x in ret]
def ResolveStringPieces(encoded_strings_by_path, string_data):
strings_by_path = parallel.DecodeDictOfLists(
encoded_strings_by_path, value_transform=ast.literal_eval)
def GeneratePathAndValues():
for path, strings in strings_by_path.items():
for value in strings:
yield path, value
ret = _AnnotateStringData(string_data, GeneratePathAndValues())
return [parallel.EncodeDictOfLists(x) for x in ret]
def ReadStringLiterals(symbols, elf_path, all_rodata=False):
"""Returns an iterable of (symbol, data) for all string literal symbols.
Emitted string literal data are null-terminated bytes.
Args:
symbols: An iterable of Symbols
elf_path: Path to the executable containing the symbols.
all_rodata: Assume every symbol within .rodata that ends with a \0 is a
string literal.
"""
address, offset, _ = LookupElfRodataInfo(elf_path)
adjust = offset - address
with open(elf_path, 'rb') as f:
for symbol in symbols:
if symbol.section != 'r':
continue
f.seek(symbol.address + adjust)
data = f.read(symbol.size_without_padding)
if symbol.IsStringLiteral() or (all_rodata and data and data[-1] == 0):
yield ((symbol, data))
def GetNameOfStringLiteralBytes(b):
"""Converts string literal bytes to printable form, may be truncated."""
b = b[:_STRING_LITERAL_LENGTH_CUTOFF + 1]
if not b.isascii():
return models.STRING_LITERAL_NAME
b = b.replace(b'\r', b'').replace(b'\n', b'').replace(b'\t', b'')
b = b.rstrip(b'\00')
s = b.decode('ascii')
if s.isprintable():
if len(s) > _STRING_LITERAL_LENGTH_CUTOFF:
return f'"{s[:_STRING_LITERAL_LENGTH_CUTOFF - 3]}"...'
return f'"{s}"'
return models.STRING_LITERAL_NAME