"""Runs bcanalyzer to extract data from LLVM Bitcode (BC) files.
IsBitcodeFile():
Reads the magic header of a file to quickly decide whether it is a BC file.
ParseTag():
Heuristically parses a single-line tag from bcanalyzer dump (exporeted for
testing).
RunBcAnalyzerOnIntermediates():
BulkForkAndCall() target: Given BC file [paths], runs (llvm-)bcanalyzer on
each path, parses the output, extracts strings, and returns {path: [strings]}.
This file can also be run stand-alone in order to test out the logic on smaller
sample sizes.
"""
import argparse
import os
import re
import subprocess
import parallel
import path_util
_CHAR_WIDTH_LIMIT = 2
_RE_SPLIT = re.compile(r'=(\d+)')
_NON_TYPE_TAGS = set(['NUMENTRY', 'STRUCT_NAME'])
OPENING_TAG = 1
CLOSING_TAG = 2
SELF_CLOSING_TAG = OPENING_TAG | CLOSING_TAG
def _IsOpeningTag(tag_type):
return tag_type & 1
def _IsClosingTag(tag_type):
return tag_type & 2
def IsBitcodeFile(path):
try:
with open(path, 'rb') as f:
return f.read(4) == b'BC\xc0\xde'
except IOError:
return False
def ParseTag(line):
"""Heuristically parses a single-line tag from bcanalyzer dump.
Since input data are machine-generated, so we only need "good enough" parsing
logic that favors simplicity. For example, '</FOO/>' is accepted.
Args:
line: Stripped line that may have a single-line tag with trailing text.
Returns:
(tag_type, tag, attrib_pos) if successful, else (None) * 3. Details:
tag_type: One of {OPENING_TAG, CLOSING_TAG, SELF_CLOSING_TAG}.
tag: The tag name.
attrib_pos: Position in |line| to start parsing attributes.
"""
if len(line) < 2 or line[0] != '<':
return (None, None, None)
tag_type, pos = (CLOSING_TAG, 2) if line[1] == '/' else (OPENING_TAG, 1)
for i in range(pos, len(line)):
if not line[i].isalnum() and line[i] != '_':
if i == pos or not line[i] in ' >/':
break
end = line.find('>', i)
if end < 0:
break
if line[end - 1] == '/':
return (SELF_CLOSING_TAG, line[pos:i], i)
return (tag_type, line[pos:i], i)
return (None, None, None)
def _ParseOpItems(line, pos):
"""Heuristically extracts op0=# op1=# ... values from a single-line tag."""
start = line.index(' op', pos)
end = line.index('>', start)
for t in _RE_SPLIT.finditer(line[start:end]):
yield int(t.group(1))
def _UnpackUint16ListToBytes(items):
for item in items:
yield item & 0xFF
yield (item >> 8) & 0xFF
def _UnpackUint32ListToBytes(items):
for item in items:
yield item & 0xFF
yield (item >> 8) & 0xFF
yield (item >> 16) & 0xFF
yield (item >> 24) & 0xFF
class _BcIntArrayType:
"""The specs of an integer array type."""
_UNPACKER_MAP = {
1: iter,
2: _UnpackUint16ListToBytes,
4: _UnpackUint32ListToBytes
}
def __init__(self, length, width):
self.length = length
self.width = width
def ParseOpItemsAsBytes(self, line, attrib_pos, add_null_at_end):
"""Reads op0=# op=# ... values and returns them as a list of bytes.
Interprets each op0=# op1=# ... value as a |self.width|-byte integer, splits
them into component bytes (little-endian), and returns the result as string.
Args:
line: Stripped line of single-line tag with op0=# op1=# ... data.
attrib_pos: Position in |line| where attribute list starts.
add_null_add_end: Whether to append |'\x00' * self.width|.
"""
items = _ParseOpItems(line, attrib_pos)
unpacker = _BcIntArrayType._UNPACKER_MAP[self.width]
s = bytes(unpacker(items))
if add_null_at_end:
s += b'\x00' * self.width
assert len(s) == self.length * self.width
return s
class _BcTypeInfo:
"""Stateful parser of <TYPE_BLOCK_ID>, specialized for integer arrays."""
def __init__(self):
self.cur_type_id = 0
self.int_types = {}
self.int_array_types = {}
def Feed(self, line, tag, attrib_pos):
"""Parses a single-line tag and store integer and integer array types.
Args:
line: Stripped line of single-line tag with op0=# op1=# ... data.
tag: The tag type in |line| (child tag of <TYPE_BLOCK_ID>).
attrib_pos: Position in |line| where attribute list starts.
"""
if tag in _NON_TYPE_TAGS:
return
if tag == 'INTEGER':
num_bits = next(_ParseOpItems(line, attrib_pos))
self.int_types[self.cur_type_id] = num_bits
elif tag == 'ARRAY':
[size, item_type_id] = list(_ParseOpItems(line, attrib_pos))
bits = self.int_types.get(item_type_id)
if bits is not None:
self.int_array_types[self.cur_type_id] = _BcIntArrayType(
size, bits // 8)
self.cur_type_id += 1
def GetArrayType(self, idx):
return self.int_array_types.get(idx)
def _ParseBcAnalyzer(lines):
"""A generator to extract bytes() from bcanalyzer dump of a BC file."""
STATE_VOID = 0
STATE_TYPE_BLOCK = 1
STATE_CONST_BLOCK = 2
state = STATE_VOID
type_info = None
consts_cur_type = None
for line in lines:
line = line.lstrip()
(tag_type, tag, attrib_pos) = ParseTag(line)
if tag_type is None:
continue
if state == STATE_VOID:
if _IsOpeningTag(tag_type):
if tag == 'TYPE_BLOCK_ID':
if type_info is None:
state = STATE_TYPE_BLOCK
type_info = _BcTypeInfo()
elif tag == 'CONSTANTS_BLOCK':
if type_info is not None:
state = STATE_CONST_BLOCK
elif state == STATE_TYPE_BLOCK:
if _IsClosingTag(tag_type) and tag == 'TYPE_BLOCK_ID':
state = STATE_VOID
else:
type_info.Feed(line, tag, attrib_pos)
elif state == STATE_CONST_BLOCK:
if _IsClosingTag(tag_type) and tag == 'CONSTANTS_BLOCK':
break
if tag == 'SETTYPE':
try:
consts_cur_type_id = next(_ParseOpItems(line, attrib_pos))
except StopIteration:
return
consts_cur_type = type_info.GetArrayType(consts_cur_type_id)
elif consts_cur_type and consts_cur_type.width <= _CHAR_WIDTH_LIMIT:
if tag in ['CSTRING', 'STRING', 'DATA']:
s = consts_cur_type.ParseOpItemsAsBytes(line, attrib_pos,
tag == 'CSTRING')
yield (consts_cur_type, s)
class _BcAnalyzerRunner:
"""Helper to run bcanalyzer and extract output lines. """
def __init__(self, output_directory):
self._args = [
path_util.GetBcAnalyzerPath(), '--dump', '--disable-histogram'
]
self._output_directory = output_directory
def RunOnFile(self, obj_file):
output = subprocess.check_output(
self._args + [obj_file], cwd=self._output_directory).decode('ascii')
return output.splitlines()
def RunBcAnalyzerOnIntermediates(target, output_directory):
"""Calls bcanalyzer and returns encoded map from path to strings.
Args:
target: A list of BC file paths.
"""
assert isinstance(target, list)
runner = _BcAnalyzerRunner(output_directory)
strings_by_path = {}
for t in target:
strings_by_path[t] = [s for _, s in _ParseBcAnalyzer(runner.RunOnFile(t))]
return parallel.EncodeDictOfLists(strings_by_path, value_transform=repr)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--output-directory', default='.')
parser.add_argument('--char-width-limit', type=int)
parser.add_argument('objects', type=os.path.realpath, nargs='+')
args = parser.parse_args()
base_path = os.path.normpath(args.output_directory)
runner = _BcAnalyzerRunner(args.output_directory)
if args.char_width_limit is not None:
global _CHAR_WIDTH_LIMIT
_CHAR_WIDTH_LIMIT = args.char_width_limit
for obj_path in args.objects:
rel_path = os.path.relpath(obj_path, base_path)
print('File: %s' % rel_path)
for cur_type, s in _ParseBcAnalyzer(runner.RunOnFile(obj_path)):
print(' char%d[%d]: %r' % (cur_type.width * 8, cur_type.length, s))
print('')
if __name__ == '__main__':
main()