"""Parses R8 disassembly outputs."""
import argparse
import code
import collections
import dataclasses
import logging
import readline
@dataclasses.dataclass
class DexMethod:
name: str
class_name: str
param_types: list = None
return_type: str = None
bytecode: list = None
class DexClass:
def __init__(self, name):
self.name = name
self.methods = []
def FindMethodByteCode(self, class_name, method_name, param_types,
return_type):
for method in self.methods:
if (method.name == method_name and method.class_name == class_name
and method.return_type == return_type
and method.param_types == param_types):
return method.bytecode
return None
class _WrapPeekableNoNewLine:
"""Line iterator decorator with peek(), and strips new line."""
def __init__(self, it):
self._it = it
self._peek_lineno = 1
self._buf = None
self._cur = None
def _next_internal(self):
ret = next(self._it, None)
return None if ret is None else ret.rstrip('\n')
def __next__(self):
if self._buf is None:
self._cur = self._next_internal()
else:
self._cur, self._buf = self._buf, None
self._peek_lineno += 1
return self._cur
def peek(self):
if self._buf is None:
self._buf = self._next_internal()
return self._buf
def format_error(self, expected, is_peek=False):
return ' '.join(('Line %d:' % (self._peek_lineno - int(not is_peek)),
'Expected %s,' % expected,
'got %r.' % (self.peek() if is_peek else self._cur)))
def _ExtractMethodInfo(it):
"""Extracts coarse method data from R8 DEX dump.
"Coarse" meaning detailed Method info data are left unparsed.
Args:
it: _WrapPeekableNoNewLine iterator for lines of R8 DEX dump.
Yields:
method_str: De-quoted string after "# Methods:".
residual_str: De-quoted string after "# Residual:".
signature: Signature string.
byte_code: Lines of disassembled code (blank lines removed).
"""
def is_end():
return it.peek() is None or it.peek().startswith('# Class:')
method_str = None
while True:
method_str = ''
residual_str = ''
signature = ''
byte_code = []
while not is_end():
if it.peek().startswith('# Method:'):
break
next(it)
else:
break
method_str = next(it).split('\'')[1]
line = next(it)
assert line.startswith('# '), it.format_error('comment with access flags')
line = next(it)
if line.startswith('# Residual:'):
residual_str = line.split('\'')[1]
line = next(it)
assert line == '#', it.format_error('empty comment')
assert next(it) == '', it.format_error('empty line')
if it.peek() is None:
break
if it.peek().startswith('#'):
yield (method_str, residual_str, signature, byte_code)
continue
assert it.peek() != '', it.format_error('comment or signature', True)
signature = next(it)
assert it.peek().startswith('registers:'), it.format_error(
'registers/inputs/outputs', True)
while not is_end():
if it.peek().startswith('#'):
break
line = next(it)
if line:
byte_code.append(line + '\n')
else:
break
yield (method_str, residual_str, signature, byte_code)
if method_str:
yield (method_str, residual_str, signature, byte_code)
def _ExtractClassNameAndMethodInfos(r8_lines_it):
"""Extracts Class and coarse Method info of R8 DEX dump.
Args:
r8_lines_it: Iterator for lines of R8 DEX dump.
Yields:
class_name: Name of classes.
method_info: List of coarse Method info from _ExtractMethodInfo().
"""
it = _WrapPeekableNoNewLine(r8_lines_it)
while it.peek() is not None:
line = next(it)
if not line.startswith('# Class:'):
continue
yield line.split('\'')[1], list(_ExtractMethodInfo(it))
def _SplitMethod(signature):
"""Splits a method signature into components.
Args:
signature: "{return_type} {class_name}.{method_name}({param_types})".
Returns:
return_type: String for "{return_type}".
class_name: String for "{class_name}".
method_name: Sting for "{method_name}".
param_types: List of strings for "{param_types}".
"""
return_type_name, param_str = signature[:-1].split('(')
return_type, method_full_name = return_type_name.split(' ')
last_dot_pos = method_full_name.rfind('.')
if last_dot_pos < 0:
class_name = ''
method_name = method_full_name
else:
class_name = method_full_name[:last_dot_pos]
method_name = method_full_name[last_dot_pos + 1:]
param_types = param_str.split(', ') if param_str else []
return return_type, class_name, method_name, param_types
def _ExtractMethodLegacy(method_name, signature):
"""Extracts method info for legacy format.
Returns: Same as _SplitMethod().
"""
(return_type, class_name, _, param_types) = _SplitMethod(signature)
return return_type, class_name, method_name, param_types
def _CompareSignatures(return_type1, param_types1, return_type2, param_types2):
"""Computes similarity score of two method signatures."""
ret_score = 1 if return_type1 == return_type2 else 0
if param_types1 == param_types2:
param_score = 1.0
else:
size = min(len(param_types1), len(param_types2))
if size == 0:
param_score = 0.0
else:
pos_matches = sum(param_types1[i] == param_types2[i] for i in range(size))
pos_matches /= size
counter1 = collections.Counter(param_types1)
counter2 = collections.Counter(param_types2)
multi_set_matches = sum((counter1 & counter2).values()) / size
diff = abs(len(param_types1) - len(param_types2))
param_score = pos_matches * 0.6 + multi_set_matches * 0.3 - diff * 0.02
return ret_score * 0.1 + param_score * 0.9
def _ExtractMethod(method_str, residual_str):
"""Extracts method info for updated format.
Returns: Same as _SplitMethod().
"""
residual = _SplitMethod(residual_str) if residual_str else None
alternatives = method_str.split(' <OR> ')
if alternatives[0].endswith(')'):
assert all(f.endswith(')') for f in alternatives)
method_parts = [_SplitMethod(f) for f in alternatives]
scores = [(_CompareSignatures(method_parts[i][0], method_parts[i][3],
residual[0], residual[3]), i)
for i in range(len(alternatives))]
return method_parts[max(scores)[1]]
assert len(alternatives) == 1
if residual:
return residual
assert False, 'Failed to extract method.'
return None
def Parse(lines):
"""Parses R8 disassembly lines into DexClass.
Args:
lines: Iterator for lines of R8 DEX dump.
Returns:
class_obj_map: Dict from class name to DexClass instances.
anomalies: List [(class_name, extracted_class_name, method_name)] with
unexplained mismatch between "{class_name}" from "Class:" and class name
extracted from method signatures.
"""
class_obj_map = {}
total_methods = 0
count_with_code = 0
count_synthetics = 0
warning_quota_signature = 8
warning_quota_synthetic = 8
anomalies = []
for class_name, method_infos in _ExtractClassNameAndMethodInfos(lines):
base_class_name = class_name.split('$$')[0]
class_obj = DexClass(class_name)
class_obj_map[class_name] = class_obj
for method_str, residual_str, signature, byte_code in method_infos:
total_methods += 1
if not byte_code:
continue
count_with_code += 1
if ' ' not in method_str:
return_type, extracted_class_name, method_name, param_types = (
_ExtractMethodLegacy(method_str, signature))
else:
if signature not in ('', method_str):
if warning_quota_signature > 0:
warning_quota_signature -= 1
logging.warning('Found signature not matching "# Method":')
logging.warning(' %s', signature)
logging.warning(' %s', method_str)
return_type, extracted_class_name, method_name, param_types = (
_ExtractMethod(method_str, residual_str))
if extracted_class_name != class_name:
if '.' not in extracted_class_name:
pass
elif extracted_class_name.split('$$')[0] == base_class_name:
if 'Synthetic' not in class_name:
if warning_quota_synthetic > 0:
warning_quota_synthetic -= 1
logging.warning('Found "$$" in non-synthetic class: %s',
class_name)
count_synthetics += 1
else:
anomalies.append((class_name, extracted_class_name, method_name))
method_obj = DexMethod(method_name, class_name, param_types, return_type,
byte_code)
class_obj.methods.append(method_obj)
logging.debug(
'R8 disassembler method stats: Found %d total, %d with code, '
'%d affected by synthetics, %d anomalies.', total_methods,
count_with_code, count_synthetics, len(anomalies))
return class_obj_map, anomalies
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input',
type=str,
help='File containing outputs of deobfuscated R8 '
'disassembly output.')
args = parser.parse_args()
with open(args.input, 'rt', encoding='utf-8') as fh:
class_obj_map, anomalies = Parse(fh)
variables = {'class_obj_map': class_obj_map, 'anomalies': anomalies}
banner = []
banner.append('=' * 80)
banner.append('class_obj_map: {method: DexClass obj}')
banner.append('anomalies: [(class_name, extracted_class_name, method_name)]')
code.InteractiveConsole(variables).interact('\n'.join(banner))
if __name__ == '__main__':
main()