"""Utilities for ARSC file parsing.
This file provides tools to performs shallow parsing for binary size analysis at
chunk level, without comprehensive error detection.
ARSC file format are extracted from:
https://android.googlesource.com/platform/frameworks/base/+/master/libs/androidfw/include/androidfw/ResourceTypes.h
https://android.googlesource.com/platform/frameworks/base/+/master/libs/androidfw/ResourceTypes.cpp
"""
import argparse
import collections
import dataclasses
import logging
import functools
import os
import re
import struct
import sys
import zipfile
import stream_reader
_RES_NULL_TYPE = 0x0000
_RES_STRING_POOL_TYPE = 0x0001
_RES_TABLE_TYPE = 0x0002
_RES_XML_TYPE = 0x0003
_RES_XML_FIRST_CHUNK_TYPE = 0x0100
_RES_XML_START_NAMESPACE_TYPE = 0x0100
_RES_XML_END_NAMESPACE_TYPE = 0x0101
_RES_XML_START_ELEMENT_TYPE = 0x0102
_RES_XML_END_ELEMENT_TYPE = 0x0103
_RES_XML_CDATA_TYPE = 0x0104
_RES_XML_LAST_CHUNK_TYPE = 0x017f
_RES_XML_RESOURCE_MAP_TYPE = 0x0180
_RES_TABLE_PACKAGE_TYPE = 0x0200
_RES_TABLE_TYPE_TYPE = 0x0201
_RES_TABLE_TYPE_SPEC_TYPE = 0x0202
_RES_TABLE_LIBRARY_TYPE = 0x0203
_RES_TABLE_OVERLAYABLE_TYPE = 0x0204
_RES_TABLE_OVERLAYABLE_POLICY_TYPE = 0x0205
_RES_TABLE_STAGED_ALIAS_TYPE = 0x0206
_StringInfo = collections.namedtuple('StringInfo', 'enc_size,data')
class _ArscStreamReader(stream_reader.StreamReader):
def Clone(self):
ret = _ArscStreamReader(self._data)
ret.Seek(self.Tell())
return ret
def NextArscEncodedLengthUtf8(self):
byte1 = self.NextUByte()
return ((byte1 & 0x7F) << 8) | self.NextUByte() if byte1 & 0x80 else byte1
def NextArscEncodedLengthWide(self):
short1 = self.NextUShort()
return (((short1 & 0x7FFF) << 16) | self.NextUShort() if short1
& 0x8000 else short1)
def NextCString(self, n):
t = self.NextBytes(n)
term = t.find(0)
t = t if term < 0 else t[:term]
return t.decode('latin1')
def PeekArscHeaderType(self):
pos = self.Tell()
ret = self.NextUShort()
self.Seek(pos)
return ret
@functools.lru_cache
def GetArscResTypeToClassMap(self):
"""Returns a lookup table to map chunk type to class.
Requires all classes in this file to be parsed, before calling.
"""
def MakeGeneric(type_name):
return lambda reader, parent: ArscGeneric(type_name, reader, parent)
return {
_RES_STRING_POOL_TYPE: ArscStringPool,
_RES_TABLE_TYPE: ArscResTable,
_RES_XML_TYPE: MakeGeneric('XML'),
_RES_XML_FIRST_CHUNK_TYPE: MakeGeneric('XML_FIRST_CHUNK'),
_RES_XML_START_NAMESPACE_TYPE: MakeGeneric('XML_START_NAMESPACE'),
_RES_XML_END_NAMESPACE_TYPE: MakeGeneric('XML_END_NAMESPACE'),
_RES_XML_START_ELEMENT_TYPE: MakeGeneric('XML_START_ELEMENT'),
_RES_XML_END_ELEMENT_TYPE: MakeGeneric('XML_END_ELEMENT'),
_RES_XML_CDATA_TYPE: MakeGeneric('XML_CDATA'),
_RES_XML_LAST_CHUNK_TYPE: MakeGeneric('XML_LAST_CHUNK'),
_RES_XML_RESOURCE_MAP_TYPE: MakeGeneric('XML_RESOURCE_MAP'),
_RES_TABLE_PACKAGE_TYPE: ArscResTablePackage,
_RES_TABLE_TYPE_TYPE: ArscResTableType,
_RES_TABLE_TYPE_SPEC_TYPE: ArscResTableTypeSpec,
_RES_TABLE_LIBRARY_TYPE: MakeGeneric('LIBRARY'),
_RES_TABLE_OVERLAYABLE_TYPE: MakeGeneric('OVERLAYABLE'),
_RES_TABLE_OVERLAYABLE_POLICY_TYPE: MakeGeneric('OVERLAYABLE_POLICY'),
_RES_TABLE_STAGED_ALIAS_TYPE: MakeGeneric('STAGED_ALIAS'),
}
def NextArscChunk(self, parent=None):
chunk_type = self.PeekArscHeaderType()
arsc_class = self.GetArscResTypeToClassMap().get(chunk_type) or ArscChunk
chunk = arsc_class(self, parent=parent)
self.Seek(chunk.end_addr)
return chunk
def _SplitBits(value, *widths):
for width in widths:
yield value & ((1 << width) - 1)
value >>= width
class ResTableConfig:
"""Structure to specify |config| in ArscResTableType."""
def __init__(self, reader):
self.size = reader.NextUInt()
assert self.size == 64
self.imsi = reader.NextUInt()
self.locale = reader.NextUInt()
self.screen_type = reader.NextUInt()
self.input = reader.NextUInt()
self.screen_size = reader.NextUInt()
self.version = reader.NextUInt()
self.screen_config = reader.NextUInt()
self.screen_size_dp = reader.NextUInt()
self.locale_script = reader.NextCString(4)
self.locale_variant = reader.NextCString(8)
self.screen_config2 = reader.NextUInt()
self.locale_script_was_computed = bool(reader.NextUByte())
self.locale_numbering_system = reader.NextCString(8)
reader.Skip(3)
def EmitField(self, v, start, width, fmt, lookup=None):
"""Formats and yields bit range from an integer if non-0."""
bits = (v >> start) & ((1 << width) - 1)
if bits != 0:
yield (lookup and lookup.get(bits)) or fmt % bits
def DecodeLanguageOrRegion(self, code, base):
if code & 0x8000 == 0:
return (chr(code & 0x7f) + chr(code >> 8)).rstrip('\0')
return ''.join(chr(base + ((code >> i) & 0x1f)) for i in range(3))
def EmitLocaleString(self):
"""Emits locale data as formatted string if non-0."""
if self.locale == 0:
return
language, country = _SplitBits(self.locale, 16, 16)
script_was_provided = (self.locale_script
and not self.locale_script_was_computed)
ret = self.DecodeLanguageOrRegion(language, ord('a'))
if (not script_was_provided and not self.locale_variant
and not self.locale_numbering_system):
if country != 0:
ret += '-r' + self.DecodeLanguageOrRegion(country, ord('0'))
else:
ret = 'b+' + ret
if script_was_provided:
ret += '+' + self.locale_script
if country != 0:
ret += '+' + self.DecodeLanguageOrRegion(country, ord('0'))
if self.locale_variant:
ret += '+' + self.locale_variant
if self.locale_numbering_system:
ret += '+u+nu+' + self.locale_numbering_system
yield ret
def EmitAllTokens(self):
"""Emits all non-0 data as string tokens.
The formatting follows ResTable_config::toString(). Typically each value is
extracted as bit ranges in some member variable. If a value matches a
predefined constant (e.g., 4 = "xlarge") then the token is taken as the
string. Otherwise a fallback (e.g., "screenLayoutSize=5") is used instead.
"""
yield from self.EmitField(self.imsi, 0, 16, 'mcc%d')
yield from self.EmitField(self.imsi, 16, 16, 'mnc%d')
yield from self.EmitLocaleString()
yield from self.EmitField(self.screen_config, 6, 2, 'layoutDir=%d', {
1: 'ldltr',
2: 'ldrtl',
})
yield from self.EmitField(self.screen_config, 16, 16, 'sw%ddp')
yield from self.EmitField(self.screen_size_dp, 0, 16, 'w%ddp')
yield from self.EmitField(self.screen_size_dp, 16, 16, 'h%ddp')
yield from self.EmitField(self.screen_config, 0, 4, 'screenLayoutSize=%d', {
1: 'small',
2: 'normal',
3: 'large',
4: 'xlarge',
})
yield from self.EmitField(self.screen_config, 4, 2, 'screenLayoutLong=%d', {
1: 'notling',
2: 'long',
})
yield from self.EmitField(self.screen_config2, 0, 2, 'screenRound=%d', {
1: 'notround',
2: 'round',
})
yield from self.EmitField(self.screen_config2, 8, 2, 'wideColorGamut=%d', {
1: 'nowidecg',
2: 'widecg',
})
yield from self.EmitField(self.screen_config2, 10, 2, 'hdr=%d', {
1: 'lowdr',
2: 'highdr',
})
yield from self.EmitField(self.screen_type, 0, 8, 'orientation=%d', {
1: 'port',
2: 'land',
3: 'square',
})
yield from self.EmitField(self.screen_config, 8, 4, 'uiModeType=%d', {
1: 'normal',
2: 'desk',
3: 'car',
4: 'television',
5: 'appliance',
6: 'watch',
7: 'vrheadset',
})
yield from self.EmitField(self.screen_config, 12, 2, 'uiModeNight=%d', {
1: 'notnight',
2: 'night',
})
yield from self.EmitField(self.screen_type, 16, 16, '%ddpi', {
120: 'ldpi',
160: 'mdpi',
213: 'tvdpi',
240: 'hdpi',
320: 'xhdpi',
480: 'xxhdpi',
640: 'xxxhdpi',
0xffff: 'nodpi',
0xfffe: 'anydpi',
})
yield from self.EmitField(self.screen_type, 8, 8, 'touchscreen=%d', {
1: 'notouch',
2: 'finger',
3: 'stylus',
})
yield from self.EmitField(self.input, 16, 2, '(keys=%d)', {
1: 'keysexposed',
2: 'keyshidden',
3: 'keyssoft',
})
yield from self.EmitField(self.input, 0, 8, 'keyboard=%d', {
1: 'nokeys',
2: 'qwerty',
3: '12key',
})
yield from self.EmitField(self.input, 18, 2, 'inputFlagsNavHidden=%d', {
1: 'navexposed',
2: 'navhidden',
})
yield from self.EmitField(self.input, 8, 8, 'navigation=%d', {
1: 'nonav',
2: 'dpad',
3: 'trackball',
4: 'wheel',
})
if self.screen_size != 0:
yield '%dx%d' % tuple(_SplitBits(self.screen_size, 16, 16))
if self.version != 0:
sdk, minor = _SplitBits(self.version, 16, 16)
yield f'v{sdk}' + (f'.{minor}' if minor else '')
def __str__(self):
return '-'.join(self.EmitAllTokens())
class ArscChunk:
"""Base class for ARSC chunks, embedding hierarchy and common header.
Fields:
addr: Absolute start address of chunk in the ARSC file.
parent: Reference to the parent chunk (None for root).
children: References to the children chunks.
type: (In header) Chunk type specified by a _RES_*_TYPE constant.
header_size: (In header) Byte size of the header, which is |type|-dependent.
size: (In header) Byte size of the chunk, including header.
placeholder: Number of placeholder bytes.
"""
def __init__(self, reader, parent):
self.addr = reader.Tell()
self.parent = parent
self.children = []
self.type = reader.NextUShort()
self.header_size = reader.NextUShort()
self.size = reader.NextUInt()
@property
def payload_addr(self):
return self.addr + self.header_size
@property
def end_addr(self):
return self.addr + self.size
@property
def placeholder(self):
"""Returns type-dependent placeholder, overridable."""
return 0
def StrHelper(self, name, fields):
depth = 0
cur = self.parent
while cur:
depth += 1
cur = cur.parent
r = '[%08X, %08X)' % (self.addr, self.addr + self.size)
f = ', '.join(f'{k}={v}' for (k, v) in fields.items())
return '%s: %s%s: %s' % (r, ' ' * depth, name, f)
def __str__(self):
return self.StrHelper('GENERIC', {'type': self.type})
def symbol_name(self):
return f'GENERIC: type={self.type}'
def labelled_children(self):
yield from ((None, child) for child in self.children)
class ArscGeneric(ArscChunk):
"""Generic chunk containing only name."""
def __init__(self, type_name, reader, parent):
super().__init__(reader, parent)
self.type_name = type_name
def __str__(self):
return self.StrHelper(self.type_name, {})
def symbol_name(self):
return self.type_name
class ArscStringPool(ArscChunk):
"""_RES_STRING_POOL_TYPE chunk for string storage.
Fields:
string_count: Number of strings in chunk.
"""
SORTED_FLAG = 1 << 0
UTF8_FLAG = 1 << 8
def __init__(self, reader, parent=None):
super().__init__(reader, parent)
self.string_count = reader.NextUInt()
self.style_count = reader.NextUInt()
self.flags = reader.NextUInt()
self.string_start = reader.NextUInt()
self.style_start = reader.NextUInt()
self.is_utf8 = bool(self.flags & ArscStringPool.UTF8_FLAG)
assert reader.Tell() == self.payload_addr
self.role = ''
base = self.addr + self.string_start
self.string_addrs = [
base + reader.NextUInt() for _ in range(self.string_count)
]
self.reader = reader.Clone()
def __str__(self):
return self.StrHelper('STRING_POOL', {
'string_count': self.string_count,
'style_count': self.style_count
})
def symbol_name(self):
return f'STRING_POOL_{self.role}' if self.role else 'STRING_POOL'
@property
@functools.lru_cache
def string_infos(self):
ret = []
if self.is_utf8:
for offset in self.string_addrs:
self.reader.Seek(offset)
self.reader.NextArscEncodedLengthUtf8()
enc_size = self.reader.NextArscEncodedLengthUtf8()
data = self.reader.NextBytes(enc_size)
enc_size = self.reader.Tell() - offset + 1
ret.append(_StringInfo(enc_size, data))
else:
for offset in self.string_addrs:
self.reader.Seek(offset)
str_size = self.reader.NextArscEncodedLengthWide()
enc_size = str_size * 2
data = self.reader.NextBytes(enc_size)
enc_size = self.reader.Tell() - offset + 2
ret.append(_StringInfo(enc_size, data))
return ret
@property
@functools.lru_cache
def string_items(self):
encoding = 'utf-8' if self.is_utf8 else 'utf-16'
ret = [
info.data.decode(encoding, errors='surrogatepass')
for info in self.string_infos
]
self.reader = None
return ret
def GetString(self, idx):
return self.string_items[idx]
def GetEncodedSize(self, idx):
return self.string_infos[idx].enc_size
class ArscResTable(ArscChunk):
"""_RES_TABLE_TYPE chunk for ARSC file root.
Children comprise of:
ArscStringPool
ArscResTablePackage 1
...
ArscResTablePackage n
Fields:
package_count: Number of ArscResTablePackage entries (typically 1).
"""
def __init__(self, reader, parent=None):
super().__init__(reader, parent)
self.package_count = reader.NextUInt()
assert reader.PeekArscHeaderType() == _RES_STRING_POOL_TYPE
self.string_pool = reader.NextArscChunk(parent=self)
self.string_pool.role = 'root'
self.children.append(self.string_pool)
cur_addr = self.string_pool.end_addr
self.packages = []
for _ in range(self.package_count):
reader.Seek(cur_addr)
assert reader.PeekArscHeaderType() == _RES_TABLE_PACKAGE_TYPE
package = reader.NextArscChunk(parent=self)
self.children.append(package)
self.packages.append(package)
cur_addr = self.packages[-1].end_addr
def __str__(self):
return self.StrHelper('TABLE', {'package_count': self.package_count})
def symbol_name(self):
return 'res_table'
def labelled_children(self):
for chunk in self.children:
if chunk is self.string_pool:
yield (None, chunk)
elif isinstance(chunk, ArscResTablePackage):
yield (chunk.name, chunk)
else:
yield (None, chunk)
class ArscResTablePackage(ArscChunk):
"""_RES_TABLE_PACKAGE_TYPE chunk containing a package's resources.
Children *normally* comprise of:
ArscStringPool: For types (small, for resource types).
ArscStringPool: For keys (resource names)
ArscResTableTypeSpec 1: For resource type 1.
ArscResTableType 1.1: Config used by the type, has the same |entry_count|.
...
ArscResTableType 1.n_1: Here n_1 depends on type 1.
ArscResTableTypeSpec 2: For resource type 2.
ArscResTableType 2.1
...
ArscResTableType 2.n_2
...
Fields:
name: Android app package name for the resource.
"""
def __init__(self, reader, parent=None):
super().__init__(reader, parent)
self.id = reader.NextUInt()
assert self.id < 0x100, 'Package ID is too big'
self.name = reader.NextBytes(256).decode('utf-16').rstrip('\0')
self.type_strings = reader.NextUInt()
self.last_public_type = reader.NextUInt()
self.key_strings = reader.NextUInt()
self.last_public_key = reader.NextUInt()
self.type_id_offset = reader.NextUInt()
assert reader.Tell() == self.payload_addr
self.type_pool = None
self.key_pool = None
cur_addr = self.payload_addr
while cur_addr < self.end_addr:
reader.Seek(cur_addr)
chunk = reader.NextArscChunk(parent=self)
self.children.append(chunk)
if isinstance(chunk, ArscStringPool):
if self.type_pool is None:
self.type_pool = chunk
self.type_pool.role = 'types'
elif self.key_pool is None:
self.key_pool = chunk
self.key_pool.role = 'keys'
else:
logging.warning('Unexpected string pool at %08X.' % t.address)
cur_addr = chunk.end_addr
def __str__(self):
return self.StrHelper('PACKAGE', {'name': self.name})
def symbol_name(self):
return 'package'
def labelled_children(self):
for chunk in self.children:
if isinstance(chunk, (ArscResTableType, ArscResTableTypeSpec)):
yield (chunk.type_str, chunk)
else:
yield (None, chunk)
class ArscResTableType(ArscChunk):
"""_RES_TABLE_TYPE_TYPE chunk for resources of a common type and config.
Following the header, the struct consinsts of a (relative) pointer table for
resource entries, followed by resource data (not parsed). The pointer table
can be dense or sparse.
* Dense tables use NO_ENTRY to mark resources unavailable for |config|. These
pointers are counted in |placeholder|.
* Sparse tables stores sorted (index, pointer) pairs and uses binary search.
Currently we don't support these.
Fields:
type_str: Name of the common type, e.g., "drawable", "layout", "string".
config: ResTableConfig instance.
entry_count: Number of resources contained.
"""
NO_ENTRY = 0xffffffff
FLAG_SPARSE = 0x01
def __init__(self, reader, parent=None):
super().__init__(reader, parent)
assert parent
assert parent.type_pool, 'Missing type string pool.'
assert parent.key_pool, 'Missing key string pool.'
self.id = reader.NextUByte()
assert self.id != 0, 'ResTable_type has invalid id.'
self.flags = reader.NextUByte()
self.is_sparse = self.flags & ArscResTableType.FLAG_SPARSE != 0
self.reserved = reader.NextUShort()
self.entry_count = reader.NextUInt()
self.entries_start = reader.NextUInt()
assert self.entries_start < self.size
self.config = ResTableConfig(reader)
assert reader.Tell() == self.payload_addr
self.entry_placeholder = 0
self.type_str = parent.type_pool.GetString(self.id - 1)
entries_start_addr = self.addr + self.entries_start
if not self.is_sparse:
entries_offsets = [reader.NextUInt() for _ in range(self.entry_count)]
self.entry_placeholder += sum(4 for o in entries_offsets
if o == ArscResTableType.NO_ENTRY)
assert entries_start_addr >= reader.Tell()
@property
def placeholder(self):
return self.entry_placeholder
def __str__(self):
return self.StrHelper(
'TYPE', {
'type_str': self.type_str,
'entry_count': self.entry_count,
'is_sparse': self.is_sparse,
'size': self.size,
'placeholder': self.entry_placeholder,
'config': str(self.config),
})
def symbol_name(self):
return str(self.config) or 'default'
class ArscResTableTypeSpec(ArscChunk):
"""_RES_TABLE_TYPE_SPEC_TYPE chunk for info on resource of the same type.
The info stored is independent of config.
Fields:
type_str: Name of the common type, e.g., "drawable", "layout", "string".
entry_count: Number of resources contained.
"""
def __init__(self, reader, parent=None):
super().__init__(reader, parent)
assert parent
assert parent.type_pool, 'Missing type string pool.'
self.id = reader.NextUByte()
assert self.id != 0, 'ResTable_typeSpec has invalid id.'
self.res0 = reader.NextUByte()
self.res1 = reader.NextUShort()
self.entry_count = reader.NextUInt()
assert self.entry_count < 0x10000, 'ResTable_typeSpec has too many entries.'
self.type_str = parent.type_pool.GetString(self.id - 1)
reader.Seek(self.end_addr)
def __str__(self):
return self.StrHelper('TYPE_SPEC', {
'type_str': self.type_str,
'entry_count': self.entry_count
})
def symbol_name(self):
return 'TYPE_SPEC'
class ArscFile:
"""Represents a single ARSC file.
Shallowly parses an ARSC file into nested ArscChunk for binary size analysis.
Fields:
table: The root chunk that contains all other chunks.
"""
def __init__(self, data):
reader = _ArscStreamReader(data)
assert reader.PeekArscHeaderType() == _RES_TABLE_TYPE
self.table = reader.NextArscChunk()
def VisitPreOrder(self):
"""Depth-first pre-order visitor of all (path, chunk).
|path| is a string to establish context of |chunk|, consisting of non-None
labels of ancestral and current chunks joined by '/'.
"""
@dataclasses.dataclass
class StackFrame:
prev_has_label: bool
child_iterator: object
label_stack = []
yield '', self.table
stack = [StackFrame(False, self.table.labelled_children())]
while stack:
frame = stack[-1]
if frame.prev_has_label:
label_stack.pop()
frame.prev_has_label = False
label_and_chunk = next(frame.child_iterator, None)
if label_and_chunk:
label, chunk = label_and_chunk
if label:
label_stack.append(label)
frame.prev_has_label = True
yield '/'.join(label_stack), chunk
stack.append(StackFrame(False, chunk.labelled_children()))
else:
stack.pop()
def ParseRtxt(path):
"""Given an R.txt file, returns {id -> name}."""
pattern = re.compile(r'^.*? (\w+) 0x(.+)', re.MULTILINE)
with open(path, encoding='utf-8') as f:
data = f.read()
return {int(m.group(2), 16): m.group(1) for m in pattern.finditer(data)}
def _DumpArscChunks(arsc_data, names_by_id, dump_strings):
arsc_file = ArscFile(arsc_data)
package_id = None
for _, chunk in arsc_file.VisitPreOrder():
print(str(chunk))
if isinstance(chunk, ArscResTablePackage):
package_id = chunk.id
elif isinstance(chunk, ArscResTableTypeSpec) and names_by_id:
for i in range(chunk.entry_count):
res_id = package_id << 24 | chunk.id << 16 | i
print(f'- {chunk.type_str}:', names_by_id.get(res_id, '<unnamed>'))
elif isinstance(chunk, ArscStringPool) and dump_strings:
for i in range(chunk.string_count):
value = chunk.GetString(i)
if len(value) > 63:
value = value[:60] + '...'
print(f'- "{value}" size={chunk.GetEncodedSize(i) + 4}')
def main():
parser = argparse.ArgumentParser(description='Dump ARSC contents to stdout.')
parser.add_argument('--rtxt-path', help='R.txt that maps IDs -> names')
parser.add_argument('--strings', action='store_true')
parser.add_argument('input',
help='Input (.arsc, .apk, .jar, .zip) file path.')
args = parser.parse_args()
if not args.rtxt_path:
candidate = f'{args.input}.R.txt'
if os.path.exists(candidate):
args.rtxt_path = candidate
names_by_id = None
if args.rtxt_path:
names_by_id = ParseRtxt(args.rtxt_path)
if os.path.splitext(args.input)[1] in ('.apk', '.jar', '.zip'):
with zipfile.ZipFile(args.input) as z:
arsc_file_paths = [
f for f in z.namelist() if os.path.splitext(f)[1] == '.arsc'
]
if not arsc_file_paths:
print('Error: {} does not contain .arsc files.'.format(args.input))
sys.exit(1)
for path in arsc_file_paths:
_DumpArscChunks(z.read(path), names_by_id, args.strings)
else:
with open(args.input, 'rb') as fh:
_DumpArscChunks(fh.read(), names_by_id, args.strings)
if __name__ == '__main__':
main()