"""This script is used to analyze #include graphs.
It produces the .js file that accompanies include-analysis.html.
Usage:
$ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
$ autoninja -C out/Debug -v chrome | tee /tmp/build_log
$ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
--json-out=/tmp/include-analysis.js /tmp/build_log
(If you have reclient access, add use_reclient=true to the gn args, but not on
Windows due to crbug.com/1223741#c9)
The script takes roughly half an hour on a fast machine for the chrome build
target, which is considered fast enough for batch job purposes for now. It can
be sped up significantly by using multiple processes with the --proccesses option,
but it will also use significantly more memory as a result (OOM is a risk).
If --json-out is not provided, the script exits after printing some statistics
to stdout. This is significantly faster than generating the full JSON data. For
example:
$ autoninja -C out/Debug -v chrome | analyze_includes.py - 2>/dev/null
build_size 270237664463
"""
import argparse
import concurrent.futures
import functools
import json
import math
import os
import pathlib
import re
import sys
import unittest
from collections import defaultdict
from datetime import datetime, timezone
from itertools import islice
def parse_build(build_log, root_filter=None):
"""Parse the build_log (generated as in the Usage note above) to capture the
include graph. Returns a (roots, includes) pair, where roots is a list of root
nodes in the graph (the source files) and includes is a dict from filename to
list of filenames included by that filename."""
build_dir = '.'
file_stack = []
includes = {}
roots = set()
@functools.cache
def norm(fn):
x = fn.replace('\\\\', '\\')
p = pathlib.Path(os.path.join(build_dir, x)).resolve()
x = os.path.relpath(p)
return x.replace(os.path.sep, '/')
@functools.cache
def parse_include_line(line):
m = INCLUDE_RE.match(line)
if m:
depth = len(m.group(1))
filename = norm(m.group(2))
return filename, depth
ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
COMPILE_RE = re.compile(r'\[\d+/\d+\] (.*[/\\])?clang.* [/-]c (\S*)')
INCLUDE_RE = re.compile(r'(\.+) (.*)$')
skipping_root = False
for line in build_log:
if (parsed := parse_include_line(line)) and not parsed[0].endswith('.pcm'):
if skipping_root:
continue
prev_depth = len(file_stack) - 1
filename, depth = parsed
if filename not in includes:
includes[filename] = set()
if depth > prev_depth:
if sys.platform != 'win32':
assert depth == prev_depth + 1
elif depth > prev_depth + 1:
print('missing include under', file_stack[0])
continue
else:
del file_stack[-(prev_depth - depth + 1):]
includes[file_stack[-1]].add(filename)
file_stack.append(filename)
continue
if (m := COMPILE_RE.match(line)) and not m.group(2).endswith('.modulemap'):
skipping_root = False
filename = norm(m.group(2))
if root_filter and not root_filter.match(filename):
skipping_root = True
continue
roots.add(filename)
file_stack = [filename]
includes.setdefault(filename, set())
continue
if (m := ENTER_DIR_RE.match(line)):
build_dir = m.group(1)
continue
if line.startswith('['):
skipping_root = True
continue
return roots, includes
class TestParseBuild(unittest.TestCase):
def test_basic(self):
x = [
'ninja: Entering directory `out/foo\'',
'[1/3] clang -c ../../a.cc -o a.o',
'. ../../a.h',
'[2/3] clang -c gen/c.c -o a.o',
]
(roots, includes) = parse_build(x)
self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
self.assertEqual(set(includes.keys()),
set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
self.assertEqual(includes['a.cc'], set(['a.h']))
self.assertEqual(includes['a.h'], set())
self.assertEqual(includes['out/foo/gen/c.c'], set())
def test_more(self):
x = [
'ninja: Entering directory `out/foo\'',
'[20/99] clang -c ../../a.cc -o a.o',
'. ../../a.h',
'. ../../b.h',
'.. ../../c.h',
'... ../../d.h',
'. ../../e.h',
]
(roots, includes) = parse_build(x)
self.assertEqual(roots, set(['a.cc']))
self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
self.assertEqual(includes['b.h'], set(['c.h']))
self.assertEqual(includes['c.h'], set(['d.h']))
self.assertEqual(includes['d.h'], set())
self.assertEqual(includes['e.h'], set())
def test_multiple(self):
x = [
'ninja: Entering directory `out/foo\'',
'[123/234] clang -c ../../a.cc -o a.o',
'. ../../a.h',
'[124/234] clang -c ../../b.cc -o b.o',
'. ../../b.h',
]
(roots, includes) = parse_build(x)
self.assertEqual(roots, set(['a.cc', 'b.cc']))
self.assertEqual(includes['a.cc'], set(['a.h']))
self.assertEqual(includes['b.cc'], set(['b.h']))
def test_root_filter(self):
x = [
'ninja: Entering directory `out/foo\'',
'[9/100] clang -c ../../a.cc -o a.o',
'. ../../a.h',
'[10/100] clang -c ../../b.cc -o b.o',
'. ../../b.h',
]
(roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
self.assertEqual(roots, set(['a.cc']))
self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
self.assertEqual(includes['a.cc'], set(['a.h']))
def test_windows(self):
x = [
'ninja: Entering directory `out/foo\'',
'[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
'. ../../a.h',
'[2/3] clang-cl.exe /c gen/c.c /Foa.o',
]
(roots, includes) = parse_build(x)
self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
self.assertEqual(set(includes.keys()),
set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
self.assertEqual(includes['a.cc'], set(['a.h']))
self.assertEqual(includes['a.h'], set())
self.assertEqual(includes['out/foo/gen/c.c'], set())
def test_bindgen(self):
x = [
'ninja: Entering directory `out/foo\'',
'[123/234] clang -c ../../a.cc -o a.o',
'. ../../a.h',
'[124/234] bindgen -c ../../b.cc -o b.o',
'. ../../b.h',
'[125/234] clang -c ../../c.cc -o c.o',
'. ../../c.h',
]
(roots, includes) = parse_build(x)
self.assertEqual(roots, set(['a.cc', 'c.cc']))
self.assertEqual(includes['a.cc'], set(['a.h']))
self.assertEqual(includes['c.cc'], set(['c.h']))
def test_modules(self):
x = [
'ninja: Entering directory `out/foo\'',
'[123/234] clang -x c++ -Xclang -emit-module -c ../../a.modulemap -o a.pcm',
'[124/234] clang -fmodule-file=a.pcm -c ../../a.cc -o a.o',
'. a.pcm',
'. ../../a.h',
]
(roots, includes) = parse_build(x)
self.assertEqual(roots, {'a.cc'})
self.assertEqual(includes, {'a.cc': {'a.h'}, 'a.h': set()})
def post_order_nodes(root, child_nodes):
"""Generate the nodes reachable from root (including root itself) in
post-order traversal order. child_nodes maps each node to its children."""
visited = set()
def walk(n):
if n in visited:
return
visited.add(n)
for c in child_nodes[n]:
for x in walk(c):
yield x
yield n
return walk(root)
def compute_doms(root, includes):
"""Compute the dominators for all nodes reachable from root. Node A dominates
node B if all paths from the root to B go through A. Returns a dict from
filename to the set of dominators of that filename (including itself).
The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
"""
parent = {}
ancestor = {}
vertex = []
label = {}
semi = {}
pred = defaultdict(list)
bucket = defaultdict(list)
dom = {}
def dfs(v):
semi[v] = len(vertex)
vertex.append(v)
label[v] = v
for w in includes[v]:
if w not in semi:
parent[w] = v
dfs(w)
pred[w].append(v)
def compress(v):
if ancestor[v] in ancestor:
compress(ancestor[v])
if semi[label[ancestor[v]]] < semi[label[v]]:
label[v] = label[ancestor[v]]
ancestor[v] = ancestor[ancestor[v]]
def evaluate(v):
if v not in ancestor:
return v
compress(v)
return label[v]
def link(v, w):
ancestor[w] = v
dfs(root)
for w in reversed(vertex[1:]):
for v in pred[w]:
u = evaluate(v)
if semi[u] < semi[w]:
semi[w] = semi[u]
bucket[vertex[semi[w]]].append(w)
link(parent[w], w)
for v in bucket[parent[w]]:
u = evaluate(v)
dom[v] = u if semi[u] < semi[v] else parent[w]
bucket[parent[w]] = []
for w in vertex[1:]:
if dom[w] != vertex[semi[w]]:
dom[w] = dom[dom[w]]
all_doms = {}
all_doms[root] = {root}
def dom_set(node):
if node not in all_doms:
all_doms[node] = {node}
all_doms[node].update(dom_set(dom[node]))
return all_doms[node]
return {n: dom_set(n) for n in vertex}
def compute_added_sizes(args):
"""Helper to compute added sizes from the given root."""
roots, includes, sizes = args
added_sizes = {node: 0 for node in includes}
for root in roots:
doms = compute_doms(root, includes)
for node in doms:
if node not in sizes:
continue
for dom in doms[node]:
added_sizes[dom] += sizes[node]
return added_sizes
class TestComputeDoms(unittest.TestCase):
def test_basic(self):
includes = {}
includes[1] = [2]
includes[2] = [1]
includes[3] = [2]
includes[4] = [1]
includes[5] = [4, 3]
root = 5
doms = compute_doms(root, includes)
self.assertEqual(doms[1], set([5, 1]))
self.assertEqual(doms[2], set([5, 2]))
self.assertEqual(doms[3], set([5, 3]))
self.assertEqual(doms[4], set([5, 4]))
self.assertEqual(doms[5], set([5]))
def test_larger(self):
includes = {}
includes['a'] = ['d']
includes['b'] = ['a', 'd', 'e']
includes['c'] = ['f', 'g']
includes['d'] = ['l']
includes['e'] = ['h']
includes['f'] = ['i']
includes['g'] = ['i', 'j']
includes['h'] = ['k', 'e']
includes['i'] = ['k']
includes['j'] = ['i']
includes['k'] = ['i', 'r']
includes['l'] = ['h']
includes['r'] = ['a', 'b', 'c']
root = 'r'
doms = compute_doms(root, includes)
self.assertEqual(doms['a'], set(['a', 'r']))
self.assertEqual(doms['b'], set(['b', 'r']))
self.assertEqual(doms['c'], set(['c', 'r']))
self.assertEqual(doms['d'], set(['d', 'r']))
self.assertEqual(doms['e'], set(['e', 'r']))
self.assertEqual(doms['f'], set(['f', 'c', 'r']))
self.assertEqual(doms['g'], set(['g', 'c', 'r']))
self.assertEqual(doms['h'], set(['h', 'r']))
self.assertEqual(doms['i'], set(['i', 'r']))
self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
self.assertEqual(doms['k'], set(['k', 'r']))
self.assertEqual(doms['l'], set(['l', 'd', 'r']))
self.assertEqual(doms['r'], set(['r']))
def log(*args, **kwargs):
"""Log output to stderr."""
print(*args, file=sys.stderr, **kwargs)
def batched(iterable, n):
if n < 1:
raise ValueError('n must be at least one')
iterator = iter(iterable)
while batch := tuple(islice(iterator, n)):
yield batch
def analyze(target, revision, build_log_file, json_file, root_filter, processes=1):
log('Parsing build log...')
(roots, includes) = parse_build(build_log_file, root_filter)
log('Getting file sizes...')
sizes = {name: os.path.getsize(name) for name in includes}
log('Computing transitive sizes and prevalence...')
build_size = 0
trans_sizes = {name: 0 for name in includes}
prevalence = {name: 0 for name in includes}
for n in includes:
for node in post_order_nodes(n, includes):
trans_sizes[n] += sizes[node]
if n in roots:
prevalence[node] += 1
if n in roots:
build_size += trans_sizes[n]
print('build_size', build_size)
if json_file is None:
log('--json-out not set; exiting.')
return 0
log('Building reverse include map...')
included_by = {k: set() for k in includes}
for k in includes:
for i in includes[k]:
included_by[i].add(k)
log('Computing added sizes...')
augmented_includes = {}
for src in includes:
augmented_includes[src] = set()
for dst in includes[src]:
augmented_includes[src].add((src, dst))
augmented_includes[(src, dst)] = {dst}
if processes > 1:
added_sizes = {node: 0 for node in augmented_includes}
chunk_size = math.ceil(float(len(roots)) / processes)
chunked = list(batched(roots, chunk_size))
with concurrent.futures.ProcessPoolExecutor(max_workers=processes) as pool:
for computed_added_sizes in pool.map(
compute_added_sizes,
((chunk, augmented_includes, sizes) for chunk in chunked),
):
for dom, size in computed_added_sizes.items():
added_sizes[dom] += size
else:
added_sizes = compute_added_sizes((roots, augmented_includes, sizes))
names = []
name2nr = {}
for n in sorted(includes.keys()):
name2nr[n] = len(names)
names.append(n)
def nr(name):
return name2nr[name]
log('Writing output...')
json_file.write('data = ')
json.dump(
{
'target': target,
'revision': revision,
'date': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
'files': names,
'roots': [nr(x) for x in sorted(roots)],
'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
'included_by': [[nr(x) for x in sorted(included_by[n])] for n in names],
'sizes': [sizes[n] for n in names],
'tsizes': [trans_sizes[n] for n in names],
'asizes': [added_sizes[n] for n in names],
'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
for s in names],
'prevalence': [prevalence[n] for n in names],
}, json_file)
log('All done!')
def main():
result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
if len(result.failures) > 0 or len(result.errors) > 0:
return 1
parser = argparse.ArgumentParser(description='Analyze an #include graph.')
parser.add_argument('build_log',
type=argparse.FileType('r', errors='replace'),
help='The build log to analyze (- for stdin).')
parser.add_argument('--target',
help='The target that was built (e.g. chrome).')
parser.add_argument('--revision',
help='The revision that was built (e.g. 016588d4ee20).')
parser.add_argument(
'--json-out',
type=argparse.FileType('w'),
help='Write full analysis data to a JSON file (- for stdout).')
parser.add_argument('--root-filter',
help='Regex to filter which root files are analyzed.')
parser.add_argument('--processes',
action="store",
type=int,
default=1,
help="Use multiple processes to speed up the analysis - "
"note that this scales memory usage significantly")
args = parser.parse_args()
if args.json_out and not (args.target and args.revision):
print('error: --json-out requires both --target and --revision to be set')
return 1
try:
root_filter = re.compile(args.root_filter) if args.root_filter else None
except Exception:
print('error: --root-filter is not a valid regex')
return 1
analyze(args.target, args.revision, args.build_log, args.json_out,
root_filter, processes=args.processes)
if __name__ == '__main__':
sys.exit(main())