chromium_src/tools/clang/scripts/analyze_includes.py · OpenHarmony-TPC/chromium_src - AtomGit

#!/usr/bin/env python3
# Copyright 2021 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""This script is used to analyze #include graphs.

It produces the .js file that accompanies include-analysis.html.

Usage:

$ gn gen --args="show_includes=true symbol_level=0 enable_precompiled_headers=false" out/Debug
$ autoninja -C out/Debug -v chrome | tee /tmp/build_log
$ analyze_includes.py --target=chrome --revision=$(git rev-parse --short HEAD) \
    --json-out=/tmp/include-analysis.js /tmp/build_log

(If you have reclient access, add use_reclient=true to the gn args, but not on
Windows due to crbug.com/1223741#c9)

The script takes roughly half an hour on a fast machine for the chrome build
target, which is considered fast enough for batch job purposes for now. It can
be sped up significantly by using multiple processes with the --proccesses option,
but it will also use significantly more memory as a result (OOM is a risk).

If --json-out is not provided, the script exits after printing some statistics
to stdout. This is significantly faster than generating the full JSON data. For
example:

$ autoninja -C out/Debug -v chrome | analyze_includes.py - 2>/dev/null
build_size 270237664463
"""

import argparse
import concurrent.futures
import functools
import json
import math
import os
import pathlib
import re
import sys
import unittest
from collections import defaultdict
from datetime import datetime, timezone
from itertools import islice


def parse_build(build_log, root_filter=None):
  """Parse the build_log (generated as in the Usage note above) to capture the
  include graph. Returns a (roots, includes) pair, where roots is a list of root
  nodes in the graph (the source files) and includes is a dict from filename to
  list of filenames included by that filename."""
  build_dir = '.'
  file_stack = []
  includes = {}
  roots = set()

  # Note: A file might include different files for different compiler
  # invocations depending on -D flags. For such cases, includes[file] will be
  # the union of those includes.

  @functools.cache
  def norm(fn):
    x = fn.replace('\\\\', '\\')
    # Use Path.resolve() rather than path.realpath() to get the canonical
    # upper/lower-case version of the path on Windows.
    p = pathlib.Path(os.path.join(build_dir, x)).resolve()
    x = os.path.relpath(p)
    return x.replace(os.path.sep, '/')

  @functools.cache
  def parse_include_line(line):
    m = INCLUDE_RE.match(line)

    if m:
      depth = len(m.group(1))
      filename = norm(m.group(2))

      return filename, depth

  # ninja: Entering directory `out/foo'
  ENTER_DIR_RE = re.compile(r'ninja: Entering directory `(.*?)\'$')
  # [M/N] clang... -c foo.cc -o foo.o ...
  # [M/N] .../clang... -c foo.cc -o foo.o ...
  # [M/N] clang-cl.exe /c foo.cc /Fofoo.o ...
  # [M/N] ...\clang-cl.exe /c foo.cc /Fofoo.o ...
  COMPILE_RE = re.compile(r'\[\d+/\d+\] (.*[/\\])?clang.* [/-]c (\S*)')
  # . a.h
  # .. b.h
  # . c.h
  INCLUDE_RE = re.compile(r'(\.+) (.*)$')

  skipping_root = False

  for line in build_log:
    # TODO(https://crbug.com/435303792): Ignore precompiled modules (.pcm) until
    # an appropriate way to calculate their size for include analysis is found.
    if (parsed := parse_include_line(line)) and not parsed[0].endswith('.pcm'):
      if skipping_root:
        continue
      prev_depth = len(file_stack) - 1
      filename, depth = parsed
      if filename not in includes:
        includes[filename] = set()

      if depth > prev_depth:
        if sys.platform != 'win32':
          # TODO(crbug.com/40187759): Always assert.
          assert depth == prev_depth + 1
        elif depth > prev_depth + 1:
          # Until the bug is fixed, skip these includes.
          print('missing include under', file_stack[0])
          continue
      else:
        del file_stack[-(prev_depth - depth + 1):]

      includes[file_stack[-1]].add(filename)
      file_stack.append(filename)
      continue

    # Clang module compile .modulemap files, so skip that from include analysis.
    if (m := COMPILE_RE.match(line)) and not m.group(2).endswith('.modulemap'):
      skipping_root = False
      filename = norm(m.group(2))
      if root_filter and not root_filter.match(filename):
        skipping_root = True
        continue
      roots.add(filename)
      file_stack = [filename]
      includes.setdefault(filename, set())
      continue

    if (m := ENTER_DIR_RE.match(line)):
      build_dir = m.group(1)
      continue

    if line.startswith('['):
      # Some tool other than clang is running. Ignore its output.
      skipping_root = True
      continue

  return roots, includes


class TestParseBuild(unittest.TestCase):
  def test_basic(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[1/3] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[2/3] clang -c gen/c.c -o a.o',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
    self.assertEqual(set(includes.keys()),
                     set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['a.h'], set())
    self.assertEqual(includes['out/foo/gen/c.c'], set())

  def test_more(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[20/99] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '. ../../b.h',
        '.. ../../c.h',
        '... ../../d.h',
        '. ../../e.h',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc']))
    self.assertEqual(includes['a.cc'], set(['a.h', 'b.h', 'e.h']))
    self.assertEqual(includes['b.h'], set(['c.h']))
    self.assertEqual(includes['c.h'], set(['d.h']))
    self.assertEqual(includes['d.h'], set())
    self.assertEqual(includes['e.h'], set())

  def test_multiple(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[123/234] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[124/234] clang -c ../../b.cc -o b.o',
        '. ../../b.h',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'b.cc']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['b.cc'], set(['b.h']))

  def test_root_filter(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[9/100] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[10/100] clang -c ../../b.cc -o b.o',
        '. ../../b.h',
    ]
    (roots, includes) = parse_build(x, re.compile(r'^a.cc$'))
    self.assertEqual(roots, set(['a.cc']))
    self.assertEqual(set(includes.keys()), set(['a.cc', 'a.h']))
    self.assertEqual(includes['a.cc'], set(['a.h']))

  def test_windows(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[1/3] path\\clang-cl.exe /c ../../a.cc /Foa.o',
        '. ../../a.h',
        '[2/3] clang-cl.exe /c gen/c.c /Foa.o',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'out/foo/gen/c.c']))
    self.assertEqual(set(includes.keys()),
                     set(['a.cc', 'a.h', 'out/foo/gen/c.c']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['a.h'], set())
    self.assertEqual(includes['out/foo/gen/c.c'], set())

  def test_bindgen(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[123/234] clang -c ../../a.cc -o a.o',
        '. ../../a.h',
        '[124/234] bindgen -c ../../b.cc -o b.o',
        '. ../../b.h',
        '[125/234] clang -c ../../c.cc -o c.o',
        '. ../../c.h',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, set(['a.cc', 'c.cc']))
    self.assertEqual(includes['a.cc'], set(['a.h']))
    self.assertEqual(includes['c.cc'], set(['c.h']))

  def test_modules(self):
    x = [
        'ninja: Entering directory `out/foo\'',
        '[123/234] clang -x c++ -Xclang -emit-module -c ../../a.modulemap -o a.pcm',
        '[124/234] clang -fmodule-file=a.pcm -c ../../a.cc -o a.o',
        '. a.pcm',
        '. ../../a.h',
    ]
    (roots, includes) = parse_build(x)
    self.assertEqual(roots, {'a.cc'})
    self.assertEqual(includes, {'a.cc': {'a.h'}, 'a.h': set()})


def post_order_nodes(root, child_nodes):
  """Generate the nodes reachable from root (including root itself) in
  post-order traversal order. child_nodes maps each node to its children."""
  visited = set()

  def walk(n):
    if n in visited:
      return
    visited.add(n)

    for c in child_nodes[n]:
      for x in walk(c):
        yield x
    yield n

  return walk(root)


def compute_doms(root, includes):
  """Compute the dominators for all nodes reachable from root. Node A dominates
  node B if all paths from the root to B go through A. Returns a dict from
  filename to the set of dominators of that filename (including itself).

  The implementation follows the "simple" version of Lengauer & Tarjan "A Fast
  Algorithm for Finding Dominators in a Flowgraph" (TOPLAS 1979).
  """

  parent = {}
  ancestor = {}
  vertex = []
  label = {}
  semi = {}
  pred = defaultdict(list)
  bucket = defaultdict(list)
  dom = {}

  def dfs(v):
    semi[v] = len(vertex)
    vertex.append(v)
    label[v] = v

    for w in includes[v]:
      if w not in semi:
        parent[w] = v
        dfs(w)
      pred[w].append(v)

  def compress(v):
    if ancestor[v] in ancestor:
      compress(ancestor[v])
      if semi[label[ancestor[v]]] < semi[label[v]]:
        label[v] = label[ancestor[v]]
      ancestor[v] = ancestor[ancestor[v]]

  def evaluate(v):
    if v not in ancestor:
      return v
    compress(v)
    return label[v]

  def link(v, w):
    ancestor[w] = v

  # Step 1: Initialization.
  dfs(root)

  for w in reversed(vertex[1:]):
    # Step 2: Compute semidominators.
    for v in pred[w]:
      u = evaluate(v)
      if semi[u] < semi[w]:
        semi[w] = semi[u]

    bucket[vertex[semi[w]]].append(w)
    link(parent[w], w)

    # Step 3: Implicitly define the immediate dominator for each node.
    for v in bucket[parent[w]]:
      u = evaluate(v)
      dom[v] = u if semi[u] < semi[v] else parent[w]
    bucket[parent[w]] = []

  # Step 4: Explicitly define the immediate dominator for each node.
  for w in vertex[1:]:
    if dom[w] != vertex[semi[w]]:
      dom[w] = dom[dom[w]]

  # Get the full dominator set for each node.
  all_doms = {}
  all_doms[root] = {root}

  def dom_set(node):
    if node not in all_doms:
      # node's dominators is itself and the dominators of its immediate
      # dominator.
      all_doms[node] = {node}
      all_doms[node].update(dom_set(dom[node]))

    return all_doms[node]

  return {n: dom_set(n) for n in vertex}


def compute_added_sizes(args):
  """Helper to compute added sizes from the given root."""

  roots, includes, sizes = args
  added_sizes = {node: 0 for node in includes}

  for root in roots:
    doms = compute_doms(root, includes)

    for node in doms:
      if node not in sizes:
        # Skip the (src,dst) pseudo nodes.
        continue
      for dom in doms[node]:
        added_sizes[dom] += sizes[node]

  return added_sizes


class TestComputeDoms(unittest.TestCase):
  def test_basic(self):
    includes = {}
    includes[1] = [2]
    includes[2] = [1]
    includes[3] = [2]
    includes[4] = [1]
    includes[5] = [4, 3]
    root = 5

    doms = compute_doms(root, includes)

    self.assertEqual(doms[1], set([5, 1]))
    self.assertEqual(doms[2], set([5, 2]))
    self.assertEqual(doms[3], set([5, 3]))
    self.assertEqual(doms[4], set([5, 4]))
    self.assertEqual(doms[5], set([5]))

  def test_larger(self):
    # Fig. 1 in the Lengauer-Tarjan paper.
    includes = {}
    includes['a'] = ['d']
    includes['b'] = ['a', 'd', 'e']
    includes['c'] = ['f', 'g']
    includes['d'] = ['l']
    includes['e'] = ['h']
    includes['f'] = ['i']
    includes['g'] = ['i', 'j']
    includes['h'] = ['k', 'e']
    includes['i'] = ['k']
    includes['j'] = ['i']
    includes['k'] = ['i', 'r']
    includes['l'] = ['h']
    includes['r'] = ['a', 'b', 'c']
    root = 'r'

    doms = compute_doms(root, includes)

    # Fig. 2 in the Lengauer-Tarjan paper.
    self.assertEqual(doms['a'], set(['a', 'r']))
    self.assertEqual(doms['b'], set(['b', 'r']))
    self.assertEqual(doms['c'], set(['c', 'r']))
    self.assertEqual(doms['d'], set(['d', 'r']))
    self.assertEqual(doms['e'], set(['e', 'r']))
    self.assertEqual(doms['f'], set(['f', 'c', 'r']))
    self.assertEqual(doms['g'], set(['g', 'c', 'r']))
    self.assertEqual(doms['h'], set(['h', 'r']))
    self.assertEqual(doms['i'], set(['i', 'r']))
    self.assertEqual(doms['j'], set(['j', 'g', 'c', 'r']))
    self.assertEqual(doms['k'], set(['k', 'r']))
    self.assertEqual(doms['l'], set(['l', 'd', 'r']))
    self.assertEqual(doms['r'], set(['r']))


def log(*args, **kwargs):
  """Log output to stderr."""
  print(*args, file=sys.stderr, **kwargs)


# TODO: Use itertools.batched after updating the Python version on bots to 3.12.
def batched(iterable, n):
  # batched('ABCDEFG', 2) → AB CD EF G
  if n < 1:
    raise ValueError('n must be at least one')
  iterator = iter(iterable)
  while batch := tuple(islice(iterator, n)):
    yield batch


def analyze(target, revision, build_log_file, json_file, root_filter, processes=1):
  log('Parsing build log...')
  (roots, includes) = parse_build(build_log_file, root_filter)

  log('Getting file sizes...')
  sizes = {name: os.path.getsize(name) for name in includes}

  log('Computing transitive sizes and prevalence...')
  build_size = 0
  trans_sizes = {name: 0 for name in includes}
  prevalence = {name: 0 for name in includes}

  for n in includes:
    for node in post_order_nodes(n, includes):
      # Compute the transitive size of a file, i.e. the size of the
      # file itself and all its transitive includes.
      trans_sizes[n] += sizes[node]

      if n in roots:
        prevalence[node] += 1

    # Total build size is the sum of the transitive size of all roots.
    if n in roots:
      build_size += trans_sizes[n]

  print('build_size', build_size)

  if json_file is None:
    log('--json-out not set; exiting.')
    return 0

  # Map from file to files that include it.
  log('Building reverse include map...')
  included_by = {k: set() for k in includes}
  for k in includes:
    for i in includes[k]:
      included_by[i].add(k)

  log('Computing added sizes...')

  # Split each src -> dst edge in includes into src -> (src,dst) -> dst, so that
  # we can compute how much each include graph edge adds to the size by doing
  # dominance analysis on the (src,dst) nodes.
  augmented_includes = {}
  for src in includes:
    augmented_includes[src] = set()
    for dst in includes[src]:
      augmented_includes[src].add((src, dst))
      augmented_includes[(src, dst)] = {dst}

  if processes > 1:
    added_sizes = {node: 0 for node in augmented_includes}

    # Break up the list of roots into chunks based on the number of processes and pass them
    # to each worker. Giving each worker a complete chunk of roots to work on rather than
    # having workers pull as they go minimizes contention and gives better parallelization.
    chunk_size = math.ceil(float(len(roots)) / processes)
    chunked = list(batched(roots, chunk_size))

    with concurrent.futures.ProcessPoolExecutor(max_workers=processes) as pool:
      for computed_added_sizes in pool.map(
        compute_added_sizes,
        ((chunk, augmented_includes, sizes) for chunk in chunked),
      ):
        for dom, size in computed_added_sizes.items():
          added_sizes[dom] += size
  else:
    added_sizes = compute_added_sizes((roots, augmented_includes, sizes))

  # Assign a number to each filename for tighter JSON representation.
  names = []
  name2nr = {}
  for n in sorted(includes.keys()):
    name2nr[n] = len(names)
    names.append(n)

  def nr(name):
    return name2nr[name]

  log('Writing output...')

  # Provide a JS object for convenient inclusion in the HTML file.
  # If someone really wants a proper JSON file, maybe we can reconsider this.
  json_file.write('data = ')

  json.dump(
      {
          'target': target,
          'revision': revision,
          'date': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S UTC'),
          'files': names,
          'roots': [nr(x) for x in sorted(roots)],
          'includes': [[nr(x) for x in sorted(includes[n])] for n in names],
          'included_by': [[nr(x) for x in sorted(included_by[n])] for n in names],
          'sizes': [sizes[n] for n in names],
          'tsizes': [trans_sizes[n] for n in names],
          'asizes': [added_sizes[n] for n in names],
          'esizes': [[added_sizes[(s, d)] for d in sorted(includes[s])]
                     for s in names],
          'prevalence': [prevalence[n] for n in names],
      }, json_file)

  log('All done!')


def main():
  result = unittest.main(argv=sys.argv[:1], exit=False, verbosity=2).result
  if len(result.failures) > 0 or len(result.errors) > 0:
    return 1

  parser = argparse.ArgumentParser(description='Analyze an #include graph.')
  parser.add_argument('build_log',
                      type=argparse.FileType('r', errors='replace'),
                      help='The build log to analyze (- for stdin).')
  parser.add_argument('--target',
                      help='The target that was built (e.g. chrome).')
  parser.add_argument('--revision',
                      help='The revision that was built (e.g. 016588d4ee20).')
  parser.add_argument(
      '--json-out',
      type=argparse.FileType('w'),
      help='Write full analysis data to a JSON file (- for stdout).')
  parser.add_argument('--root-filter',
                      help='Regex to filter which root files are analyzed.')
  parser.add_argument('--processes',
                      action="store",
                      type=int,
                      default=1,
                      help="Use multiple processes to speed up the analysis - "
                           "note that this scales memory usage significantly")
  args = parser.parse_args()

  if args.json_out and not (args.target and args.revision):
    print('error: --json-out requires both --target and --revision to be set')
    return 1

  try:
    root_filter = re.compile(args.root_filter) if args.root_filter else None
  except Exception:
    print('error: --root-filter is not a valid regex')
    return 1

  analyze(args.target, args.revision, args.build_log, args.json_out,
          root_filter, processes=args.processes)


if __name__ == '__main__':
  sys.exit(main())