910e62b5创建于 1月15日历史提交
#!/usr/bin/env python3
# Copyright 2017 The Chromium Authors
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Parser for linker map files.

The format of a linker map file depends on the linker that generates it. This
file uses "coded linker name" to identify formats and variants:

  'gold': The gold linker (usage is being deprecated by Chrome).
  'lld_v0': LLD linker (no LTO), old format.
  'lld-lto_v0': LLD linker with ThinLTO, old format.
  'lld_v1': LLD linker (no LTO), new format.
  'lld-lto_v1': LLD linker with ThinLTO, new format.
"""

import argparse
import code
import collections
import gzip
import itertools
import logging
import os
import re
import readline
import sys

import archive_util
import demangle
import models

# About linker maps:
# * "Discarded input sections" include symbols merged with other symbols
#   (aliases), so the information there is not actually a list of unused things.
# * Linker maps include symbols that do not have names (with object path),
#   whereas "nm" skips over these (they don't account for much though).
# * The parse time for compressed linker maps is dominated by ungzipping.

_STRIP_NAME_PREFIX = {
    models.FLAG_STARTUP: 8,
    models.FLAG_UNLIKELY: 9,
    models.FLAG_REL_LOCAL: 10,
    models.FLAG_REL: 4,
    models.FLAG_HOT: 4,
}

# Sections that we want to create individual symbols from.
_USEFUL_SECTIONS = frozenset(models.BSS_SECTIONS + (
    models.SECTION_DATA,
    models.SECTION_DATA_REL_RO,
    models.SECTION_DATA_REL_RO_LOCAL,
    models.SECTION_RODATA,
    models.SECTION_TDATA,
    models.SECTION_TEXT,
))

def _OpenMaybeGzAsText(path):
  """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
  if path.endswith('.gz'):
    return gzip.open(path, 'rt')
  return open(path, 'rt')


def _FlagsFromMangledName(name):
  # Currently, lld map files have section = '.text.startup' and put the symbol
  # name in the section break-down ("level 3 symbols").
  if name.startswith('startup.') or name == 'startup':
    return models.FLAG_STARTUP
  if name.startswith('unlikely.'):
    return models.FLAG_UNLIKELY
  if name.startswith('rel.local.'):
    return models.FLAG_REL_LOCAL
  if name.startswith('rel.'):
    return models.FLAG_REL
  if name.startswith('hot.'):
    return models.FLAG_HOT
  return 0


def _NormalizeName(name):
  # Outlined functions have names like OUTLINED_FUNCTION_0, which can
  # appear 1000+ time, and can cause false aliasing. We treat these as
  # special cases by designating them as a placeholder symbols and
  # renaming them to '** outlined function'.
  if name.startswith('OUTLINED_FUNCTION_'):
    return '** outlined function'
  if name.startswith('.L.str'):
    return models.STRING_LITERAL_NAME
  if name.endswith(' (.cfi)'):
    return name[:-7]
  return name


class MapFileParserGold:
  """Parses a linker map file from gold linker."""
  # Map file writer for gold linker:
  # https://github.com/gittup/binutils/blob/HEAD/gold/mapfile.cc

  def __init__(self):
    self._common_symbols = []
    self._symbols = []
    self._section_ranges = {}
    self._lines = None

  def Parse(self, lines):
    """Parses a linker map file.

    Args:
      lines: Iterable of lines, the first of which has been consumed to
      identify file type.

    Returns:
      A tuple of (section_ranges, symbols, extras).
    """
    self._lines = iter(lines)
    logging.debug('Scanning for Header')

    while True:
      line = self._SkipToLineWithPrefix('Common symbol', 'Memory map')
      if line.startswith('Common symbol'):
        self._common_symbols = self._ParseCommonSymbols()
        logging.debug('.bss common entries: %d', len(self._common_symbols))
        continue
      if line.startswith('Memory map'):
        self._ParseSections()
      break
    return self._section_ranges, self._symbols, {}

  def _SkipToLineWithPrefix(self, prefix, prefix2=None):
    for l in self._lines:
      if l.startswith(prefix) or (prefix2 and l.startswith(prefix2)):
        return l
    return None

  def _ParsePossiblyWrappedParts(self, line, count):
    parts = line.split(None, count - 1)
    if not parts:
      return None
    if len(parts) != count:
      line = next(self._lines)
      parts.extend(line.split(None, count - len(parts) - 1))
      assert len(parts) == count, 'parts: ' + ' '.join(parts)
    parts[-1] = parts[-1].rstrip()
    return parts

  def _ParseCommonSymbols(self):
    # Common symbol       size              file
    #
    # ff_cos_131072       0x40000           obj/third_party/<snip>
    # ff_cos_131072_fixed
    #                     0x20000           obj/third_party/<snip>
    ret = []
    next(self._lines)  # Skip past blank line

    name, size_str, path = None, None, None
    for l in self._lines:
      parts = self._ParsePossiblyWrappedParts(l, 3)
      if not parts:
        break
      name, size_str, path = parts
      sym = models.Symbol(
          models.SECTION_BSS,
          int(size_str[2:], 16),
          full_name=name,
          object_path=path)
      ret.append(sym)
    return ret

  def _ParseSections(self):
    # .text           0x0028c600  0x22d3468
    #  .text.startup._GLOBAL__sub_I_bbr_sender.cc
    #                 0x0028c600       0x38 obj/net/net/bbr_sender.o
    #  .text._reset   0x00339d00       0xf0 obj/third_party/icu/icuuc/ucnv.o
    #  ** fill        0x0255fb00   0x02
    #  .text._ZN4base8AutoLockD2Ev
    #                 0x00290710        0xe obj/net/net/file_name.o
    #                 0x00290711                base::AutoLock::~AutoLock()
    #                 0x00290711                base::AutoLock::~AutoLock()
    # .text._ZNK5blink15LayoutBlockFlow31mustSeparateMarginAfterForChildERK...
    #                 0xffffffffffffffff       0x46 obj/...
    #                 0x006808e1                blink::LayoutBlockFlow::...
    # .text.OUTLINED_FUNCTION_0
    #                 0x002a2000       0x20 obj/net/net/tag.o
    # .bss
    #  .bss._ZGVZN11GrProcessor11initClassIDI10LightingFPEEvvE8kClassID
    #                 0x02d4b294        0x4 obj/skia/skia/SkLightingShader.o
    #                 0x02d4b294   guard variable for void GrProcessor::ini...
    # .data           0x0028c600  0x22d3468
    #  .data.rel.ro._ZTVN3gvr7android19ScopedJavaGlobalRefIP12_jfloatArrayEE
    #                 0x02d1e668       0x10 ../third_party/.../libfoo.a(bar.o)
    #                 0x02d1e668   vtable for gvr::android::GlobalRef<_jflo...
    #  ** merge strings
    #                 0x0255fb00   0x1f2424
    #  ** merge constants
    #                 0x0255fb00   0x8
    # ** common       0x02db5700   0x13ab48
    syms = self._symbols
    while True:
      line = self._SkipToLineWithPrefix('.')
      if not line:
        break
      section_name = None
      prev_section_name = None
      try:
        # Parse section name and size.
        parts = self._ParsePossiblyWrappedParts(line, 3)
        if not parts:
          break
        section_name, section_address_str, section_size_str = parts
        section_address = int(section_address_str[2:], 16)

        # .debug sections have address=0, and always come at the end.
        if syms and section_address == 0:
          logging.info('Stopped parsing at %s', section_name)
          break

        section_size = int(section_size_str[2:], 16)

        # E.g. Merge user-defined sections. e.g.: malloc_hook, protected_memory.
        if not section_name.startswith('.'):
          logging.info('Merged %s into %s', section_name, prev_section_name)
          section_name = prev_section_name
          archive_util.ExtendSectionRangeAdjacent(section_ranges, section_name,
                                                  section_address, section_size)
        else:
          prev_section_name = section_name
          self._section_ranges[section_name] = (section_address, section_size)

        if section_name in _USEFUL_SECTIONS:
          logging.info('Parsing %s', section_name)
          if section_name in models.BSS_SECTIONS:
            # Common symbols have no address.
            syms.extend(self._common_symbols)
          prefix_len = len(section_name) + 1  # + 1 for the trailing .
          symbol_gap_count = 0
          merge_symbol_start_address = section_address
          sym_count_at_start = len(syms)
          line = next(self._lines)
          # Parse section symbols.
          while True:
            if not line or line.isspace():
              break
            if line.startswith(' **'):
              zero_index = line.find('0')
              if zero_index == -1:
                # Line wraps.
                name = line.strip()
                line = next(self._lines)
              else:
                # Line does not wrap.
                name = line[:zero_index].strip()
                line = line[zero_index:]
              address_str, size_str = self._ParsePossiblyWrappedParts(line, 2)
              line = next(self._lines)
              # These bytes are already accounted for.
              if name == '** common':
                continue
              address = int(address_str[2:], 16)
              size = int(size_str[2:], 16)
              path = None
              sym = models.Symbol(section_name, size, address=address,
                                  full_name=name, object_path=path)
              syms.append(sym)
              if merge_symbol_start_address > 0:
                merge_symbol_start_address += size
            else:
              # A normal symbol entry.
              subsection_name, address_str, size_str, path = (
                  self._ParsePossiblyWrappedParts(line, 4))
              size = int(size_str[2:], 16)
              assert subsection_name.startswith(section_name), (
                  'subsection name was: ' + subsection_name)
              mangled_name = subsection_name[prefix_len:]
              name = None
              address_str2 = None
              while True:
                line = next(self._lines).rstrip()
                if not line or line.startswith(' .'):
                  break
                # clang includes ** fill, but gcc does not.
                if line.startswith(' ** fill'):
                  # Alignment explicitly recorded in map file. Rather than
                  # record padding based on these entries, we calculate it
                  # using addresses. We do this because fill lines are not
                  # present when compiling with gcc (only for clang).
                  continue
                if line.startswith(' **'):
                  break
                if name is None:
                  address_str2, name = self._ParsePossiblyWrappedParts(line, 2)

              if address_str == '0xffffffffffffffff':
                # The section needs special handling (e.g., a merge section)
                # It also generally has a large offset after it, so don't
                # penalize the subsequent symbol for this gap (e.g. a 50kb gap).
                # There seems to be no corelation between where these gaps occur
                # and the symbols they come in-between.
                # TODO(agrieve): Learn more about why this happens.
                if address_str2:
                  address = int(address_str2[2:], 16) - 1
                elif syms and syms[-1].address > 0:
                  # Merge sym with no second line showing real address.
                  address = syms[-1].end_address
                else:
                  logging.warning('First symbol of section had address -1')
                  address = 0

                merge_symbol_start_address = address + size
              else:
                address = int(address_str[2:], 16)
                # Finish off active address gap / merge section.
                if merge_symbol_start_address:
                  merge_size = address - merge_symbol_start_address
                  merge_symbol_start_address = 0
                  if merge_size > 0:
                    # merge_size == 0 for the initial symbol generally.
                    logging.debug('Merge symbol of size %d found at:\n  %r',
                                  merge_size, syms[-1])
                    # Set size=0 so that it will show up as padding.
                    sym = models.Symbol(
                        section_name, 0,
                        address=address,
                        full_name='** symbol gap %d' % symbol_gap_count)
                    symbol_gap_count += 1
                    syms.append(sym)

              #  .text.res_findResource_60
              #                 0x00178de8       0x12a obj/...
              #                 0x00178de9                res_findResource_60
              #  .text._ZN3url6ParsedC2Ev
              #                 0x0021ad62       0x2e obj/url/url/url_parse.o
              #                 0x0021ad63                url::Parsed::Parsed()
              #  .text.unlikely._ZN4base3CPUC2Ev
              #                 0x003f9d3c       0x48 obj/base/base/cpu.o
              #                 0x003f9d3d                base::CPU::CPU()
              full_name = name or mangled_name
              if mangled_name and (not name or mangled_name.startswith('_Z') or
                                   '._Z' in mangled_name):
                full_name = mangled_name

              flags = _FlagsFromMangledName(mangled_name)
              if full_name:
                if flags:
                  full_name = full_name[_STRIP_NAME_PREFIX[flags]:]
                else:
                  full_name = _NormalizeName(full_name)

              sym = models.Symbol(section_name, size, address=address,
                                  full_name=full_name, object_path=path,
                                  flags=flags)
              syms.append(sym)
          logging.debug('Symbol count for %s: %d', section_name,
                        len(syms) - sym_count_at_start)
      except:
        logging.error('Problem line: %r', line)
        logging.error('In section: %r', section_name)
        raise


class MapFileParserLld:
  """Parses a linker map file from LLD."""
  # Map file writer for LLD linker (for ELF):
  # https://github.com/llvm-mirror/lld/blob/HEAD/ELF/MapFile.cpp
  _LINE_RE_V0 = re.compile(r'([0-9a-f]+)\s+([0-9a-f]+)\s+(\d+) ( *)(.*)')
  _LINE_RE_V1 = re.compile(
      r'\s*[0-9a-f]+\s+([0-9a-f]+)\s+([0-9a-f]+)\s+(\d+) ( *)(.*)')
  _LINE_RE = [_LINE_RE_V0, _LINE_RE_V1]

  def __init__(self, linker_name):
    self._linker_name = linker_name
    self._common_symbols = []
    self._section_ranges = {}

  @staticmethod
  def ParseArmAnnotations(tok):
    """Decides whether a Level 3 token is an annotation.

    Returns:
      A 2-tuple (is_annotation, next_thumb2_mode):
        is_annotation: Whether |tok| is an annotation.
        next_thumb2_mode: New |thumb2_mode| value, or None if keep old value.
    """
    # Annotations for ARM match '$t', '$d.1', but not '$_21::invoke'.
    if tok.startswith('$') and (len(tok) == 2 or
                                (len(tok) >= 3 and tok[2] == '.')):
      if tok.startswith('$t'):
        return True, True  # Is annotation, enter Thumb2 mode.
      if tok.startswith('$a'):
        return True, False  # Is annotation, enter ARM32 mode.
      return True, None  # Is annotation, keep old |thumb2_mode| value.
    return False, None  # Not annotation, keep old |thumb2_mode| value.

  def Tokenize(self, lines):
    """Generator to filter and tokenize linker map lines."""
    # Extract e.g., 'lld_v0' -> 0, or 'lld-lto_v1' -> 1.
    map_file_version = int(self._linker_name.split('_v')[1])
    pattern = MapFileParserLld._LINE_RE[map_file_version]

    # A Level 3 symbol can have |size == 0| in some situations (e.g., assembly
    # code symbols). To provided better size estimates in this case, the "span"
    # of a Level 3 symbol is computed as:
    #  (A) The |address| difference compared to the next Level 3 symbol.
    #  (B) If the Level 3 symbol is the last among Level 3 lines nested under a
    #      Level 2 line: The difference between the Level 3 symbol's |address|
    #      and the containing Level 2 line's end address.
    # To handle (A), |lines| is visited using a one-step lookahead, using
    # |sentinel| to handle the last line. To handle (B), |level2_end_address| is
    # computed for each Level 2 line.
    sentinel = '0 0 0 0 THE_END'
    assert pattern.match(sentinel)
    level2_end_address = None
    thumb2_mode = False
    (line, address, size, level, tok) = (None, None, None, None, None)
    for next_line in itertools.chain(lines, (sentinel,)):
      m = pattern.match(next_line)
      if m is None:
        continue
      next_address = int(m.group(1), 16)
      next_size = int(m.group(2), 16)
      next_level = (len(m.group(4)) // 8) + 1  # Add 1 to agree with comments.
      next_tok = m.group(5)

      if next_level == 3:
        assert level >= 2, 'Cannot jump from Level 1 to Level 3.'
        # Detect annotations. If found, maybe update |thumb2_mode|, then skip.
        (is_annotation, next_thumb2_mode) = (
            MapFileParserLld.ParseArmAnnotations(next_tok))
        if is_annotation:
          if next_thumb2_mode:
            thumb2_mode = next_thumb2_mode
          continue  # Skip annotations.
        if thumb2_mode:
          # Adjust odd address to even. Alignment is not guanteed for all
          # symbols (e.g., data, or x86), so this is judiciously applied.
          next_address &= ~1
      else:
        thumb2_mode = False  # Resets on leaving Level 3.

      if address is not None:
        span = None
        if level == 3:
          span = next_address if next_level == 3 else level2_end_address
          span -= address
        elif level == 2:
          level2_end_address = address + size
        yield (line, address, size, level, span, tok)

      line = next_line
      address = next_address
      size = next_size
      level = next_level
      tok = next_tok

  def Parse(self, lines):
    """Parses a linker map file.

    Args:
      lines: Iterable of lines, the first of which has been consumed to
      identify file type.

    Returns:
      A tuple of (section_ranges, symbols).
    """
    # Newest format:
    #     VMA      LMA     Size Align Out     In      Symbol
    #     194      194       13     1 .interp
    #     194      194       13     1         <internal>:(.interp)
    #     1a8      1a8     22d8     4 .ARM.exidx
    #     1b0      1b0        8     4         obj/sandbox/syscall.o:(.ARM.exidx)
    #     400      400   123400    64 .text
    #     600      600       14     4         ...:(.text.OUTLINED_FUNCTION_0)
    #     600      600        0     1                 $x.3
    #     600      600       14     1                 OUTLINED_FUNCTION_0
    #    3f00     3f00      700     4 malloc_hook
    #    3f00     3f00      700     1         ...:o:(malloc_hook.foo)
    #    3f00     3f00      700     1                 foo (.llvm.1234)
    #  123800   123800    20000   256 .rodata
    #  123800   123800        4     4         ...:o:(.rodata._ZN3fooE.llvm.1234)
    #  123800   123800        4     1                 foo (.llvm.1234)
    #  123804   123804        4     4         ...:o:(.rodata.bar.llvm.1234)
    #  123804   123804        4     1                 bar.llvm.1234
    # Older format:
    # Address          Size             Align Out     In      Symbol
    # 00000000002002a8 000000000000001c     1 .interp
    # 00000000002002a8 000000000000001c     1         <internal>:(.interp)
    # ...
    # 0000000000201000 0000000000000202    16 .text
    # 0000000000201000 000000000000002a     1         /[...]/crt1.o:(.text)
    # 0000000000201000 0000000000000000     0                 _start
    # 000000000020102a 0000000000000000     1         /[...]/crti.o:(.text)
    # 0000000000201030 00000000000000bd    16         /[...]/crtbegin.o:(.text)
    # 0000000000201030 0000000000000000     0             deregister_tm_clones
    # 0000000000201060 0000000000000000     0             register_tm_clones
    # 00000000002010a0 0000000000000000     0             __do_global_dtors_aux
    # 00000000002010c0 0000000000000000     0             frame_dummy
    # 00000000002010ed 0000000000000071     1         a.o:(.text)
    # 00000000002010ed 0000000000000071     0             main
    syms = []
    cur_section = None
    cur_section_is_useful = False
    promoted_name_count = 0
    # |is_partial| indicates that an eligible Level 3 line should be used to
    # update |syms[-1].full_name| instead of creating a new symbol.
    is_partial = False
    # Assembly code can create consecutive Level 3 lines with |size == 0|. These
    # lines can represent
    #  (1) assembly functions (should form symbol), or
    #  (2) assembly labels (should NOT form symbol).
    # It seems (2) correlates with the presence of a leading Level 3 line with
    # |size > 0|. This gives rise to the following strategy: Each symbol S from
    # a Level 3 line suppresses Level 3 lines with |address| less than
    # |next_usable_address := S.address + S.size|.
    next_usable_address = 0

    # For Thin-LTO, a map from each address to the Thin-LTO cache file. This
    # provides hints downstream to identify object_paths for .L.ref.tmp symbols,
    # but is not useful in the final output. Therefore it's stored separately,
    # instead of being in Symbol.
    thin_map = {}

    tokenizer = self.Tokenize(lines)

    in_jump_table = False
    jump_tables_count = 0
    jump_entries_count = 0
    prev_section_end = 0
    prev_section_name = None

    for (line, address, size, level, span, tok) in tokenizer:
      # Level 1 data match the "Out" column. They specify sections or
      # PROVIDE_HIDDEN lines.
      if level == 1:
        # .debug sections have address=0, and always come at the end.
        # Once we've hit a partition, we've finished the main library.
        # Ideally we'd also break down symbols in partitions, but we're likely
        # to stop using them soon anyways.
        if (syms and address == 0 or tok.endswith('_partition')
            or tok.startswith('PROVIDE_HIDDEN')):
          logging.info('Stopped parsing at %s', tok)
          break

        cur_section = tok
        assert address >= prev_section_end, (
            f'Section {cur_section} has start address within previous section: '
            f'{address}\n{self._section_ranges}')

        # E.g. Merge user-defined sections. e.g.: malloc_hook, protected_memory.
        if not cur_section.startswith('.'):
          logging.info('Merged %s into %s', cur_section, prev_section_name)
          cur_section = prev_section_name
          archive_util.ExtendSectionRangeAdjacent(self._section_ranges,
                                                  cur_section, address, size)
        else:
          prev_section_name = cur_section
          self._section_ranges[cur_section] = (address, size)

        if cur_section not in models.BSS_SECTIONS:
          prev_section_end = address + size

        # E.g., Want to convert "(.text._name)" -> "_name" later.
        mangled_start_idx = len(cur_section) + 2
        cur_section_is_useful = cur_section in _USEFUL_SECTIONS

      elif cur_section_is_useful:
        # Level 2 data match the "In" column. They specify object paths and
        # section names within objects, or '<internal>:...'.
        if level == 2:
          # E.g., 'path.o:(.text._name)' => ['path.o', '(.text._name)'].
          cur_obj, paren_value = tok.split(':')

          in_jump_table = '.L.cfi.jumptable' in paren_value
          if in_jump_table:
            # Store each CFI jump table as a Level 2 symbol, whose Level 3
            # details are discarded.
            jump_tables_count += 1
            cur_obj = ''  # Replaces 'lto.tmp' to prevent problem later.
            mangled_name = '** CFI jump table'
          else:
            # E.g., '(.text.unlikely._name)' -> '_name'.
            mangled_name = paren_value[mangled_start_idx:-1]
            cur_flags = _FlagsFromMangledName(mangled_name)
            is_partial = True
            # As of 2017/11 LLD does not distinguish merged strings from other
            # merged data. Feature request is filed under:
            # https://bugs.llvm.org/show_bug.cgi?id=35248
            if cur_obj == '<internal>':
              if cur_section == '.rodata':
                # Treat all <internal> sections within .rodata as as string
                # literals. Some may hold numeric constants or other data, but
                # there is currently no way to distinguish them.
                mangled_name = '** lld merge strings'
              else:
                # e.g. <internal>:(.text.thunk)
                mangled_name = '** ' + paren_value.strip('()')

              is_partial = False
              cur_obj = None
            elif (cur_obj == 'lto.tmp' or 'thinlto-cache' in cur_obj
                  or '.lto.' in cur_obj):
              thin_map[address] = os.path.basename(cur_obj)
              cur_obj = None

          # Create a symbol here since there may be no ensuing Level 3 lines.
          # But if there are, then the symbol can be modified later as sym[-1].
          sym = models.Symbol(cur_section, size, address=address,
                              full_name=mangled_name, object_path=cur_obj,
                              flags=cur_flags)
          syms.append(sym)

          # Level 3 |address| is nested under Level 2, don't add |size|.
          next_usable_address = address

        # Level 3 data match the "Symbol" column. They specify symbol names or
        # special names such as '.L_MergeGlobals'. Annotations such as '$d',
        # '$t.42' also appear at Level 3, but they are consumed by |tokenizer|,
        # so don't appear hear.
        elif level == 3:
          # Handle .L.cfi.jumptable.
          if in_jump_table:
            # Level 3 entries in CFI jump tables are thunks with mangled names.
            # Extracting them as symbols is not worthwhile; we only store the
            # Level 2 symbol, and print the count for verbose output. For
            # counting, '__typeid_' entries are excluded since they're likely
            # just annotations.
            if not tok.startswith('__typeid_'):
              jump_entries_count += 1
            continue

          # Ignore anything with '.L_MergedGlobals' prefix. This seems to only
          # happen for ARM (32-bit) builds.
          if tok.startswith('.L_MergedGlobals'):
            continue

          # Use |span| to decide whether to use a Level 3 line for Symbols. This
          # is useful for two purposes:
          # * This is a better indicator than |size|, which can be 0 for
          #   assembly functions.
          # * If multiple Level 3 lines have the same starting address, this
          #   cause all but the last line to have |span > 0|. This dedups lines
          #   with identical symbol names (why do they exist?). Note that this
          #   also skips legitimate aliases, but that's desired because nm.py
          #   (downstream) assumes no aliases already exist.
          if span > 0:
            stripped_tok = demangle.StripLlvmPromotedGlobalNames(tok)
            if len(tok) != len(stripped_tok):
              promoted_name_count += 1
              tok = stripped_tok
            tok = _NormalizeName(tok)

            # Handle special case where a partial symbol consumes bytes before
            # the first Level 3 symbol.
            if is_partial and syms[-1].address < address:
              # Truncate the partial symbol and leave it without |full_name|.
              # The data from the current line will form a new symbol.
              syms[-1].size = address - syms[-1].address
              next_usable_address = address
              is_partial = False

            if is_partial:
              syms[-1].full_name = tok
              syms[-1].size = size if size > 0 else min(syms[-1].size, span)
              next_usable_address = address + syms[-1].size
              is_partial = False
            elif address >= next_usable_address:
              if tok.startswith('__typeid_'):
                assert size == 1
                if tok.endswith('_byte_array'):
                  # CFI byte array table: |size| is inaccurate, so use |span|.
                  size_to_use = span
                else:
                  # Likely '_global_addr' or '_unique_member'. These should be:
                  # * Skipped since they're in CFI tables.
                  # * Suppressed (via |next_usable_address|) by another Level 3
                  #   symbol.
                  # Anything that makes it here would be an anomaly worthy of
                  # investigation, so print warnings.
                  logging.warning('Unrecognized __typeid_ symbol at %08X',
                                  address)
                  continue
              else:
                # Prefer |size|, and only fall back to |span| if |size == 0|.
                size_to_use = size if size > 0 else span
              sym = models.Symbol(cur_section, size_to_use, address=address,
                                  full_name=tok, flags=cur_flags)
              syms.append(sym)

              # Suppress symbols with overlapping |address|. This eliminates
              # labels from assembly sources.
              next_usable_address = address + size_to_use
              if cur_obj is not None:
                syms[-1].object_path = cur_obj

        else:
          logging.error('Problem line: %r', line)

    if promoted_name_count:
      logging.info('Found %d promoted global names', promoted_name_count)
    if jump_tables_count:
      logging.info('Found %d CFI jump tables with %d total entries',
                   jump_tables_count, jump_entries_count)
    return self._section_ranges, syms, {'thin_map': thin_map}


def _DetectLto(lines):
  """Scans LLD linker map file and returns whether LTO was used."""
  # It's assumed that the first line in |lines| was consumed to determine that
  # LLD was used. Seek 'thinlto-cache' prefix or the string '.lto' within an
  # "indicator section" as indicator for LTO.
  found_indicator_section = False
  # Potential names of "main section". Only one gets used.
  indicator_section_set = set(['.rodata', '.ARM.exidx'])
  start_pos = -1
  for line in lines:
    # Shortcut to avoid regex: The first line seen (second line in file) should
    # start a section, and start with '.', e.g.:
    #     194      194       13     1 .interp
    # Assign |start_pos| as position of '.', and trim everything before!
    if start_pos < 0:
      start_pos = line.index('.')
    if len(line) < start_pos:
      continue
    line = line[start_pos:]
    tok = line.lstrip()  # Allow whitespace at right.
    indent_size = len(line) - len(tok)
    if indent_size == 0:  # Section change.
      if found_indicator_section:  # Exit if just visited "main section".
        break
      if tok.strip() in indicator_section_set:
        found_indicator_section = True
    elif indent_size == 8:
      if found_indicator_section:
        if tok.startswith('thinlto-cache') or '.lto.' in tok:
          return True
  return False


def _DetectLinkerName(lines):
  """Heuristic linker detection from partial scan of the linker map.

  Args:
    lines: Iterable of lines from the linker map.

  Returns:
    A coded linker name.
  """
  first_line = next(lines)

  if first_line.startswith('Address'):
    return 'lld-lto_v0' if _DetectLto(lines) else 'lld_v0'

  if first_line.lstrip().startswith('VMA'):
    return 'lld-lto_v1' if _DetectLto(lines) else 'lld_v1'

  if first_line.startswith('Archive member'):
    return 'gold'

  raise Exception('Invalid map file: ' + first_line)


def ParseLines(lines):
  """Parses a linker map file given an iterable of its lines.

  Returns:
    A tuple of (section_ranges, symbols, extras).
  """
  # Buffer 1000 lines for format detection.
  header = list(itertools.islice(lines, 1000))
  lines = itertools.chain(header, lines)
  linker_name = _DetectLinkerName(iter(header))
  logging.info('Detected map file of type %s', linker_name)
  if linker_name.startswith('lld'):
    inner_parser = MapFileParserLld(linker_name)
  elif linker_name == 'gold':
    inner_parser = MapFileParserGold()
  else:
    raise Exception('.map file is from a unsupported linker.')

  next(lines)  # Consume the first line of headers.
  section_ranges, syms, extras = inner_parser.Parse(lines)
  for sym in syms:
    if sym.object_path and not sym.object_path.endswith(')'):
      # Don't want '' to become '.'.
      # Thin archives' paths will get fixed in |ar.CreateThinObjectPath|.
      sym.object_path = os.path.normpath(sym.object_path)
  return section_ranges, syms, extras


def ParseFile(path):
  """Parses a linker map file pointed to by |path|.

  Returns:
    A tuple of (section_ranges, symbols, extras).
  """
  with _OpenMaybeGzAsText(path) as f:
    return ParseLines(f)


def DeduceObjectPathsFromThinMap(raw_symbols, extras):
  """Uses Thin-LTO object paths to find object_paths of symbols. """
  thin_map = extras.get('thin_map', None)  # |address| -> |thin_obj|
  if not thin_map:  # None or empty.
    logging.info('No thin-object-path found: Skipping object path deduction.')
    return

  # Build map of |thin_obj| -> |object_paths|.
  thin_obj_to_object_paths = collections.defaultdict(set)
  logging.info('Building map of thin-object-path -> object path.')
  for symbol in raw_symbols:
    if symbol.object_path:
      thin_obj = thin_map.get(symbol.address, None)
      if thin_obj:
        thin_obj_to_object_paths[thin_obj].add(symbol.object_path)

  # For each symbol without |object_path|, translate |address| -> |thin_obj| ->
  # |object_paths|. If unique, then assign to symbol. Stats are kept, keyed on
  # |len(object_paths)|.
  # Example symbols this happens with: ".L.ref.tmp", "** outlined function".
  logging.info('Assigning object paths to using ThinLTO paths.')
  ref_tmp_popu = [0] * 3
  ref_tmp_pss = [0] * 3
  for symbol in raw_symbols:
    if not symbol.object_path:
      thin_obj = thin_map.get(symbol.address)
      # Ignore non-native symbols.
      if thin_obj:
        count = 0
        object_paths = thin_obj_to_object_paths.get(thin_obj)
        if object_paths is not None:
          count = min(len(object_paths), 2)  # 2+ maps to 2.
          # We could create path aliases when count > 1, but it wouldn't
          # necessarily be correct. That occurs when *another* symbol from the
          # same .o file contains a path alias, but not necessarily this symbol.
          if count == 1:
            symbol.object_path = next(iter(object_paths))
        ref_tmp_popu[count] += 1
        ref_tmp_pss[count] += symbol.pss

  # As of Mar 2019:
  #   No match: 2 symbols with total PSS = 20
  #   Assigned (1 object path): 1098 symbols with total PSS = 55454
  #   Ambiguous (2+ object paths): 2315 symbols with total PSS = 41941
  logging.info('Object path deduction results for pathless symbols:')
  logging.info('  No match: %d symbols with total PSS = %d', ref_tmp_popu[0],
               ref_tmp_pss[0])
  logging.info('  Assigned (1 object path): %d symbols with total PSS = %d',
               ref_tmp_popu[1], ref_tmp_pss[1])
  logging.info('  Ambiguous (2+ object paths): %d symbols with total PSS = %d',
               ref_tmp_popu[2], ref_tmp_pss[2])


def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('linker_file', type=os.path.realpath)
  parser.add_argument(
      '-v',
      '--verbose',
      default=0,
      action='count',
      help='Verbose level (multiple times for more)')
  parser.add_argument('--dump', action='store_true')
  args = parser.parse_args()

  logging.basicConfig(
      level=logging.WARNING - args.verbose * 10,
      format='%(levelname).1s %(relativeCreated)6d %(message)s')

  section_ranges, syms, extras = ParseFile(args.linker_file)

  if args.dump:
    print(section_ranges)
    for sym in syms:
      print(sym)
  else:
    # Enter interactive shell.
    readline.parse_and_bind('tab: complete')
    variables = {
        'section_ranges': section_ranges,
        'syms': syms,
        'extras': extras
    }
    banner_lines = [
        '*' * 80,
        'Variables:',
        '  section_ranges: Map from section name to (address, size).',
        '  syms: Raw symbols parsed from the linker map file.',
        '  extras: Format-specific extra data.',
        '*' * 80,
    ]
    code.InteractiveConsole(variables).interact('\n'.join(banner_lines))


if __name__ == '__main__':
  main()