0

valgrind/asan: update asan_symbolize.py

We need to update this for https://reviews.llvm.org/D60529

Bug: 1288169
Change-Id: I3c8f5b3f173be0ce5771e279f6a402deaafb9501
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/3301851
Auto-Submit: Takuto Ikuta <tikuta@chromium.org>
Reviewed-by: Nico Weber <thakis@chromium.org>
Commit-Queue: Nico Weber <thakis@chromium.org>
Cr-Commit-Position: refs/heads/main@{#964541}
This commit is contained in:
Takuto Ikuta
2022-01-28 13:08:04 +00:00
committed by Chromium LUCI CQ
parent d18700ce0d
commit a6a7b65891
4 changed files with 683 additions and 138 deletions

@ -4,6 +4,7 @@
# Third-party code.
third_party/blink/tools/blinkpy/third_party/*
third_party/blink/web_tests/external/wpt/*
tools/valgrind/asan/third_party/asan_symbolize.py
# TODO(crbug.com/1116155): Enable this for formatting by yapf.
tools/json_schema_compiler/*

@ -100,30 +100,6 @@ def find_inode_at_path(inode, path):
return ret
# Create a binary name filter that works around https://crbug.com/444835.
# When running tests on OSX swarming servers, ASan sometimes prints paths to
# files in cache (ending with SHA1 filenames) instead of paths to hardlinks to
# those files in the product dir.
# For a given |binary_path| chrome_osx_binary_name_filter() returns one of the
# hardlinks to the same inode in |product_dir_path|.
def make_chrome_osx_binary_name_filter(product_dir_path=''):
def chrome_osx_binary_name_filter(binary_path):
basename = os.path.basename(binary_path)
if is_hash_name(basename) and product_dir_path:
inode = os.stat(binary_path).st_ino
new_binary_path = find_inode_at_path(inode, product_dir_path)
if new_binary_path:
return new_binary_path
return binary_path
return chrome_osx_binary_name_filter
def make_sysroot_filter(sysroot):
"""Creates a binary name filter for a symbol tree of a remote system."""
return lambda binary_path: sysroot + binary_path
# Construct a path to the .dSYM bundle for the given binary.
# There are three possible cases for binary location in Chromium:
# 1. The binary is a standalone executable or dynamic library in the product
@ -182,6 +158,11 @@ class JSONTestRunSymbolizer(object):
def symbolize(self, test_run):
original_snippet = base64.b64decode(
test_run['output_snippet_base64']).decode('utf-8', 'replace')
# replace non-ascii character with '?'.
original_snippet = ''.join(i if i <= u'~' else u'?'
for i in original_snippet)
symbolized_snippet = self.symbolize_snippet(original_snippet)
if symbolized_snippet == original_snippet:
# No sanitizer reports in snippet.
@ -211,6 +192,26 @@ def symbolize_snippets_in_json(filename, symbolization_loop):
json.dump(json_data, f, indent=3, sort_keys=True)
class macOSBinaryNameFilterPlugin(asan_symbolize.AsanSymbolizerPlugIn):
def __init__(self):
self.product_dir_path = ''
def filter_binary_path(self, binary_path):
# Create a binary name filter that works around https://crbug.com/444835.
# When running tests on OSX swarming servers, ASan sometimes prints paths to
# files in cache (ending with SHA1 filenames) instead of paths to hardlinks
# to those files in the product dir.
# For a given |binary_path| macOSBinaryNameFilterPlugin returns one of the
# hardlinks to the same inode in |product_dir_path|.
basename = os.path.basename(binary_path)
if is_hash_name(basename) and self.product_dir_path:
inode = os.stat(binary_path).st_ino
new_binary_path = find_inode_at_path(inode, self.product_dir_path)
if new_binary_path:
return new_binary_path
return binary_path
def main():
parser = argparse.ArgumentParser(description='Symbolize sanitizer reports.')
parser.add_argument('--test-summary-json-file',
@ -234,22 +235,27 @@ def main():
# Most source paths for Chromium binaries start with
# /path/to/src/out/Release/../../
asan_symbolize.fix_filename_patterns.append('Release/../../')
binary_name_filter = None
if args.sysroot:
binary_name_filter = make_sysroot_filter(args.sysroot)
elif platform.uname()[0] == 'Darwin':
binary_name_filter = make_chrome_osx_binary_name_filter(
chrome_product_dir_path(args.executable_path))
loop = asan_symbolize.SymbolizationLoop(
binary_name_filter=binary_name_filter,
dsym_hint_producer=chrome_dsym_hints)
if args.test_summary_json_file:
symbolize_snippets_in_json(args.test_summary_json_file, loop)
else:
# Process stdin.
asan_symbolize.logfile = sys.stdin
loop.process_logfile()
with asan_symbolize.AsanSymbolizerPlugInProxy() as plugin_proxy:
if args.sysroot:
sysroot_filter = asan_symbolize.SysRootFilterPlugIn()
sysroot_filter.sysroot_path = args.sysroot
plugin_proxy.add_plugin(sysroot_filter)
elif platform.uname()[0] == 'Darwin':
macos_filter = macOSBinaryNameFilterPlugin()
macos_filter.product_dir_path = args.executable_path
plugin_proxy.add_plugin(macos_filter)
loop = asan_symbolize.SymbolizationLoop(
plugin_proxy=plugin_proxy, dsym_hint_producer=chrome_dsym_hints)
if args.test_summary_json_file:
symbolize_snippets_in_json(args.test_summary_json_file, loop)
else:
# Process stdin.
asan_symbolize.logfile = sys.stdin
loop.process_logfile()
if __name__ == '__main__':
main()

@ -1,7 +1,7 @@
Name: asan_symbolize.py
License: University of Illinois Open Source License.
Version: r227327
URL: http://llvm.org/viewvc/llvm-project/compiler-rt/trunk/lib/asan/scripts/asan_symbolize.py?view=co&content-type=text%2Fplain
License: Apache-2.0 WITH LLVM-exception
Version: a81e45b8bcb8eb274ad73357e10e2cdf8a314a8c
URL: https://github.com/llvm/llvm-project/blob/a81e45b8bcb8eb274ad73357e10e2cdf8a314a8c/compiler-rt/lib/asan/scripts/asan_symbolize.py
Security Critical: no
asan_symbolize.py is a verbatim copy of asan_symbolize.py in the LLVM trunk.

@ -1,26 +1,38 @@
#!/usr/bin/env python
#===- lib/asan/scripts/asan_symbolize.py -----------------------------------===#
#
# The LLVM Compiler Infrastructure
#
# This file is distributed under the University of Illinois Open Source
# License. See LICENSE.TXT for details.
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
#===------------------------------------------------------------------------===#
"""
Example of use:
asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" -s "$HOME/SymbolFiles" < asan.log
PLUGINS
This script provides a way for external plug-ins to hook into the behaviour of
various parts of this script (see `--plugins`). This is useful for situations
where it is necessary to handle site-specific quirks (e.g. binaries with debug
symbols only accessible via a remote service) without having to modify the
script itself.
"""
import argparse
import bisect
import errno
import getopt
import logging
import os
import re
import subprocess
import sys
from distutils.spawn import find_executable
symbolizers = {}
DEBUG = False
demangle = False
binutils_prefix = None
sysroot_path = None
binary_name_filter = None
fix_filename_patterns = None
logfile = sys.stdin
allow_system_symbolizer = True
@ -31,16 +43,14 @@ def fix_filename(file_name):
if fix_filename_patterns:
for path_to_cut in fix_filename_patterns:
file_name = re.sub('.*' + path_to_cut, '', file_name)
file_name = re.sub('.*asan_[a-z_]*.cc:[0-9]*', '_asan_rtl_', file_name)
file_name = re.sub('.*asan_[a-z_]*.(cc|cpp):[0-9]*', '_asan_rtl_', file_name)
file_name = re.sub('.*crtstuff.c:0', '???:0', file_name)
return file_name
def sysroot_path_filter(binary_name):
return sysroot_path + binary_name
def is_valid_arch(s):
return s in ["i386", "x86_64", "x86_64h", "arm", "armv6", "armv7", "armv7s",
"armv7k", "arm64", "powerpc64", "powerpc64le", "s390x", "s390"]
"armv7k", "arm64", "powerpc64", "powerpc64le", "s390x", "s390",
"riscv64"]
def guess_arch(addr):
# Guess which arch we're running. 10 = len('0x') + 8 hex digits.
@ -79,20 +89,15 @@ class LLVMSymbolizer(Symbolizer):
self.pipe = self.open_llvm_symbolizer()
def open_llvm_symbolizer(self):
cmd = [
self.symbolizer_path,
'--use-symbol-table=true',
# TODO(https://crbug.com/1132789) remove this temporary fix.
'--%sdemangle' % ("" if demangle else "no-"),
'--functions=linkage',
'--inlining=true',
'--default-arch=%s' % self.default_arch
]
cmd = [self.symbolizer_path,
('--demangle' if demangle else '--no-demangle'),
'--functions=linkage',
'--inlines',
'--default-arch=%s' % self.default_arch]
if self.system == 'Darwin':
for hint in self.dsym_hints:
cmd.append('--dsym-hint=%s' % hint)
if DEBUG:
print(' '.join(cmd))
logging.debug(' '.join(cmd))
try:
result = subprocess.Popen(cmd, stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
@ -109,8 +114,7 @@ class LLVMSymbolizer(Symbolizer):
result = []
try:
symbolizer_input = '"%s" %s' % (binary, offset)
if DEBUG:
print(symbolizer_input)
logging.debug(symbolizer_input)
self.pipe.stdin.write("%s\n" % symbolizer_input)
while True:
function_name = self.pipe.stdout.readline().rstrip()
@ -151,12 +155,12 @@ class Addr2LineSymbolizer(Symbolizer):
addr2line_tool = 'addr2line'
if binutils_prefix:
addr2line_tool = binutils_prefix + addr2line_tool
logging.debug('addr2line binary is %s' % find_executable(addr2line_tool))
cmd = [addr2line_tool, '-fi']
if demangle:
cmd += ['--demangle']
cmd += ['-e', self.binary]
if DEBUG:
print(' '.join(cmd))
logging.debug(' '.join(cmd))
return subprocess.Popen(cmd,
stdin=subprocess.PIPE, stdout=subprocess.PIPE,
bufsize=0,
@ -173,14 +177,43 @@ class Addr2LineSymbolizer(Symbolizer):
is_first_frame = True
while True:
function_name = self.pipe.stdout.readline().rstrip()
logging.debug("read function_name='%s' from addr2line" % function_name)
# If llvm-symbolizer is installed as addr2line, older versions of
# llvm-symbolizer will print -1 when presented with -1 and not print
# a second line. In that case we will block for ever trying to read the
# file name. This also happens for non-existent files, in which case GNU
# addr2line exits immediate, but llvm-symbolizer does not (see
# https://llvm.org/PR42754).
if function_name == '-1':
logging.debug("got function '-1' -> no more input")
break
file_name = self.pipe.stdout.readline().rstrip()
logging.debug("read file_name='%s' from addr2line" % file_name)
if is_first_frame:
is_first_frame = False
elif function_name in ['', '??']:
assert file_name == function_name
elif function_name == '??':
assert file_name == '??:0', file_name
logging.debug("got function '??' -> no more input")
break
lines.append((function_name, file_name));
except Exception:
elif not function_name:
assert not file_name, file_name
logging.debug("got empty function name -> no more input")
break
if not function_name and not file_name:
logging.debug("got empty function and file name -> unknown function")
function_name = '??'
file_name = '??:0'
lines.append((function_name, file_name))
except IOError as e:
# EPIPE happens if addr2line exits early (which some implementations do
# if an invalid file is passed).
if e.errno == errno.EPIPE:
logging.debug("addr2line exited early (broken pipe), returncode=%d" % self.pipe.poll())
else:
logging.debug("unexpected I/O exception communicating with addr2line", exc_info=e)
lines.append(('??', '??:0'))
except Exception as e:
logging.debug("got unknown exception communicating with addr2line", exc_info=e)
lines.append(('??', '??:0'))
return ['%s in %s %s' % (addr, function, fix_filename(file)) for (function, file) in lines]
@ -225,8 +258,7 @@ class DarwinSymbolizer(Symbolizer):
self.open_atos()
def open_atos(self):
if DEBUG:
print('atos -o %s -arch %s' % (self.binary, self.arch))
logging.debug('atos -o %s -arch %s', self.binary, self.arch)
cmdline = ['atos', '-o', self.binary, '-arch', self.arch]
self.atos = UnbufferedLineConverter(cmdline, close_stderr=True)
@ -234,17 +266,23 @@ class DarwinSymbolizer(Symbolizer):
"""Overrides Symbolizer.symbolize."""
if self.binary != binary:
return None
if not os.path.exists(binary):
# If the binary doesn't exist atos will exit which will lead to IOError
# exceptions being raised later on so just don't try to symbolize.
return ['{} ({}:{}+{})'.format(addr, binary, self.arch, offset)]
atos_line = self.atos.convert('0x%x' % int(offset, 16))
while "got symbolicator for" in atos_line:
atos_line = self.atos.readline()
# A well-formed atos response looks like this:
# foo(type1, type2) (in object.name) (filename.cc:80)
# NOTE:
# * For C functions atos omits parentheses and argument types.
# * For C++ functions the function name (i.e., `foo` above) may contain
# templates which may contain parentheses.
match = re.match('^(.*) \(in (.*)\) \((.*:\d*)\)$', atos_line)
if DEBUG:
print('atos_line: ', atos_line)
logging.debug('atos_line: %s', atos_line)
if match:
function_name = match.group(1)
function_name = re.sub('\(.*?\)', '', function_name)
file_name = fix_filename(match.group(3))
return ['%s in %s %s' % (addr, function_name, file_name)]
else:
@ -283,7 +321,7 @@ def BreakpadSymbolizerFactory(binary):
def SystemSymbolizerFactory(system, addr, binary, arch):
if system == 'Darwin':
return DarwinSymbolizer(addr, binary, arch)
elif system == 'Linux' or system == 'FreeBSD':
elif system in ['Linux', 'FreeBSD', 'NetBSD', 'SunOS']:
return Addr2LineSymbolizer(binary)
@ -362,7 +400,8 @@ class BreakpadSymbolizer(Symbolizer):
class SymbolizationLoop(object):
def __init__(self, binary_name_filter=None, dsym_hint_producer=None):
def __init__(self, plugin_proxy=None, dsym_hint_producer=None):
self.plugin_proxy = plugin_proxy
if sys.platform == 'win32':
# ASan on Windows uses dbghelp.dll to symbolize in-process, which works
# even in sandboxed processes. Nothing needs to be done here.
@ -370,16 +409,16 @@ class SymbolizationLoop(object):
else:
# Used by clients who may want to supply a different binary name.
# E.g. in Chrome several binaries may share a single .dSYM.
self.binary_name_filter = binary_name_filter
self.dsym_hint_producer = dsym_hint_producer
self.system = os.uname()[0]
if self.system not in ['Linux', 'Darwin', 'FreeBSD']:
if self.system not in ['Linux', 'Darwin', 'FreeBSD', 'NetBSD','SunOS']:
raise Exception('Unknown system')
self.llvm_symbolizers = {}
self.last_llvm_symbolizer = None
self.dsym_hints = set([])
self.frame_no = 0
self.process_line = self.process_line_posix
self.using_module_map = plugin_proxy.has_plugin(ModuleMapPlugIn.get_name())
def symbolize_address(self, addr, binary, offset, arch):
# On non-Darwin (i.e. on platforms without .dSYM debug info) always use
@ -402,7 +441,7 @@ class SymbolizationLoop(object):
use_new_symbolizer = bool(dsym_hints_for_binary - self.dsym_hints)
self.dsym_hints |= dsym_hints_for_binary
if self.last_llvm_symbolizer and not use_new_symbolizer:
self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
self.llvm_symbolizers[binary] = self.last_llvm_symbolizer
else:
self.last_llvm_symbolizer = LLVMSymbolizerFactory(
self.system, arch, self.dsym_hints)
@ -427,10 +466,13 @@ class SymbolizationLoop(object):
assert result
return result
def get_symbolized_lines(self, symbolized_lines):
def get_symbolized_lines(self, symbolized_lines, inc_frame_counter=True):
if not symbolized_lines:
if inc_frame_counter:
self.frame_no += 1
return [self.current_line]
else:
assert inc_frame_counter
result = []
for symbolized_frame in symbolized_lines:
result.append(' #%s %s' % (str(self.frame_no), symbolized_frame.rstrip()))
@ -439,13 +481,7 @@ class SymbolizationLoop(object):
def process_logfile(self):
self.frame_no = 0
# We use readline directly instead of an iterator ("for line in logfile")
# because the latter introduces extra buffering that is undesirable for
# interactive use.
while 1:
line = logfile.readline()
if not line:
break
for line in logfile:
processed = self.process_line(line)
print('\n'.join(processed))
@ -454,15 +490,29 @@ class SymbolizationLoop(object):
def process_line_posix(self, line):
self.current_line = line.rstrip()
#0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45)
# Unsymbolicated:
# #0 0x7f6e35cf2e45 (/blah/foo.so+0x11fe45)
# Partially symbolicated:
# #0 0x7f6e35cf2e45 in foo (foo.so+0x11fe45)
# NOTE: We have to very liberal with symbol
# names in the regex because it could be an
# Objective-C or C++ demangled name.
stack_trace_line_format = (
'^( *#([0-9]+) *)(0x[0-9a-f]+) *\((.*)\+(0x[0-9a-f]+)\)')
'^( *#([0-9]+) *)(0x[0-9a-f]+) *(?:in *.+)? *\((.*)\+(0x[0-9a-f]+)\)')
match = re.match(stack_trace_line_format, line)
if not match:
return [self.current_line]
if DEBUG:
print(line)
logging.debug('Line "{}" does not match regex'.format(line))
# Not a frame line so don't increment the frame counter.
return self.get_symbolized_lines(None, inc_frame_counter=False)
logging.debug(line)
_, frameno_str, addr, binary, offset = match.groups()
if not self.using_module_map and not os.path.isabs(binary):
# Do not try to symbolicate if the binary is just the module file name
# and a module map is unavailable.
# FIXME(dliew): This is currently necessary for reports on Darwin that are
# partially symbolicated by `atos`.
return self.get_symbolized_lines(None)
arch = ""
# Arch can be embedded in the filename, e.g.: "libabc.dylib:x86_64h"
colon_pos = binary.rfind(":")
@ -477,52 +527,540 @@ class SymbolizationLoop(object):
# Assume that frame #0 is the first frame of new stack trace.
self.frame_no = 0
original_binary = binary
if self.binary_name_filter:
binary = self.binary_name_filter(binary)
binary = self.plugin_proxy.filter_binary_path(binary)
if binary is None:
# The binary filter has told us this binary can't be symbolized.
logging.debug('Skipping symbolication of binary "%s"', original_binary)
return self.get_symbolized_lines(None)
symbolized_line = self.symbolize_address(addr, binary, offset, arch)
if not symbolized_line:
if original_binary != binary:
symbolized_line = self.symbolize_address(addr, binary, offset, arch)
symbolized_line = self.symbolize_address(addr, original_binary, offset, arch)
return self.get_symbolized_lines(symbolized_line)
class AsanSymbolizerPlugInProxy(object):
"""
Serves several purposes:
- Manages the lifetime of plugins (must be used a `with` statement).
- Provides interface for calling into plugins from within this script.
"""
def __init__(self):
self._plugins = [ ]
self._plugin_names = set()
def _load_plugin_from_file_impl_py_gt_2(self, file_path, globals_space):
with open(file_path, 'r') as f:
exec(f.read(), globals_space, None)
def load_plugin_from_file(self, file_path):
logging.info('Loading plugins from "{}"'.format(file_path))
globals_space = dict(globals())
# Provide function to register plugins
def register_plugin(plugin):
logging.info('Registering plugin %s', plugin.get_name())
self.add_plugin(plugin)
globals_space['register_plugin'] = register_plugin
if sys.version_info.major < 3:
execfile(file_path, globals_space, None)
else:
# Indirection here is to avoid a bug in older Python 2 versions:
# `SyntaxError: unqualified exec is not allowed in function ...`
self._load_plugin_from_file_impl_py_gt_2(file_path, globals_space)
def add_plugin(self, plugin):
assert isinstance(plugin, AsanSymbolizerPlugIn)
self._plugins.append(plugin)
self._plugin_names.add(plugin.get_name())
plugin._receive_proxy(self)
def remove_plugin(self, plugin):
assert isinstance(plugin, AsanSymbolizerPlugIn)
self._plugins.remove(plugin)
self._plugin_names.remove(plugin.get_name())
logging.debug('Removing plugin %s', plugin.get_name())
plugin.destroy()
def has_plugin(self, name):
"""
Returns true iff the plugin name is currently
being managed by AsanSymbolizerPlugInProxy.
"""
return name in self._plugin_names
def register_cmdline_args(self, parser):
plugins = list(self._plugins)
for plugin in plugins:
plugin.register_cmdline_args(parser)
def process_cmdline_args(self, pargs):
# Use copy so we can remove items as we iterate.
plugins = list(self._plugins)
for plugin in plugins:
keep = plugin.process_cmdline_args(pargs)
assert isinstance(keep, bool)
if not keep:
self.remove_plugin(plugin)
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
for plugin in self._plugins:
plugin.destroy()
# Don't suppress raised exceptions
return False
def _filter_single_value(self, function_name, input_value):
"""
Helper for filter style plugin functions.
"""
new_value = input_value
for plugin in self._plugins:
result = getattr(plugin, function_name)(new_value)
if result is None:
return None
new_value = result
return new_value
def filter_binary_path(self, binary_path):
"""
Consult available plugins to filter the path to a binary
to make it suitable for symbolication.
Returns `None` if symbolication should not be attempted for this
binary.
"""
return self._filter_single_value('filter_binary_path', binary_path)
def filter_module_desc(self, module_desc):
"""
Consult available plugins to determine the module
description suitable for symbolication.
Returns `None` if symbolication should not be attempted for this module.
"""
assert isinstance(module_desc, ModuleDesc)
return self._filter_single_value('filter_module_desc', module_desc)
class AsanSymbolizerPlugIn(object):
"""
This is the interface the `asan_symbolize.py` code uses to talk
to plugins.
"""
@classmethod
def get_name(cls):
"""
Returns the name of the plugin.
"""
return cls.__name__
def _receive_proxy(self, proxy):
assert isinstance(proxy, AsanSymbolizerPlugInProxy)
self.proxy = proxy
def register_cmdline_args(self, parser):
"""
Hook for registering command line arguments to be
consumed in `process_cmdline_args()`.
`parser` - Instance of `argparse.ArgumentParser`.
"""
pass
def process_cmdline_args(self, pargs):
"""
Hook for handling parsed arguments. Implementations
should not modify `pargs`.
`pargs` - Instance of `argparse.Namespace` containing
parsed command line arguments.
Return `True` if plug-in should be used, otherwise
return `False`.
"""
return True
def destroy(self):
"""
Hook called when a plugin is about to be destroyed.
Implementations should free any allocated resources here.
"""
pass
# Symbolization hooks
def filter_binary_path(self, binary_path):
"""
Given a binary path return a binary path suitable for symbolication.
Implementations should return `None` if symbolication of this binary
should be skipped.
"""
return binary_path
def filter_module_desc(self, module_desc):
"""
Given a ModuleDesc object (`module_desc`) return
a ModuleDesc suitable for symbolication.
Implementations should return `None` if symbolication of this binary
should be skipped.
"""
return module_desc
class ModuleDesc(object):
def __init__(self, name, arch, start_addr, end_addr, module_path, uuid):
self.name = name
self.arch = arch
self.start_addr = start_addr
self.end_addr = end_addr
# Module path from an ASan report.
self.module_path = module_path
# Module for performing symbolization, by default same as above.
self.module_path_for_symbolization = module_path
self.uuid = uuid
assert self.is_valid()
def __str__(self):
assert self.is_valid()
return "{name} {arch} {start_addr:#016x}-{end_addr:#016x} {module_path} {uuid}".format(
name=self.name,
arch=self.arch,
start_addr=self.start_addr,
end_addr=self.end_addr,
module_path=self.module_path if self.module_path == self.module_path_for_symbolization else '{} ({})'.format(self.module_path_for_symbolization, self.module_path),
uuid=self.uuid
)
def is_valid(self):
if not isinstance(self.name, str):
return False
if not isinstance(self.arch, str):
return False
if not isinstance(self.start_addr, int):
return False
if self.start_addr < 0:
return False
if not isinstance(self.end_addr, int):
return False
if self.end_addr <= self.start_addr:
return False
if not isinstance(self.module_path, str):
return False
if not os.path.isabs(self.module_path):
return False
if not isinstance(self.module_path_for_symbolization, str):
return False
if not os.path.isabs(self.module_path_for_symbolization):
return False
if not isinstance(self.uuid, str):
return False
return True
class GetUUIDFromBinaryException(Exception):
def __init__(self, msg):
super(GetUUIDFromBinaryException, self).__init__(msg)
_get_uuid_from_binary_cache = dict()
def get_uuid_from_binary(path_to_binary, arch=None):
cache_key = (path_to_binary, arch)
cached_value = _get_uuid_from_binary_cache.get(cache_key)
if cached_value:
return cached_value
if not os.path.exists(path_to_binary):
raise GetUUIDFromBinaryException('Binary "{}" does not exist'.format(path_to_binary))
cmd = [ '/usr/bin/otool', '-l']
if arch:
cmd.extend(['-arch', arch])
cmd.append(path_to_binary)
output = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
# Look for this output:
# cmd LC_UUID
# cmdsize 24
# uuid 4CA778FE-5BF9-3C45-AE59-7DF01B2BE83F
if isinstance(output, str):
output_str = output
else:
assert isinstance(output, bytes)
output_str = output.decode()
assert isinstance(output_str, str)
lines = output_str.split('\n')
uuid = None
for index, line in enumerate(lines):
stripped_line = line.strip()
if not stripped_line.startswith('cmd LC_UUID'):
continue
uuid_line = lines[index+2].strip()
if not uuid_line.startswith('uuid'):
raise GetUUIDFromBinaryException('Malformed output: "{}"'.format(uuid_line))
split_uuid_line = uuid_line.split()
uuid = split_uuid_line[1]
break
if uuid is None:
logging.error('Failed to retrieve UUID from binary {}'.format(path_to_binary))
logging.error('otool output was:\n{}'.format(output_str))
raise GetUUIDFromBinaryException('Failed to retrieve UUID from binary "{}"'.format(path_to_binary))
else:
# Update cache
_get_uuid_from_binary_cache[cache_key] = uuid
return uuid
class ModuleMap(object):
def __init__(self):
self._module_name_to_description_map = dict()
def add_module(self, desc):
assert isinstance(desc, ModuleDesc)
assert desc.name not in self._module_name_to_description_map
self._module_name_to_description_map[desc.name] = desc
def find_module_by_name(self, name):
return self._module_name_to_description_map.get(name, None)
def __str__(self):
s = '{} modules:\n'.format(self.num_modules)
for module_desc in sorted(self._module_name_to_description_map.values(), key=lambda v: v.start_addr):
s += str(module_desc) + '\n'
return s
@property
def num_modules(self):
return len(self._module_name_to_description_map)
@property
def modules(self):
return set(self._module_name_to_description_map.values())
def get_module_path_for_symbolication(self, module_name, proxy, validate_uuid):
module_desc = self.find_module_by_name(module_name)
if module_desc is None:
return None
# Allow a plug-in to change the module description to make it
# suitable for symbolication or avoid symbolication altogether.
module_desc = proxy.filter_module_desc(module_desc)
if module_desc is None:
return None
if validate_uuid:
logging.debug('Validating UUID of {}'.format(module_desc.module_path_for_symbolization))
try:
uuid = get_uuid_from_binary(module_desc.module_path_for_symbolization, arch = module_desc.arch)
if uuid != module_desc.uuid:
logging.warning("Detected UUID mismatch {} != {}".format(uuid, module_desc.uuid))
# UUIDs don't match. Tell client to not symbolize this.
return None
except GetUUIDFromBinaryException as e:
logging.error('Failed to get binary from UUID: %s', str(e))
return None
else:
logging.warning('Skipping validation of UUID of {}'.format(module_desc.module_path_for_symbolization))
return module_desc.module_path_for_symbolization
@staticmethod
def parse_from_file(module_map_path):
if not os.path.exists(module_map_path):
raise Exception('module map "{}" does not exist'.format(module_map_path))
with open(module_map_path, 'r') as f:
mm = None
# E.g.
# 0x2db4000-0x102ddc000 /path/to (arm64) <0D6BBDE0-FF90-3680-899D-8E6F9528E04C>
hex_regex = lambda name: r'0x(?P<' + name + r'>[0-9a-f]+)'
module_path_regex = r'(?P<path>.+)'
arch_regex = r'\((?P<arch>.+)\)'
uuid_regex = r'<(?P<uuid>[0-9A-Z-]+)>'
line_regex = r'^{}-{}\s+{}\s+{}\s+{}'.format(
hex_regex('start_addr'),
hex_regex('end_addr'),
module_path_regex,
arch_regex,
uuid_regex
)
matcher = re.compile(line_regex)
line_num = 0
line = 'dummy'
while line != '':
line = f.readline()
line_num += 1
if mm is None:
if line.startswith('Process module map:'):
mm = ModuleMap()
continue
if line.startswith('End of module map'):
break
m_obj = matcher.match(line)
if not m_obj:
raise Exception('Failed to parse line {} "{}"'.format(line_num, line))
arch = m_obj.group('arch')
start_addr = int(m_obj.group('start_addr'), base=16)
end_addr = int(m_obj.group('end_addr'), base=16)
module_path = m_obj.group('path')
uuid = m_obj.group('uuid')
module_desc = ModuleDesc(
name=os.path.basename(module_path),
arch=arch,
start_addr=start_addr,
end_addr=end_addr,
module_path=module_path,
uuid=uuid
)
mm.add_module(module_desc)
if mm is not None:
logging.debug('Loaded Module map from "{}":\n{}'.format(
f.name,
str(mm))
)
return mm
class SysRootFilterPlugIn(AsanSymbolizerPlugIn):
"""
Simple plug-in to add sys root prefix to all binary paths
used for symbolication.
"""
def __init__(self):
self.sysroot_path = ""
def register_cmdline_args(self, parser):
parser.add_argument('-s', dest='sys_root', metavar='SYSROOT',
help='set path to sysroot for sanitized binaries')
def process_cmdline_args(self, pargs):
if pargs.sys_root is None:
# Not being used so remove ourselves.
return False
self.sysroot_path = pargs.sys_root
return True
def filter_binary_path(self, path):
return self.sysroot_path + path
class ModuleMapPlugIn(AsanSymbolizerPlugIn):
def __init__(self):
self._module_map = None
self._uuid_validation = True
def register_cmdline_args(self, parser):
parser.add_argument('--module-map',
help='Path to text file containing module map'
'output. See print_module_map ASan option.')
parser.add_argument('--skip-uuid-validation',
default=False,
action='store_true',
help='Skips validating UUID of modules using otool.')
def process_cmdline_args(self, pargs):
if not pargs.module_map:
return False
self._module_map = ModuleMap.parse_from_file(args.module_map)
if self._module_map is None:
msg = 'Failed to find module map'
logging.error(msg)
raise Exception(msg)
self._uuid_validation = not pargs.skip_uuid_validation
return True
def filter_binary_path(self, binary_path):
if os.path.isabs(binary_path):
# This is a binary path so transform into
# a module name
module_name = os.path.basename(binary_path)
else:
module_name = binary_path
return self._module_map.get_module_path_for_symbolication(
module_name,
self.proxy,
self._uuid_validation
)
def add_logging_args(parser):
parser.add_argument('--log-dest',
default=None,
help='Destination path for script logging (default stderr).',
)
parser.add_argument('--log-level',
choices=['debug', 'info', 'warning', 'error', 'critical'],
default='info',
help='Log level for script (default: %(default)s).'
)
def setup_logging():
# Set up a parser just for parsing the logging arguments.
# This is necessary because logging should be configured before we
# perform the main argument parsing.
parser = argparse.ArgumentParser(add_help=False)
add_logging_args(parser)
pargs, unparsed_args = parser.parse_known_args()
log_level = getattr(logging, pargs.log_level.upper())
if log_level == logging.DEBUG:
log_format = '%(levelname)s: [%(funcName)s() %(filename)s:%(lineno)d] %(message)s'
else:
log_format = '%(levelname)s: %(message)s'
basic_config = {
'level': log_level,
'format': log_format
}
log_dest = pargs.log_dest
if log_dest:
basic_config['filename'] = log_dest
logging.basicConfig(**basic_config)
logging.debug('Logging level set to "{}" and directing output to "{}"'.format(
pargs.log_level,
'stderr' if log_dest is None else log_dest)
)
return unparsed_args
def add_load_plugin_args(parser):
parser.add_argument('-p', '--plugins',
help='Load plug-in', nargs='+', default=[])
def setup_plugins(plugin_proxy, args):
parser = argparse.ArgumentParser(add_help=False)
add_load_plugin_args(parser)
pargs , unparsed_args = parser.parse_known_args()
for plugin_path in pargs.plugins:
plugin_proxy.load_plugin_from_file(plugin_path)
# Add built-in plugins.
plugin_proxy.add_plugin(ModuleMapPlugIn())
plugin_proxy.add_plugin(SysRootFilterPlugIn())
return unparsed_args
if __name__ == '__main__':
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='ASan symbolization script',
epilog='Example of use:\n'
'asan_symbolize.py -c "$HOME/opt/cross/bin/arm-linux-gnueabi-" '
'-s "$HOME/SymbolFiles" < asan.log')
parser.add_argument('path_to_cut', nargs='*',
help='pattern to be cut from the result file path ')
parser.add_argument('-d','--demangle', action='store_true',
help='demangle function names')
parser.add_argument('-s', metavar='SYSROOT',
help='set path to sysroot for sanitized binaries')
parser.add_argument('-c', metavar='CROSS_COMPILE',
help='set prefix for binutils')
parser.add_argument('-l','--logfile', default=sys.stdin,
type=argparse.FileType('r'),
help='set log file name to parse, default is stdin')
parser.add_argument('--force-system-symbolizer', action='store_true',
help='don\'t use llvm-symbolizer')
args = parser.parse_args()
if args.path_to_cut:
fix_filename_patterns = args.path_to_cut
if args.demangle:
demangle = True
if args.s:
binary_name_filter = sysroot_path_filter
sysroot_path = args.s
if args.c:
binutils_prefix = args.c
if args.logfile:
logfile = args.logfile
else:
logfile = sys.stdin
if args.force_system_symbolizer:
force_system_symbolizer = True
if force_system_symbolizer:
assert(allow_system_symbolizer)
loop = SymbolizationLoop(binary_name_filter)
loop.process_logfile()
remaining_args = setup_logging()
with AsanSymbolizerPlugInProxy() as plugin_proxy:
remaining_args = setup_plugins(plugin_proxy, remaining_args)
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='ASan symbolization script',
epilog=__doc__)
parser.add_argument('path_to_cut', nargs='*',
help='pattern to be cut from the result file path ')
parser.add_argument('-d','--demangle', action='store_true',
help='demangle function names')
parser.add_argument('-c', metavar='CROSS_COMPILE',
help='set prefix for binutils')
parser.add_argument('-l','--logfile', default=sys.stdin,
type=argparse.FileType('r'),
help='set log file name to parse, default is stdin')
parser.add_argument('--force-system-symbolizer', action='store_true',
help='don\'t use llvm-symbolizer')
# Add logging arguments so that `--help` shows them.
add_logging_args(parser)
# Add load plugin arguments so that `--help` shows them.
add_load_plugin_args(parser)
plugin_proxy.register_cmdline_args(parser)
args = parser.parse_args(remaining_args)
plugin_proxy.process_cmdline_args(args)
if args.path_to_cut:
fix_filename_patterns = args.path_to_cut
if args.demangle:
demangle = True
if args.c:
binutils_prefix = args.c
if args.logfile:
logfile = args.logfile
else:
logfile = sys.stdin
if args.force_system_symbolizer:
force_system_symbolizer = True
if force_system_symbolizer:
assert(allow_system_symbolizer)
loop = SymbolizationLoop(plugin_proxy)
loop.process_logfile()