Add scripts for using cygprofile to repository

These scripts, which are used to create a link order file for Linux based
builds (Linux, Android, ChromeOS). This commit simply moves them from a
Google private web page where they have lived for the last couple of years.

The scripts are:
mergetraces.py - merge multiple cygprofile traces into a single trace
symbolise.py - convert traces into an order file with symbols
patch_orderfile.py - complete the order file with symbols from Chrome.

I will seperately add a page on creating an order file to the Chromium
web site.

Review URL: https://chromiumcodereview.appspot.com/16151006

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@203143 0039d316-1c4b-4281-b951-d872f2087c98

This commit is contained in:

aberent@chromium.org

2013-05-30 14:51:56 +00:00

parent adbed22511

commit da0ac0fd05

3 changed files with 554 additions and 0 deletions

tools/cygprofile

mergetraces.py patch_orderfile.py symbolize.py

									
										186

tools/cygprofile/mergetraces.py
									
										Executable file
									
				@@ -0,0 +1,186 @@

				#!/usr/bin/python

				# Copyright 2013 The Chromium Authors. All rights reserved.

				# Use of this source code is governed by a BSD-style license that can be

				# found in the LICENSE file.

				# Use: ../mergetraces.py `ls cyglog.* -Sr` > merged_cyglog

				""""Merge multiple logs files from different processes into a single log.

				Given two log files of execution traces, merge the traces into a single trace.

				Merging will use timestamps (i.e. the first two columns of logged calls) to

				create a single log that is an ordered trace of calls by both processes.

				"""

				import optparse

				import os

				import string

				import subprocess

				import sys

				def ParseLogLines(lines):

				  """Parse log file lines.

				  Args:

				    lines: lines from log file produced by profiled run

				    Below is an example of a small log file:

				    5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so

				    secs       msecs      pid:threadid    func

				    START

				    1314897086 795828     3587:1074648168 0x509e105c

				    1314897086 795874     3587:1074648168 0x509e0eb4

				    1314897086 796326     3587:1074648168 0x509e0e3c

				    1314897086 796552     3587:1074648168 0x509e07bc

				    END

				  Returns:

				    tuple conisiting of 1) an ordered list of the logged calls, as an array of

				    fields, 2) the virtual start address of the library, used to compute the

				    offset of the symbol in the library and 3) the virtual end address

				  """

				  call_lines = []

				  vm_start = 0

				  vm_end = 0

				  dash_index = lines[0].find ('-')

				  space_index = lines[0].find (' ')

				  vm_start = int (lines[0][:dash_index], 16)

				  vm_end = int (lines[0][dash_index+1:space_index], 16)

				  for line in lines[2:]:

				    line = line.strip()

				    # print hex (vm_start)

				    fields = line.split()

				    call_lines.append (fields)

				  return (call_lines, vm_start, vm_end)

				def HasDuplicates(calls):

				  """Funcition is a sanity check to make sure that calls are only logged once.

				  Args:

				    calls: list of calls logged

				  Returns:

				    boolean indicating if calls has duplicate calls

				  """

				  seen = []

				  for call in calls:

				    if call[3] in seen:

				      return true

				    else:

				      seen.append(call[3])

				def CheckTimestamps(calls):

				  """Prints warning to stderr if the call timestamps are not in order.

				  Args:

				    calls: list of calls logged

				  """

				  index = 0

				  last_timestamp_secs = -1

				  last_timestamp_ms = -1

				  while (index < len (calls)):

				    timestamp_secs = int (calls[index][0])

				    timestamp_ms = int (calls[index][1])

				    timestamp = (timestamp_secs * 1000000) + timestamp_ms

				    last_timestamp = (last_timestamp_secs * 1000000) + last_timestamp_ms

				    if (timestamp < last_timestamp):

				      sys.stderr.write("WARNING: last_timestamp: " + str(last_timestamp_secs)

				                       + " " + str(last_timestamp_ms) + " timestamp: "

				                       + str(timestamp_secs) + " " + str(timestamp_ms) + "\n")

				    last_timestamp_secs = timestamp_secs

				    last_timestamp_ms = timestamp_ms

				    index = index + 1

				def Convert (call_lines, startAddr, endAddr):

				  """Converts the call addresses to static offsets and removes invalid calls.

				  Removes profiled calls not in shared library using start and end virtual

				  addresses, converts strings to integer values, coverts virtual addresses to

				  address in shared library.

				  Returns:

				     list of calls as tuples (sec, msec, pid:tid, callee)

				  """

				  converted_calls = []

				  call_addresses = []

				  for fields in call_lines:

				    secs = int (fields[0])

				    msecs = int (fields[1])

				    callee = int (fields[3], 16)

				    # print ("callee: " + hex (callee) + " start: " + hex (startAddr) + " end: "

				    #        + hex (endAddr))

				    if (callee >= startAddr and callee < endAddr

				        and (not callee in call_addresses)):

				      converted_calls.append((secs, msecs, fields[2], (callee - startAddr)))

				      call_addresses.append(callee)

				  return converted_calls

				def Timestamp(trace_entry):

				  return int (trace_entry[0]) * 1000000 + int(trace_entry[1])

				def AddTrace (tracemap, trace):

				  """Adds a trace to the tracemap.

				  Adds entries in the trace to the tracemap. All new calls will be added to

				  the tracemap. If the calls already exist in the tracemap then they will be

				  replaced if they happened sooner in the new trace.

				  Args:

				    tracemap: the tracemap

				    trace: the trace

				  """

				  for trace_entry in trace:

				    call = trace_entry[3]

				    if (not call in tracemap) or (

				        Timestamp(tracemap[call]) > Timestamp(trace_entry)):

				      tracemap[call] = trace_entry

				def main():

				  """Merge two traces for code in specified library and write to stdout.

				  Merges the two traces and coverts the virtual addresses to the offsets in the

				  library.  First line of merged trace has dummy virtual address of 0-ffffffff

				  so that symbolizing the addresses uses the addresses in the log, since the

				  addresses have already been converted to static offsets.

				  """

				  parser = optparse.OptionParser('usage: %prog trace1 ... traceN')

				  (_, args) = parser.parse_args()

				  if len(args) <= 1:

				    parser.error('expected at least the following args: trace1 trace2')

				  step = 0

				  tracemap = dict()

				  for trace_file in args:

				    step += 1

				    sys.stderr.write("    " + str(step) + "/" + str(len(args)) +

				                     ": " + trace_file + ":\n")

				    trace_lines = map(string.rstrip, open(trace_file).readlines())

				    (trace_calls, trace_start, trace_end) = ParseLogLines(trace_lines)

				    CheckTimestamps(trace_calls)

				    sys.stderr.write("Len: " + str(len(trace_calls)) +

				                     ". Start: " + hex(trace_start) +

				                     ", end: " + hex(trace_end) + '\n')

				    trace_calls = Convert(trace_calls, trace_start, trace_end)

				    sys.stderr.write("Converted len: " + str(len(trace_calls)) + "\n")

				    AddTrace(tracemap, trace_calls)

				    sys.stderr.write("Merged len: " + str(len(tracemap)) + "\n")

				  # Extract the resulting trace from the tracemap

				  merged_trace = []

				  for call in tracemap:

				    merged_trace.append(tracemap[call])

				  merged_trace.sort(key=Timestamp)

				  print "0-ffffffff r-xp 00000000 xx:00 00000 ./"

				  print "secs\tmsecs\tpid:threadid\tfunc"

				  for call in merged_trace:

				    print (str(call[0]) + "\t" + str(call[1]) + "\t" + call[2] + "\t" +

				           hex(call[3]))

				if __name__ == '__main__':

				  main()

									
										117

tools/cygprofile/patch_orderfile.py
									
										Executable file
									
				@@ -0,0 +1,117 @@

				#!/usr/bin/python

				# Copyright 2013 The Chromium Authors. All rights reserved.

				# Use of this source code is governed by a BSD-style license that can be

				# found in the LICENSE file.

				import commands

				import os

				import sys

				orderfile = sys.argv[1]

				uninstrumented_shlib = sys.argv[2]

				nmlines_uninstrumented = commands.getoutput ('nm -S -n ' +

				   uninstrumented_shlib + '  | egrep "( t )|( W )|( T )"').split('\n')

				nmlines = []

				for nmline in nmlines_uninstrumented:

				  if (len(nmline.split()) == 4):

				    nmlines.append(nmline)

				# Map addresses to list of functions at that address.  There are multiple

				# functions at an address because of aliasing.

				nm_index = 0

				uniqueAddrs = []

				addressMap = {}

				while nm_index < len(nmlines):

				  if (len(nmlines[nm_index].split()) == 4):

				    nm_int = int (nmlines[nm_index].split()[0], 16)

				    size = int (nmlines[nm_index].split()[1], 16)

				    fnames = [nmlines[nm_index].split()[3]]

				    nm_index = nm_index + 1

				    while nm_index < len(nmlines) and nm_int == int (

				        nmlines[nm_index].split()[0], 16):

				      fnames.append(nmlines[nm_index].split()[3])

				      nm_index = nm_index + 1

				    addressMap[nm_int] = fnames

				    uniqueAddrs.append((nm_int, size))

				  else:

				    nm_index = nm_index + 1

				def binary_search (addr, start, end):

				  # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)

				  if start >= end or start == end - 1:

				    (nm_addr, size) = uniqueAddrs[start]

				    if not (addr >= nm_addr and addr < nm_addr + size):

				      sys.stderr.write ("ERROR: did not find function in binary: addr: " +

				          hex(addr) + " nm_addr: " + str(nm_addr) + " start: " + str(start) +

				          " end: " + str(end) + "\n")

				      raise Error("error")

				    return (addressMap[nm_addr], size)

				  else:

				    halfway = start + ((end - start) / 2)

				    (nm_addr, size) = uniqueAddrs[halfway]

				    # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)

				    if (addr >= nm_addr and addr < nm_addr + size):

				      return (addressMap[nm_addr], size)

				    elif (addr < nm_addr):

				      return binary_search (addr, start, halfway)

				    elif (addr >= nm_addr + size):

				      return binary_search (addr, halfway, end)

				    else:

				      raise "ERROR: did not expect this case"

				f = open (orderfile)

				lines = f.readlines()

				profiled_list = []

				for line in lines:

				  if (line.strip() == ''):

				    continue

				  functionName = line.replace('.text.', '').split('.clone.')[0].strip()

				  profiled_list.append (functionName)

				# Symbol names are not unique.  Since the order file uses symbol names, the

				# patched order file pulls in all symbols with the same name.  Multiple function

				# addresses for the same function name may also be due to ".clone" symbols,

				# since the substring is stripped.

				functions = []

				functionAddressMap = {}

				for line in nmlines:

				  try:

				    functionName = line.split()[3]

				  except:

				    functionName = line.split()[2]

				  functionName = functionName.split('.clone.')[0]

				  functionAddress = int (line.split()[0].strip(), 16)

				  try:

				    functionAddressMap[functionName].append(functionAddress)

				  except:

				    functionAddressMap[functionName] = [functionAddress]

				    functions.append(functionName)

				sys.stderr.write ("profiled list size: " + str(len(profiled_list)) + "\n")

				addresses = []

				symbols_found = 0

				for function in profiled_list:

				   try:

				     addrs = functionAddressMap[function]

				     symbols_found = symbols_found + 1

				   except:

				     addrs = []

				     # sys.stderr.write ("WARNING: could not find symbol " + function + "\n")

				   for addr in addrs:

				     if not (addr in addresses):

				       addresses.append(addr)

				sys.stderr.write ("symbols found: " + str(symbols_found) + "\n")

				sys.stderr.write ("number of addresses: " + str(len(addresses)) + "\n")

				total_size = 0

				for addr in addresses:

				  # if (count % 500 == 0):

				  #    print "current count: " + str(count)

				  (functions, size) = binary_search (addr, 0, len(uniqueAddrs))

				  total_size = total_size + size

				  for function in functions:

				    print ".text." + function

				  print ""

				sys.stderr.write ("total_size: " + str(total_size) + "\n")

									
										251

tools/cygprofile/symbolize.py
									
										Executable file
									
				@@ -0,0 +1,251 @@

				#!/usr/bin/python

				# Copyright 2013 The Chromium Authors. All rights reserved.

				# Use of this source code is governed by a BSD-style license that can be

				# found in the LICENSE file.

				"""Symbolize log file produced by cypgofile instrumentation.

				Given a log file and the binary being profiled (e.g. executable, shared

				library), the script can produce three different outputs: 1) symbols for the

				addresses, 2) function and line numbers for the addresses, or 3) an order file.

				"""

				import optparse

				import os

				import string

				import subprocess

				import sys

				def ParseLogLines(log_file_lines):

				  """Parse a log file produced by the profiled run of clank.

				  Args:

				    log_file_lines: array of lines in log file produced by profiled run

				    lib_name: library or executable containing symbols

				    Below is an example of a small log file:

				    5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so

				    secs       msecs      pid:threadid    func

				    START

				    1314897086 795828     3587:1074648168 0x509e105c

				    1314897086 795874     3587:1074648168 0x509e0eb4

				    1314897086 796326     3587:1074648168 0x509e0e3c

				    1314897086 796552     3587:1074648168 0x509e07bc

				    END

				  Returns:

				    call_info list with list of tuples of the format (sec, msec, call id,

				    function address called)

				  """

				  call_lines = []

				  has_started = False

				  vm_start = 0

				  line = log_file_lines[0]

				  assert("r-xp" in line)

				  end_index = line.find('-')

				  vm_start = int(line[:end_index], 16)

				  for line in log_file_lines[2:]:

				  # print hex(vm_start)

				    fields = line.split()

				    if len(fields) == 4:

				      call_lines.append(fields)

				  # Convert strings to int in fields.

				  call_info = []

				  for call_line in call_lines:

				    (sec_timestamp, msec_timestamp) = map(int, call_line[0:2])

				    callee_id = call_line[2]

				    addr = int(call_line[3], 16)

				    if vm_start < addr:

				      addr -= vm_start

				      call_info.append((sec_timestamp, msec_timestamp, callee_id, addr))

				  return call_info

				def ParseLibSymbols(lib_file):

				  """Get output from running nm and greping for text symbols.

				  Args:

				    lib_file: the library or executable that contains the profiled code

				  Returns:

				    list of sorted unique addresses and corresponding size of function symbols

				    in lib_file and map of addresses to all symbols at a particular address

				  """

				  cmd = ['nm', '-S', '-n', lib_file]

				  nm_p = subprocess.Popen(cmd, stdout=subprocess.PIPE)

				  output = nm_p.communicate()[0]

				  nm_lines = output.split('\n')

				  nm_symbols = []

				  for nm_line in nm_lines:

				    if any(str in nm_line for str in (' t ', ' W ', ' T ')):

				      nm_symbols.append(nm_line)

				  nm_index = 0

				  unique_addrs = []

				  address_map = {}

				  while nm_index < len(nm_symbols):

				    # If the length of the split line is not 4, then it does not contain all the

				    # information needed to symbolize (i.e. address, size and symbol name).

				    if len(nm_symbols[nm_index].split()) == 4:

				      (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]

				      # Multiple symbols may be at the same address.  This is do to aliasing

				      # done by the compiler.  Since there is no way to be sure which one was

				      # called in profiled run, we will symbolize to include all symbol names at

				      # a particular address.

				      fnames = []

				      while (nm_index < len(nm_symbols) and

				             addr == int(nm_symbols[nm_index].split()[0], 16)):

				        if len(nm_symbols[nm_index].split()) == 4:

				          fnames.append(nm_symbols[nm_index].split()[3])

				        nm_index += 1

				      address_map[addr] = fnames

				      unique_addrs.append((addr, size))

				    else:

				      nm_index += 1

				  return (unique_addrs, address_map)

				class SymbolNotFoundException(Exception):

				  def __init__(self,value):

				    self.value = value

				  def __str__(self):

				    return repr(self.value)

				def BinarySearchAddresses(addr, start, end, arr):

				  """Find starting address of a symbol at a particular address.

				  The reason we can not directly use the address provided by the log file is

				  that the log file may give an address after the start of the symbol.  The

				  logged address is often one byte after the start.  By using this search

				  function rather than just subtracting one from the logged address allows

				  the logging instrumentation to log any address in a function.

				  Args:

				    addr: the address being searched for

				    start: the starting index for the binary search

				    end: the ending index for the binary search

				    arr: the list being searched containing tuple of address and size

				  Returns:

				    the starting address of the symbol at address addr

				  Raises:

				    Exception: if address not found.  Functions expects all logged addresses

				    to be found

				  """

				  # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)

				  if start >= end or start == end - 1:

				    # arr[i] is a tuple of address and size.  Check if addr inside range

				    if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]:

				      return arr[start][0]

				    elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]:

				      return arr[end][0]

				    else:

				      raise SymbolNotFoundException(addr)

				  else:

				    halfway = (start + end) / 2

				    (nm_addr, size) = arr[halfway]

				    # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)

				    if addr >= nm_addr and addr < nm_addr + size:

				      return nm_addr

				    elif addr < nm_addr:

				      return BinarySearchAddresses(addr, start, halfway-1, arr)

				    else:

				      # Condition (addr >= nm_addr + size) must be true.

				      return BinarySearchAddresses(addr, halfway+1, end, arr)

				def FindFunctions(addr, unique_addrs, address_map):

				  """Find function symbol names at address addr."""

				  return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1,

				                                           unique_addrs)]

				def AddrToLine(addr, lib_file):

				  """Use addr2line to determine line info of a particular address."""

				  cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]

				  p = subprocess.Popen(cmd, stdout=subprocess.PIPE)

				  output = (p.communicate()[0]).split('\n')

				  line = output[0]

				  index = 1

				  while index < len(output):

				    line = line + ':' + output[index]

				    index += 1

				  return line

				def main():

				  """Write output for profiled run to standard out.

				  The format of the output depends on the output type specified as the third

				  command line argument.  The default output type is to symbolize the addresses

				  of the functions called.

				  """

				  parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')

				  parser.add_option('-t', '--outputType', dest='output_type',

				                    default='symbolize', type='string',

				                    help='lineize or symbolize or orderfile')

				  # Option for output type.  The log file and lib file arguments are required

				  # by the script and therefore are not options.

				  (options, args) = parser.parse_args()

				  if len(args) != 2:

				    parser.error('expected 2 args: log_file lib_file')

				  (log_file, lib_file) = args

				  output_type = options.output_type

				  lib_name = lib_file.split('/')[-1].strip()

				  log_file_lines = map(string.rstrip, open(log_file).readlines())

				  call_info = ParseLogLines(log_file_lines)

				  (unique_addrs, address_map) = ParseLibSymbols(lib_file)

				  # Check for duplicate addresses in the log file, and print a warning if

				  # duplicates are found. The instrumentation that produces the log file

				  # should only print the first time a function is entered.

				  addr_list = []

				  for call in call_info:

				    addr = call[3]

				    if addr not in addr_list:

				      addr_list.append(addr)

				    else:

				      print('WARNING: Address ' + hex(addr) + ' (line= ' +

				            AddrToLine(addr, lib_file) + ') already profiled.')

				  for call in call_info:

				    if output_type == 'lineize':

				      symbol = AddrToLine(call[3], lib_file)

				      print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'

				            + symbol)

				    elif output_type == 'orderfile':

				      try:

				        symbols = FindFunctions(call[3], unique_addrs, address_map)

				        for symbol in symbols:

				          print '.text.' + symbol

				        print ''

				      except SymbolNotFoundException as e:

				        sys.stderr.write('WARNING: Did not find function in binary. addr: '

				                      + hex(addr) + '\n')

				    else:

				      try:

				        symbols = FindFunctions(call[3], unique_addrs, address_map)

				        print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'

				              + symbols[0])

				        first_symbol = True

				        for symbol in symbols:

				          if not first_symbol:

				            print '\t\t\t\t\t' + symbol

				          else:

				            first_symbol = False

				      except SymbolNotFoundException as e:

				        sys.stderr.write('WARNING: Did not find function in binary. addr: '

				                      + hex(addr) + '\n')

				if __name__ == '__main__':

				  main()

Add scripts for using cygprofile to repository

186 tools/cygprofile/mergetraces.py Executable file

117 tools/cygprofile/patch_orderfile.py Executable file

251 tools/cygprofile/symbolize.py Executable file

186

tools/cygprofile/mergetraces.py Executable file

117

tools/cygprofile/patch_orderfile.py Executable file

251

tools/cygprofile/symbolize.py Executable file