Add IDL Lexer

This lexer understands standard IDL tokens which are similar to 'C'.
INT, HEX, FLOAT, QUOTE and SYMBOL.   SYMBOL can then also become a
KEYWORD such as enum, interface, struct, typedef...

R=ncbray@google.com
BUG=76237
TEST=python idl_lexer.py --test_expect --test_same test_lex.in

Review URL: http://codereview.chromium.org/6697028

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@79169 0039d316-1c4b-4281-b951-d872f2087c98

This commit is contained in:

noelallen@google.com

2011-03-23 20:01:40 +00:00

parent 030ea0b2e4

commit bd3f4b3da8

2 changed files with 321 additions and 0 deletions

ppapi/generators

idl_lexer.py test_lex.in

									
										293

ppapi/generators/idl_lexer.py
									
										Normal file
									
				@ -0,0 +1,293 @@

				#!/usr/bin/python

				#

				# Copyright (c) 2011 The Chromium Authors. All rights reserved.

				# Use of this source code is governed by a BSD-style license that can be

				# found in the LICENSE file.

				""" Lexer for PPAPI IDL """

				import getopt

				import os.path

				import re

				import sys

				#

				# Try to load the ply module, if not, then assume it is in the third_party

				# directory, relative to ppapi

				#

				try:

				  from ply import lex

				except:

				  module_path, module_name = os.path.split(__file__)

				  third_party = os.path.join(module_path, '..', '..', 'third_party')

				  sys.path.append(third_party)

				  from ply import lex

				#

				# IDL Lexer

				#

				class IDLLexer(object):

				  # 'tokens' is a value required by lex which specifies the complete list

				  # of valid token types.

				  tokens = [

				    # Symbol and keywords types

				      'COMMENT',

				      'DESCRIBE',

				      'ENUM',

				      'SYMBOL',

				      'INTERFACE',

				      'STRUCT',

				      'TYPEDEF',

				    # Data types

				      'FLOAT',

				      'INT',

				      'HEX',

				      'STRING',

				    # Operators

				      'LSHIFT'

				  ]

				  # 'keywords' is a map of string to token type.  All SYMBOL tokens are

				  # matched against keywords, to determine if the token is actually a keyword.

				  keywords = {

				    'describe' : 'DESCRIBE',

				    'enum'  : 'ENUM',

				    'interface' : 'INTERFACE',

				    'readonly' : 'READONLY',

				    'struct' : 'STRUCT',

				    'typedef' : 'TYPEDEF',

				  }

				  # 'literals' is a value expected by lex which specifies a list of valid

				  # literal tokens, meaning the token type and token value are identical.

				  literals = '"*.(){}[],;:=+-'

				  # Token definitions

				  #

				  # Lex assumes any value or function in the form of 't_<TYPE>' represents a

				  # regular expression where a match will emit a token of type <TYPE>.  In the

				  # case of a function, the function is called when a match is made.

				  # 't_ignore' is a special match of items to ignore

				  t_ignore = ' \t'

				  # Constant values

				  t_FLOAT = r'-?(\d+\.\d*|\d*\.\d+)([Ee][+-]?\d+)?|-?\d+[Ee][+-]?\d+'

				  t_HEX = r'0x[a-fA-F0-9]+'

				  t_INT = r'-?\d+'

				  t_LSHIFT = r'<<'

				  # A line ending '\n', we use this to increment the line number

				  def t_LINE_END(self, t):

				    r'\n+'

				    self.AddLines(len(t.value))

				  # We do not process escapes in the IDL strings.  Strings are exclusively

				  # used for attributes, and not used as typical 'C' constants.

				  def t_STRING(self, t):

				    r'"[^"]*"'

				    t.value = t.value[1:-1]

				    self.AddLines(t.value.count('\n'))

				    return t

				  # A C or C++ style comment:  /* xxx */ or //

				  def t_COMMENT(self, t):

				    r'(/\*(.|\n)*?\*/)|(//.*)'

				    self.AddLines(t.value.count('\n'))

				    # C++ comments should keep the newline

				    if t.value[:2] == '//': t.value += '\n'

				    return t

				  # A symbol or keyword.

				  def t_KEYWORD_SYMBOL(self, t):

				    r'[A-Za-z][A-Za-z_0-9]*'

				    #All non-keywords are assumed to be symbols

				    t.type = self.keywords.get(t.value, 'SYMBOL')

				    return t

				  def t_ANY_error(self, t):

				    line = self.lexobj.lineno

				    pos = self.lexobj.lexpos - self.index[line]

				    file = self.lexobj.filename

				    out = self.ErrorMessage(file, line, pos, "Unrecognized input")

				    sys.stderr.write(out + '\n')

				  def AddLines(self, count):

				    # Set the lexer position for the beginning of the next line.  In the case

				    # of multiple lines, tokens can not exist on any of the lines except the

				    # last one, so the recorded value for previous lines are unused.  We still

				    # fill the array however, to make sure the line count is correct.

				    self.lexobj.lineno += count

				    for i in range(count):

				      self.index.append(self.lexobj.lexpos)

				  def FileLineMsg(self, file, line, msg):

				    if file:  return "%s(%d) : %s" % (file, line + 1, msg)

				    return "<BuiltIn> : %s" % msg

				  def SourceLine(self, file, line, pos):

				    caret = '\t^'.expandtabs(pos)

				    return "%s\n%s" % (self.lines[line], caret)

				  def ErrorMessage(self, file, line, pos, msg):

				    return "\n%s\n%s" % (

				        self.FileLineMsg(file, line, msg),

				        self.SourceLine(file, line, pos))

				  def SetData(self, filename, data):

				    self.lexobj.filename = filename

				    self.lexobj.lineno = 0

				    self.lines = data.split('\n')

				    self.index = [0]

				    self.lexobj.input(data)

				  def __init__(self):

				    self.lexobj = lex.lex(object=self, lextab=None, optimize=0)

				#

				# FilesToTokens

				#

				# From a set of source file names, generate a list of tokens.

				#

				def FilesToTokens(filenames, verbose=False):

				  lexer = IDLLexer()

				  outlist = []

				  for filename in filenames:

				    data = open(filename).read()

				    lexer.SetData(filename, data)

				    if verbose: sys.stdout.write('  Loaded %s...\n' % filename)

				    while 1:

				      t = lexer.lexobj.token()

				      if t is None: break

				      outlist.append(t)

				  return outlist

				#

				# TextToTokens

				#

				# From a block of text, generate a list of tokens

				#

				def TextToTokens(source):

				  lexer = IDLLexer()

				  outlist = []

				  lexer.SetData('AUTO', source)

				  while 1:

				    t = lexer.lexobj.token()

				    if t is None: break

				    outlist.append(t.value)

				  return outlist

				#

				# TestSame

				#

				# From a set of token values, generate a new source text by joining with a

				# single space.  The new source is then tokenized and compared against the

				# old set.

				#

				def TestSame(values, output=False, verbose=False):

				  src1 = ' '.join(values)

				  src2 = ' '.join(TextToTokens(src1))

				  if output:

				    sys.stdout.write('Generating original.txt and tokenized.txt\n')

				    open('original.txt', 'w').write(src1)

				    open('tokenized.txt', 'w').write(src2)

				  if src1 == src2:

				    sys.stdout.write('Same: Pass\n')

				    return 0

				  sys.stdout.write('Same: Failed\n')

				  return -1

				#

				# TestExpect

				#

				# From a set of tokens pairs, verify the type field of the second matches

				# the value of the first, so that:

				# INT 123 FLOAT 1.1

				# will generate a passing test, where the first token is the SYMBOL INT,

				# and the second token is the INT 123, third token is the SYMBOL FLOAT and

				# the fourth is the FLOAT 1.1, etc...

				def TestExpect(tokens):

				  count = len(tokens)

				  index = 0

				  errors = 0

				  while index < count:

				    type = tokens[index].value

				    token = tokens[index + 1]

				    index += 2

				    if type != token.type:

				      sys.stderr.write('Mismatch:  Expected %s, but got %s = %s.' %

				                       (type, token.type, token.value))

				      errors += 1

				  if not errors:

				    sys.stdout.write('Expect: Pass\n')

				    return 0

				  sys.stdout.write('Expect: Failed\n')

				  return -1

				def Main(args):

				  try:

				    long_opts = ['output', 'verbose', 'test_expect', 'test_same']

				    usage = 'Usage: idl_lexer.py %s [<src.idl> ...]' % ' '.join(

				       ['--%s' % opt for opt in long_opts])

				    opts, filenames = getopt.getopt(args, '', long_opts)

				  except getopt.error, e:

				    sys.stderr.write('Illegal option: %s\n%s\n' % (str(e), usage))

				    return 1

				  output = False

				  test_same = False

				  test_expect = False

				  verbose = False

				  for opt, val in opts:

				    if opt == '--output':

				      output = True

				    if opt == '--test_expect':

				      test_expect = True

				    if opt == '--test_same':

				      test_same = True

				    if opt == '--verbose':

				      verbose = True

				  try:

				    tokens = FilesToTokens(filenames, verbose)

				    values = [tok.value for tok in tokens]

				    if output: sys.stdout.write(' <> '.join(values) + '\n')

				    if test_same:

				      if TestSame(values, output = output, verbose = verbose):

				        return -1

				    if test_expect:

				      if TestExpect(tokens):

				        return -1

				    return 0

				  except lex.LexError as le:

				    sys.stderr.write('%s\n' % str(le))

				  return -1

				if __name__ == '__main__':

				  sys.exit(Main(sys.argv[1:]))

28

ppapi/generators/test_lex.in Normal file

 @ -0,0 +1,28 @@
 INT 1 INT 123 INT 12345
 SYMBOL A123 SYMBOL A_A
 COMMENT //abc
 COMMENT // abc
 COMMENT //  abc
 COMMENT //abc def
 COMMENT // abc def
 COMMENT //  abc def
 COMMENT /*abc*/ COMMENT /* abc */ COMMENT /* abc
  */
 COMMENT /* abc def */ COMMENT /* abc def
 */ COMMENT //  abc def
 FLOAT 1.1
 FLOAT 1e1
 FLOAT -1.1
 FLOAT -1e1
 FLOAT 1e-1
 FLOAT -1e-1
 HEX 0x1
 HEX 0x0
 HEX 0x10
 HEX 0x112312

Add IDL Lexer

293 ppapi/generators/idl_lexer.py Normal file

28 ppapi/generators/test_lex.in Normal file

293

ppapi/generators/idl_lexer.py Normal file

28

ppapi/generators/test_lex.in Normal file