llvm-6502/utils/lit/lit/ShUtil.py
Daniel Dunbar bd4fa2efd3 lit: Fix a sh lexing bug which caused annotate-token.m to fail when run with the
internal shell parser; we weren't lexing the quotes in a command like::

  clang -DFOO='hello'

correctly.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@103652 91177308-0d34-0410-b5e6-96231b3b80d8
2010-05-12 21:47:58 +00:00

354 lines
12 KiB
Python

import itertools
import Util
from ShCommands import Command, Pipeline, Seq
class ShLexer:
def __init__(self, data, win32Escapes = False):
self.data = data
self.pos = 0
self.end = len(data)
self.win32Escapes = win32Escapes
def eat(self):
c = self.data[self.pos]
self.pos += 1
return c
def look(self):
return self.data[self.pos]
def maybe_eat(self, c):
"""
maybe_eat(c) - Consume the character c if it is the next character,
returning True if a character was consumed. """
if self.data[self.pos] == c:
self.pos += 1
return True
return False
def lex_arg_fast(self, c):
# Get the leading whitespace free section.
chunk = self.data[self.pos - 1:].split(None, 1)[0]
# If it has special characters, the fast path failed.
if ('|' in chunk or '&' in chunk or
'<' in chunk or '>' in chunk or
"'" in chunk or '"' in chunk or
'\\' in chunk):
return None
self.pos = self.pos - 1 + len(chunk)
return chunk
def lex_arg_slow(self, c):
if c in "'\"":
str = self.lex_arg_quoted(c)
else:
str = c
while self.pos != self.end:
c = self.look()
if c.isspace() or c in "|&":
break
elif c in '><':
# This is an annoying case; we treat '2>' as a single token so
# we don't have to track whitespace tokens.
# If the parse string isn't an integer, do the usual thing.
if not str.isdigit():
break
# Otherwise, lex the operator and convert to a redirection
# token.
num = int(str)
tok = self.lex_one_token()
assert isinstance(tok, tuple) and len(tok) == 1
return (tok[0], num)
elif c == '"':
self.eat()
str += self.lex_arg_quoted('"')
elif c == "'":
self.eat()
str += self.lex_arg_quoted("'")
elif not self.win32Escapes and c == '\\':
# Outside of a string, '\\' escapes everything.
self.eat()
if self.pos == self.end:
Util.warning("escape at end of quoted argument in: %r" %
self.data)
return str
str += self.eat()
else:
str += self.eat()
return str
def lex_arg_quoted(self, delim):
str = ''
while self.pos != self.end:
c = self.eat()
if c == delim:
return str
elif c == '\\' and delim == '"':
# Inside a '"' quoted string, '\\' only escapes the quote
# character and backslash, otherwise it is preserved.
if self.pos == self.end:
Util.warning("escape at end of quoted argument in: %r" %
self.data)
return str
c = self.eat()
if c == '"': #
str += '"'
elif c == '\\':
str += '\\'
else:
str += '\\' + c
else:
str += c
Util.warning("missing quote character in %r" % self.data)
return str
def lex_arg_checked(self, c):
pos = self.pos
res = self.lex_arg_fast(c)
end = self.pos
self.pos = pos
reference = self.lex_arg_slow(c)
if res is not None:
if res != reference:
raise ValueError,"Fast path failure: %r != %r" % (res, reference)
if self.pos != end:
raise ValueError,"Fast path failure: %r != %r" % (self.pos, end)
return reference
def lex_arg(self, c):
return self.lex_arg_fast(c) or self.lex_arg_slow(c)
def lex_one_token(self):
"""
lex_one_token - Lex a single 'sh' token. """
c = self.eat()
if c in ';!':
return (c,)
if c == '|':
if self.maybe_eat('|'):
return ('||',)
return (c,)
if c == '&':
if self.maybe_eat('&'):
return ('&&',)
if self.maybe_eat('>'):
return ('&>',)
return (c,)
if c == '>':
if self.maybe_eat('&'):
return ('>&',)
if self.maybe_eat('>'):
return ('>>',)
return (c,)
if c == '<':
if self.maybe_eat('&'):
return ('<&',)
if self.maybe_eat('>'):
return ('<<',)
return (c,)
return self.lex_arg(c)
def lex(self):
while self.pos != self.end:
if self.look().isspace():
self.eat()
else:
yield self.lex_one_token()
###
class ShParser:
def __init__(self, data, win32Escapes = False):
self.data = data
self.tokens = ShLexer(data, win32Escapes = win32Escapes).lex()
def lex(self):
try:
return self.tokens.next()
except StopIteration:
return None
def look(self):
next = self.lex()
if next is not None:
self.tokens = itertools.chain([next], self.tokens)
return next
def parse_command(self):
tok = self.lex()
if not tok:
raise ValueError,"empty command!"
if isinstance(tok, tuple):
raise ValueError,"syntax error near unexpected token %r" % tok[0]
args = [tok]
redirects = []
while 1:
tok = self.look()
# EOF?
if tok is None:
break
# If this is an argument, just add it to the current command.
if isinstance(tok, str):
args.append(self.lex())
continue
# Otherwise see if it is a terminator.
assert isinstance(tok, tuple)
if tok[0] in ('|',';','&','||','&&'):
break
# Otherwise it must be a redirection.
op = self.lex()
arg = self.lex()
if not arg:
raise ValueError,"syntax error near token %r" % op[0]
redirects.append((op, arg))
return Command(args, redirects)
def parse_pipeline(self):
negate = False
if self.look() == ('!',):
self.lex()
negate = True
commands = [self.parse_command()]
while self.look() == ('|',):
self.lex()
commands.append(self.parse_command())
return Pipeline(commands, negate)
def parse(self):
lhs = self.parse_pipeline()
while self.look():
operator = self.lex()
assert isinstance(operator, tuple) and len(operator) == 1
if not self.look():
raise ValueError, "missing argument to operator %r" % operator[0]
# FIXME: Operator precedence!!
lhs = Seq(lhs, operator[0], self.parse_pipeline())
return lhs
###
import unittest
class TestShLexer(unittest.TestCase):
def lex(self, str, *args, **kwargs):
return list(ShLexer(str, *args, **kwargs).lex())
def test_basic(self):
self.assertEqual(self.lex('a|b>c&d<e'),
['a', ('|',), 'b', ('>',), 'c', ('&',), 'd',
('<',), 'e'])
def test_redirection_tokens(self):
self.assertEqual(self.lex('a2>c'),
['a2', ('>',), 'c'])
self.assertEqual(self.lex('a 2>c'),
['a', ('>',2), 'c'])
def test_quoting(self):
self.assertEqual(self.lex(""" 'a' """),
['a'])
self.assertEqual(self.lex(""" "hello\\"world" """),
['hello"world'])
self.assertEqual(self.lex(""" "hello\\'world" """),
["hello\\'world"])
self.assertEqual(self.lex(""" "hello\\\\world" """),
["hello\\world"])
self.assertEqual(self.lex(""" he"llo wo"rld """),
["hello world"])
self.assertEqual(self.lex(""" a\\ b a\\\\b """),
["a b", "a\\b"])
self.assertEqual(self.lex(""" "" "" """),
["", ""])
self.assertEqual(self.lex(""" a\\ b """, win32Escapes = True),
['a\\', 'b'])
class TestShParse(unittest.TestCase):
def parse(self, str):
return ShParser(str).parse()
def test_basic(self):
self.assertEqual(self.parse('echo hello'),
Pipeline([Command(['echo', 'hello'], [])], False))
self.assertEqual(self.parse('echo ""'),
Pipeline([Command(['echo', ''], [])], False))
self.assertEqual(self.parse("""echo -DFOO='a'"""),
Pipeline([Command(['echo', '-DFOO=a'], [])], False))
self.assertEqual(self.parse('echo -DFOO="a"'),
Pipeline([Command(['echo', '-DFOO=a'], [])], False))
def test_redirection(self):
self.assertEqual(self.parse('echo hello > c'),
Pipeline([Command(['echo', 'hello'],
[((('>'),), 'c')])], False))
self.assertEqual(self.parse('echo hello > c >> d'),
Pipeline([Command(['echo', 'hello'], [(('>',), 'c'),
(('>>',), 'd')])], False))
self.assertEqual(self.parse('a 2>&1'),
Pipeline([Command(['a'], [(('>&',2), '1')])], False))
def test_pipeline(self):
self.assertEqual(self.parse('a | b'),
Pipeline([Command(['a'], []),
Command(['b'], [])],
False))
self.assertEqual(self.parse('a | b | c'),
Pipeline([Command(['a'], []),
Command(['b'], []),
Command(['c'], [])],
False))
self.assertEqual(self.parse('! a'),
Pipeline([Command(['a'], [])],
True))
def test_list(self):
self.assertEqual(self.parse('a ; b'),
Seq(Pipeline([Command(['a'], [])], False),
';',
Pipeline([Command(['b'], [])], False)))
self.assertEqual(self.parse('a & b'),
Seq(Pipeline([Command(['a'], [])], False),
'&',
Pipeline([Command(['b'], [])], False)))
self.assertEqual(self.parse('a && b'),
Seq(Pipeline([Command(['a'], [])], False),
'&&',
Pipeline([Command(['b'], [])], False)))
self.assertEqual(self.parse('a || b'),
Seq(Pipeline([Command(['a'], [])], False),
'||',
Pipeline([Command(['b'], [])], False)))
self.assertEqual(self.parse('a && b || c'),
Seq(Seq(Pipeline([Command(['a'], [])], False),
'&&',
Pipeline([Command(['b'], [])], False)),
'||',
Pipeline([Command(['c'], [])], False)))
if __name__ == '__main__':
unittest.main()