tenfourfox/python/compare-locales/compare_locales/parser.py
Cameron Kaiser c9b2922b70 hello FPR
2017-04-19 00:56:45 -07:00

522 lines
16 KiB
Python

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import re
import codecs
import logging
from HTMLParser import HTMLParser
__constructors = []
class Entity(object):
'''
Abstraction layer for a localizable entity.
Currently supported are grammars of the form:
1: pre white space
2: pre comments
3: entity definition
4: entity key (name)
5: entity value
6: post comment (and white space) in the same line (dtd only)
<--[1]
<!-- pre comments --> <--[2]
<!ENTITY key "value"> <!-- comment -->
<-------[3]---------><------[6]------>
'''
def __init__(self, contents, pp,
span, pre_ws_span, pre_comment_span, def_span,
key_span, val_span, post_span):
self.contents = contents
self.span = span
self.pre_ws_span = pre_ws_span
self.pre_comment_span = pre_comment_span
self.def_span = def_span
self.key_span = key_span
self.val_span = val_span
self.post_span = post_span
self.pp = pp
pass
# getter helpers
def get_all(self):
return self.contents[self.span[0]:self.span[1]]
def get_pre_ws(self):
return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
def get_pre_comment(self):
return self.contents[self.pre_comment_span[0]:
self.pre_comment_span[1]]
def get_def(self):
return self.contents[self.def_span[0]:self.def_span[1]]
def get_key(self):
return self.contents[self.key_span[0]:self.key_span[1]]
def get_val(self):
return self.pp(self.contents[self.val_span[0]:self.val_span[1]])
def get_raw_val(self):
return self.contents[self.val_span[0]:self.val_span[1]]
def get_post(self):
return self.contents[self.post_span[0]:self.post_span[1]]
# getters
all = property(get_all)
pre_ws = property(get_pre_ws)
pre_comment = property(get_pre_comment)
definition = property(get_def)
key = property(get_key)
val = property(get_val)
raw_val = property(get_raw_val)
post = property(get_post)
def __repr__(self):
return self.key
class Junk(object):
'''
An almost-Entity, representing junk data that we didn't parse.
This way, we can signal bad content as stuff we don't understand.
And the either fix that, or report real bugs in localizations.
'''
junkid = 0
def __init__(self, contents, span):
self.contents = contents
self.span = span
self.pre_ws = self.pre_comment = self.definition = self.post = ''
self.__class__.junkid += 1
self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
# getter helpers
def get_all(self):
return self.contents[self.span[0]:self.span[1]]
# getters
all = property(get_all)
val = property(get_all)
def __repr__(self):
return self.key
class Parser:
canMerge = True
def __init__(self):
if not hasattr(self, 'encoding'):
self.encoding = 'utf-8'
pass
def readFile(self, file):
f = codecs.open(file, 'r', self.encoding)
try:
self.contents = f.read()
except UnicodeDecodeError, e:
(logging.getLogger('locales')
.error("Can't read file: " + file + '; ' + str(e)))
self.contents = u''
f.close()
def readContents(self, contents):
(self.contents, length) = codecs.getdecoder(self.encoding)(contents)
def parse(self):
l = []
m = {}
for e in self:
m[e.key] = len(l)
l.append(e)
return (l, m)
def postProcessValue(self, val):
return val
def __iter__(self):
contents = self.contents
offset = 0
self.header, offset = self.getHeader(contents, offset)
self.footer = ''
entity, offset = self.getEntity(contents, offset)
while entity:
yield entity
entity, offset = self.getEntity(contents, offset)
f = self.reFooter.match(contents, offset)
if f:
self.footer = f.group()
offset = f.end()
if len(contents) > offset:
yield Junk(contents, (offset, len(contents)))
pass
def getHeader(self, contents, offset):
header = ''
h = self.reHeader.match(contents)
if h:
header = h.group()
offset = h.end()
return (header, offset)
def getEntity(self, contents, offset):
m = self.reKey.match(contents, offset)
if m:
offset = m.end()
entity = self.createEntity(contents, m)
return (entity, offset)
# first check if footer has a non-empy match,
# 'cause then we don't find junk
m = self.reFooter.match(contents, offset)
if m and m.end() > offset:
return (None, offset)
m = self.reKey.search(contents, offset)
if m:
# we didn't match, but search, so there's junk between offset
# and start. We'll match() on the next turn
junkend = m.start()
return (Junk(contents, (offset, junkend)), junkend)
return (None, offset)
def createEntity(self, contents, m):
return Entity(contents, self.postProcessValue,
*[m.span(i) for i in xrange(7)])
def getParser(path):
for item in __constructors:
if re.search(item[0], path):
return item[1]
raise UserWarning("Cannot find Parser")
# Subgroups of the match will:
# 1: pre white space
# 2: pre comments
# 3: entity definition
# 4: entity key (name)
# 5: entity value
# 6: post comment (and white space) in the same line (dtd only)
# <--[1]
# <!-- pre comments --> <--[2]
# <!ENTITY key "value"> <!-- comment -->
#
# <-------[3]---------><------[6]------>
class DTDParser(Parser):
# http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
# ":" | [A-Z] | "_" | [a-z] |
# [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
# | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
# [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
# [#x10000-#xEFFFF]
CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
# + \U00010000-\U000EFFFF seems to be unsupported in python
# NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
# [#x0300-#x036F] | [#x203F-#x2040]
NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
Name = '[' + NameStartChar + '][' + NameChar + ']*'
reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
'\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
'(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
re.DOTALL)
# add BOM to DTDs, details in bug 435002
reHeader = re.compile(u'^\ufeff?'
u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
'(<!ENTITY\s+%\s+(' + Name +
')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
def getEntity(self, contents, offset):
'''
Overload Parser.getEntity to special-case ParsedEntities.
Just check for a parsed entity if that method claims junk.
<!ENTITY % foo SYSTEM "url">
%foo;
'''
entity, inneroffset = Parser.getEntity(self, contents, offset)
if (entity and isinstance(entity, Junk)) or entity is None:
m = self.rePE.match(contents, offset)
if m:
inneroffset = m.end()
entity = Entity(contents, self.postProcessValue,
*[m.span(i) for i in xrange(7)])
return (entity, inneroffset)
def createEntity(self, contents, m):
valspan = m.span('val')
valspan = (valspan[0]+1, valspan[1]-1)
return Entity(contents, self.postProcessValue, m.span(),
m.span('pre'), m.span('precomment'),
m.span('entity'), m.span('key'), valspan,
m.span('post'))
class PropertiesParser(Parser):
escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
'(?P<nl>\n\s*)|(?P<single>.))', re.M)
known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
def __init__(self):
self.reKey = re.compile('^(\s*)'
'((?:[#!].*?\n\s*)*)'
'([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
self.reHeader = re.compile('^\s*([#!].*\s*)+')
self.reFooter = re.compile('\s*([#!].*\s*)*$')
self._escapedEnd = re.compile(r'\\+$')
self._trailingWS = re.compile(r'[ \t]*$')
Parser.__init__(self)
def getHeader(self, contents, offset):
header = ''
h = self.reHeader.match(contents, offset)
if h:
candidate = h.group()
if 'http://mozilla.org/MPL/2.0/' in candidate or \
'LICENSE BLOCK' in candidate:
header = candidate
offset = h.end()
return (header, offset)
def getEntity(self, contents, offset):
# overwritten to parse values line by line
m = self.reKey.match(contents, offset)
if m:
offset = m.end()
while True:
endval = nextline = contents.find('\n', offset)
if nextline == -1:
endval = offset = len(contents)
break
# is newline escaped?
_e = self._escapedEnd.search(contents, offset, nextline)
offset = nextline + 1
if _e is None:
break
# backslashes at end of line, if 2*n, not escaped
if len(_e.group()) % 2 == 0:
break
# strip trailing whitespace
ws = self._trailingWS.search(contents, m.end(), offset)
if ws:
endval -= ws.end() - ws.start()
entity = Entity(contents, self.postProcessValue,
(m.start(), offset), # full span
m.span(1), # leading whitespan
m.span(2), # leading comment span
(m.start(3), offset), # entity def span
m.span(3), # key span
(m.end(), endval), # value span
(offset, offset)) # post comment span, empty
return (entity, offset)
m = self.reKey.search(contents, offset)
if m:
# we didn't match, but search, so there's junk between offset
# and start. We'll match() on the next turn
junkend = m.start()
return (Junk(contents, (offset, junkend)), junkend)
return (None, offset)
def postProcessValue(self, val):
def unescape(m):
found = m.groupdict()
if found['uni']:
return unichr(int(found['uni'][1:], 16))
if found['nl']:
return ''
return self.known_escapes.get(found['single'], found['single'])
val = self.escape.sub(unescape, val)
return val
class DefinesParser(Parser):
# can't merge, #unfilter needs to be the last item, which we don't support
canMerge = False
def __init__(self):
self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
'(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
re.M)
self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
Parser.__init__(self)
class IniParser(Parser):
'''
Parse files of the form:
# initial comment
[cat]
whitespace*
#comment
string=value
...
'''
def __init__(self):
self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
self.reFooter = re.compile('\s*')
Parser.__init__(self)
DECL, COMMENT, START, END, CONTENT = range(5)
class BookmarksParserInner(HTMLParser):
class Token(object):
_type = None
content = ''
def __str__(self):
return self.content
class DeclToken(Token):
_type = DECL
def __init__(self, decl):
self.content = decl
pass
def __str__(self):
return '<!%s>' % self.content
pass
class CommentToken(Token):
_type = COMMENT
def __init__(self, comment):
self.content = comment
pass
def __str__(self):
return '<!--%s-->' % self.content
pass
class StartToken(Token):
_type = START
def __init__(self, tag, attrs, content):
self.tag = tag
self.attrs = dict(attrs)
self.content = content
pass
pass
class EndToken(Token):
_type = END
def __init__(self, tag):
self.tag = tag
pass
def __str__(self):
return '</%s>' % self.tag.upper()
pass
class ContentToken(Token):
_type = CONTENT
def __init__(self, content):
self.content = content
pass
pass
def __init__(self):
HTMLParser.__init__(self)
self.tokens = []
def parse(self, contents):
self.tokens = []
self.feed(contents)
self.close()
return self.tokens
# Called when we hit an end DL tag to reset the folder selections
def handle_decl(self, decl):
self.tokens.append(self.DeclToken(decl))
# Called when we hit an end DL tag to reset the folder selections
def handle_comment(self, comment):
self.tokens.append(self.CommentToken(comment))
def handle_starttag(self, tag, attrs):
self.tokens.append(self.StartToken(tag, attrs,
self.get_starttag_text()))
# Called when text data is encountered
def handle_data(self, data):
if self.tokens[-1]._type == CONTENT:
self.tokens[-1].content += data
else:
self.tokens.append(self.ContentToken(data))
def handle_charref(self, data):
self.handle_data('&#%s;' % data)
def handle_entityref(self, data):
self.handle_data('&%s;' % data)
# Called when we hit an end DL tag to reset the folder selections
def handle_endtag(self, tag):
self.tokens.append(self.EndToken(tag))
class BookmarksParser(Parser):
canMerge = False
class BMEntity(object):
def __init__(self, key, val):
self.key = key
self.val = val
def __iter__(self):
p = BookmarksParserInner()
tks = p.parse(self.contents)
i = 0
k = []
for i in xrange(len(tks)):
t = tks[i]
if t._type == START:
k.append(t.tag)
keys = t.attrs.keys()
keys.sort()
for attrname in keys:
yield self.BMEntity('.'.join(k) + '.@' + attrname,
t.attrs[attrname])
if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
i += 1
t = tks[i]
v = t.content.strip()
if v:
yield self.BMEntity('.'.join(k), v)
elif t._type == END:
k.pop()
__constructors = [('\\.dtd$', DTDParser()),
('\\.properties$', PropertiesParser()),
('\\.ini$', IniParser()),
('\\.inc$', DefinesParser()),
('bookmarks\\.html$', BookmarksParser())]