mirror of
https://github.com/classilla/tenfourfox.git
synced 2024-11-04 10:05:51 +00:00
522 lines
16 KiB
Python
522 lines
16 KiB
Python
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
import re
|
|
import codecs
|
|
import logging
|
|
from HTMLParser import HTMLParser
|
|
|
|
__constructors = []
|
|
|
|
|
|
class Entity(object):
|
|
'''
|
|
Abstraction layer for a localizable entity.
|
|
Currently supported are grammars of the form:
|
|
|
|
1: pre white space
|
|
2: pre comments
|
|
3: entity definition
|
|
4: entity key (name)
|
|
5: entity value
|
|
6: post comment (and white space) in the same line (dtd only)
|
|
<--[1]
|
|
<!-- pre comments --> <--[2]
|
|
<!ENTITY key "value"> <!-- comment -->
|
|
|
|
<-------[3]---------><------[6]------>
|
|
'''
|
|
def __init__(self, contents, pp,
|
|
span, pre_ws_span, pre_comment_span, def_span,
|
|
key_span, val_span, post_span):
|
|
self.contents = contents
|
|
self.span = span
|
|
self.pre_ws_span = pre_ws_span
|
|
self.pre_comment_span = pre_comment_span
|
|
self.def_span = def_span
|
|
self.key_span = key_span
|
|
self.val_span = val_span
|
|
self.post_span = post_span
|
|
self.pp = pp
|
|
pass
|
|
|
|
# getter helpers
|
|
|
|
def get_all(self):
|
|
return self.contents[self.span[0]:self.span[1]]
|
|
|
|
def get_pre_ws(self):
|
|
return self.contents[self.pre_ws_span[0]:self.pre_ws_span[1]]
|
|
|
|
def get_pre_comment(self):
|
|
return self.contents[self.pre_comment_span[0]:
|
|
self.pre_comment_span[1]]
|
|
|
|
def get_def(self):
|
|
return self.contents[self.def_span[0]:self.def_span[1]]
|
|
|
|
def get_key(self):
|
|
return self.contents[self.key_span[0]:self.key_span[1]]
|
|
|
|
def get_val(self):
|
|
return self.pp(self.contents[self.val_span[0]:self.val_span[1]])
|
|
|
|
def get_raw_val(self):
|
|
return self.contents[self.val_span[0]:self.val_span[1]]
|
|
|
|
def get_post(self):
|
|
return self.contents[self.post_span[0]:self.post_span[1]]
|
|
|
|
# getters
|
|
|
|
all = property(get_all)
|
|
pre_ws = property(get_pre_ws)
|
|
pre_comment = property(get_pre_comment)
|
|
definition = property(get_def)
|
|
key = property(get_key)
|
|
val = property(get_val)
|
|
raw_val = property(get_raw_val)
|
|
post = property(get_post)
|
|
|
|
def __repr__(self):
|
|
return self.key
|
|
|
|
|
|
class Junk(object):
|
|
'''
|
|
An almost-Entity, representing junk data that we didn't parse.
|
|
This way, we can signal bad content as stuff we don't understand.
|
|
And the either fix that, or report real bugs in localizations.
|
|
'''
|
|
junkid = 0
|
|
|
|
def __init__(self, contents, span):
|
|
self.contents = contents
|
|
self.span = span
|
|
self.pre_ws = self.pre_comment = self.definition = self.post = ''
|
|
self.__class__.junkid += 1
|
|
self.key = '_junk_%d_%d-%d' % (self.__class__.junkid, span[0], span[1])
|
|
|
|
# getter helpers
|
|
def get_all(self):
|
|
return self.contents[self.span[0]:self.span[1]]
|
|
|
|
# getters
|
|
all = property(get_all)
|
|
val = property(get_all)
|
|
|
|
def __repr__(self):
|
|
return self.key
|
|
|
|
|
|
class Parser:
|
|
canMerge = True
|
|
|
|
def __init__(self):
|
|
if not hasattr(self, 'encoding'):
|
|
self.encoding = 'utf-8'
|
|
pass
|
|
|
|
def readFile(self, file):
|
|
f = codecs.open(file, 'r', self.encoding)
|
|
try:
|
|
self.contents = f.read()
|
|
except UnicodeDecodeError, e:
|
|
(logging.getLogger('locales')
|
|
.error("Can't read file: " + file + '; ' + str(e)))
|
|
self.contents = u''
|
|
f.close()
|
|
|
|
def readContents(self, contents):
|
|
(self.contents, length) = codecs.getdecoder(self.encoding)(contents)
|
|
|
|
def parse(self):
|
|
l = []
|
|
m = {}
|
|
for e in self:
|
|
m[e.key] = len(l)
|
|
l.append(e)
|
|
return (l, m)
|
|
|
|
def postProcessValue(self, val):
|
|
return val
|
|
|
|
def __iter__(self):
|
|
contents = self.contents
|
|
offset = 0
|
|
self.header, offset = self.getHeader(contents, offset)
|
|
self.footer = ''
|
|
entity, offset = self.getEntity(contents, offset)
|
|
while entity:
|
|
yield entity
|
|
entity, offset = self.getEntity(contents, offset)
|
|
f = self.reFooter.match(contents, offset)
|
|
if f:
|
|
self.footer = f.group()
|
|
offset = f.end()
|
|
if len(contents) > offset:
|
|
yield Junk(contents, (offset, len(contents)))
|
|
pass
|
|
|
|
def getHeader(self, contents, offset):
|
|
header = ''
|
|
h = self.reHeader.match(contents)
|
|
if h:
|
|
header = h.group()
|
|
offset = h.end()
|
|
return (header, offset)
|
|
|
|
def getEntity(self, contents, offset):
|
|
m = self.reKey.match(contents, offset)
|
|
if m:
|
|
offset = m.end()
|
|
entity = self.createEntity(contents, m)
|
|
return (entity, offset)
|
|
# first check if footer has a non-empy match,
|
|
# 'cause then we don't find junk
|
|
m = self.reFooter.match(contents, offset)
|
|
if m and m.end() > offset:
|
|
return (None, offset)
|
|
m = self.reKey.search(contents, offset)
|
|
if m:
|
|
# we didn't match, but search, so there's junk between offset
|
|
# and start. We'll match() on the next turn
|
|
junkend = m.start()
|
|
return (Junk(contents, (offset, junkend)), junkend)
|
|
return (None, offset)
|
|
|
|
def createEntity(self, contents, m):
|
|
return Entity(contents, self.postProcessValue,
|
|
*[m.span(i) for i in xrange(7)])
|
|
|
|
|
|
def getParser(path):
|
|
for item in __constructors:
|
|
if re.search(item[0], path):
|
|
return item[1]
|
|
raise UserWarning("Cannot find Parser")
|
|
|
|
|
|
# Subgroups of the match will:
|
|
# 1: pre white space
|
|
# 2: pre comments
|
|
# 3: entity definition
|
|
# 4: entity key (name)
|
|
# 5: entity value
|
|
# 6: post comment (and white space) in the same line (dtd only)
|
|
# <--[1]
|
|
# <!-- pre comments --> <--[2]
|
|
# <!ENTITY key "value"> <!-- comment -->
|
|
#
|
|
# <-------[3]---------><------[6]------>
|
|
|
|
|
|
class DTDParser(Parser):
|
|
# http://www.w3.org/TR/2006/REC-xml11-20060816/#NT-NameStartChar
|
|
# ":" | [A-Z] | "_" | [a-z] |
|
|
# [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] | [#x370-#x37D] | [#x37F-#x1FFF]
|
|
# | [#x200C-#x200D] | [#x2070-#x218F] | [#x2C00-#x2FEF] |
|
|
# [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] |
|
|
# [#x10000-#xEFFFF]
|
|
CharMinusDash = u'\x09\x0A\x0D\u0020-\u002C\u002E-\uD7FF\uE000-\uFFFD'
|
|
XmlComment = '<!--(?:-?[%s])*?-->' % CharMinusDash
|
|
NameStartChar = u':A-Z_a-z\xC0-\xD6\xD8-\xF6\xF8-\u02FF' + \
|
|
u'\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F' + \
|
|
u'\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD'
|
|
# + \U00010000-\U000EFFFF seems to be unsupported in python
|
|
|
|
# NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 |
|
|
# [#x0300-#x036F] | [#x203F-#x2040]
|
|
NameChar = NameStartChar + ur'\-\.0-9' + u'\xB7\u0300-\u036F\u203F-\u2040'
|
|
Name = '[' + NameStartChar + '][' + NameChar + ']*'
|
|
reKey = re.compile('(?:(?P<pre>\s*)(?P<precomment>(?:' + XmlComment +
|
|
'\s*)*)(?P<entity><!ENTITY\s+(?P<key>' + Name +
|
|
')\s+(?P<val>\"[^\"]*\"|\'[^\']*\'?)\s*>)'
|
|
'(?P<post>[ \t]*(?:' + XmlComment + '\s*)*\n?)?)',
|
|
re.DOTALL)
|
|
# add BOM to DTDs, details in bug 435002
|
|
reHeader = re.compile(u'^\ufeff?'
|
|
u'(\s*<!--.*(http://mozilla.org/MPL/2.0/|'
|
|
u'LICENSE BLOCK)([^-]+-)*[^-]+-->)?', re.S)
|
|
reFooter = re.compile('\s*(<!--([^-]+-)*[^-]+-->\s*)*$')
|
|
rePE = re.compile('(?:(\s*)((?:' + XmlComment + '\s*)*)'
|
|
'(<!ENTITY\s+%\s+(' + Name +
|
|
')\s+SYSTEM\s+(\"[^\"]*\"|\'[^\']*\')\s*>\s*%' + Name +
|
|
';)([ \t]*(?:' + XmlComment + '\s*)*\n?)?)')
|
|
|
|
def getEntity(self, contents, offset):
|
|
'''
|
|
Overload Parser.getEntity to special-case ParsedEntities.
|
|
Just check for a parsed entity if that method claims junk.
|
|
|
|
<!ENTITY % foo SYSTEM "url">
|
|
%foo;
|
|
'''
|
|
entity, inneroffset = Parser.getEntity(self, contents, offset)
|
|
if (entity and isinstance(entity, Junk)) or entity is None:
|
|
m = self.rePE.match(contents, offset)
|
|
if m:
|
|
inneroffset = m.end()
|
|
entity = Entity(contents, self.postProcessValue,
|
|
*[m.span(i) for i in xrange(7)])
|
|
return (entity, inneroffset)
|
|
|
|
def createEntity(self, contents, m):
|
|
valspan = m.span('val')
|
|
valspan = (valspan[0]+1, valspan[1]-1)
|
|
return Entity(contents, self.postProcessValue, m.span(),
|
|
m.span('pre'), m.span('precomment'),
|
|
m.span('entity'), m.span('key'), valspan,
|
|
m.span('post'))
|
|
|
|
|
|
class PropertiesParser(Parser):
|
|
escape = re.compile(r'\\((?P<uni>u[0-9a-fA-F]{1,4})|'
|
|
'(?P<nl>\n\s*)|(?P<single>.))', re.M)
|
|
known_escapes = {'n': '\n', 'r': '\r', 't': '\t', '\\': '\\'}
|
|
|
|
def __init__(self):
|
|
self.reKey = re.compile('^(\s*)'
|
|
'((?:[#!].*?\n\s*)*)'
|
|
'([^#!\s\n][^=:\n]*?)\s*[:=][ \t]*', re.M)
|
|
self.reHeader = re.compile('^\s*([#!].*\s*)+')
|
|
self.reFooter = re.compile('\s*([#!].*\s*)*$')
|
|
self._escapedEnd = re.compile(r'\\+$')
|
|
self._trailingWS = re.compile(r'[ \t]*$')
|
|
Parser.__init__(self)
|
|
|
|
def getHeader(self, contents, offset):
|
|
header = ''
|
|
h = self.reHeader.match(contents, offset)
|
|
if h:
|
|
candidate = h.group()
|
|
if 'http://mozilla.org/MPL/2.0/' in candidate or \
|
|
'LICENSE BLOCK' in candidate:
|
|
header = candidate
|
|
offset = h.end()
|
|
return (header, offset)
|
|
|
|
def getEntity(self, contents, offset):
|
|
# overwritten to parse values line by line
|
|
m = self.reKey.match(contents, offset)
|
|
if m:
|
|
offset = m.end()
|
|
while True:
|
|
endval = nextline = contents.find('\n', offset)
|
|
if nextline == -1:
|
|
endval = offset = len(contents)
|
|
break
|
|
# is newline escaped?
|
|
_e = self._escapedEnd.search(contents, offset, nextline)
|
|
offset = nextline + 1
|
|
if _e is None:
|
|
break
|
|
# backslashes at end of line, if 2*n, not escaped
|
|
if len(_e.group()) % 2 == 0:
|
|
break
|
|
# strip trailing whitespace
|
|
ws = self._trailingWS.search(contents, m.end(), offset)
|
|
if ws:
|
|
endval -= ws.end() - ws.start()
|
|
entity = Entity(contents, self.postProcessValue,
|
|
(m.start(), offset), # full span
|
|
m.span(1), # leading whitespan
|
|
m.span(2), # leading comment span
|
|
(m.start(3), offset), # entity def span
|
|
m.span(3), # key span
|
|
(m.end(), endval), # value span
|
|
(offset, offset)) # post comment span, empty
|
|
return (entity, offset)
|
|
m = self.reKey.search(contents, offset)
|
|
if m:
|
|
# we didn't match, but search, so there's junk between offset
|
|
# and start. We'll match() on the next turn
|
|
junkend = m.start()
|
|
return (Junk(contents, (offset, junkend)), junkend)
|
|
return (None, offset)
|
|
|
|
def postProcessValue(self, val):
|
|
|
|
def unescape(m):
|
|
found = m.groupdict()
|
|
if found['uni']:
|
|
return unichr(int(found['uni'][1:], 16))
|
|
if found['nl']:
|
|
return ''
|
|
return self.known_escapes.get(found['single'], found['single'])
|
|
val = self.escape.sub(unescape, val)
|
|
return val
|
|
|
|
|
|
class DefinesParser(Parser):
|
|
# can't merge, #unfilter needs to be the last item, which we don't support
|
|
canMerge = False
|
|
|
|
def __init__(self):
|
|
self.reKey = re.compile('^(\s*)((?:^#(?!define\s).*\s*)*)'
|
|
'(#define[ \t]+(\w+)[ \t]+(.*?))([ \t]*$\n?)',
|
|
re.M)
|
|
self.reHeader = re.compile('^\s*(#(?!define\s).*\s*)*')
|
|
self.reFooter = re.compile('\s*(#(?!define\s).*\s*)*$', re.M)
|
|
Parser.__init__(self)
|
|
|
|
|
|
class IniParser(Parser):
|
|
'''
|
|
Parse files of the form:
|
|
# initial comment
|
|
[cat]
|
|
whitespace*
|
|
#comment
|
|
string=value
|
|
...
|
|
'''
|
|
def __init__(self):
|
|
self.reHeader = re.compile('^((?:\s*|[;#].*)\n)*\[.+?\]\n', re.M)
|
|
self.reKey = re.compile('(\s*)((?:[;#].*\n\s*)*)((.+?)=(.*))(\n?)')
|
|
self.reFooter = re.compile('\s*')
|
|
Parser.__init__(self)
|
|
|
|
|
|
DECL, COMMENT, START, END, CONTENT = range(5)
|
|
|
|
|
|
class BookmarksParserInner(HTMLParser):
|
|
|
|
class Token(object):
|
|
_type = None
|
|
content = ''
|
|
|
|
def __str__(self):
|
|
return self.content
|
|
|
|
class DeclToken(Token):
|
|
_type = DECL
|
|
|
|
def __init__(self, decl):
|
|
self.content = decl
|
|
pass
|
|
|
|
def __str__(self):
|
|
return '<!%s>' % self.content
|
|
pass
|
|
|
|
class CommentToken(Token):
|
|
_type = COMMENT
|
|
|
|
def __init__(self, comment):
|
|
self.content = comment
|
|
pass
|
|
|
|
def __str__(self):
|
|
return '<!--%s-->' % self.content
|
|
pass
|
|
|
|
class StartToken(Token):
|
|
_type = START
|
|
|
|
def __init__(self, tag, attrs, content):
|
|
self.tag = tag
|
|
self.attrs = dict(attrs)
|
|
self.content = content
|
|
pass
|
|
pass
|
|
|
|
class EndToken(Token):
|
|
_type = END
|
|
|
|
def __init__(self, tag):
|
|
self.tag = tag
|
|
pass
|
|
|
|
def __str__(self):
|
|
return '</%s>' % self.tag.upper()
|
|
pass
|
|
|
|
class ContentToken(Token):
|
|
_type = CONTENT
|
|
|
|
def __init__(self, content):
|
|
self.content = content
|
|
pass
|
|
pass
|
|
|
|
def __init__(self):
|
|
HTMLParser.__init__(self)
|
|
self.tokens = []
|
|
|
|
def parse(self, contents):
|
|
self.tokens = []
|
|
self.feed(contents)
|
|
self.close()
|
|
return self.tokens
|
|
|
|
# Called when we hit an end DL tag to reset the folder selections
|
|
def handle_decl(self, decl):
|
|
self.tokens.append(self.DeclToken(decl))
|
|
|
|
# Called when we hit an end DL tag to reset the folder selections
|
|
def handle_comment(self, comment):
|
|
self.tokens.append(self.CommentToken(comment))
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
self.tokens.append(self.StartToken(tag, attrs,
|
|
self.get_starttag_text()))
|
|
|
|
# Called when text data is encountered
|
|
def handle_data(self, data):
|
|
if self.tokens[-1]._type == CONTENT:
|
|
self.tokens[-1].content += data
|
|
else:
|
|
self.tokens.append(self.ContentToken(data))
|
|
|
|
def handle_charref(self, data):
|
|
self.handle_data('&#%s;' % data)
|
|
|
|
def handle_entityref(self, data):
|
|
self.handle_data('&%s;' % data)
|
|
|
|
# Called when we hit an end DL tag to reset the folder selections
|
|
def handle_endtag(self, tag):
|
|
self.tokens.append(self.EndToken(tag))
|
|
|
|
|
|
class BookmarksParser(Parser):
|
|
canMerge = False
|
|
|
|
class BMEntity(object):
|
|
def __init__(self, key, val):
|
|
self.key = key
|
|
self.val = val
|
|
|
|
def __iter__(self):
|
|
p = BookmarksParserInner()
|
|
tks = p.parse(self.contents)
|
|
i = 0
|
|
k = []
|
|
for i in xrange(len(tks)):
|
|
t = tks[i]
|
|
if t._type == START:
|
|
k.append(t.tag)
|
|
keys = t.attrs.keys()
|
|
keys.sort()
|
|
for attrname in keys:
|
|
yield self.BMEntity('.'.join(k) + '.@' + attrname,
|
|
t.attrs[attrname])
|
|
if i + 1 < len(tks) and tks[i+1]._type == CONTENT:
|
|
i += 1
|
|
t = tks[i]
|
|
v = t.content.strip()
|
|
if v:
|
|
yield self.BMEntity('.'.join(k), v)
|
|
elif t._type == END:
|
|
k.pop()
|
|
|
|
|
|
__constructors = [('\\.dtd$', DTDParser()),
|
|
('\\.properties$', PropertiesParser()),
|
|
('\\.ini$', IniParser()),
|
|
('\\.inc$', DefinesParser()),
|
|
('bookmarks\\.html$', BookmarksParser())]
|