mirror of
https://github.com/classilla/tenfourfox.git
synced 2025-01-06 09:29:35 +00:00
421 lines
17 KiB
Python
421 lines
17 KiB
Python
# This Source Code Form is subject to the terms of the Mozilla Public
|
|
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
|
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
|
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
from xml import sax
|
|
try:
|
|
from cStringIO import StringIO
|
|
except ImportError:
|
|
from StringIO import StringIO
|
|
|
|
from compare_locales.parser import DTDParser, PropertiesParser
|
|
|
|
|
|
class Checker(object):
|
|
'''Abstract class to implement checks per file type.
|
|
'''
|
|
pattern = None
|
|
|
|
@classmethod
|
|
def use(cls, file):
|
|
return cls.pattern.match(file.file)
|
|
|
|
def check(self, refEnt, l10nEnt):
|
|
'''Given the reference and localized Entities, performs checks.
|
|
|
|
This is a generator yielding tuples of
|
|
- "warning" or "error", depending on what should be reported,
|
|
- tuple of line, column info for the error within the string
|
|
- description string to be shown in the report
|
|
'''
|
|
if True:
|
|
raise NotImplementedError("Need to subclass")
|
|
yield ("error", (0, 0), "This is an example error", "example")
|
|
|
|
|
|
class PrintfException(Exception):
|
|
def __init__(self, msg, pos):
|
|
self.pos = pos
|
|
self.msg = msg
|
|
|
|
|
|
class PropertiesChecker(Checker):
|
|
'''Tests to run on .properties files.
|
|
'''
|
|
pattern = re.compile('.*\.properties$')
|
|
printf = re.compile(r'%(?P<good>%|'
|
|
r'(?:(?P<number>[1-9][0-9]*)\$)?'
|
|
r'(?P<width>\*|[0-9]+)?'
|
|
r'(?P<prec>\.(?:\*|[0-9]+)?)?'
|
|
r'(?P<spec>[duxXosScpfg]))?')
|
|
|
|
def check(self, refEnt, l10nEnt):
|
|
'''Test for the different variable formats.
|
|
'''
|
|
refValue, l10nValue = refEnt.val, l10nEnt.val
|
|
refSpecs = None
|
|
# check for PluralForm.jsm stuff, should have the docs in the
|
|
# comment
|
|
if 'Localization_and_Plurals' in refEnt.pre_comment:
|
|
# For plurals, common variable pattern is #1. Try that.
|
|
pats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
|
|
refValue))
|
|
if len(pats) == 0:
|
|
return
|
|
lpats = set(int(m.group(1)) for m in re.finditer('#([0-9]+)',
|
|
l10nValue))
|
|
if pats - lpats:
|
|
yield ('warning', 0, 'not all variables used in l10n',
|
|
'plural')
|
|
return
|
|
if lpats - pats:
|
|
yield ('error', 0, 'unreplaced variables in l10n',
|
|
'plural')
|
|
return
|
|
return
|
|
# check for lost escapes
|
|
raw_val = l10nEnt.raw_val
|
|
for m in PropertiesParser.escape.finditer(raw_val):
|
|
if m.group('single') and \
|
|
m.group('single') not in PropertiesParser.known_escapes:
|
|
yield ('warning', m.start(),
|
|
'unknown escape sequence, \\' + m.group('single'),
|
|
'escape')
|
|
try:
|
|
refSpecs = self.getPrintfSpecs(refValue)
|
|
except PrintfException:
|
|
refSpecs = []
|
|
if refSpecs:
|
|
for t in self.checkPrintf(refSpecs, l10nValue):
|
|
yield t
|
|
return
|
|
|
|
def checkPrintf(self, refSpecs, l10nValue):
|
|
try:
|
|
l10nSpecs = self.getPrintfSpecs(l10nValue)
|
|
except PrintfException, e:
|
|
yield ('error', e.pos, e.msg, 'printf')
|
|
return
|
|
if refSpecs != l10nSpecs:
|
|
sm = SequenceMatcher()
|
|
sm.set_seqs(refSpecs, l10nSpecs)
|
|
msgs = []
|
|
warn = None
|
|
for action, i1, i2, j1, j2 in sm.get_opcodes():
|
|
if action == 'equal':
|
|
continue
|
|
if action == 'delete':
|
|
# missing argument in l10n
|
|
if i2 == len(refSpecs):
|
|
# trailing specs missing, that's just a warning
|
|
warn = ', '.join('trailing argument %d `%s` missing' %
|
|
(i+1, refSpecs[i])
|
|
for i in xrange(i1, i2))
|
|
else:
|
|
for i in xrange(i1, i2):
|
|
msgs.append('argument %d `%s` missing' %
|
|
(i+1, refSpecs[i]))
|
|
continue
|
|
if action == 'insert':
|
|
# obsolete argument in l10n
|
|
for i in xrange(j1, j2):
|
|
msgs.append('argument %d `%s` obsolete' %
|
|
(i+1, l10nSpecs[i]))
|
|
continue
|
|
if action == 'replace':
|
|
for i, j in zip(xrange(i1, i2), xrange(j1, j2)):
|
|
msgs.append('argument %d `%s` should be `%s`' %
|
|
(j+1, l10nSpecs[j], refSpecs[i]))
|
|
if msgs:
|
|
yield ('error', 0, ', '.join(msgs), 'printf')
|
|
if warn is not None:
|
|
yield ('warning', 0, warn, 'printf')
|
|
|
|
def getPrintfSpecs(self, val):
|
|
hasNumber = False
|
|
specs = []
|
|
for m in self.printf.finditer(val):
|
|
if m.group("good") is None:
|
|
# found just a '%', signal an error
|
|
raise PrintfException('Found single %', m.start())
|
|
if m.group("good") == '%':
|
|
# escaped %
|
|
continue
|
|
if ((hasNumber and m.group('number') is None) or
|
|
(not hasNumber and specs and
|
|
m.group('number') is not None)):
|
|
# mixed style, numbered and not
|
|
raise PrintfException('Mixed ordered and non-ordered args',
|
|
m.start())
|
|
hasNumber = m.group('number') is not None
|
|
if hasNumber:
|
|
pos = int(m.group('number')) - 1
|
|
ls = len(specs)
|
|
if pos >= ls:
|
|
# pad specs
|
|
nones = pos - ls
|
|
specs[ls:pos] = nones*[None]
|
|
specs.append(m.group('spec'))
|
|
else:
|
|
if specs[pos] is not None:
|
|
raise PrintfException('Double ordered argument %d' %
|
|
(pos+1),
|
|
m.start())
|
|
specs[pos] = m.group('spec')
|
|
else:
|
|
specs.append(m.group('spec'))
|
|
# check for missing args
|
|
if hasNumber and not all(specs):
|
|
raise PrintfException('Ordered argument missing', 0)
|
|
return specs
|
|
|
|
|
|
class DTDChecker(Checker):
|
|
"""Tests to run on DTD files.
|
|
|
|
Uses xml.sax for the heavy lifting of xml parsing.
|
|
|
|
The code tries to parse until it doesn't find any unresolved entities
|
|
anymore. If it finds one, it tries to grab the key, and adds an empty
|
|
<!ENTITY key ""> definition to the header.
|
|
|
|
Also checks for some CSS and number heuristics in the values.
|
|
"""
|
|
pattern = re.compile('.*\.dtd$')
|
|
|
|
eref = re.compile('&(%s);' % DTDParser.Name)
|
|
tmpl = '''<!DOCTYPE elem [%s]>
|
|
<elem>%s</elem>
|
|
'''
|
|
xmllist = set(('amp', 'lt', 'gt', 'apos', 'quot'))
|
|
|
|
def __init__(self, reference):
|
|
self.reference = reference
|
|
self.__known_entities = None
|
|
|
|
def known_entities(self, refValue):
|
|
if self.__known_entities is None and self.reference is not None:
|
|
self.__known_entities = set()
|
|
for ent in self.reference:
|
|
self.__known_entities.update(self.entities_for_value(ent.val))
|
|
return self.__known_entities if self.__known_entities is not None \
|
|
else self.entities_for_value(refValue)
|
|
|
|
def entities_for_value(self, value):
|
|
reflist = set(m.group(1).encode('utf-8')
|
|
for m in self.eref.finditer(value))
|
|
reflist -= self.xmllist
|
|
return reflist
|
|
|
|
# Setup for XML parser, with default and text-only content handler
|
|
class TextContent(sax.handler.ContentHandler):
|
|
textcontent = ''
|
|
|
|
def characters(self, content):
|
|
self.textcontent += content
|
|
|
|
defaulthandler = sax.handler.ContentHandler()
|
|
texthandler = TextContent()
|
|
|
|
numPattern = r'([0-9]+|[0-9]*\.[0-9]+)'
|
|
num = re.compile('^%s$' % numPattern)
|
|
lengthPattern = '%s(em|px|ch|cm|in)' % numPattern
|
|
length = re.compile('^%s$' % lengthPattern)
|
|
spec = re.compile(r'((?:min\-)?(?:width|height))\s*:\s*%s' %
|
|
lengthPattern)
|
|
style = re.compile(r'^%(spec)s\s*(;\s*%(spec)s\s*)*;?$' %
|
|
{'spec': spec.pattern})
|
|
|
|
processContent = None
|
|
|
|
def check(self, refEnt, l10nEnt):
|
|
"""Try to parse the refvalue inside a dummy element, and keep
|
|
track of entities that we need to define to make that work.
|
|
|
|
Return a checker that offers just those entities.
|
|
"""
|
|
refValue, l10nValue = refEnt.val, l10nEnt.val
|
|
# find entities the refValue references,
|
|
# reusing markup from DTDParser.
|
|
reflist = self.known_entities(refValue)
|
|
entities = ''.join('<!ENTITY %s "">' % s for s in sorted(reflist))
|
|
parser = sax.make_parser()
|
|
parser.setFeature(sax.handler.feature_external_ges, False)
|
|
|
|
parser.setContentHandler(self.defaulthandler)
|
|
try:
|
|
parser.parse(StringIO(self.tmpl %
|
|
(entities, refValue.encode('utf-8'))))
|
|
# also catch stray %
|
|
parser.parse(StringIO(self.tmpl %
|
|
(refEnt.all.encode('utf-8') + entities,
|
|
'&%s;' % refEnt.key.encode('utf-8'))))
|
|
except sax.SAXParseException, e:
|
|
yield ('warning',
|
|
(0, 0),
|
|
"can't parse en-US value", 'xmlparse')
|
|
|
|
# find entities the l10nValue references,
|
|
# reusing markup from DTDParser.
|
|
l10nlist = self.entities_for_value(l10nValue)
|
|
missing = sorted(l10nlist - reflist)
|
|
_entities = entities + ''.join('<!ENTITY %s "">' % s for s in missing)
|
|
warntmpl = u'Referencing unknown entity `%s`'
|
|
if reflist:
|
|
warntmpl += ' (%s known)' % ', '.join(sorted(reflist))
|
|
if self.processContent is not None:
|
|
self.texthandler.textcontent = ''
|
|
parser.setContentHandler(self.texthandler)
|
|
try:
|
|
parser.parse(StringIO(self.tmpl % (_entities,
|
|
l10nValue.encode('utf-8'))))
|
|
# also catch stray %
|
|
# if this fails, we need to substract the entity definition
|
|
parser.setContentHandler(self.defaulthandler)
|
|
parser.parse(StringIO(self.tmpl % (
|
|
l10nEnt.all.encode('utf-8') + _entities,
|
|
'&%s;' % l10nEnt.key.encode('utf-8'))))
|
|
except sax.SAXParseException, e:
|
|
# xml parse error, yield error
|
|
# sometimes, the error is reported on our fake closing
|
|
# element, make that the end of the last line
|
|
lnr = e.getLineNumber() - 1
|
|
lines = l10nValue.splitlines()
|
|
if lnr > len(lines):
|
|
lnr = len(lines)
|
|
col = len(lines[lnr-1])
|
|
else:
|
|
col = e.getColumnNumber()
|
|
if lnr == 1:
|
|
# first line starts with <elem>, substract
|
|
col -= len("<elem>")
|
|
elif lnr == 0:
|
|
col -= len("<!DOCTYPE elem [") # first line is DOCTYPE
|
|
yield ('error', (lnr, col), ' '.join(e.args), 'xmlparse')
|
|
|
|
for key in missing:
|
|
yield ('warning', (0, 0), warntmpl % key.decode('utf-8'),
|
|
'xmlparse')
|
|
|
|
# Number check
|
|
if self.num.match(refValue) and not self.num.match(l10nValue):
|
|
yield ('warning', 0, 'reference is a number', 'number')
|
|
# CSS checks
|
|
# just a length, width="100em"
|
|
if self.length.match(refValue) and not self.length.match(l10nValue):
|
|
yield ('error', 0, 'reference is a CSS length', 'css')
|
|
# real CSS spec, style="width:100px;"
|
|
if self.style.match(refValue):
|
|
if not self.style.match(l10nValue):
|
|
yield ('error', 0, 'reference is a CSS spec', 'css')
|
|
else:
|
|
# warn if different properties or units
|
|
refMap = dict((s, u) for s, _, u in
|
|
self.spec.findall(refValue))
|
|
msgs = []
|
|
for s, _, u in self.spec.findall(l10nValue):
|
|
if s not in refMap:
|
|
msgs.insert(0, '%s only in l10n' % s)
|
|
continue
|
|
else:
|
|
ru = refMap.pop(s)
|
|
if u != ru:
|
|
msgs.append("units for %s don't match "
|
|
"(%s != %s)" % (s, u, ru))
|
|
for s in refMap.iterkeys():
|
|
msgs.insert(0, '%s only in reference' % s)
|
|
if msgs:
|
|
yield ('warning', 0, ', '.join(msgs), 'css')
|
|
|
|
if self.processContent is not None:
|
|
for t in self.processContent(self.texthandler.textcontent):
|
|
yield t
|
|
|
|
|
|
class PrincessAndroid(DTDChecker):
|
|
"""Checker for the string values that Android puts into an XML container.
|
|
|
|
http://developer.android.com/guide/topics/resources/string-resource.html#FormattingAndStyling # noqa
|
|
has more info. Check for unescaped apostrophes and bad unicode escapes.
|
|
"""
|
|
quoted = re.compile("(?P<q>[\"']).*(?P=q)$")
|
|
|
|
def unicode_escape(self, str):
|
|
"""Helper method to try to decode all unicode escapes in a string.
|
|
|
|
This code uses the standard python decode for unicode-escape, but
|
|
that's somewhat tricky, as its input needs to be ascii. To get to
|
|
ascii, the unicode string gets converted to ascii with
|
|
backslashreplace, i.e., all non-ascii unicode chars get unicode
|
|
escaped. And then we try to roll all of that back.
|
|
Now, when that hits an error, that's from the original string, and we
|
|
need to search for the actual error position in the original string,
|
|
as the backslashreplace code changes string positions quite badly.
|
|
See also the last check in TestAndroid.test_android_dtd, with a
|
|
lengthy chinese string.
|
|
"""
|
|
val = str.encode('ascii', 'backslashreplace')
|
|
try:
|
|
val.decode('unicode-escape')
|
|
except UnicodeDecodeError, e:
|
|
args = list(e.args)
|
|
badstring = args[1][args[2]:args[3]]
|
|
i = len(args[1][:args[2]].decode('unicode-escape'))
|
|
args[2] = i
|
|
args[3] = i + len(badstring)
|
|
raise UnicodeDecodeError(*args)
|
|
|
|
@classmethod
|
|
def use(cls, file):
|
|
"""Use this Checker only for DTD files in embedding/android."""
|
|
return (file.module in ("embedding/android",
|
|
"mobile/android/base")
|
|
and cls.pattern.match(file.file))
|
|
|
|
def processContent(self, val):
|
|
"""Actual check code.
|
|
Check for unicode escapes and unescaped quotes and apostrophes,
|
|
if string's not quoted.
|
|
"""
|
|
# first, try to decode unicode escapes
|
|
try:
|
|
self.unicode_escape(val)
|
|
except UnicodeDecodeError, e:
|
|
yield ('error', e.args[2], e.args[4], 'android')
|
|
# check for unescaped single or double quotes.
|
|
# first, see if the complete string is single or double quoted,
|
|
# that changes the rules
|
|
m = self.quoted.match(val)
|
|
if m:
|
|
q = m.group('q')
|
|
offset = 0
|
|
val = val[1:-1] # strip quotes
|
|
else:
|
|
q = "[\"']"
|
|
offset = -1
|
|
stray_quot = re.compile(r"[\\\\]*(%s)" % q)
|
|
|
|
for m in stray_quot.finditer(val):
|
|
if len(m.group(0)) % 2:
|
|
# found an unescaped single or double quote, which message?
|
|
if m.group(1) == '"':
|
|
msg = u"Quotes in Android DTDs need escaping with \\\" "\
|
|
u"or \\u0022, or put string in apostrophes."
|
|
else:
|
|
msg = u"Apostrophes in Android DTDs need escaping with "\
|
|
u"\\' or \\u0027, or use \u2019, or put string in "\
|
|
u"quotes."
|
|
yield ('error', m.end(0)+offset, msg, 'android')
|
|
|
|
|
|
def getChecker(file, reference=None):
|
|
if PropertiesChecker.use(file):
|
|
return PropertiesChecker()
|
|
if PrincessAndroid.use(file):
|
|
return PrincessAndroid(reference)
|
|
if DTDChecker.use(file):
|
|
return DTDChecker(reference)
|
|
return None
|