From c919e57cbf378122e1f646de6e73cfe4c581f825 Mon Sep 17 00:00:00 2001 From: Jordan Rose Date: Fri, 12 Sep 2014 16:46:05 +0000 Subject: [PATCH] [lit] Parse all strings as UTF-8 rather than ASCII. As far as I can tell UTF-8 has been supported since the beginning of Python's codec support, and it's the de facto standard for text these days, at least for primarily-English text. This allows us to put Unicode into lit RUN lines. rdar://problem/18311663 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@217688 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Other/lit-unicode.txt | 3 +++ utils/lit/lit/ProgressBar.py | 8 ++++---- utils/lit/lit/TestRunner.py | 26 ++++++++++++++++++-------- utils/lit/lit/formats/googletest.py | 2 +- utils/lit/lit/util.py | 9 +++++++-- 5 files changed, 33 insertions(+), 15 deletions(-) create mode 100644 test/Other/lit-unicode.txt diff --git a/test/Other/lit-unicode.txt b/test/Other/lit-unicode.txt new file mode 100644 index 00000000000..ca92c991d17 --- /dev/null +++ b/test/Other/lit-unicode.txt @@ -0,0 +1,3 @@ +REQUIRES: shell +RUN: echo "ようこそ" | FileCheck %s +CHECK: {{^}}ようこそ{{$}} diff --git a/utils/lit/lit/ProgressBar.py b/utils/lit/lit/ProgressBar.py index e3644f1fa63..3ad704d16c9 100644 --- a/utils/lit/lit/ProgressBar.py +++ b/utils/lit/lit/ProgressBar.py @@ -6,8 +6,8 @@ import sys, re, time def to_bytes(str): - # Encode to Latin1 to get binary data. - return str.encode('ISO-8859-1') + # Encode to UTF-8 to get binary data. + return str.encode('utf-8') class TerminalController: """ @@ -136,7 +136,7 @@ class TerminalController: def _tparm(self, arg, index): import curses - return curses.tparm(to_bytes(arg), index).decode('ascii') or '' + return curses.tparm(to_bytes(arg), index).decode('utf-8') or '' def _tigetstr(self, cap_name): # String capabilities can include "delays" of the form "$<2>". @@ -147,7 +147,7 @@ class TerminalController: if cap is None: cap = '' else: - cap = cap.decode('ascii') + cap = cap.decode('utf-8') return re.sub(r'\$<\d+>[/*]?', '', cap) def render(self, template): diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py index 97524179988..af6e383ae4f 100644 --- a/utils/lit/lit/TestRunner.py +++ b/utils/lit/lit/TestRunner.py @@ -192,6 +192,11 @@ def executeShCmd(cmd, cfg, cwd, results): f.seek(0, 0) procData[i] = (procData[i][0], f.read()) + def to_string(bytes): + if isinstance(bytes, str): + return bytes + return bytes.encode('utf-8') + exitCode = None for i,(out,err) in enumerate(procData): res = procs[i].wait() @@ -201,11 +206,11 @@ def executeShCmd(cmd, cfg, cwd, results): # Ensure the resulting output is always of string type. try: - out = str(out.decode('ascii')) + out = to_string(out.decode('utf-8')) except: out = str(out) try: - err = str(err.decode('ascii')) + err = to_string(err.decode('utf-8')) except: err = str(err) @@ -314,13 +319,18 @@ def parseIntegratedTestScriptCommands(source_path): # Python2 and bytes in Python3. # # Once we find a match, we do require each script line to be decodable to - # ascii, so we convert the outputs to ascii before returning. This way the + # UTF-8, so we convert the outputs to UTF-8 before returning. This way the # remaining code can work with "strings" agnostic of the executing Python # version. def to_bytes(str): - # Encode to Latin1 to get binary data. - return str.encode('ISO-8859-1') + # Encode to UTF-8 to get binary data. + return str.encode('utf-8') + def to_string(bytes): + if isinstance(bytes, str): + return bytes + return to_bytes(bytes) + keywords = ('RUN:', 'XFAIL:', 'REQUIRES:', 'END.') keywords_re = re.compile( to_bytes("(%s)(.*)\n" % ("|".join(k for k in keywords),))) @@ -341,13 +351,13 @@ def parseIntegratedTestScriptCommands(source_path): match_position) last_match_position = match_position - # Convert the keyword and line to ascii strings and yield the + # Convert the keyword and line to UTF-8 strings and yield the # command. Note that we take care to return regular strings in # Python 2, to avoid other code having to differentiate between the # str and unicode types. keyword,ln = match.groups() - yield (line_number, str(keyword[:-1].decode('ascii')), - str(ln.decode('ascii'))) + yield (line_number, to_string(keyword[:-1].decode('utf-8')), + to_string(ln.decode('utf-8'))) finally: f.close() diff --git a/utils/lit/lit/formats/googletest.py b/utils/lit/lit/formats/googletest.py index 3d14b729ed0..1b5b7856456 100644 --- a/utils/lit/lit/formats/googletest.py +++ b/utils/lit/lit/formats/googletest.py @@ -31,7 +31,7 @@ class GoogleTest(TestFormat): try: lines = lit.util.capture([path, '--gtest_list_tests'], env=localConfig.environment) - lines = lines.decode('ascii') + lines = lines.decode('utf-8') if kIsWindows: lines = lines.replace('\r', '') lines = lines.split('\n') diff --git a/utils/lit/lit/util.py b/utils/lit/lit/util.py index 72a8b4848e0..cce620cabcd 100644 --- a/utils/lit/lit/util.py +++ b/utils/lit/lit/util.py @@ -156,13 +156,18 @@ def executeCommand(command, cwd=None, env=None): if exitCode == -signal.SIGINT: raise KeyboardInterrupt + def to_string(bytes): + if isinstance(bytes, str): + return bytes + return bytes.encode('utf-8') + # Ensure the resulting output is always of string type. try: - out = str(out.decode('ascii')) + out = to_string(out.decode('utf-8')) except: out = str(out) try: - err = str(err.decode('ascii')) + err = to_string(err.decode('utf-8')) except: err = str(err)