Move generic isPrint and columnWidth implementations to a separate header/source to allow using both generic and system-dependent versions on win32.

Summary:
This is needed so we can use generic columnWidthUTF8 in clang-format on
win32 simultaneously with a separate system-dependent implementations of
isPrint/columnWidth in TextDiagnostic.cpp to avoid attempts to print Unicode
characters using narrow-character interfaces (which is not supported on Windows,
and we'll have to figure out how to handle this).

Reviewers: jordan_rose

Reviewed By: jordan_rose

CC: llvm-commits, klimek

Differential Revision: http://llvm-reviews.chandlerc.com/D1559

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189952 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
Alexander Kornienko 2013-09-04 16:00:12 +00:00
parent 9127334dad
commit 280e5eef43
9 changed files with 197 additions and 220 deletions

View File

@ -0,0 +1,62 @@
//===- llvm/Support/Unicode.h - Unicode character properties -*- C++ -*-=====//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines functions that allow querying certain properties of Unicode
// characters.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/StringRef.h"
namespace llvm {
namespace sys {
namespace unicode {
enum ColumnWidthErrors {
ErrorInvalidUTF8 = -2,
ErrorNonPrintableCharacter = -1
};
/// Determines if a character is likely to be displayed correctly on the
/// terminal. Exact implementation would have to depend on the specific
/// terminal, so we define the semantic that should be suitable for generic case
/// of a terminal capable to output Unicode characters.
///
/// All characters from the Unicode code point range are considered printable
/// except for:
/// * C0 and C1 control character ranges;
/// * default ignorable code points as per 5.21 of
/// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
/// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
/// terminals;
/// * format characters (category = Cf);
/// * surrogates (category = Cs);
/// * unassigned characters (category = Cn).
/// \return true if the character is considered printable.
bool isPrintable(int UCS);
/// Gets the number of positions the UTF8-encoded \p Text is likely to occupy
/// when output on a terminal ("character width"). This depends on the
/// implementation of the terminal, and there's no standard definition of
/// character width.
///
/// The implementation defines it in a way that is expected to be compatible
/// with a generic Unicode-capable terminal.
///
/// \return Character width:
/// * ErrorNonPrintableCharacter (-1) if \p Text contains non-printable
/// characters (as identified by isPrintable);
/// * 0 for each non-spacing and enclosing combining mark;
/// * 2 for each CJK character excluding halfwidth forms;
/// * 1 for each of the remaining characters.
int columnWidthUTF8(StringRef Text);
} // namespace unicode
} // namespace sys
} // namespace llvm

View File

@ -54,6 +54,7 @@ add_llvm_library(LLVMSupport
ToolOutputFile.cpp
Triple.cpp
Twine.cpp
Unicode.cpp
YAMLParser.cpp
YAMLTraits.cpp
raw_os_ostream.cpp

View File

@ -1,10 +1,31 @@
#include "llvm/Support/Locale.h"
#include "llvm/Config/config.h"
#include "llvm/Support/Unicode.h"
#ifdef __APPLE__
#include "LocaleXlocale.inc"
#elif LLVM_ON_WIN32
#include "LocaleWindows.inc"
namespace llvm {
namespace sys {
namespace locale {
int columnWidth(StringRef Text) {
#if LLVM_ON_WIN32
return Text.size();
#else
#include "LocaleGeneric.inc"
return llvm::sys::unicode::columnWidthUTF8(Text);
#endif
}
bool isPrint(int UCS) {
#if LLVM_ON_WIN32
// Restrict characters that we'll try to print to the the lower part of ASCII
// except for the control characters (0x20 - 0x7E). In general one can not
// reliably output code points U+0080 and higher using narrow character C/C++
// output functions in Windows, because the meaning of the upper 128 codes is
// determined by the active code page in the console.
return ' ' <= UCS && UCS <= '~';
#else
return llvm::sys::unicode::isPrintable(UCS);
#endif
}
} // namespace locale
} // namespace sys
} // namespace llvm

View File

@ -1,15 +0,0 @@
namespace llvm {
namespace sys {
namespace locale {
int columnWidth(StringRef s) {
return s.size();
}
bool isPrint(int c) {
return ' ' <= c && c <= '~';
}
}
}
}

View File

@ -1,61 +0,0 @@
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/Support/ManagedStatic.h"
#include <cassert>
#include <xlocale.h>
namespace {
struct locale_holder {
locale_holder()
: l(newlocale(LC_CTYPE_MASK,"en_US.UTF-8",LC_GLOBAL_LOCALE))
{
assert(NULL!=l);
}
~locale_holder() {
freelocale(l);
}
int mbswidth(llvm::SmallString<16> s) const {
// this implementation assumes no '\0' in s
assert(s.size()==strlen(s.c_str()));
size_t size = mbstowcs_l(NULL,s.c_str(),0,l);
assert(size!=(size_t)-1);
if (size==0)
return 0;
llvm::SmallVector<wchar_t,200> ws(size);
size = mbstowcs_l(&ws[0],s.c_str(),ws.size(),l);
assert(ws.size()==size);
return wcswidth_l(&ws[0],ws.size(),l);
}
int isprint(int c) const {
return iswprint_l(c,l);
}
private:
locale_t l;
};
llvm::ManagedStatic<locale_holder> l;
}
namespace llvm {
namespace sys {
namespace locale {
int columnWidth(StringRef s) {
int width = l->mbswidth(s);
assert(width>=0);
return width;
}
bool isPrint(int c) {
return l->isprint(c);
}
}
}
}

View File

@ -1,4 +1,4 @@
//===- llvm/Support/LocaleGeneric.inc - Locale-dependent stuff -*- C++ -*-===//
//===- llvm/Support/Unicode.cpp - Unicode character properties -*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@ -7,41 +7,20 @@
//
//===----------------------------------------------------------------------===//
//
// This file implements llvm::sys::locale::columnWidth and
// llvm::sys::locale::isPrint functions for UTF-8 locales.
// This file implements functions that allow querying certain properties of
// Unicode characters.
//
//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
#include "llvm/Support/Unicode.h"
#include "llvm/Support/ConvertUTF.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Support/UnicodeCharRanges.h"
namespace llvm {
namespace sys {
namespace locale {
namespace unicode {
enum ColumnWidthErrors {
ErrorInvalidUTF8 = -2,
ErrorNonPrintableCharacter = -1
};
/// Determines if a character is likely to be displayed correctly on the
/// terminal. Exact implementation would have to depend on the specific
/// terminal, so we define the semantic that should be suitable for generic case
/// of a terminal capable to output Unicode characters.
/// All characters from the Unicode codepoint range are considered printable
/// except for:
/// * C0 and C1 control character ranges;
/// * default ignorable code points as per 5.21 of
/// http://www.unicode.org/versions/Unicode6.2.0/UnicodeStandard-6.2.pdf
/// except for U+00AD SOFT HYPHEN, as it's actually displayed on most
/// terminals;
/// * format characters (category = Cf);
/// * surrogates (category = Cs);
/// * unassigned characters (category = Cn).
/// \return true if the character is considered printable.
bool isPrint(int UCS) {
bool isPrintable(int UCS) {
// Sorted list of non-overlapping intervals of code points that are not
// supposed to be printable.
static const UnicodeCharRange NonPrintableRanges[] = {
@ -241,13 +220,13 @@ bool isPrint(int UCS) {
/// with a generic Unicode-capable terminal.
/// \return Character width:
/// * ErrorNonPrintableCharacter (-1) for non-printable characters (as
/// identified by isPrint);
/// identified by isPrintable);
/// * 0 for non-spacing and enclosing combining marks;
/// * 2 for CJK characters excluding halfwidth forms;
/// * 1 for all remaining characters.
static inline int charWidth(int UCS)
{
if (!isPrint(UCS))
if (!isPrintable(UCS))
return ErrorNonPrintableCharacter;
// Sorted list of non-spacing and enclosing combining mark intervals as
@ -361,7 +340,7 @@ static inline int charWidth(int UCS)
return 1;
}
int columnWidth(StringRef Text) {
int columnWidthUTF8(StringRef Text) {
unsigned ColumnWidth = 0;
unsigned Length;
for (size_t i = 0, e = Text.size(); i < e; i += Length) {
@ -382,6 +361,7 @@ int columnWidth(StringRef Text) {
return ColumnWidth;
}
}
}
}
} // namespace unicode
} // namespace sys
} // namespace llvm

View File

@ -19,7 +19,6 @@ add_llvm_unittest(SupportTests
FileOutputBufferTest.cpp
IntegersSubsetTest.cpp
LeakDetectorTest.cpp
LocaleTest.cpp
LockFileManagerTest.cpp
ManagedStatic.cpp
MathExtrasTest.cpp
@ -32,6 +31,7 @@ add_llvm_unittest(SupportTests
RegexTest.cpp
SwapByteOrderTest.cpp
TimeValueTest.cpp
UnicodeTest.cpp
ValueHandleTest.cpp
YAMLIOTest.cpp
YAMLParserTest.cpp

View File

@ -1,104 +0,0 @@
//===- unittests/Support/LocaleTest.cpp - Locale.h tests ------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/Locale.h"
#include "gtest/gtest.h"
namespace llvm {
namespace sys {
namespace locale {
namespace {
// FIXME: WIN32 implementation is incorrect. We should consider using the one
// from LocaleGeneric.inc for WIN32.
#ifndef _WIN32
TEST(Locale, columnWidth) {
// FIXME: This test fails with MacOSX implementation of columnWidth.
#ifndef __APPLE__
EXPECT_EQ(0, columnWidth(""));
EXPECT_EQ(1, columnWidth(" "));
EXPECT_EQ(1, columnWidth("a"));
EXPECT_EQ(1, columnWidth("~"));
EXPECT_EQ(6, columnWidth("abcdef"));
EXPECT_EQ(-1, columnWidth("\x01"));
EXPECT_EQ(-1, columnWidth("aaaaaaaaaa\x01"));
EXPECT_EQ(-1, columnWidth("\342\200\213")); // 200B ZERO WIDTH SPACE
// 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some
// text editors display it only when a line is broken at it, some use it as a
// line-break hint, but don't display. We choose terminal-oriented
// interpretation.
EXPECT_EQ(1, columnWidth("\302\255"));
EXPECT_EQ(0, columnWidth("\314\200")); // 0300 COMBINING GRAVE ACCENT
EXPECT_EQ(1, columnWidth("\340\270\201")); // 0E01 THAI CHARACTER KO KAI
EXPECT_EQ(2, columnWidth("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00
EXPECT_EQ(4, columnWidth("\344\270\200\344\270\200"));
EXPECT_EQ(3, columnWidth("q\344\270\200"));
EXPECT_EQ(3, columnWidth("\314\200\340\270\201\344\270\200"));
// Invalid UTF-8 strings, columnWidth should error out.
EXPECT_EQ(-2, columnWidth("\344"));
EXPECT_EQ(-2, columnWidth("\344\270"));
EXPECT_EQ(-2, columnWidth("\344\270\033"));
EXPECT_EQ(-2, columnWidth("\344\270\300"));
EXPECT_EQ(-2, columnWidth("\377\366\355"));
EXPECT_EQ(-2, columnWidth("qwer\344"));
EXPECT_EQ(-2, columnWidth("qwer\344\270"));
EXPECT_EQ(-2, columnWidth("qwer\344\270\033"));
EXPECT_EQ(-2, columnWidth("qwer\344\270\300"));
EXPECT_EQ(-2, columnWidth("qwer\377\366\355"));
// UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode
// characters.
EXPECT_EQ(-2, columnWidth("\370\200\200\200\200")); // U+200000
EXPECT_EQ(-2, columnWidth("\374\200\200\200\200\200")); // U+4000000
#endif // __APPLE__
}
TEST(Locale, isPrint) {
EXPECT_FALSE(isPrint(0)); // <control-0000>-<control-001F>
EXPECT_FALSE(isPrint(0x01));
EXPECT_FALSE(isPrint(0x1F));
EXPECT_TRUE(isPrint(' '));
EXPECT_TRUE(isPrint('A'));
EXPECT_TRUE(isPrint('~'));
EXPECT_FALSE(isPrint(0x7F)); // <control-007F>..<control-009F>
EXPECT_FALSE(isPrint(0x90));
EXPECT_FALSE(isPrint(0x9F));
EXPECT_TRUE(isPrint(0xAC));
EXPECT_TRUE(isPrint(0xAD)); // SOFT HYPHEN is displayed on most terminals
// as either a space or a dash.
EXPECT_TRUE(isPrint(0xAE));
// MacOS implementation doesn't think it's printable.
#ifndef __APPLE__
EXPECT_TRUE(isPrint(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
#endif // __APPLE__
EXPECT_FALSE(isPrint(0x0378)); // <reserved-0378>..<reserved-0379>
EXPECT_FALSE(isPrint(0x0600)); // ARABIC NUMBER SIGN
EXPECT_FALSE(isPrint(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF>
EXPECT_TRUE(isPrint(0x20000)); // CJK UNIFIED IDEOGRAPH-20000
EXPECT_FALSE(isPrint(0x10FFFF)); // noncharacter
}
#endif // _WIN32
} // namespace
} // namespace locale
} // namespace sys
} // namespace llvm

View File

@ -0,0 +1,93 @@
//===- unittests/Support/UnicodeTest.cpp - Unicode.h tests ----------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
#include "llvm/Support/Unicode.h"
#include "gtest/gtest.h"
namespace llvm {
namespace sys {
namespace unicode {
namespace {
TEST(Unicode, columnWidthUTF8) {
EXPECT_EQ(0, columnWidthUTF8(""));
EXPECT_EQ(1, columnWidthUTF8(" "));
EXPECT_EQ(1, columnWidthUTF8("a"));
EXPECT_EQ(1, columnWidthUTF8("~"));
EXPECT_EQ(6, columnWidthUTF8("abcdef"));
EXPECT_EQ(-1, columnWidthUTF8("\x01"));
EXPECT_EQ(-1, columnWidthUTF8("aaaaaaaaaa\x01"));
EXPECT_EQ(-1, columnWidthUTF8("\342\200\213")); // 200B ZERO WIDTH SPACE
// 00AD SOFT HYPHEN is displayed on most terminals as a space or a dash. Some
// text editors display it only when a line is broken at it, some use it as a
// line-break hint, but don't display. We choose terminal-oriented
// interpretation.
EXPECT_EQ(1, columnWidthUTF8("\302\255"));
EXPECT_EQ(0, columnWidthUTF8("\314\200")); // 0300 COMBINING GRAVE ACCENT
EXPECT_EQ(1, columnWidthUTF8("\340\270\201")); // 0E01 THAI CHARACTER KO KAI
EXPECT_EQ(2, columnWidthUTF8("\344\270\200")); // CJK UNIFIED IDEOGRAPH-4E00
EXPECT_EQ(4, columnWidthUTF8("\344\270\200\344\270\200"));
EXPECT_EQ(3, columnWidthUTF8("q\344\270\200"));
EXPECT_EQ(3, columnWidthUTF8("\314\200\340\270\201\344\270\200"));
// Invalid UTF-8 strings, columnWidthUTF8 should error out.
EXPECT_EQ(-2, columnWidthUTF8("\344"));
EXPECT_EQ(-2, columnWidthUTF8("\344\270"));
EXPECT_EQ(-2, columnWidthUTF8("\344\270\033"));
EXPECT_EQ(-2, columnWidthUTF8("\344\270\300"));
EXPECT_EQ(-2, columnWidthUTF8("\377\366\355"));
EXPECT_EQ(-2, columnWidthUTF8("qwer\344"));
EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270"));
EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\033"));
EXPECT_EQ(-2, columnWidthUTF8("qwer\344\270\300"));
EXPECT_EQ(-2, columnWidthUTF8("qwer\377\366\355"));
// UTF-8 sequences longer than 4 bytes correspond to unallocated Unicode
// characters.
EXPECT_EQ(-2, columnWidthUTF8("\370\200\200\200\200")); // U+200000
EXPECT_EQ(-2, columnWidthUTF8("\374\200\200\200\200\200")); // U+4000000
}
TEST(Unicode, isPrintable) {
EXPECT_FALSE(isPrintable(0)); // <control-0000>-<control-001F>
EXPECT_FALSE(isPrintable(0x01));
EXPECT_FALSE(isPrintable(0x1F));
EXPECT_TRUE(isPrintable(' '));
EXPECT_TRUE(isPrintable('A'));
EXPECT_TRUE(isPrintable('~'));
EXPECT_FALSE(isPrintable(0x7F)); // <control-007F>..<control-009F>
EXPECT_FALSE(isPrintable(0x90));
EXPECT_FALSE(isPrintable(0x9F));
EXPECT_TRUE(isPrintable(0xAC));
EXPECT_TRUE(isPrintable(0xAD)); // SOFT HYPHEN is displayed on most terminals
// as either a space or a dash.
EXPECT_TRUE(isPrintable(0xAE));
EXPECT_TRUE(isPrintable(0x0377)); // GREEK SMALL LETTER PAMPHYLIAN DIGAMMA
EXPECT_FALSE(isPrintable(0x0378)); // <reserved-0378>..<reserved-0379>
EXPECT_FALSE(isPrintable(0x0600)); // ARABIC NUMBER SIGN
EXPECT_FALSE(isPrintable(0x1FFFF)); // <reserved-1F774>..<noncharacter-1FFFF>
EXPECT_TRUE(isPrintable(0x20000)); // CJK UNIFIED IDEOGRAPH-20000
EXPECT_FALSE(isPrintable(0x10FFFF)); // noncharacter
}
} // namespace
} // namespace unicode
} // namespace sys
} // namespace llvm