From 91ccead42a3f84760f2d9794bf8d78c879e9cbb5 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Mon, 26 Jan 2015 19:51:00 +0000 Subject: [PATCH] Add a UTF8 to UTF16 conversion wrapper for use in the pdb dumper This can also be used instead of the WindowsSupport.h ConvertUTF8ToUTF16 helpers, but that will require massaging some character types. The Windows support routines want wchar_t output, but wchar_t is often 32 bits on non-Windows OSs. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@227122 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/ConvertUTF.h | 8 +++++++ lib/Support/ConvertUTFWrapper.cpp | 31 ++++++++++++++++++++++++++++ unittests/Support/ConvertUTFTest.cpp | 14 +++++++++++++ 3 files changed, 53 insertions(+) diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h index a184d0df213..38952ec99e6 100644 --- a/include/llvm/Support/ConvertUTF.h +++ b/include/llvm/Support/ConvertUTF.h @@ -251,6 +251,14 @@ bool hasUTF16ByteOrderMark(ArrayRef SrcBytes); */ bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out); +/** + * Converts a UTF-8 string into a UTF-16 string with native endianness. + * + * \returns true on success + */ +bool convertUTF8ToUTF16String(StringRef SrcUTF8, + SmallVectorImpl &DstUTF16); + } /* end namespace llvm */ #endif diff --git a/lib/Support/ConvertUTFWrapper.cpp b/lib/Support/ConvertUTFWrapper.cpp index e45335ddcb6..8f77bff4668 100644 --- a/lib/Support/ConvertUTFWrapper.cpp +++ b/lib/Support/ConvertUTFWrapper.cpp @@ -127,5 +127,36 @@ bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out) { return true; } +bool convertUTF8ToUTF16String(StringRef SrcUTF8, + SmallVectorImpl &DstUTF16) { + assert(DstUTF16.empty()); + + // Avoid OOB by returning early on empty input. + if (SrcUTF8.empty()) + return true; + + const UTF8 *Src = reinterpret_cast(SrcUTF8.begin()); + const UTF8 *SrcEnd = reinterpret_cast(SrcUTF8.end()); + + // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding + // as UTF-16 should always require the same amount or less code units than the + // UTF-8 encoding. + DstUTF16.resize(SrcUTF8.size()); + UTF16 *Dst = &DstUTF16[0]; + UTF16 *DstEnd = Dst + DstUTF16.size(); + + ConversionResult CR = + ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion); + assert(CR != targetExhausted); + + if (CR != conversionOK) { + DstUTF16.clear(); + return false; + } + + DstUTF16.resize(Dst - &DstUTF16[0]); + return true; +} + } // end namespace llvm diff --git a/unittests/Support/ConvertUTFTest.cpp b/unittests/Support/ConvertUTFTest.cpp index 49748db4ae9..a6dbe4c475f 100644 --- a/unittests/Support/ConvertUTFTest.cpp +++ b/unittests/Support/ConvertUTFTest.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Format.h" #include "gtest/gtest.h" #include #include @@ -37,6 +38,19 @@ TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { EXPECT_EQ(Expected, Result); } +TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { + // Src is the look of disapproval. + static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; + StringRef Ref(Src, sizeof(Src) - 1); + SmallVector Result; + bool Success = convertUTF8ToUTF16String(Ref, Result); + EXPECT_TRUE(Success); + static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; + ASSERT_EQ(3, Result.size()); + for (int I = 0, E = 3; I != E; ++I) + EXPECT_EQ(Expected[I], Result[I]); +} + TEST(ConvertUTFTest, OddLengthInput) { std::string Result; bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);