diff --git a/include/llvm/Support/ConvertUTF.h b/include/llvm/Support/ConvertUTF.h index a184d0df213..38952ec99e6 100644 --- a/include/llvm/Support/ConvertUTF.h +++ b/include/llvm/Support/ConvertUTF.h @@ -251,6 +251,14 @@ bool hasUTF16ByteOrderMark(ArrayRef SrcBytes); */ bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out); +/** + * Converts a UTF-8 string into a UTF-16 string with native endianness. + * + * \returns true on success + */ +bool convertUTF8ToUTF16String(StringRef SrcUTF8, + SmallVectorImpl &DstUTF16); + } /* end namespace llvm */ #endif diff --git a/lib/Support/ConvertUTFWrapper.cpp b/lib/Support/ConvertUTFWrapper.cpp index e45335ddcb6..8f77bff4668 100644 --- a/lib/Support/ConvertUTFWrapper.cpp +++ b/lib/Support/ConvertUTFWrapper.cpp @@ -127,5 +127,36 @@ bool convertUTF16ToUTF8String(ArrayRef SrcBytes, std::string &Out) { return true; } +bool convertUTF8ToUTF16String(StringRef SrcUTF8, + SmallVectorImpl &DstUTF16) { + assert(DstUTF16.empty()); + + // Avoid OOB by returning early on empty input. + if (SrcUTF8.empty()) + return true; + + const UTF8 *Src = reinterpret_cast(SrcUTF8.begin()); + const UTF8 *SrcEnd = reinterpret_cast(SrcUTF8.end()); + + // Allocate the same number of UTF-16 code units as UTF-8 code units. Encoding + // as UTF-16 should always require the same amount or less code units than the + // UTF-8 encoding. + DstUTF16.resize(SrcUTF8.size()); + UTF16 *Dst = &DstUTF16[0]; + UTF16 *DstEnd = Dst + DstUTF16.size(); + + ConversionResult CR = + ConvertUTF8toUTF16(&Src, SrcEnd, &Dst, DstEnd, strictConversion); + assert(CR != targetExhausted); + + if (CR != conversionOK) { + DstUTF16.clear(); + return false; + } + + DstUTF16.resize(Dst - &DstUTF16[0]); + return true; +} + } // end namespace llvm diff --git a/unittests/Support/ConvertUTFTest.cpp b/unittests/Support/ConvertUTFTest.cpp index 49748db4ae9..a6dbe4c475f 100644 --- a/unittests/Support/ConvertUTFTest.cpp +++ b/unittests/Support/ConvertUTFTest.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/ConvertUTF.h" +#include "llvm/Support/Format.h" #include "gtest/gtest.h" #include #include @@ -37,6 +38,19 @@ TEST(ConvertUTFTest, ConvertUTF16BigEndianToUTF8String) { EXPECT_EQ(Expected, Result); } +TEST(ConvertUTFTest, ConvertUTF8ToUTF16String) { + // Src is the look of disapproval. + static const char Src[] = "\xe0\xb2\xa0_\xe0\xb2\xa0"; + StringRef Ref(Src, sizeof(Src) - 1); + SmallVector Result; + bool Success = convertUTF8ToUTF16String(Ref, Result); + EXPECT_TRUE(Success); + static const UTF16 Expected[] = {0x0CA0, 0x005f, 0x0CA0, 0}; + ASSERT_EQ(3, Result.size()); + for (int I = 0, E = 3; I != E; ++I) + EXPECT_EQ(Expected[I], Result[I]); +} + TEST(ConvertUTFTest, OddLengthInput) { std::string Result; bool Success = convertUTF16ToUTF8String(makeArrayRef("xxxxx", 5), Result);