From 264e92d6db46083f9f46484ec39e99f18d35d370 Mon Sep 17 00:00:00 2001 From: Rui Ueyama Date: Tue, 30 Jul 2013 19:03:20 +0000 Subject: [PATCH] Implement TokenizeWindowsCommandLine. This is a follow up patch for r187390 to implement the parser for the Windows-style command line. This should follow the rule as described at http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx Differential Revision: http://llvm-reviews.chandlerc.com/D1235 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187430 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Support/CommandLine.cpp | 102 +++++++++++++++++++++++++- unittests/Support/CommandLineTest.cpp | 37 +++++++--- 2 files changed, 129 insertions(+), 10 deletions(-) diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index 1975a013385..a47af2729f3 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -498,9 +498,109 @@ void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver, NewArgv.push_back(Saver.SaveString(Token.c_str())); } +/// Backslashes are interpreted in a rather complicated way in the Windows-style +/// command line, because backslashes are used both to separate path and to +/// escape double quote. This method consumes runs of backslashes as well as the +/// following double quote if it's escaped. +/// +/// * If an even number of backslashes is followed by a double quote, one +/// backslash is output for every pair of backslashes, and the last double +/// quote remains unconsumed. The double quote will later be interpreted as +/// the start or end of a quoted string in the main loop outside of this +/// function. +/// +/// * If an odd number of backslashes is followed by a double quote, one +/// backslash is output for every pair of backslashes, and a double quote is +/// output for the last pair of backslash-double quote. The double quote is +/// consumed in this case. +/// +/// * Otherwise, backslashes are interpreted literally. +static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) { + size_t E = Src.size(); + int BackslashCount = 0; + // Skip the backslashes. + do { + ++I; + ++BackslashCount; + } while (I != E && Src[I] == '\\'); + + bool FollowedByDoubleQuote = (I != E && Src[I] == '"'); + if (FollowedByDoubleQuote) { + Token.append(BackslashCount / 2, '\\'); + if (BackslashCount % 2 == 0) + return I - 1; + Token.push_back('"'); + return I; + } + Token.append(BackslashCount, '\\'); + return I - 1; +} + void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, SmallVectorImpl &NewArgv) { - llvm_unreachable("FIXME not implemented"); + SmallString<128> Token; + + // This is a small state machine to consume characters until it reaches the + // end of the source string. + enum { INIT, UNQUOTED, QUOTED } State = INIT; + for (size_t I = 0, E = Src.size(); I != E; ++I) { + // INIT state indicates that the current input index is at the start of + // the string or between tokens. + if (State == INIT) { + if (isWhitespace(Src[I])) + continue; + if (Src[I] == '"') { + State = QUOTED; + continue; + } + if (Src[I] == '\\') { + I = parseBackslash(Src, I, Token); + State = UNQUOTED; + continue; + } + Token.push_back(Src[I]); + State = UNQUOTED; + continue; + } + + // UNQUOTED state means that it's reading a token not quoted by double + // quotes. + if (State == UNQUOTED) { + // Whitespace means the end of the token. + if (isWhitespace(Src[I])) { + NewArgv.push_back(Saver.SaveString(Token.c_str())); + Token.clear(); + State = INIT; + continue; + } + if (Src[I] == '"') { + State = QUOTED; + continue; + } + if (Src[I] == '\\') { + I = parseBackslash(Src, I, Token); + continue; + } + Token.push_back(Src[I]); + continue; + } + + // QUOTED state means that it's reading a token quoted by double quotes. + if (State == QUOTED) { + if (Src[I] == '"') { + State = UNQUOTED; + continue; + } + if (Src[I] == '\\') { + I = parseBackslash(Src, I, Token); + continue; + } + Token.push_back(Src[I]); + } + } + // Append the last token after hitting EOF with no whitespace. + if (!Token.empty()) + NewArgv.push_back(Saver.SaveString(Token.c_str())); } static bool ExpandResponseFile(const char *FName, StringSaver &Saver, diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp index 7a1c3821d7d..c54e1b9570f 100644 --- a/unittests/Support/CommandLineTest.cpp +++ b/unittests/Support/CommandLineTest.cpp @@ -125,21 +125,40 @@ class StrDupSaver : public cl::StringSaver { } }; +typedef void ParserFunction(StringRef Source, llvm::cl::StringSaver &Saver, + SmallVectorImpl &NewArgv); + + +void testCommandLineTokenizer(ParserFunction *parse, const char *Input, + const char *const Output[], size_t OutputSize) { + SmallVector Actual; + StrDupSaver Saver; + parse(Input, Saver, Actual); + EXPECT_EQ(OutputSize, Actual.size()); + for (unsigned I = 0, E = Actual.size(); I != E; ++I) { + if (I < OutputSize) + EXPECT_STREQ(Output[I], Actual[I]); + free(const_cast(Actual[I])); + } +} + TEST(CommandLineTest, TokenizeGNUCommandLine) { const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' " "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\""; const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar", "foobarbaz", "C:\\src\\foo.cpp", "C:\\src\\foo.cpp" }; - SmallVector Actual; - StrDupSaver Saver; - cl::TokenizeGNUCommandLine(Input, Saver, Actual); - EXPECT_EQ(array_lengthof(Output), Actual.size()); - for (unsigned I = 0, E = Actual.size(); I != E; ++I) { - if (I < array_lengthof(Output)) - EXPECT_STREQ(Output[I], Actual[I]); - free(const_cast(Actual[I])); - } + testCommandLineTokenizer(cl::TokenizeGNUCommandLine, Input, Output, + array_lengthof(Output)); +} + +TEST(CommandLineTest, TokenizeWindowsCommandLine) { + const char *Input = "a\\b c\\\\d e\\\\\"f g\" h\\\"i j\\\\\\\"k \"lmn\" o pqr " + "\"st \\\"u\" \\v"; + const char *const Output[] = { "a\\b", "c\\\\d", "e\\f g", "h\"i", "j\\\"k", + "lmn", "o", "pqr", "st \"u", "\\v" }; + testCommandLineTokenizer(cl::TokenizeWindowsCommandLine, Input, Output, + array_lengthof(Output)); } } // anonymous namespace