diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h index ffe09b19b68..82df2c67bd0 100644 --- a/include/llvm/Support/Regex.h +++ b/include/llvm/Support/Regex.h @@ -7,7 +7,10 @@ // //===----------------------------------------------------------------------===// // -// This file implements a POSIX regular expression matcher. +// This file implements a POSIX regular expression matcher. Both Basic and +// Extended POSIX regular expressions (ERE) are supported. EREs were extended +// to support backreferences in matches. +// This implementation also supports matching strings with embedded NUL chars. // //===----------------------------------------------------------------------===// @@ -33,12 +36,14 @@ namespace llvm { /// null string after any newline in the string in addition to its normal /// function, and the $ anchor matches the null string before any /// newline in the string in addition to its normal function. - Newline=2 + Newline=2, + /// By default, the POSIX extended regular expression (ERE) syntax is + /// assumed. Pass this flag to turn on basic regular expressions (BRE) + /// instead. + BasicRegex=4 }; - /// Compiles the given POSIX Extended Regular Expression \p Regex. - /// This implementation supports regexes and matching strings with embedded - /// NUL characters. + /// Compiles the given regular expression \p Regex. Regex(StringRef Regex, unsigned Flags = NoFlags); ~Regex(); diff --git a/lib/Support/Regex.cpp b/lib/Support/Regex.cpp index d293da07d68..0a5479a548f 100644 --- a/lib/Support/Regex.cpp +++ b/lib/Support/Regex.cpp @@ -27,7 +27,9 @@ Regex::Regex(StringRef regex, unsigned Flags) { flags |= REG_ICASE; if (Flags & Newline) flags |= REG_NEWLINE; - error = llvm_regcomp(preg, regex.data(), flags|REG_EXTENDED|REG_PEND); + if (!(Flags & BasicRegex)) + flags |= REG_EXTENDED; + error = llvm_regcomp(preg, regex.data(), flags|REG_PEND); } Regex::~Regex() { diff --git a/lib/Support/regcomp.c b/lib/Support/regcomp.c index 46c91a9c497..74d9186aaaa 100644 --- a/lib/Support/regcomp.c +++ b/lib/Support/regcomp.c @@ -303,6 +303,7 @@ p_ere_exp(struct parse *p) sopno pos; int count; int count2; + int backrefnum; sopno subno; int wascaret = 0; @@ -370,7 +371,34 @@ p_ere_exp(struct parse *p) case '\\': REQUIRE(MORE(), REG_EESCAPE); c = GETNEXT(); - ordinary(p, c); + if (c >= '1' && c <= '9') { + /* \[0-9] is taken to be a back-reference to a previously specified + * matching group. backrefnum will hold the number. The matching + * group must exist (i.e. if \4 is found there must have been at + * least 4 matching groups specified in the pattern previously). + */ + backrefnum = c - '0'; + if (p->pend[backrefnum] == 0) { + SETERROR(REG_ESUBREG); + break; + } + + /* Make sure everything checks out and emit the sequence + * that marks a back-reference to the parse structure. + */ + assert(backrefnum <= p->g->nsub); + EMIT(OBACK_, backrefnum); + assert(p->pbegin[backrefnum] != 0); + assert(OP(p->strip[p->pbegin[backrefnum]]) != OLPAREN); + assert(OP(p->strip[p->pend[backrefnum]]) != ORPAREN); + (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); + EMIT(O_BACK, backrefnum); + p->g->backrefs = 1; + } else { + /* Other chars are simply themselves when escaped with a backslash. + */ + ordinary(p, c); + } break; case '{': /* okay as ordinary except if digit follows */ REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); diff --git a/unittests/Support/RegexTest.cpp b/unittests/Support/RegexTest.cpp index 65b66c3eee8..38d95955f89 100644 --- a/unittests/Support/RegexTest.cpp +++ b/unittests/Support/RegexTest.cpp @@ -51,7 +51,6 @@ TEST_F(RegexTest, Basics) { EXPECT_EQ(1u, Matches.size()); EXPECT_EQ(String, Matches[0].str()); - std::string NulPattern="X[0-9]+X([a-f])?:([0-9]+)"; String="YX99a:513b"; NulPattern[7] = '\0'; @@ -62,6 +61,28 @@ TEST_F(RegexTest, Basics) { EXPECT_TRUE(r5.match(String)); } +TEST_F(RegexTest, Backreferences) { + Regex r1("([a-z]+)_\\1"); + SmallVector Matches; + EXPECT_TRUE(r1.match("abc_abc", &Matches)); + EXPECT_EQ(2u, Matches.size()); + EXPECT_FALSE(r1.match("abc_ab", &Matches)); + + Regex r2("a([0-9])b\\1c\\1"); + EXPECT_TRUE(r2.match("a4b4c4", &Matches)); + EXPECT_EQ(2u, Matches.size()); + EXPECT_EQ("4", Matches[1].str()); + EXPECT_FALSE(r2.match("a2b2c3")); + + Regex r3("a([0-9])([a-z])b\\1\\2"); + EXPECT_TRUE(r3.match("a6zb6z", &Matches)); + EXPECT_EQ(3u, Matches.size()); + EXPECT_EQ("6", Matches[1].str()); + EXPECT_EQ("z", Matches[2].str()); + EXPECT_FALSE(r3.match("a6zb6y")); + EXPECT_FALSE(r3.match("a6zb7z")); +} + TEST_F(RegexTest, Substitution) { std::string Error;