mirror of
https://github.com/c64scene-ar/llvm-6502.git
synced 2025-01-14 00:32:55 +00:00
Add regular expression matching support, based on OpenBSD regexec()/regcomp()
implementation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@80493 91177308-0d34-0410-b5e6-96231b3b80d8
This commit is contained in:
parent
7438106207
commit
ce0c81e7dd
@ -66,3 +66,4 @@ Autoconf llvm/autoconf
|
||||
llvm/projects/sample/autoconf
|
||||
CellSPU backend llvm/lib/Target/CellSPU/README.txt
|
||||
Google Test llvm/utils/unittest/googletest
|
||||
OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex}
|
||||
|
756
docs/re_format.7
Normal file
756
docs/re_format.7
Normal file
@ -0,0 +1,756 @@
|
||||
.\" $OpenBSD: re_format.7,v 1.14 2007/05/31 19:19:30 jmc Exp $
|
||||
.\"
|
||||
.\" Copyright (c) 1997, Phillip F Knaack. All rights reserved.
|
||||
.\"
|
||||
.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
.\" Copyright (c) 1992, 1993, 1994
|
||||
.\" The Regents of the University of California. All rights reserved.
|
||||
.\"
|
||||
.\" This code is derived from software contributed to Berkeley by
|
||||
.\" Henry Spencer.
|
||||
.\"
|
||||
.\" Redistribution and use in source and binary forms, with or without
|
||||
.\" modification, are permitted provided that the following conditions
|
||||
.\" are met:
|
||||
.\" 1. Redistributions of source code must retain the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer.
|
||||
.\" 2. Redistributions in binary form must reproduce the above copyright
|
||||
.\" notice, this list of conditions and the following disclaimer in the
|
||||
.\" documentation and/or other materials provided with the distribution.
|
||||
.\" 3. Neither the name of the University nor the names of its contributors
|
||||
.\" may be used to endorse or promote products derived from this software
|
||||
.\" without specific prior written permission.
|
||||
.\"
|
||||
.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
.\" SUCH DAMAGE.
|
||||
.\"
|
||||
.\" @(#)re_format.7 8.3 (Berkeley) 3/20/94
|
||||
.\"
|
||||
.Dd $Mdocdate: May 31 2007 $
|
||||
.Dt RE_FORMAT 7
|
||||
.Os
|
||||
.Sh NAME
|
||||
.Nm re_format
|
||||
.Nd POSIX regular expressions
|
||||
.Sh DESCRIPTION
|
||||
Regular expressions (REs),
|
||||
as defined in
|
||||
.St -p1003.1-2004 ,
|
||||
come in two forms:
|
||||
basic regular expressions
|
||||
(BREs)
|
||||
and extended regular expressions
|
||||
(EREs).
|
||||
Both forms of regular expressions are supported
|
||||
by the interfaces described in
|
||||
.Xr regex 3 .
|
||||
Applications dealing with regular expressions
|
||||
may use one or the other form
|
||||
(or indeed both).
|
||||
For example,
|
||||
.Xr ed 1
|
||||
uses BREs,
|
||||
whilst
|
||||
.Xr egrep 1
|
||||
talks EREs.
|
||||
Consult the manual page for the specific application to find out which
|
||||
it uses.
|
||||
.Pp
|
||||
POSIX leaves some aspects of RE syntax and semantics open;
|
||||
.Sq **
|
||||
marks decisions on these aspects that
|
||||
may not be fully portable to other POSIX implementations.
|
||||
.Pp
|
||||
This manual page first describes regular expressions in general,
|
||||
specifically extended regular expressions,
|
||||
and then discusses differences between them and basic regular expressions.
|
||||
.Sh EXTENDED REGULAR EXPRESSIONS
|
||||
An ERE is one** or more non-empty**
|
||||
.Em branches ,
|
||||
separated by
|
||||
.Sq \*(Ba .
|
||||
It matches anything that matches one of the branches.
|
||||
.Pp
|
||||
A branch is one** or more
|
||||
.Em pieces ,
|
||||
concatenated.
|
||||
It matches a match for the first, followed by a match for the second, etc.
|
||||
.Pp
|
||||
A piece is an
|
||||
.Em atom
|
||||
possibly followed by a single**
|
||||
.Sq * ,
|
||||
.Sq + ,
|
||||
.Sq ?\& ,
|
||||
or
|
||||
.Em bound .
|
||||
An atom followed by
|
||||
.Sq *
|
||||
matches a sequence of 0 or more matches of the atom.
|
||||
An atom followed by
|
||||
.Sq +
|
||||
matches a sequence of 1 or more matches of the atom.
|
||||
An atom followed by
|
||||
.Sq ?\&
|
||||
matches a sequence of 0 or 1 matches of the atom.
|
||||
.Pp
|
||||
A bound is
|
||||
.Sq {
|
||||
followed by an unsigned decimal integer,
|
||||
possibly followed by
|
||||
.Sq ,\&
|
||||
possibly followed by another unsigned decimal integer,
|
||||
always followed by
|
||||
.Sq } .
|
||||
The integers must lie between 0 and
|
||||
.Dv RE_DUP_MAX
|
||||
(255**) inclusive,
|
||||
and if there are two of them, the first may not exceed the second.
|
||||
An atom followed by a bound containing one integer
|
||||
.Ar i
|
||||
and no comma matches
|
||||
a sequence of exactly
|
||||
.Ar i
|
||||
matches of the atom.
|
||||
An atom followed by a bound
|
||||
containing one integer
|
||||
.Ar i
|
||||
and a comma matches
|
||||
a sequence of
|
||||
.Ar i
|
||||
or more matches of the atom.
|
||||
An atom followed by a bound
|
||||
containing two integers
|
||||
.Ar i
|
||||
and
|
||||
.Ar j
|
||||
matches a sequence of
|
||||
.Ar i
|
||||
through
|
||||
.Ar j
|
||||
(inclusive) matches of the atom.
|
||||
.Pp
|
||||
An atom is a regular expression enclosed in
|
||||
.Sq ()
|
||||
(matching a part of the regular expression),
|
||||
an empty set of
|
||||
.Sq ()
|
||||
(matching the null string)**,
|
||||
a
|
||||
.Em bracket expression
|
||||
(see below),
|
||||
.Sq .\&
|
||||
(matching any single character),
|
||||
.Sq ^
|
||||
(matching the null string at the beginning of a line),
|
||||
.Sq $
|
||||
(matching the null string at the end of a line),
|
||||
a
|
||||
.Sq \e
|
||||
followed by one of the characters
|
||||
.Sq ^.[$()|*+?{\e
|
||||
(matching that character taken as an ordinary character),
|
||||
a
|
||||
.Sq \e
|
||||
followed by any other character**
|
||||
(matching that character taken as an ordinary character,
|
||||
as if the
|
||||
.Sq \e
|
||||
had not been present**),
|
||||
or a single character with no other significance (matching that character).
|
||||
A
|
||||
.Sq {
|
||||
followed by a character other than a digit is an ordinary character,
|
||||
not the beginning of a bound**.
|
||||
It is illegal to end an RE with
|
||||
.Sq \e .
|
||||
.Pp
|
||||
A bracket expression is a list of characters enclosed in
|
||||
.Sq [] .
|
||||
It normally matches any single character from the list (but see below).
|
||||
If the list begins with
|
||||
.Sq ^ ,
|
||||
it matches any single character
|
||||
.Em not
|
||||
from the rest of the list
|
||||
(but see below).
|
||||
If two characters in the list are separated by
|
||||
.Sq - ,
|
||||
this is shorthand for the full
|
||||
.Em range
|
||||
of characters between those two (inclusive) in the
|
||||
collating sequence, e.g.\&
|
||||
.Sq [0-9]
|
||||
in ASCII matches any decimal digit.
|
||||
It is illegal** for two ranges to share an endpoint, e.g.\&
|
||||
.Sq a-c-e .
|
||||
Ranges are very collating-sequence-dependent,
|
||||
and portable programs should avoid relying on them.
|
||||
.Pp
|
||||
To include a literal
|
||||
.Sq ]\&
|
||||
in the list, make it the first character
|
||||
(following a possible
|
||||
.Sq ^ ) .
|
||||
To include a literal
|
||||
.Sq - ,
|
||||
make it the first or last character,
|
||||
or the second endpoint of a range.
|
||||
To use a literal
|
||||
.Sq -
|
||||
as the first endpoint of a range,
|
||||
enclose it in
|
||||
.Sq [.
|
||||
and
|
||||
.Sq .]
|
||||
to make it a collating element (see below).
|
||||
With the exception of these and some combinations using
|
||||
.Sq [
|
||||
(see next paragraphs),
|
||||
all other special characters, including
|
||||
.Sq \e ,
|
||||
lose their special significance within a bracket expression.
|
||||
.Pp
|
||||
Within a bracket expression, a collating element
|
||||
(a character,
|
||||
a multi-character sequence that collates as if it were a single character,
|
||||
or a collating-sequence name for either)
|
||||
enclosed in
|
||||
.Sq [.
|
||||
and
|
||||
.Sq .]
|
||||
stands for the sequence of characters of that collating element.
|
||||
The sequence is a single element of the bracket expression's list.
|
||||
A bracket expression containing a multi-character collating element
|
||||
can thus match more than one character,
|
||||
e.g. if the collating sequence includes a
|
||||
.Sq ch
|
||||
collating element,
|
||||
then the RE
|
||||
.Sq [[.ch.]]*c
|
||||
matches the first five characters of
|
||||
.Sq chchcc .
|
||||
.Pp
|
||||
Within a bracket expression, a collating element enclosed in
|
||||
.Sq [=
|
||||
and
|
||||
.Sq =]
|
||||
is an equivalence class, standing for the sequences of characters
|
||||
of all collating elements equivalent to that one, including itself.
|
||||
(If there are no other equivalent collating elements,
|
||||
the treatment is as if the enclosing delimiters were
|
||||
.Sq [.
|
||||
and
|
||||
.Sq .] . )
|
||||
For example, if
|
||||
.Sq x
|
||||
and
|
||||
.Sq y
|
||||
are the members of an equivalence class,
|
||||
then
|
||||
.Sq [[=x=]] ,
|
||||
.Sq [[=y=]] ,
|
||||
and
|
||||
.Sq [xy]
|
||||
are all synonymous.
|
||||
An equivalence class may not** be an endpoint of a range.
|
||||
.Pp
|
||||
Within a bracket expression, the name of a
|
||||
.Em character class
|
||||
enclosed
|
||||
in
|
||||
.Sq [:
|
||||
and
|
||||
.Sq :]
|
||||
stands for the list of all characters belonging to that class.
|
||||
Standard character class names are:
|
||||
.Bd -literal -offset indent
|
||||
alnum digit punct
|
||||
alpha graph space
|
||||
blank lower upper
|
||||
cntrl print xdigit
|
||||
.Ed
|
||||
.Pp
|
||||
These stand for the character classes defined in
|
||||
.Xr ctype 3 .
|
||||
A locale may provide others.
|
||||
A character class may not be used as an endpoint of a range.
|
||||
.Pp
|
||||
There are two special cases** of bracket expressions:
|
||||
the bracket expressions
|
||||
.Sq [[:<:]]
|
||||
and
|
||||
.Sq [[:>:]]
|
||||
match the null string at the beginning and end of a word, respectively.
|
||||
A word is defined as a sequence of
|
||||
characters starting and ending with a word character
|
||||
which is neither preceded nor followed by
|
||||
word characters.
|
||||
A word character is an
|
||||
.Em alnum
|
||||
character (as defined by
|
||||
.Xr ctype 3 )
|
||||
or an underscore.
|
||||
This is an extension,
|
||||
compatible with but not specified by POSIX,
|
||||
and should be used with
|
||||
caution in software intended to be portable to other systems.
|
||||
.Pp
|
||||
In the event that an RE could match more than one substring of a given
|
||||
string,
|
||||
the RE matches the one starting earliest in the string.
|
||||
If the RE could match more than one substring starting at that point,
|
||||
it matches the longest.
|
||||
Subexpressions also match the longest possible substrings, subject to
|
||||
the constraint that the whole match be as long as possible,
|
||||
with subexpressions starting earlier in the RE taking priority over
|
||||
ones starting later.
|
||||
Note that higher-level subexpressions thus take priority over
|
||||
their lower-level component subexpressions.
|
||||
.Pp
|
||||
Match lengths are measured in characters, not collating elements.
|
||||
A null string is considered longer than no match at all.
|
||||
For example,
|
||||
.Sq bb*
|
||||
matches the three middle characters of
|
||||
.Sq abbbc ;
|
||||
.Sq (wee|week)(knights|nights)
|
||||
matches all ten characters of
|
||||
.Sq weeknights ;
|
||||
when
|
||||
.Sq (.*).*
|
||||
is matched against
|
||||
.Sq abc ,
|
||||
the parenthesized subexpression matches all three characters;
|
||||
and when
|
||||
.Sq (a*)*
|
||||
is matched against
|
||||
.Sq bc ,
|
||||
both the whole RE and the parenthesized subexpression match the null string.
|
||||
.Pp
|
||||
If case-independent matching is specified,
|
||||
the effect is much as if all case distinctions had vanished from the
|
||||
alphabet.
|
||||
When an alphabetic that exists in multiple cases appears as an
|
||||
ordinary character outside a bracket expression, it is effectively
|
||||
transformed into a bracket expression containing both cases,
|
||||
e.g.\&
|
||||
.Sq x
|
||||
becomes
|
||||
.Sq [xX] .
|
||||
When it appears inside a bracket expression,
|
||||
all case counterparts of it are added to the bracket expression,
|
||||
so that, for example,
|
||||
.Sq [x]
|
||||
becomes
|
||||
.Sq [xX]
|
||||
and
|
||||
.Sq [^x]
|
||||
becomes
|
||||
.Sq [^xX] .
|
||||
.Pp
|
||||
No particular limit is imposed on the length of REs**.
|
||||
Programs intended to be portable should not employ REs longer
|
||||
than 256 bytes,
|
||||
as an implementation can refuse to accept such REs and remain
|
||||
POSIX-compliant.
|
||||
.Pp
|
||||
The following is a list of extended regular expressions:
|
||||
.Bl -tag -width Ds
|
||||
.It Ar c
|
||||
Any character
|
||||
.Ar c
|
||||
not listed below matches itself.
|
||||
.It \e Ns Ar c
|
||||
Any backslash-escaped character
|
||||
.Ar c
|
||||
matches itself.
|
||||
.It \&.
|
||||
Matches any single character that is not a newline
|
||||
.Pq Sq \en .
|
||||
.It Bq Ar char-class
|
||||
Matches any single character in
|
||||
.Ar char-class .
|
||||
To include a
|
||||
.Ql \&]
|
||||
in
|
||||
.Ar char-class ,
|
||||
it must be the first character.
|
||||
A range of characters may be specified by separating the end characters
|
||||
of the range with a
|
||||
.Ql - ;
|
||||
e.g.\&
|
||||
.Ar a-z
|
||||
specifies the lower case characters.
|
||||
The following literal expressions can also be used in
|
||||
.Ar char-class
|
||||
to specify sets of characters:
|
||||
.Bd -unfilled -offset indent
|
||||
[:alnum:] [:cntrl:] [:lower:] [:space:]
|
||||
[:alpha:] [:digit:] [:print:] [:upper:]
|
||||
[:blank:] [:graph:] [:punct:] [:xdigit:]
|
||||
.Ed
|
||||
.Pp
|
||||
If
|
||||
.Ql -
|
||||
appears as the first or last character of
|
||||
.Ar char-class ,
|
||||
then it matches itself.
|
||||
All other characters in
|
||||
.Ar char-class
|
||||
match themselves.
|
||||
.Pp
|
||||
Patterns in
|
||||
.Ar char-class
|
||||
of the form
|
||||
.Eo [.
|
||||
.Ar col-elm
|
||||
.Ec .]\&
|
||||
or
|
||||
.Eo [=
|
||||
.Ar col-elm
|
||||
.Ec =]\& ,
|
||||
where
|
||||
.Ar col-elm
|
||||
is a collating element, are interpreted according to
|
||||
.Xr setlocale 3
|
||||
.Pq not currently supported .
|
||||
.It Bq ^ Ns Ar char-class
|
||||
Matches any single character, other than newline, not in
|
||||
.Ar char-class .
|
||||
.Ar char-class
|
||||
is defined as above.
|
||||
.It ^
|
||||
If
|
||||
.Sq ^
|
||||
is the first character of a regular expression, then it
|
||||
anchors the regular expression to the beginning of a line.
|
||||
Otherwise, it matches itself.
|
||||
.It $
|
||||
If
|
||||
.Sq $
|
||||
is the last character of a regular expression,
|
||||
it anchors the regular expression to the end of a line.
|
||||
Otherwise, it matches itself.
|
||||
.It [[:<:]]
|
||||
Anchors the single character regular expression or subexpression
|
||||
immediately following it to the beginning of a word.
|
||||
.It [[:>:]]
|
||||
Anchors the single character regular expression or subexpression
|
||||
immediately following it to the end of a word.
|
||||
.It Pq Ar re
|
||||
Defines a subexpression
|
||||
.Ar re .
|
||||
Any set of characters enclosed in parentheses
|
||||
matches whatever the set of characters without parentheses matches
|
||||
(that is a long-winded way of saying the constructs
|
||||
.Sq (re)
|
||||
and
|
||||
.Sq re
|
||||
match identically).
|
||||
.It *
|
||||
Matches the single character regular expression or subexpression
|
||||
immediately preceding it zero or more times.
|
||||
If
|
||||
.Sq *
|
||||
is the first character of a regular expression or subexpression,
|
||||
then it matches itself.
|
||||
The
|
||||
.Sq *
|
||||
operator sometimes yields unexpected results.
|
||||
For example, the regular expression
|
||||
.Ar b*
|
||||
matches the beginning of the string
|
||||
.Qq abbb
|
||||
(as opposed to the substring
|
||||
.Qq bbb ) ,
|
||||
since a null match is the only leftmost match.
|
||||
.It +
|
||||
Matches the singular character regular expression
|
||||
or subexpression immediately preceding it
|
||||
one or more times.
|
||||
.It ?
|
||||
Matches the singular character regular expression
|
||||
or subexpression immediately preceding it
|
||||
0 or 1 times.
|
||||
.Sm off
|
||||
.It Xo
|
||||
.Pf { Ar n , m No }\ \&
|
||||
.Pf { Ar n , No }\ \&
|
||||
.Pf { Ar n No }
|
||||
.Xc
|
||||
.Sm on
|
||||
Matches the single character regular expression or subexpression
|
||||
immediately preceding it at least
|
||||
.Ar n
|
||||
and at most
|
||||
.Ar m
|
||||
times.
|
||||
If
|
||||
.Ar m
|
||||
is omitted, then it matches at least
|
||||
.Ar n
|
||||
times.
|
||||
If the comma is also omitted, then it matches exactly
|
||||
.Ar n
|
||||
times.
|
||||
.It \*(Ba
|
||||
Used to separate patterns.
|
||||
For example,
|
||||
the pattern
|
||||
.Sq cat\*(Badog
|
||||
matches either
|
||||
.Sq cat
|
||||
or
|
||||
.Sq dog .
|
||||
.El
|
||||
.Sh BASIC REGULAR EXPRESSIONS
|
||||
Basic regular expressions differ in several respects:
|
||||
.Bl -bullet -offset 3n
|
||||
.It
|
||||
.Sq \*(Ba ,
|
||||
.Sq + ,
|
||||
and
|
||||
.Sq ?\&
|
||||
are ordinary characters and there is no equivalent
|
||||
for their functionality.
|
||||
.It
|
||||
The delimiters for bounds are
|
||||
.Sq \e{
|
||||
and
|
||||
.Sq \e} ,
|
||||
with
|
||||
.Sq {
|
||||
and
|
||||
.Sq }
|
||||
by themselves ordinary characters.
|
||||
.It
|
||||
The parentheses for nested subexpressions are
|
||||
.Sq \e(
|
||||
and
|
||||
.Sq \e) ,
|
||||
with
|
||||
.Sq (
|
||||
and
|
||||
.Sq )\&
|
||||
by themselves ordinary characters.
|
||||
.It
|
||||
.Sq ^
|
||||
is an ordinary character except at the beginning of the
|
||||
RE or** the beginning of a parenthesized subexpression.
|
||||
.It
|
||||
.Sq $
|
||||
is an ordinary character except at the end of the
|
||||
RE or** the end of a parenthesized subexpression.
|
||||
.It
|
||||
.Sq *
|
||||
is an ordinary character if it appears at the beginning of the
|
||||
RE or the beginning of a parenthesized subexpression
|
||||
(after a possible leading
|
||||
.Sq ^ ) .
|
||||
.It
|
||||
Finally, there is one new type of atom, a
|
||||
.Em back-reference :
|
||||
.Sq \e
|
||||
followed by a non-zero decimal digit
|
||||
.Ar d
|
||||
matches the same sequence of characters matched by the
|
||||
.Ar d Ns th
|
||||
parenthesized subexpression
|
||||
(numbering subexpressions by the positions of their opening parentheses,
|
||||
left to right),
|
||||
so that, for example,
|
||||
.Sq \e([bc]\e)\e1
|
||||
matches
|
||||
.Sq bb\&
|
||||
or
|
||||
.Sq cc
|
||||
but not
|
||||
.Sq bc .
|
||||
.El
|
||||
.Pp
|
||||
The following is a list of basic regular expressions:
|
||||
.Bl -tag -width Ds
|
||||
.It Ar c
|
||||
Any character
|
||||
.Ar c
|
||||
not listed below matches itself.
|
||||
.It \e Ns Ar c
|
||||
Any backslash-escaped character
|
||||
.Ar c ,
|
||||
except for
|
||||
.Sq { ,
|
||||
.Sq } ,
|
||||
.Sq \&( ,
|
||||
and
|
||||
.Sq \&) ,
|
||||
matches itself.
|
||||
.It \&.
|
||||
Matches any single character that is not a newline
|
||||
.Pq Sq \en .
|
||||
.It Bq Ar char-class
|
||||
Matches any single character in
|
||||
.Ar char-class .
|
||||
To include a
|
||||
.Ql \&]
|
||||
in
|
||||
.Ar char-class ,
|
||||
it must be the first character.
|
||||
A range of characters may be specified by separating the end characters
|
||||
of the range with a
|
||||
.Ql - ;
|
||||
e.g.\&
|
||||
.Ar a-z
|
||||
specifies the lower case characters.
|
||||
The following literal expressions can also be used in
|
||||
.Ar char-class
|
||||
to specify sets of characters:
|
||||
.Bd -unfilled -offset indent
|
||||
[:alnum:] [:cntrl:] [:lower:] [:space:]
|
||||
[:alpha:] [:digit:] [:print:] [:upper:]
|
||||
[:blank:] [:graph:] [:punct:] [:xdigit:]
|
||||
.Ed
|
||||
.Pp
|
||||
If
|
||||
.Ql -
|
||||
appears as the first or last character of
|
||||
.Ar char-class ,
|
||||
then it matches itself.
|
||||
All other characters in
|
||||
.Ar char-class
|
||||
match themselves.
|
||||
.Pp
|
||||
Patterns in
|
||||
.Ar char-class
|
||||
of the form
|
||||
.Eo [.
|
||||
.Ar col-elm
|
||||
.Ec .]\&
|
||||
or
|
||||
.Eo [=
|
||||
.Ar col-elm
|
||||
.Ec =]\& ,
|
||||
where
|
||||
.Ar col-elm
|
||||
is a collating element, are interpreted according to
|
||||
.Xr setlocale 3
|
||||
.Pq not currently supported .
|
||||
.It Bq ^ Ns Ar char-class
|
||||
Matches any single character, other than newline, not in
|
||||
.Ar char-class .
|
||||
.Ar char-class
|
||||
is defined as above.
|
||||
.It ^
|
||||
If
|
||||
.Sq ^
|
||||
is the first character of a regular expression, then it
|
||||
anchors the regular expression to the beginning of a line.
|
||||
Otherwise, it matches itself.
|
||||
.It $
|
||||
If
|
||||
.Sq $
|
||||
is the last character of a regular expression,
|
||||
it anchors the regular expression to the end of a line.
|
||||
Otherwise, it matches itself.
|
||||
.It [[:<:]]
|
||||
Anchors the single character regular expression or subexpression
|
||||
immediately following it to the beginning of a word.
|
||||
.It [[:>:]]
|
||||
Anchors the single character regular expression or subexpression
|
||||
immediately following it to the end of a word.
|
||||
.It \e( Ns Ar re Ns \e)
|
||||
Defines a subexpression
|
||||
.Ar re .
|
||||
Subexpressions may be nested.
|
||||
A subsequent backreference of the form
|
||||
.Pf \e Ns Ar n ,
|
||||
where
|
||||
.Ar n
|
||||
is a number in the range [1,9], expands to the text matched by the
|
||||
.Ar n Ns th
|
||||
subexpression.
|
||||
For example, the regular expression
|
||||
.Ar \e(.*\e)\e1
|
||||
matches any string consisting of identical adjacent substrings.
|
||||
Subexpressions are ordered relative to their left delimiter.
|
||||
.It *
|
||||
Matches the single character regular expression or subexpression
|
||||
immediately preceding it zero or more times.
|
||||
If
|
||||
.Sq *
|
||||
is the first character of a regular expression or subexpression,
|
||||
then it matches itself.
|
||||
The
|
||||
.Sq *
|
||||
operator sometimes yields unexpected results.
|
||||
For example, the regular expression
|
||||
.Ar b*
|
||||
matches the beginning of the string
|
||||
.Qq abbb
|
||||
(as opposed to the substring
|
||||
.Qq bbb ) ,
|
||||
since a null match is the only leftmost match.
|
||||
.Sm off
|
||||
.It Xo
|
||||
.Pf \e{ Ar n , m No \e}\ \&
|
||||
.Pf \e{ Ar n , No \e}\ \&
|
||||
.Pf \e{ Ar n No \e}
|
||||
.Xc
|
||||
.Sm on
|
||||
Matches the single character regular expression or subexpression
|
||||
immediately preceding it at least
|
||||
.Ar n
|
||||
and at most
|
||||
.Ar m
|
||||
times.
|
||||
If
|
||||
.Ar m
|
||||
is omitted, then it matches at least
|
||||
.Ar n
|
||||
times.
|
||||
If the comma is also omitted, then it matches exactly
|
||||
.Ar n
|
||||
times.
|
||||
.El
|
||||
.Sh SEE ALSO
|
||||
.Xr ctype 3 ,
|
||||
.Xr regex 3
|
||||
.Sh STANDARDS
|
||||
.St -p1003.1-2004 :
|
||||
Base Definitions, Chapter 9 (Regular Expressions).
|
||||
.Sh BUGS
|
||||
Having two kinds of REs is a botch.
|
||||
.Pp
|
||||
The current POSIX spec says that
|
||||
.Sq )\&
|
||||
is an ordinary character in the absence of an unmatched
|
||||
.Sq ( ;
|
||||
this was an unintentional result of a wording error,
|
||||
and change is likely.
|
||||
Avoid relying on it.
|
||||
.Pp
|
||||
Back-references are a dreadful botch,
|
||||
posing major problems for efficient implementations.
|
||||
They are also somewhat vaguely defined
|
||||
(does
|
||||
.Sq a\e(\e(b\e)*\e2\e)*d
|
||||
match
|
||||
.Sq abbbd ? ) .
|
||||
Avoid using them.
|
||||
.Pp
|
||||
POSIX's specification of case-independent matching is vague.
|
||||
The
|
||||
.Dq one case implies all cases
|
||||
definition given above
|
||||
is the current consensus among implementors as to the right interpretation.
|
||||
.Pp
|
||||
The syntax for word boundaries is incredibly ugly.
|
64
include/llvm/Support/Regex.h
Normal file
64
include/llvm/Support/Regex.h
Normal file
@ -0,0 +1,64 @@
|
||||
//===-- Regex.h - Regular Expression matcher implementation -*- C++ -*-----===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a POSIX regular expression matcher.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/ADT/StringRef.h"
|
||||
|
||||
struct llvm_regex;
|
||||
namespace llvm {
|
||||
class Regex {
|
||||
public:
|
||||
enum {
|
||||
/// Compile with support for subgroup matches, this is just to make
|
||||
/// constructs like Regex("...", 0) more readable as Regex("...", Sub).
|
||||
Sub=0,
|
||||
/// Compile for matching that ignores upper/lower case distinctions.
|
||||
IgnoreCase=1,
|
||||
/// Compile for matching that need only report success or failure,
|
||||
/// not what was matched.
|
||||
NoSub=2,
|
||||
/// Compile for newline-sensitive matching. With this flag '[^' bracket
|
||||
/// expressions and '.' never match newline. A ^ anchor matches the
|
||||
/// null string after any newline in the string in addition to its normal
|
||||
/// function, and the $ anchor matches the null string before any
|
||||
/// newline in the string in addition to its normal function.
|
||||
Newline=4
|
||||
};
|
||||
|
||||
/// Compiles the given POSIX Extended Regular Expression \arg Regex.
|
||||
/// This implementation supports regexes and matching strings with embedded
|
||||
/// NUL characters.
|
||||
Regex(const StringRef &Regex, unsigned Flags=NoSub);
|
||||
~Regex();
|
||||
|
||||
/// isValid - returns the error encountered during regex compilation, or
|
||||
/// matching, if any.
|
||||
bool isValid(std::string &Error);
|
||||
|
||||
/// matches - Match the regex against a given \arg String.
|
||||
///
|
||||
/// \param Matches - If given, on a succesful match this will be filled in
|
||||
/// with references to the matched group expressions (inside \arg String),
|
||||
/// the first group is always the entire pattern.
|
||||
/// By default the regex is compiled with NoSub, which disables support for
|
||||
/// Matches.
|
||||
/// For this feature to be enabled you must construct the regex using
|
||||
/// Regex("...", Regex::Sub) constructor.
|
||||
|
||||
bool match(const StringRef &String, SmallVectorImpl<StringRef> *Matches=0);
|
||||
private:
|
||||
struct llvm_regex *preg;
|
||||
int error;
|
||||
bool sub;
|
||||
};
|
||||
}
|
@ -32,6 +32,12 @@ add_llvm_library(LLVMSupport
|
||||
Twine.cpp
|
||||
raw_os_ostream.cpp
|
||||
raw_ostream.cpp
|
||||
Regex.cpp
|
||||
regcomp.c
|
||||
regerror.c
|
||||
regexec.c
|
||||
regfree.c
|
||||
regstrlcpy.c
|
||||
)
|
||||
|
||||
target_link_libraries (LLVMSupport LLVMSystem)
|
||||
|
54
lib/Support/COPYRIGHT.regex
Normal file
54
lib/Support/COPYRIGHT.regex
Normal file
@ -0,0 +1,54 @@
|
||||
$OpenBSD: COPYRIGHT,v 1.3 2003/06/02 20:18:36 millert Exp $
|
||||
|
||||
Copyright 1992, 1993, 1994 Henry Spencer. All rights reserved.
|
||||
This software is not subject to any license of the American Telephone
|
||||
and Telegraph Company or of the Regents of the University of California.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose on
|
||||
any computer system, and to alter it and redistribute it, subject
|
||||
to the following restrictions:
|
||||
|
||||
1. The author is not responsible for the consequences of use of this
|
||||
software, no matter how awful, even if they arise from flaws in it.
|
||||
|
||||
2. The origin of this software must not be misrepresented, either by
|
||||
explicit claim or by omission. Since few users ever read sources,
|
||||
credits must appear in the documentation.
|
||||
|
||||
3. Altered versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software. Since few users
|
||||
ever read sources, credits must appear in the documentation.
|
||||
|
||||
4. This notice may not be removed or altered.
|
||||
|
||||
=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
|
||||
/*-
|
||||
* Copyright (c) 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)COPYRIGHT 8.1 (Berkeley) 3/16/94
|
||||
*/
|
97
lib/Support/Regex.cpp
Normal file
97
lib/Support/Regex.cpp
Normal file
@ -0,0 +1,97 @@
|
||||
//===-- Regex.cpp - Regular Expression matcher implementation -------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements a POSIX regular expression matcher.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#include "llvm/Support/Regex.h"
|
||||
#include "llvm/Support/ErrorHandling.h"
|
||||
#include "llvm/Support/raw_ostream.h"
|
||||
#include "regex_impl.h"
|
||||
#include <string>
|
||||
|
||||
using namespace llvm;
|
||||
Regex::Regex(const StringRef ®ex, unsigned Flags)
|
||||
{
|
||||
unsigned flags = 0;
|
||||
preg = new struct llvm_regex;
|
||||
preg->re_endp = regex.end();
|
||||
if (Flags & IgnoreCase)
|
||||
flags |= REG_ICASE;
|
||||
if (Flags & NoSub) {
|
||||
flags |= REG_NOSUB;
|
||||
sub = false;
|
||||
} else {
|
||||
sub = true;
|
||||
}
|
||||
if (Flags & Newline)
|
||||
flags |= REG_NEWLINE;
|
||||
error = llvm_regcomp(preg, regex.data(), flags|REG_EXTENDED|REG_PEND);
|
||||
}
|
||||
|
||||
bool Regex::isValid(std::string &Error)
|
||||
{
|
||||
if (!error)
|
||||
return true;
|
||||
|
||||
size_t len = llvm_regerror(error, preg, NULL, 0);
|
||||
char *errbuff = new char[len];
|
||||
llvm_regerror(error, preg, errbuff, len);
|
||||
Error.assign(errbuff);
|
||||
return false;
|
||||
}
|
||||
|
||||
Regex::~Regex()
|
||||
{
|
||||
llvm_regfree(preg);
|
||||
delete preg;
|
||||
}
|
||||
|
||||
bool Regex::match(const StringRef &String, SmallVectorImpl<StringRef> *Matches)
|
||||
{
|
||||
unsigned nmatch = Matches ? preg->re_nsub+1 : 0;
|
||||
|
||||
if (Matches) {
|
||||
assert(sub && "Substring matching requested but pattern compiled without");
|
||||
Matches->clear();
|
||||
}
|
||||
|
||||
// pmatch needs to have at least one element.
|
||||
SmallVector<llvm_regmatch_t, 2> pm;
|
||||
pm.resize(nmatch > 0 ? nmatch : 1);
|
||||
pm[0].rm_so = 0;
|
||||
pm[0].rm_eo = String.size();
|
||||
|
||||
int rc = llvm_regexec(preg, String.data(), nmatch, pm.data(), REG_STARTEND);
|
||||
|
||||
if (rc == REG_NOMATCH)
|
||||
return false;
|
||||
if (rc != 0) {
|
||||
// regexec can fail due to invalid pattern or running out of memory.
|
||||
error = rc;
|
||||
return false;
|
||||
}
|
||||
|
||||
// There was a match.
|
||||
|
||||
if (Matches) { // match position requested
|
||||
for (unsigned i=0;i<nmatch; i++) {
|
||||
if (pm[i].rm_so == -1) {
|
||||
// this group didn't match
|
||||
Matches->push_back(StringRef());
|
||||
continue;
|
||||
}
|
||||
assert(pm[i].rm_eo > pm[i].rm_so);
|
||||
Matches->push_back(StringRef(String.data()+pm[i].rm_so,
|
||||
pm[i].rm_eo-pm[i].rm_so));
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
70
lib/Support/regcclass.h
Normal file
70
lib/Support/regcclass.h
Normal file
@ -0,0 +1,70 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)cclass.h 8.3 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
/* character-class table */
|
||||
static struct cclass {
|
||||
const char *name;
|
||||
const char *chars;
|
||||
const char *multis;
|
||||
} cclasses[] = {
|
||||
{ "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
|
||||
0123456789", ""} ,
|
||||
{ "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
|
||||
""} ,
|
||||
{ "blank", " \t", ""} ,
|
||||
{ "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
|
||||
\25\26\27\30\31\32\33\34\35\36\37\177", ""} ,
|
||||
{ "digit", "0123456789", ""} ,
|
||||
{ "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
|
||||
0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
|
||||
""} ,
|
||||
{ "lower", "abcdefghijklmnopqrstuvwxyz",
|
||||
""} ,
|
||||
{ "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
|
||||
0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
|
||||
""} ,
|
||||
{ "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
|
||||
""} ,
|
||||
{ "space", "\t\n\v\f\r ", ""} ,
|
||||
{ "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
|
||||
""} ,
|
||||
{ "xdigit", "0123456789ABCDEFabcdef",
|
||||
""} ,
|
||||
{ NULL, 0, "" }
|
||||
};
|
139
lib/Support/regcname.h
Normal file
139
lib/Support/regcname.h
Normal file
@ -0,0 +1,139 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)cname.h 8.3 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
/* character-name table */
|
||||
static struct cname {
|
||||
const char *name;
|
||||
char code;
|
||||
} cnames[] = {
|
||||
{ "NUL", '\0' },
|
||||
{ "SOH", '\001' },
|
||||
{ "STX", '\002' },
|
||||
{ "ETX", '\003' },
|
||||
{ "EOT", '\004' },
|
||||
{ "ENQ", '\005' },
|
||||
{ "ACK", '\006' },
|
||||
{ "BEL", '\007' },
|
||||
{ "alert", '\007' },
|
||||
{ "BS", '\010' },
|
||||
{ "backspace", '\b' },
|
||||
{ "HT", '\011' },
|
||||
{ "tab", '\t' },
|
||||
{ "LF", '\012' },
|
||||
{ "newline", '\n' },
|
||||
{ "VT", '\013' },
|
||||
{ "vertical-tab", '\v' },
|
||||
{ "FF", '\014' },
|
||||
{ "form-feed", '\f' },
|
||||
{ "CR", '\015' },
|
||||
{ "carriage-return", '\r' },
|
||||
{ "SO", '\016' },
|
||||
{ "SI", '\017' },
|
||||
{ "DLE", '\020' },
|
||||
{ "DC1", '\021' },
|
||||
{ "DC2", '\022' },
|
||||
{ "DC3", '\023' },
|
||||
{ "DC4", '\024' },
|
||||
{ "NAK", '\025' },
|
||||
{ "SYN", '\026' },
|
||||
{ "ETB", '\027' },
|
||||
{ "CAN", '\030' },
|
||||
{ "EM", '\031' },
|
||||
{ "SUB", '\032' },
|
||||
{ "ESC", '\033' },
|
||||
{ "IS4", '\034' },
|
||||
{ "FS", '\034' },
|
||||
{ "IS3", '\035' },
|
||||
{ "GS", '\035' },
|
||||
{ "IS2", '\036' },
|
||||
{ "RS", '\036' },
|
||||
{ "IS1", '\037' },
|
||||
{ "US", '\037' },
|
||||
{ "space", ' ' },
|
||||
{ "exclamation-mark", '!' },
|
||||
{ "quotation-mark", '"' },
|
||||
{ "number-sign", '#' },
|
||||
{ "dollar-sign", '$' },
|
||||
{ "percent-sign", '%' },
|
||||
{ "ampersand", '&' },
|
||||
{ "apostrophe", '\'' },
|
||||
{ "left-parenthesis", '(' },
|
||||
{ "right-parenthesis", ')' },
|
||||
{ "asterisk", '*' },
|
||||
{ "plus-sign", '+' },
|
||||
{ "comma", ',' },
|
||||
{ "hyphen", '-' },
|
||||
{ "hyphen-minus", '-' },
|
||||
{ "period", '.' },
|
||||
{ "full-stop", '.' },
|
||||
{ "slash", '/' },
|
||||
{ "solidus", '/' },
|
||||
{ "zero", '0' },
|
||||
{ "one", '1' },
|
||||
{ "two", '2' },
|
||||
{ "three", '3' },
|
||||
{ "four", '4' },
|
||||
{ "five", '5' },
|
||||
{ "six", '6' },
|
||||
{ "seven", '7' },
|
||||
{ "eight", '8' },
|
||||
{ "nine", '9' },
|
||||
{ "colon", ':' },
|
||||
{ "semicolon", ';' },
|
||||
{ "less-than-sign", '<' },
|
||||
{ "equals-sign", '=' },
|
||||
{ "greater-than-sign", '>' },
|
||||
{ "question-mark", '?' },
|
||||
{ "commercial-at", '@' },
|
||||
{ "left-square-bracket", '[' },
|
||||
{ "backslash", '\\' },
|
||||
{ "reverse-solidus", '\\' },
|
||||
{ "right-square-bracket", ']' },
|
||||
{ "circumflex", '^' },
|
||||
{ "circumflex-accent", '^' },
|
||||
{ "underscore", '_' },
|
||||
{ "low-line", '_' },
|
||||
{ "grave-accent", '`' },
|
||||
{ "left-brace", '{' },
|
||||
{ "left-curly-bracket", '{' },
|
||||
{ "vertical-line", '|' },
|
||||
{ "right-brace", '}' },
|
||||
{ "right-curly-bracket", '}' },
|
||||
{ "tilde", '~' },
|
||||
{ "DEL", '\177' },
|
||||
{ NULL, 0 }
|
||||
};
|
1524
lib/Support/regcomp.c
Normal file
1524
lib/Support/regcomp.c
Normal file
File diff suppressed because it is too large
Load Diff
1021
lib/Support/regengine.inc
Normal file
1021
lib/Support/regengine.inc
Normal file
File diff suppressed because it is too large
Load Diff
131
lib/Support/regerror.c
Normal file
131
lib/Support/regerror.c
Normal file
@ -0,0 +1,131 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regerror.c 8.4 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
#include "regex_impl.h"
|
||||
|
||||
#include "regutils.h"
|
||||
|
||||
static const char *regatoi(const llvm_regex_t *, char *, int);
|
||||
|
||||
static struct rerr {
|
||||
int code;
|
||||
const char *name;
|
||||
const char *explain;
|
||||
} rerrs[] = {
|
||||
{ REG_NOMATCH, "REG_NOMATCH", "llvm_regexec() failed to match" },
|
||||
{ REG_BADPAT, "REG_BADPAT", "invalid regular expression" },
|
||||
{ REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element" },
|
||||
{ REG_ECTYPE, "REG_ECTYPE", "invalid character class" },
|
||||
{ REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)" },
|
||||
{ REG_ESUBREG, "REG_ESUBREG", "invalid backreference number" },
|
||||
{ REG_EBRACK, "REG_EBRACK", "brackets ([ ]) not balanced" },
|
||||
{ REG_EPAREN, "REG_EPAREN", "parentheses not balanced" },
|
||||
{ REG_EBRACE, "REG_EBRACE", "braces not balanced" },
|
||||
{ REG_BADBR, "REG_BADBR", "invalid repetition count(s)" },
|
||||
{ REG_ERANGE, "REG_ERANGE", "invalid character range" },
|
||||
{ REG_ESPACE, "REG_ESPACE", "out of memory" },
|
||||
{ REG_BADRPT, "REG_BADRPT", "repetition-operator operand invalid" },
|
||||
{ REG_EMPTY, "REG_EMPTY", "empty (sub)expression" },
|
||||
{ REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug" },
|
||||
{ REG_INVARG, "REG_INVARG", "invalid argument to regex routine" },
|
||||
{ 0, "", "*** unknown regexp error code ***" }
|
||||
};
|
||||
|
||||
/*
|
||||
- llvm_regerror - the interface to error numbers
|
||||
= extern size_t llvm_regerror(int, const llvm_regex_t *, char *, size_t);
|
||||
*/
|
||||
/* ARGSUSED */
|
||||
size_t
|
||||
llvm_regerror(int errcode, const llvm_regex_t *preg, char *errbuf, size_t errbuf_size)
|
||||
{
|
||||
struct rerr *r;
|
||||
size_t len;
|
||||
int target = errcode &~ REG_ITOA;
|
||||
const char *s;
|
||||
char convbuf[50];
|
||||
|
||||
if (errcode == REG_ATOI)
|
||||
s = regatoi(preg, convbuf, sizeof convbuf);
|
||||
else {
|
||||
for (r = rerrs; r->code != 0; r++)
|
||||
if (r->code == target)
|
||||
break;
|
||||
|
||||
if (errcode®_ITOA) {
|
||||
if (r->code != 0) {
|
||||
assert(strlen(r->name) < sizeof(convbuf));
|
||||
(void) llvm_strlcpy(convbuf, r->name, sizeof convbuf);
|
||||
} else
|
||||
(void)snprintf(convbuf, sizeof convbuf,
|
||||
"REG_0x%x", target);
|
||||
s = convbuf;
|
||||
} else
|
||||
s = r->explain;
|
||||
}
|
||||
|
||||
len = strlen(s) + 1;
|
||||
if (errbuf_size > 0) {
|
||||
llvm_strlcpy(errbuf, s, errbuf_size);
|
||||
}
|
||||
|
||||
return(len);
|
||||
}
|
||||
|
||||
/*
|
||||
- regatoi - internal routine to implement REG_ATOI
|
||||
*/
|
||||
static const char *
|
||||
regatoi(const llvm_regex_t *preg, char *localbuf, int localbufsize)
|
||||
{
|
||||
struct rerr *r;
|
||||
|
||||
for (r = rerrs; r->code != 0; r++)
|
||||
if (strcmp(r->name, preg->re_endp) == 0)
|
||||
break;
|
||||
if (r->code == 0)
|
||||
return("0");
|
||||
|
||||
(void)snprintf(localbuf, localbufsize, "%d", r->code);
|
||||
return(localbuf);
|
||||
}
|
157
lib/Support/regex2.h
Normal file
157
lib/Support/regex2.h
Normal file
@ -0,0 +1,157 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regex2.h 8.4 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
/*
|
||||
* internals of regex_t
|
||||
*/
|
||||
#define MAGIC1 ((('r'^0200)<<8) | 'e')
|
||||
|
||||
/*
|
||||
* The internal representation is a *strip*, a sequence of
|
||||
* operators ending with an endmarker. (Some terminology etc. is a
|
||||
* historical relic of earlier versions which used multiple strips.)
|
||||
* Certain oddities in the representation are there to permit running
|
||||
* the machinery backwards; in particular, any deviation from sequential
|
||||
* flow must be marked at both its source and its destination. Some
|
||||
* fine points:
|
||||
*
|
||||
* - OPLUS_ and O_PLUS are *inside* the loop they create.
|
||||
* - OQUEST_ and O_QUEST are *outside* the bypass they create.
|
||||
* - OCH_ and O_CH are *outside* the multi-way branch they create, while
|
||||
* OOR1 and OOR2 are respectively the end and the beginning of one of
|
||||
* the branches. Note that there is an implicit OOR2 following OCH_
|
||||
* and an implicit OOR1 preceding O_CH.
|
||||
*
|
||||
* In state representations, an operator's bit is on to signify a state
|
||||
* immediately *preceding* "execution" of that operator.
|
||||
*/
|
||||
typedef unsigned long sop; /* strip operator */
|
||||
typedef long sopno;
|
||||
#define OPRMASK 0xf8000000LU
|
||||
#define OPDMASK 0x07ffffffLU
|
||||
#define OPSHIFT ((unsigned)27)
|
||||
#define OP(n) ((n)&OPRMASK)
|
||||
#define OPND(n) ((n)&OPDMASK)
|
||||
#define SOP(op, opnd) ((op)|(opnd))
|
||||
/* operators meaning operand */
|
||||
/* (back, fwd are offsets) */
|
||||
#define OEND (1LU<<OPSHIFT) /* endmarker - */
|
||||
#define OCHAR (2LU<<OPSHIFT) /* character unsigned char */
|
||||
#define OBOL (3LU<<OPSHIFT) /* left anchor - */
|
||||
#define OEOL (4LU<<OPSHIFT) /* right anchor - */
|
||||
#define OANY (5LU<<OPSHIFT) /* . - */
|
||||
#define OANYOF (6LU<<OPSHIFT) /* [...] set number */
|
||||
#define OBACK_ (7LU<<OPSHIFT) /* begin \d paren number */
|
||||
#define O_BACK (8LU<<OPSHIFT) /* end \d paren number */
|
||||
#define OPLUS_ (9LU<<OPSHIFT) /* + prefix fwd to suffix */
|
||||
#define O_PLUS (10LU<<OPSHIFT) /* + suffix back to prefix */
|
||||
#define OQUEST_ (11LU<<OPSHIFT) /* ? prefix fwd to suffix */
|
||||
#define O_QUEST (12LU<<OPSHIFT) /* ? suffix back to prefix */
|
||||
#define OLPAREN (13LU<<OPSHIFT) /* ( fwd to ) */
|
||||
#define ORPAREN (14LU<<OPSHIFT) /* ) back to ( */
|
||||
#define OCH_ (15LU<<OPSHIFT) /* begin choice fwd to OOR2 */
|
||||
#define OOR1 (16LU<<OPSHIFT) /* | pt. 1 back to OOR1 or OCH_ */
|
||||
#define OOR2 (17LU<<OPSHIFT) /* | pt. 2 fwd to OOR2 or O_CH */
|
||||
#define O_CH (18LU<<OPSHIFT) /* end choice back to OOR1 */
|
||||
#define OBOW (19LU<<OPSHIFT) /* begin word - */
|
||||
#define OEOW (20LU<<OPSHIFT) /* end word - */
|
||||
|
||||
/*
|
||||
* Structure for [] character-set representation. Character sets are
|
||||
* done as bit vectors, grouped 8 to a byte vector for compactness.
|
||||
* The individual set therefore has both a pointer to the byte vector
|
||||
* and a mask to pick out the relevant bit of each byte. A hash code
|
||||
* simplifies testing whether two sets could be identical.
|
||||
*
|
||||
* This will get trickier for multicharacter collating elements. As
|
||||
* preliminary hooks for dealing with such things, we also carry along
|
||||
* a string of multi-character elements, and decide the size of the
|
||||
* vectors at run time.
|
||||
*/
|
||||
typedef struct {
|
||||
uch *ptr; /* -> uch [csetsize] */
|
||||
uch mask; /* bit within array */
|
||||
uch hash; /* hash code */
|
||||
size_t smultis;
|
||||
char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */
|
||||
} cset;
|
||||
/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
|
||||
#define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
|
||||
#define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
|
||||
#define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask)
|
||||
#define MCadd(p, cs, cp) mcadd(p, cs, cp) /* llvm_regcomp() internal fns */
|
||||
#define MCsub(p, cs, cp) mcsub(p, cs, cp)
|
||||
#define MCin(p, cs, cp) mcin(p, cs, cp)
|
||||
|
||||
/* stuff for character categories */
|
||||
typedef unsigned char cat_t;
|
||||
|
||||
/*
|
||||
* main compiled-expression structure
|
||||
*/
|
||||
struct re_guts {
|
||||
int magic;
|
||||
# define MAGIC2 ((('R'^0200)<<8)|'E')
|
||||
sop *strip; /* malloced area for strip */
|
||||
int csetsize; /* number of bits in a cset vector */
|
||||
int ncsets; /* number of csets in use */
|
||||
cset *sets; /* -> cset [ncsets] */
|
||||
uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */
|
||||
int cflags; /* copy of llvm_regcomp() cflags argument */
|
||||
sopno nstates; /* = number of sops */
|
||||
sopno firststate; /* the initial OEND (normally 0) */
|
||||
sopno laststate; /* the final OEND */
|
||||
int iflags; /* internal flags */
|
||||
# define USEBOL 01 /* used ^ */
|
||||
# define USEEOL 02 /* used $ */
|
||||
# define REGEX_BAD 04 /* something wrong */
|
||||
int nbol; /* number of ^ used */
|
||||
int neol; /* number of $ used */
|
||||
int ncategories; /* how many character categories */
|
||||
cat_t *categories; /* ->catspace[-CHAR_MIN] */
|
||||
char *must; /* match must contain this string */
|
||||
int mlen; /* length of must */
|
||||
size_t nsub; /* copy of re_nsub */
|
||||
int backrefs; /* does it use back references? */
|
||||
sopno nplus; /* how deep does it nest +s? */
|
||||
/* catspace must be last */
|
||||
cat_t catspace[1]; /* actually [NC] */
|
||||
};
|
||||
|
||||
/* misc utilities */
|
||||
#define OUT (CHAR_MAX+1) /* a non-character value */
|
||||
#define ISWORD(c) (isalnum(c&0xff) || (c) == '_')
|
108
lib/Support/regex_impl.h
Normal file
108
lib/Support/regex_impl.h
Normal file
@ -0,0 +1,108 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer of the University of Toronto.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regex.h 8.1 (Berkeley) 6/2/93
|
||||
*/
|
||||
|
||||
#ifndef _REGEX_H_
|
||||
#define _REGEX_H_
|
||||
|
||||
#include <sys/types.h>
|
||||
typedef off_t llvm_regoff_t;
|
||||
typedef struct {
|
||||
llvm_regoff_t rm_so; /* start of match */
|
||||
llvm_regoff_t rm_eo; /* end of match */
|
||||
} llvm_regmatch_t;
|
||||
|
||||
typedef struct llvm_regex {
|
||||
int re_magic;
|
||||
size_t re_nsub; /* number of parenthesized subexpressions */
|
||||
const char *re_endp; /* end pointer for REG_PEND */
|
||||
struct re_guts *re_g; /* none of your business :-) */
|
||||
} llvm_regex_t;
|
||||
|
||||
/* llvm_regcomp() flags */
|
||||
#define REG_BASIC 0000
|
||||
#define REG_EXTENDED 0001
|
||||
#define REG_ICASE 0002
|
||||
#define REG_NOSUB 0004
|
||||
#define REG_NEWLINE 0010
|
||||
#define REG_NOSPEC 0020
|
||||
#define REG_PEND 0040
|
||||
#define REG_DUMP 0200
|
||||
|
||||
/* llvm_regerror() flags */
|
||||
#define REG_NOMATCH 1
|
||||
#define REG_BADPAT 2
|
||||
#define REG_ECOLLATE 3
|
||||
#define REG_ECTYPE 4
|
||||
#define REG_EESCAPE 5
|
||||
#define REG_ESUBREG 6
|
||||
#define REG_EBRACK 7
|
||||
#define REG_EPAREN 8
|
||||
#define REG_EBRACE 9
|
||||
#define REG_BADBR 10
|
||||
#define REG_ERANGE 11
|
||||
#define REG_ESPACE 12
|
||||
#define REG_BADRPT 13
|
||||
#define REG_EMPTY 14
|
||||
#define REG_ASSERT 15
|
||||
#define REG_INVARG 16
|
||||
#define REG_ATOI 255 /* convert name to number (!) */
|
||||
#define REG_ITOA 0400 /* convert number to name (!) */
|
||||
|
||||
/* llvm_regexec() flags */
|
||||
#define REG_NOTBOL 00001
|
||||
#define REG_NOTEOL 00002
|
||||
#define REG_STARTEND 00004
|
||||
#define REG_TRACE 00400 /* tracing of execution */
|
||||
#define REG_LARGE 01000 /* force large representation */
|
||||
#define REG_BACKR 02000 /* force use of backref code */
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
int llvm_regcomp(llvm_regex_t *, const char *, int);
|
||||
size_t llvm_regerror(int, const llvm_regex_t *, char *, size_t);
|
||||
int llvm_regexec(const llvm_regex_t *, const char *, size_t,
|
||||
llvm_regmatch_t [], int);
|
||||
void llvm_regfree(llvm_regex_t *);
|
||||
size_t llvm_strlcpy(char *dst, const char *src, size_t siz);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* !_REGEX_H_ */
|
161
lib/Support/regexec.c
Normal file
161
lib/Support/regexec.c
Normal file
@ -0,0 +1,161 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regexec.c 8.3 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
/*
|
||||
* the outer shell of llvm_regexec()
|
||||
*
|
||||
* This file includes engine.inc *twice*, after muchos fiddling with the
|
||||
* macros that code uses. This lets the same code operate on two different
|
||||
* representations for state sets.
|
||||
*/
|
||||
#include <sys/types.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include <ctype.h>
|
||||
#include "regex_impl.h"
|
||||
|
||||
#include "regutils.h"
|
||||
#include "regex2.h"
|
||||
|
||||
/* macros for manipulating states, small version */
|
||||
#define states long
|
||||
#define states1 states /* for later use in llvm_regexec() decision */
|
||||
#define CLEAR(v) ((v) = 0)
|
||||
#define SET0(v, n) ((v) &= ~((unsigned long)1 << (n)))
|
||||
#define SET1(v, n) ((v) |= (unsigned long)1 << (n))
|
||||
#define ISSET(v, n) (((v) & ((unsigned long)1 << (n))) != 0)
|
||||
#define ASSIGN(d, s) ((d) = (s))
|
||||
#define EQ(a, b) ((a) == (b))
|
||||
#define STATEVARS long dummy /* dummy version */
|
||||
#define STATESETUP(m, n) /* nothing */
|
||||
#define STATETEARDOWN(m) /* nothing */
|
||||
#define SETUP(v) ((v) = 0)
|
||||
#define onestate long
|
||||
#define INIT(o, n) ((o) = (unsigned long)1 << (n))
|
||||
#define INC(o) ((o) <<= 1)
|
||||
#define ISSTATEIN(v, o) (((v) & (o)) != 0)
|
||||
/* some abbreviations; note that some of these know variable names! */
|
||||
/* do "if I'm here, I can also be there" etc without branches */
|
||||
#define FWD(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) << (n))
|
||||
#define BACK(dst, src, n) ((dst) |= ((unsigned long)(src)&(here)) >> (n))
|
||||
#define ISSETBACK(v, n) (((v) & ((unsigned long)here >> (n))) != 0)
|
||||
/* function names */
|
||||
#define SNAMES /* engine.inc looks after details */
|
||||
|
||||
#include "regengine.inc"
|
||||
|
||||
/* now undo things */
|
||||
#undef states
|
||||
#undef CLEAR
|
||||
#undef SET0
|
||||
#undef SET1
|
||||
#undef ISSET
|
||||
#undef ASSIGN
|
||||
#undef EQ
|
||||
#undef STATEVARS
|
||||
#undef STATESETUP
|
||||
#undef STATETEARDOWN
|
||||
#undef SETUP
|
||||
#undef onestate
|
||||
#undef INIT
|
||||
#undef INC
|
||||
#undef ISSTATEIN
|
||||
#undef FWD
|
||||
#undef BACK
|
||||
#undef ISSETBACK
|
||||
#undef SNAMES
|
||||
|
||||
/* macros for manipulating states, large version */
|
||||
#define states char *
|
||||
#define CLEAR(v) memset(v, 0, m->g->nstates)
|
||||
#define SET0(v, n) ((v)[n] = 0)
|
||||
#define SET1(v, n) ((v)[n] = 1)
|
||||
#define ISSET(v, n) ((v)[n])
|
||||
#define ASSIGN(d, s) memmove(d, s, m->g->nstates)
|
||||
#define EQ(a, b) (memcmp(a, b, m->g->nstates) == 0)
|
||||
#define STATEVARS long vn; char *space
|
||||
#define STATESETUP(m, nv) { (m)->space = malloc((nv)*(m)->g->nstates); \
|
||||
if ((m)->space == NULL) return(REG_ESPACE); \
|
||||
(m)->vn = 0; }
|
||||
#define STATETEARDOWN(m) { free((m)->space); }
|
||||
#define SETUP(v) ((v) = &m->space[m->vn++ * m->g->nstates])
|
||||
#define onestate long
|
||||
#define INIT(o, n) ((o) = (n))
|
||||
#define INC(o) ((o)++)
|
||||
#define ISSTATEIN(v, o) ((v)[o])
|
||||
/* some abbreviations; note that some of these know variable names! */
|
||||
/* do "if I'm here, I can also be there" etc without branches */
|
||||
#define FWD(dst, src, n) ((dst)[here+(n)] |= (src)[here])
|
||||
#define BACK(dst, src, n) ((dst)[here-(n)] |= (src)[here])
|
||||
#define ISSETBACK(v, n) ((v)[here - (n)])
|
||||
/* function names */
|
||||
#define LNAMES /* flag */
|
||||
|
||||
#include "regengine.inc"
|
||||
|
||||
/*
|
||||
- llvm_regexec - interface for matching
|
||||
*
|
||||
* We put this here so we can exploit knowledge of the state representation
|
||||
* when choosing which matcher to call. Also, by this point the matchers
|
||||
* have been prototyped.
|
||||
*/
|
||||
int /* 0 success, REG_NOMATCH failure */
|
||||
llvm_regexec(const llvm_regex_t *preg, const char *string, size_t nmatch,
|
||||
llvm_regmatch_t pmatch[], int eflags)
|
||||
{
|
||||
struct re_guts *g = preg->re_g;
|
||||
#ifdef REDEBUG
|
||||
# define GOODFLAGS(f) (f)
|
||||
#else
|
||||
# define GOODFLAGS(f) ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
|
||||
#endif
|
||||
|
||||
if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
|
||||
return(REG_BADPAT);
|
||||
assert(!(g->iflags®EX_BAD));
|
||||
if (g->iflags®EX_BAD) /* backstop for no-debug case */
|
||||
return(REG_BADPAT);
|
||||
eflags = GOODFLAGS(eflags);
|
||||
|
||||
if (g->nstates <= (long)(CHAR_BIT*sizeof(states1)) && !(eflags®_LARGE))
|
||||
return(smatcher(g, (char *)string, nmatch, pmatch, eflags));
|
||||
else
|
||||
return(lmatcher(g, (char *)string, nmatch, pmatch, eflags));
|
||||
}
|
72
lib/Support/regfree.c
Normal file
72
lib/Support/regfree.c
Normal file
@ -0,0 +1,72 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)regfree.c 8.3 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "regex_impl.h"
|
||||
|
||||
#include "regutils.h"
|
||||
#include "regex2.h"
|
||||
|
||||
/*
|
||||
- llvm_regfree - free everything
|
||||
*/
|
||||
void
|
||||
llvm_regfree(llvm_regex_t *preg)
|
||||
{
|
||||
struct re_guts *g;
|
||||
|
||||
if (preg->re_magic != MAGIC1) /* oops */
|
||||
return; /* nice to complain, but hard */
|
||||
|
||||
g = preg->re_g;
|
||||
if (g == NULL || g->magic != MAGIC2) /* oops again */
|
||||
return;
|
||||
preg->re_magic = 0; /* mark it invalid */
|
||||
g->magic = 0; /* mark it invalid */
|
||||
|
||||
if (g->strip != NULL)
|
||||
free((char *)g->strip);
|
||||
if (g->sets != NULL)
|
||||
free((char *)g->sets);
|
||||
if (g->setbits != NULL)
|
||||
free((char *)g->setbits);
|
||||
if (g->must != NULL)
|
||||
free(g->must);
|
||||
free((char *)g);
|
||||
}
|
52
lib/Support/regstrlcpy.c
Normal file
52
lib/Support/regstrlcpy.c
Normal file
@ -0,0 +1,52 @@
|
||||
/*
|
||||
* This code is derived from OpenBSD's libc, original license follows:
|
||||
*
|
||||
* Copyright (c) 1998 Todd C. Miller <Todd.Miller@courtesan.com>
|
||||
*
|
||||
* Permission to use, copy, modify, and distribute this software for any
|
||||
* purpose with or without fee is hereby granted, provided that the above
|
||||
* copyright notice and this permission notice appear in all copies.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
||||
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
||||
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
||||
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
||||
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
||||
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <sys/types.h>
|
||||
#include <string.h>
|
||||
|
||||
#include "regex_impl.h"
|
||||
/*
|
||||
* Copy src to string dst of size siz. At most siz-1 characters
|
||||
* will be copied. Always NUL terminates (unless siz == 0).
|
||||
* Returns strlen(src); if retval >= siz, truncation occurred.
|
||||
*/
|
||||
size_t
|
||||
llvm_strlcpy(char *dst, const char *src, size_t siz)
|
||||
{
|
||||
char *d = dst;
|
||||
const char *s = src;
|
||||
size_t n = siz;
|
||||
|
||||
/* Copy as many bytes as will fit */
|
||||
if (n != 0) {
|
||||
while (--n != 0) {
|
||||
if ((*d++ = *s++) == '\0')
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Not enough room in dst, add NUL and traverse rest of src */
|
||||
if (n == 0) {
|
||||
if (siz != 0)
|
||||
*d = '\0'; /* NUL-terminate dst */
|
||||
while (*s++)
|
||||
;
|
||||
}
|
||||
|
||||
return(s - src - 1); /* count does not include NUL */
|
||||
}
|
55
lib/Support/regutils.h
Normal file
55
lib/Support/regutils.h
Normal file
@ -0,0 +1,55 @@
|
||||
/*-
|
||||
* This code is derived from OpenBSD's libc/regex, original license follows:
|
||||
*
|
||||
* Copyright (c) 1992, 1993, 1994 Henry Spencer.
|
||||
* Copyright (c) 1992, 1993, 1994
|
||||
* The Regents of the University of California. All rights reserved.
|
||||
*
|
||||
* This code is derived from software contributed to Berkeley by
|
||||
* Henry Spencer.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* 3. Neither the name of the University nor the names of its contributors
|
||||
* may be used to endorse or promote products derived from this software
|
||||
* without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
||||
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
||||
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
* SUCH DAMAGE.
|
||||
*
|
||||
* @(#)utils.h 8.3 (Berkeley) 3/20/94
|
||||
*/
|
||||
|
||||
/* utility definitions */
|
||||
#define DUPMAX _POSIX2_RE_DUP_MAX /* xxx is this right? */
|
||||
#define INFINITY (DUPMAX + 1)
|
||||
#define NC (CHAR_MAX - CHAR_MIN + 1)
|
||||
typedef unsigned char uch;
|
||||
|
||||
/* switch off assertions (if not already off) if no REDEBUG */
|
||||
#ifndef REDEBUG
|
||||
#ifndef NDEBUG
|
||||
#define NDEBUG /* no assertions please */
|
||||
#endif
|
||||
#endif
|
||||
#include <assert.h>
|
||||
|
||||
/* for old systems with bcopy() but no memmove() */
|
||||
#ifdef USEBCOPY
|
||||
#define memmove(d, s, c) bcopy(s, d, c)
|
||||
#endif
|
64
unittests/Support/RegexTest.cpp
Normal file
64
unittests/Support/RegexTest.cpp
Normal file
@ -0,0 +1,64 @@
|
||||
//===- llvm/unittest/Support/RegexTest.cpp - Regex tests --===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "llvm/Support/Regex.h"
|
||||
#include <cstring>
|
||||
|
||||
using namespace llvm;
|
||||
namespace {
|
||||
|
||||
class RegexTest : public ::testing::Test {
|
||||
};
|
||||
|
||||
TEST_F(RegexTest, Basics) {
|
||||
Regex r1("^[0-9]+$");
|
||||
EXPECT_TRUE(r1.match("916"));
|
||||
EXPECT_TRUE(r1.match("9"));
|
||||
EXPECT_FALSE(r1.match("9a"));
|
||||
|
||||
SmallVector<StringRef, 1> Matches;
|
||||
Regex r2("[0-9]+", Regex::Sub);
|
||||
EXPECT_TRUE(r2.match("aa216b", &Matches));
|
||||
EXPECT_EQ(1u, Matches.size());
|
||||
EXPECT_EQ("216", Matches[0].str());
|
||||
|
||||
Regex r3("[0-9]+([a-f])?:([0-9]+)", Regex::Sub);
|
||||
EXPECT_TRUE(r3.match("9a:513b", &Matches));
|
||||
EXPECT_EQ(3u, Matches.size());
|
||||
EXPECT_EQ("9a:513", Matches[0].str());
|
||||
EXPECT_EQ("a", Matches[1].str());
|
||||
EXPECT_EQ("513", Matches[2].str());
|
||||
|
||||
EXPECT_TRUE(r3.match("9:513b", &Matches));
|
||||
EXPECT_EQ(3u, Matches.size());
|
||||
EXPECT_EQ("9:513", Matches[0].str());
|
||||
EXPECT_EQ("", Matches[1].str());
|
||||
EXPECT_EQ("513", Matches[2].str());
|
||||
|
||||
Regex r4("a[^b]+b", Regex::Sub);
|
||||
std::string String="axxb";
|
||||
String[2] = '\0';
|
||||
EXPECT_FALSE(r4.match("abb"));
|
||||
EXPECT_TRUE(r4.match(String, &Matches));
|
||||
EXPECT_EQ(1u, Matches.size());
|
||||
EXPECT_EQ(String, Matches[0].str());
|
||||
|
||||
|
||||
std::string NulPattern="X[0-9]+X([a-f])?:([0-9]+)";
|
||||
String="YX99a:513b";
|
||||
NulPattern[7] = '\0';
|
||||
Regex r5(NulPattern, Regex::Sub);
|
||||
EXPECT_FALSE(r5.match(String));
|
||||
EXPECT_FALSE(r5.match("X9"));
|
||||
String[3]='\0';
|
||||
EXPECT_TRUE(r5.match(String));
|
||||
}
|
||||
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user