added regex man pages (chap 3 and 7)

2024-06-24 23:30:02 +00:00 · 1997-10-08 07:09:19 +00:00 · 1997-10-08 07:09:19 +00:00 · 86fc467fac
commit 86fc467fac
parent 280300e2f8
3 changed files with 804 additions and 1 deletions
--- a/usr.man/man3/regex.3
+++ b/usr.man/man3/regex.3
@ -0,0 +1,530 @@
+.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
+.\" Copyright (c) 1992, 1993, 1994
+.\"	The Regents of the University of California.  All rights reserved.
+.\"
+.\" This code is derived from software contributed to Berkeley by
+.\" Henry Spencer.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\"    must display the following acknowledgement:
+.\"	This product includes software developed by the University of
+.\"	California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\"    may be used to endorse or promote products derived from this software
+.\"    without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"	@(#)regex.3	8.4 (Berkeley) 3/20/94
+.\"
+.TH REGEX 3 "7 October 1997" GNO "Library Routines"
+.SH NAME
+regcomp, regexec, regerror, regfree \- regular-expression library
+.SH SYNOPSIS
+#include <sys/types.h>
+.br
+#include <regex.h>
+.HP
+int \fBregcomp\fR(regex_t *\fIpreg\fR, const char *\fIpattern\fR,
+int \fIcflags\fR);
+.HP
+int \fBregexec\fR(const regex_t *\fIpreg\fR, const char *\fIstring\fR,
+size_t \fInmatch\fR, regmatch_t \fIpmatch\fR[], int \fIeflags\fR);
+.HP
+size_t \fBregerror\fR(int \fIerrcode\fR, const regex_t *\fIpreg\fR,
+char *\fIerrbuf\fR, size_t \fIerrbuf_size\fR);
+.HP
+void \fBregfree\fR(regex_t *\fIpreg\fR);
+.SH DESCRIPTION
+These routines implement POSIX 1003.2 regular expressions (``RE''s);
+see
+.BR re_format (7).
+.B regcomp
+compiles an RE written as a string into an internal form,
+.B regexec
+matches that internal form against a string and reports results,
+.B regerror
+transforms error codes from either into human-readable messages,
+and
+.B regfree
+frees any dynamically-allocated storage used by the internal form
+of an RE.
+.PP
+The header
+.B <regex.h>
+declares two structure types,
+.B regex_t
+and
+.BR regmatch_t ,
+the former for compiled internal forms and the latter for match reporting.
+It also declares the four functions,
+a type
+.BR regoff_t ,
+and a number of constants with names starting with ``REG_''.
+.PP
+.B regcomp
+compiles the regular expression contained in the
+.I pattern
+string,
+subject to the flags in
+.IR cflags ,
+and places the results in the
+.B regex_t
+structure pointed to by
+.IR preg .
+.I Cflags
+is the bitwise OR of zero or more of the following flags:
+.IP REG_EXTENDED
+Compile modern (``extended'') REs,
+rather than the obsolete (``basic'') REs that
+are the default.
+.IP REG_BASIC
+This is a synonym for 0,
+provided as a counterpart to REG_EXTENDED to improve readability.
+.IP REG_NOSPEC
+Compile with recognition of all special characters turned off.
+All characters are thus considered ordinary,
+so the ``RE'' is a literal string.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+REG_EXTENDED and REG_NOSPEC may not be used
+in the same call to
+.BR regcomp .
+.IP REG_ICASE
+Compile for matching that ignores upper/lower case distinctions.
+See
+.BR re_format (7).
+.IP REG_NOSUB
+Compile for matching that need only report success or failure,
+not what was matched.
+.IP REG_NEWLINE
+Compile for newline-sensitive matching.
+By default, newline is a completely ordinary character with no special
+meaning in either REs or strings.
+With this flag,
+`[^' bracket expressions and `.' never match newline,
+a `^' anchor matches the null string after any newline in the string
+in addition to its normal function,
+and the `$' anchor matches the null string before any newline in the
+string in addition to its normal function.
+.IP REG_PEND
+The regular expression ends,
+not at the first NUL,
+but just before the character pointed to by the
+.B re_endp
+member of the structure pointed to by
+.IR preg .
+The
+.B re_endp
+member is of type
+.BR "const char *" .
+This flag permits inclusion of NULs in the RE;
+they are considered ordinary characters.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+.PP
+When successful,
+.B regcomp
+returns 0 and fills in the structure pointed to by
+.IR preg .
+One member of that structure
+(other than
+.BR re_endp )
+is publicized:
+.BR re_nsub ,
+of type
+.BR size_t ,
+contains the number of parenthesized subexpressions within the RE
+(except that the value of this member is undefined if the
+REG_NOSUB flag was used).
+If
+.B regcomp
+fails, it returns a non-zero error code;
+see DIAGNOSTICS.
+.PP
+.B Regexec
+matches the compiled RE pointed to by
+.B preg
+against the
+.IR string ,
+subject to the flags in
+.IR eflags ,
+and reports results using
+.IR nmatch ,
+.IR pmatch ,
+and the returned value.
+The RE must have been compiled by a previous invocation of
+.BR regcomp .
+The compiled form is not altered during execution of
+.BR regexec ,
+so a single compiled RE can be used simultaneously by multiple threads.
+.PP
+By default,
+the NUL-terminated string pointed to by
+.I string
+is considered to be the text of an entire line, minus any terminating
+newline.
+The
+.I eflags
+argument is the bitwise OR of zero or more of the following flags:
+.IP REG_NOTBOL \w'REG_STARTEND'u+2n
+The first character of
+the string
+is not the beginning of a line, so the `^' anchor should not match before it.
+This does not affect the behavior of newlines under REG_NEWLINE.
+.IP REG_NOTEOL
+The NUL terminating
+the string
+does not end a line, so the `$' anchor should not match before it.
+This does not affect the behavior of newlines under REG_NEWLINE.
+.IP REG_STARTEND
+The string is considered to start at
+\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR
+and to have a terminating NUL located at
+\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR
+(there need not actually be a NUL at that location),
+regardless of the value of
+.IR nmatch .
+See below for the definition of
+.IR pmatch
+and
+.IR nmatch .
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL;
+REG_STARTEND affects only the location of the string,
+not how it is matched.
+.PP
+See
+.BR re_format (7)
+for a discussion of what is matched in situations where an RE or a
+portion thereof could match any of several substrings of
+.IR string .
+.PP
+Normally,
+.B regexec
+returns 0 for success and the non-zero code REG_NOMATCH for failure.
+Other non-zero error codes may be returned in exceptional situations;
+see DIAGNOSTICS.
+.PP
+If REG_NOSUB was specified in the compilation of the RE,
+or if
+.I nmatch
+is 0,
+.B regexec
+ignores the
+.I pmatch
+argument (but see below for the case where REG_STARTEND is specified).
+Otherwise,
+.I pmatch
+points to an array of
+.I nmatch
+structures of type
+.BR regmatch_t .
+Such a structure has at least the members
+.B rm_so
+and
+.BR rm_eo ,
+both of type
+.B regoff_t
+(a signed arithmetic type at least as large as an
+.B off_t
+and a
+.BR ssize_t ),
+containing respectively the offset of the first character of a substring
+and the offset of the first character after the end of the substring.
+Offsets are measured from the beginning of the
+.I string
+argument given to
+.BR regexec .
+An empty substring is denoted by equal offsets,
+both indicating the character following the empty substring.
+.PP
+The 0th member of the
+.I pmatch
+array is filled in to indicate what substring of
+.I string
+was matched by the entire RE.
+Remaining members report what substring was matched by parenthesized
+subexpressions within the RE;
+member
+.I i
+reports subexpression
+.IR i ,
+with subexpressions counted (starting at 1) by the order of their opening
+parentheses in the RE, left to right.
+Unused entries in the array\(emcorresponding either to subexpressions that
+did not participate in the match at all, or to subexpressions that do not
+exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both
+.B rm_so
+and
+.B rm_eo
+set to \-1.
+If a subexpression participated in the match several times,
+the reported substring is the last one it matched.
+(Note, as an example in particular, that when the RE `(b*)+' matches `bbb',
+the parenthesized subexpression matches each of the three `b's and then
+an infinite number of empty strings following the last `b',
+so the reported substring is one of the empties.)
+.PP
+If REG_STARTEND is specified,
+.I pmatch
+must point to at least one
+.B regmatch_t
+(even if
+.I nmatch
+is 0 or REG_NOSUB was specified),
+to hold the input offsets for REG_STARTEND.
+Use for output is still entirely controlled by
+.IR nmatch ;
+if
+.I nmatch
+is 0 or REG_NOSUB was specified,
+the value of
+.IR pmatch [0]
+will not be changed by a successful
+.IR regexec .
+.PP
+.B regerror
+maps a non-zero
+.I errcode
+from either
+.B regcomp
+or
+.B regexec
+to a human-readable, printable message.
+If
+.I preg
+is non-NULL,
+the error code should have arisen from use of
+the
+.B regex_t
+pointed to by
+.IR preg ,
+and if the error code came from
+.BR regcomp ,
+it should have been the result from the most recent
+.B regcomp
+using that
+.IR regex_t .
+.RB ( regerror
+may be able to supply a more detailed message using information
+from the
+.BR regex_t .)
+.B regerror
+places the NUL-terminated message into the buffer pointed to by
+.IR errbuf ,
+limiting the length (including the NUL) to at most
+.I errbuf_size
+bytes.
+If the whole message won't fit,
+as much of it as will fit before the terminating NUL is supplied.
+In any case,
+the returned value is the size of buffer needed to hold the whole
+message (including terminating NUL).
+If
+.I errbuf_size
+is 0,
+.I errbuf
+is ignored but the return value is still correct.
+.PP
+If the
+.I errcode
+given to
+.B regerror
+is first ORed with REG_ITOA,
+the ``message'' that results is the printable name of the error code,
+e.g. ``REG_NOMATCH'',
+rather than an explanation thereof.
+If
+.I errcode
+is REG_ATOI,
+then
+.I preg
+shall be non-NULL and the
+.I re_endp
+member of the structure it points to
+must point to the printable name of an error code;
+in this case, the result in
+.I errbuf
+is the decimal digits of
+the numeric value of the error code
+(0 if the name is not recognized).
+REG_ITOA and REG_ATOI are intended primarily as debugging facilities;
+they are extensions,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+Be warned also that they are considered experimental and changes are possible.
+.PP
+.I Regfree
+frees any dynamically-allocated storage associated with the compiled RE
+pointed to by
+.IR preg .
+The remaining
+.B regex_t
+is no longer a valid compiled RE
+and the effect of supplying it to
+.B regexec
+or
+.B regerror
+is undefined.
+.PP
+None of these functions references global variables except for tables
+of constants;
+all are safe for use from multiple threads if the arguments are safe.
+.SH IMPLEMENTATION CHOICES
+There are a number of decisions that 1003.2 leaves up to the implementor,
+either by explicitly saying ``undefined'' or by virtue of them being
+forbidden by the RE grammar.
+This implementation treats them as follows.
+.PP
+See
+.BR re_format (7)
+for a discussion of the definition of case-independent matching.
+.PP
+There is no particular limit on the length of REs,
+except insofar as memory is limited.
+Memory usage is approximately linear in RE size, and largely insensitive
+to RE complexity, except for bounded repetitions.
+See BUGS for one short RE using them
+that will run almost any system out of memory.
+.PP
+A backslashed character other than one specifically given a magic meaning
+by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs)
+is taken as an ordinary character.
+.PP
+Any unmatched [ is a REG_EBRACK error.
+.PP
+Equivalence classes cannot begin or end bracket-expression ranges.
+The endpoint of one range cannot begin another.
+.PP
+RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255.
+.PP
+A repetition operator (?, *, +, or bounds) cannot follow another
+repetition operator.
+A repetition operator cannot begin an expression or subexpression
+or follow `^' or `|'.
+.PP
+`|' cannot appear first or last in a (sub)expression or after another `|',
+i.e. an operand of `|' cannot be an empty subexpression.
+An empty parenthesized subexpression, `()', is legal and matches an
+empty (sub)string.
+An empty string is not a legal RE.
+.PP
+A `{' followed by a digit is considered the beginning of bounds for a
+bounded repetition, which must then follow the syntax for bounds.
+A `{' \fInot\fR followed by a digit is considered an ordinary character.
+.PP
+`^' and `$' beginning and ending subexpressions in obsolete (``basic'')
+REs are anchors, not ordinary characters.
+.SH SEE ALSO
+grep(1), re_format(7)
+.PP
+POSIX 1003.2, sections 2.8 (Regular Expression Notation)
+and
+B.5 (C Binding for Regular Expression Matching).
+.SH DIAGNOSTICS
+Non-zero error codes from
+.B regcomp
+and
+.B regexec
+include the following:
+.nf
+
+REG_NOMATCH	regexec() failed to match
+REG_BADPAT	invalid regular expression
+REG_ECOLLATE	invalid collating element
+REG_ECTYPE	invalid character class
+REG_EESCAPE	\e applied to unescapable character
+REG_ESUBREG	invalid backreference number
+REG_EBRACK	brackets [ ] not balanced
+REG_EPAREN	parentheses ( ) not balanced
+REG_EBRACE	braces { } not balanced
+REG_BADBR	invalid repetition count(s) in { }
+REG_ERANGE	invalid character range in [ ]
+REG_ESPACE	ran out of memory
+REG_BADRPT	?, *, or + operand invalid
+REG_EMPTY	empty (sub)expression
+REG_ASSERT	``can't happen''\(emyou found a bug
+REG_INVARG	invalid argument, e.g. negative-length string
+.fi
+.SH HISTORY
+Originally written by Henry Spencer.
+Altered for inclusion in the 4.4BSD distribution.
+.SH BUGS
+This is an alpha release with known defects.
+Please report problems.
+.PP
+There is one known functionality bug.
+The implementation of internationalization is incomplete:
+the locale is always assumed to be the default one of 1003.2,
+and only the collating elements etc. of that locale are available.
+.PP
+The back-reference code is subtle and doubts linger about its correctness
+in complex cases.
+.PP
+.B regexec
+performance is poor.
+This will improve with later releases.
+.I nmatch
+exceeding 0 is expensive;
+.I nmatch
+exceeding 1 is worse.
+.B regexec
+is largely insensitive to RE complexity \fIexcept\fR that back
+references are massively expensive.
+RE length does matter; in particular, there is a strong speed bonus
+for keeping RE length under about 30 characters,
+with most special characters counting roughly double.
+.PP
+.B regcomp
+implements bounded repetitions by macro expansion,
+which is costly in time and space if counts are large
+or bounded repetitions are nested.
+An RE like, say,
+`((((a{1,100}){1,100}){1,100}){1,100}){1,100}'
+will (eventually) run almost any existing machine out of swap space.
+.PP
+There are suspected problems with response to obscure error conditions.
+Notably,
+certain kinds of internal overflow,
+produced only by truly enormous REs or by multiply nested bounded repetitions,
+are probably not handled well.
+.PP
+Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is
+a special character only in the presence of a previous unmatched `('.
+This can't be fixed until the spec is fixed.
+.PP
+The standard's definition of back references is vague.
+For example, does
+`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?
+Until the standard is clarified,
+behavior in such cases should not be relied on.
+.PP
+The implementation of word-boundary matching is a bit of a kludge,
+and bugs may lurk in combinations of word-boundary matching and anchoring.
--- a/usr.man/man7/re.format.7
+++ b/usr.man/man7/re.format.7
@ -0,0 +1,268 @@
+.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
+.\" Copyright (c) 1992, 1993, 1994
+.\"	The Regents of the University of California.  All rights reserved.
+.\"
+.\" This code is derived from software contributed to Berkeley by
+.\" Henry Spencer.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\" 3. All advertising materials mentioning features or use of this software
+.\"    must display the following acknowledgement:
+.\"	This product includes software developed by the University of
+.\"	California, Berkeley and its contributors.
+.\" 4. Neither the name of the University nor the names of its contributors
+.\"    may be used to endorse or promote products derived from this software
+.\"    without specific prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.\"	@(#)re_format.7	8.3 (Berkeley) 3/20/94
+.\"
+.TH RE_FORMAT 7 "7 October 1997"
+.SH NAME
+re_format \- POSIX 1003.2 regular expressions
+.SH DESCRIPTION
+Regular expressions (``RE''s),
+as defined in POSIX 1003.2, come in two forms:
+modern REs (roughly those of
+.BR egrep ;
+1003.2 calls these ``extended'' REs)
+and obsolete REs (roughly those of
+.BR ed ;
+1003.2 ``basic'' REs).
+Obsolete REs mostly exist for backward compatibility in some old programs;
+they will be discussed at the end.
+1003.2 leaves some aspects of RE syntax and semantics open;
+`\(dg' marks decisions on these aspects that
+may not be fully portable to other 1003.2 implementations.
+.PP
+A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR,
+separated by `|'.
+It matches anything that matches one of the branches.
+.PP
+A branch is one\(dg or more \fIpieces\fR, concatenated.
+It matches a match for the first, followed by a match for the second, etc.
+.PP
+A piece is an \fIatom\fR possibly followed
+by a single\(dg `*', `+', `?', or \fIbound\fR.
+An atom followed by `*' matches a sequence of 0 or more matches of the atom.
+An atom followed by `+' matches a sequence of 1 or more matches of the atom.
+An atom followed by `?' matches a sequence of 0 or 1 matches of the atom.
+.PP
+A \fIbound\fR is `{' followed by an unsigned decimal integer,
+possibly followed by `,'
+possibly followed by another unsigned decimal integer,
+always followed by `}'.
+The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive,
+and if there are two of them, the first may not exceed the second.
+An atom followed by a bound containing one integer \fIi\fR
+and no comma matches
+a sequence of exactly \fIi\fR matches of the atom.
+An atom followed by a bound
+containing one integer \fIi\fR and a comma matches
+a sequence of \fIi\fR or more matches of the atom.
+An atom followed by a bound
+containing two integers \fIi\fR and \fIj\fR matches
+a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom.
+.PP
+An atom is a regular expression enclosed in `()' (matching a match for the
+regular expression),
+an empty set of `()' (matching the null string)\(dg,
+a \fIbracket expression\fR (see below), `.'
+(matching any single character), `^' (matching the null string at the
+beginning of a line), `$' (matching the null string at the
+end of a line), a `\e' followed by one of the characters
+`^.[$()|*+?{\e'
+(matching that character taken as an ordinary character),
+a `\e' followed by any other character\(dg
+(matching that character taken as an ordinary character,
+as if the `\e' had not been present\(dg),
+or a single character with no other significance (matching that character).
+A `{' followed by a character other than a digit is an ordinary
+character, not the beginning of a bound\(dg.
+It is illegal to end an RE with `\e'.
+.PP
+A \fIbracket expression\fR is a list of characters enclosed in `[]'.
+It normally matches any single character from the list (but see below).
+If the list begins with `^',
+it matches any single character
+(but see below) \fInot\fR from the rest of the list.
+If two characters in the list are separated by `\-', this is shorthand
+for the full \fIrange\fR of characters between those two (inclusive) in the
+collating sequence,
+e.g. `[0-9]' in ASCII matches any decimal digit.
+It is illegal\(dg for two ranges to share an
+endpoint, e.g. `a-c-e'.
+Ranges are very collating-sequence-dependent,
+and portable programs should avoid relying on them.
+.PP
+To include a literal `]' in the list, make it the first character
+(following a possible `^').
+To include a literal `\-', make it the first or last character,
+or the second endpoint of a range.
+To use a literal `\-' as the first endpoint of a range,
+enclose it in `[.' and `.]' to make it a collating element (see below).
+With the exception of these and some combinations using `[' (see next
+paragraphs), all other special characters, including `\e', lose their
+special significance within a bracket expression.
+.PP
+Within a bracket expression, a collating element (a character,
+a multi-character sequence that collates as if it were a single character,
+or a collating-sequence name for either)
+enclosed in `[.' and `.]' stands for the
+sequence of characters of that collating element.
+The sequence is a single element of the bracket expression's list.
+A bracket expression containing a multi-character collating element 
+can thus match more than one character,
+e.g. if the collating sequence includes a `ch' collating element,
+then the RE `[[.ch.]]*c' matches the first five characters
+of `chchcc'.
+.PP
+Within a bracket expression, a collating element enclosed in `[=' and
+`=]' is an equivalence class, standing for the sequences of characters
+of all collating elements equivalent to that one, including itself.
+(If there are no other equivalent collating elements,
+the treatment is as if the enclosing delimiters were `[.' and `.]'.)
+For example, if o and \o'o^' are the members of an equivalence class,
+then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous.
+An equivalence class may not\(dg be an endpoint
+of a range.
+.PP
+Within a bracket expression, the name of a \fIcharacter class\fR enclosed
+in `[:' and `:]' stands for the list of all characters belonging to that
+class.
+Standard character class names are:
+.PP
+.RS
+.nf
+alnum	digit	punct
+alpha	graph	space
+blank	lower	upper
+cntrl	print	xdigit
+.fi
+.RE
+.PP
+These stand for the character classes defined in
+.IR ctype (3).
+A locale may provide others.
+A character class may not be used as an endpoint of a range.
+.PP
+There are two special cases\(dg of bracket expressions:
+the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at
+the beginning and end of a word respectively.
+A word is defined as a sequence of
+word characters
+which is neither preceded nor followed by
+word characters.
+A word character is an
+.I alnum
+character (as defined by
+.IR ctype (3))
+or an underscore.
+This is an extension,
+compatible with but not specified by POSIX 1003.2,
+and should be used with
+caution in software intended to be portable to other systems.
+.PP
+In the event that an RE could match more than one substring of a given
+string,
+the RE matches the one starting earliest in the string.
+If the RE could match more than one substring starting at that point,
+it matches the longest.
+Subexpressions also match the longest possible substrings, subject to
+the constraint that the whole match be as long as possible,
+with subexpressions starting earlier in the RE taking priority over
+ones starting later.
+Note that higher-level subexpressions thus take priority over
+their lower-level component subexpressions.
+.PP
+Match lengths are measured in characters, not collating elements.
+A null string is considered longer than no match at all.
+For example,
+`bb*' matches the three middle characters of `abbbc',
+`(wee|week)(knights|nights)' matches all ten characters of `weeknights',
+when `(.*).*' is matched against `abc' the parenthesized subexpression
+matches all three characters, and
+when `(a*)*' is matched against `bc' both the whole RE and the parenthesized
+subexpression match the null string.
+.PP
+If case-independent matching is specified,
+the effect is much as if all case distinctions had vanished from the
+alphabet.
+When an alphabetic that exists in multiple cases appears as an
+ordinary character outside a bracket expression, it is effectively
+transformed into a bracket expression containing both cases,
+e.g. `x' becomes `[xX]'.
+When it appears inside a bracket expression, all case counterparts
+of it are added to the bracket expression, so that (e.g.) `[x]'
+becomes `[xX]' and `[^x]' becomes `[^xX]'.
+.PP
+No particular limit is imposed on the length of REs\(dg.
+Programs intended to be portable should not employ REs longer
+than 256 bytes,
+as an implementation can refuse to accept such REs and remain
+POSIX-compliant.
+.PP
+Obsolete (``basic'') regular expressions differ in several respects.
+`|', `+', and `?' are ordinary characters and there is no equivalent
+for their functionality.
+The delimiters for bounds are `\e{' and `\e}',
+with `{' and `}' by themselves ordinary characters.
+The parentheses for nested subexpressions are `\e(' and `\e)',
+with `(' and `)' by themselves ordinary characters.
+`^' is an ordinary character except at the beginning of the
+RE or\(dg the beginning of a parenthesized subexpression,
+`$' is an ordinary character except at the end of the
+RE or\(dg the end of a parenthesized subexpression,
+and `*' is an ordinary character if it appears at the beginning of the
+RE or the beginning of a parenthesized subexpression
+(after a possible leading `^').
+Finally, there is one new type of atom, a \fIback reference\fR:
+`\e' followed by a non-zero decimal digit \fId\fR
+matches the same sequence of characters
+matched by the \fId\fRth parenthesized subexpression
+(numbering subexpressions by the positions of their opening parentheses,
+left to right),
+so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'.
+.SH SEE ALSO
+.BR regex (3)
+.PP
+POSIX 1003.2, section 2.8 (Regular Expression Notation).
+.SH BUGS
+Having two kinds of REs is a botch.
+.PP
+The current 1003.2 spec says that `)' is an ordinary character in
+the absence of an unmatched `(';
+this was an unintentional result of a wording error,
+and change is likely.
+Avoid relying on it.
+.PP
+Back references are a dreadful botch,
+posing major problems for efficient implementations.
+They are also somewhat vaguely defined
+(does
+`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?).
+Avoid using them.
+.PP
+1003.2's specification of case-independent matching is vague.
+The ``one case implies all cases'' definition given above
+is current consensus among implementors as to the right interpretation.
+.PP
+The syntax for word boundaries is incredibly ugly.
--- a/usr.man/mkso.data
+++ b/usr.man/mkso.data
@ -1,7 +1,7 @@
 # 0
 # 1  The first column is the "real" man page; the second is the .so link.
 # 2
-# 3  $Id: mkso.data,v 1.5 1997/10/03 05:46:31 gdr Exp $
+# 3  $Id: mkso.data,v 1.6 1997/10/08 07:09:08 gdr Exp $
 # 4
 man2/alarm.2		man2/alarm10.2
 man2/chdir.2		man2/fchdir.2
@ -201,6 +201,10 @@ man3/queue.3		man3/TAILQ_INSERT_BEFORE.3
 man3/queue.3		man3/TAILQ_INSERT_HEAD.3
 man3/queue.3		man3/TAILQ_INSERT_TAIL.3
 man3/queue.3		man3/TAILQ_REMOVE.3
+man3/regex.3		man3/regcomp.3
+man3/regex.3		man3/regexec.3
+man3/regex.3		man3/regerror.3
+man3/regex.3		man3/regfree.3
 man3/scanf.3		man3/fscanf.3
 man3/scanf.3		man3/sscanf.3
 man3/scanf.3		man3/vfscanf.3
@ -241,3 +245,4 @@ man3/tmpnam.3		man3/tmpfile.3
 man3/ttyname.3		man3/isatty.3
 man3/ttyname.3		man3/ttyslot.3
 man5/utmp.5		man5/wtmp.5
+man7/re.format.7	man7/re_format.7