Retro68/gcc/libgo/go/regexp/syntax/regexp.go

// Copyright 2011 The Go Authors.  All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package syntax

// Note to implementers:
// In this package, re is always a *Regexp and r is always a rune.

import (
	"bytes"
	"strconv"
	"strings"
	"unicode"
)

// A Regexp is a node in a regular expression syntax tree.
type Regexp struct {
	Op       Op // operator
	Flags    Flags
	Sub      []*Regexp  // subexpressions, if any
	Sub0     [1]*Regexp // storage for short Sub
	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
	Rune0    [2]rune    // storage for short Rune
	Min, Max int        // min, max for OpRepeat
	Cap      int        // capturing index, for OpCapture
	Name     string     // capturing name, for OpCapture
}

// An Op is a single regular expression operator.
type Op uint8

// Operators are listed in precedence order, tightest binding to weakest.
// Character class operators are listed simplest to most complex
// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).

const (
	OpNoMatch        Op = 1 + iota // matches no strings
	OpEmptyMatch                   // matches empty string
	OpLiteral                      // matches Runes sequence
	OpCharClass                    // matches Runes interpreted as range pair list
	OpAnyCharNotNL                 // matches any character except newline
	OpAnyChar                      // matches any character
	OpBeginLine                    // matches empty string at beginning of line
	OpEndLine                      // matches empty string at end of line
	OpBeginText                    // matches empty string at beginning of text
	OpEndText                      // matches empty string at end of text
	OpWordBoundary                 // matches word boundary `\b`
	OpNoWordBoundary               // matches word non-boundary `\B`
	OpCapture                      // capturing subexpression with index Cap, optional name Name
	OpStar                         // matches Sub[0] zero or more times
	OpPlus                         // matches Sub[0] one or more times
	OpQuest                        // matches Sub[0] zero or one times
	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
	OpConcat                       // matches concatenation of Subs
	OpAlternate                    // matches alternation of Subs
)

const opPseudo Op = 128 // where pseudo-ops start

// Equal returns true if x and y have identical structure.
func (x *Regexp) Equal(y *Regexp) bool {
	if x == nil || y == nil {
		return x == y
	}
	if x.Op != y.Op {
		return false
	}
	switch x.Op {
	case OpEndText:
		// The parse flags remember whether this is \z or \Z.
		if x.Flags&WasDollar != y.Flags&WasDollar {
			return false
		}

	case OpLiteral, OpCharClass:
		if len(x.Rune) != len(y.Rune) {
			return false
		}
		for i, r := range x.Rune {
			if r != y.Rune[i] {
				return false
			}
		}

	case OpAlternate, OpConcat:
		if len(x.Sub) != len(y.Sub) {
			return false
		}
		for i, sub := range x.Sub {
			if !sub.Equal(y.Sub[i]) {
				return false
			}
		}

	case OpStar, OpPlus, OpQuest:
		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
			return false
		}

	case OpRepeat:
		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
			return false
		}

	case OpCapture:
		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
			return false
		}
	}
	return true
}

// writeRegexp writes the Perl syntax for the regular expression re to b.
func writeRegexp(b *bytes.Buffer, re *Regexp) {
	switch re.Op {
	default:
		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
	case OpNoMatch:
		b.WriteString(`[^\x00-\x{10FFFF}]`)
	case OpEmptyMatch:
		b.WriteString(`(?:)`)
	case OpLiteral:
		if re.Flags&FoldCase != 0 {
			b.WriteString(`(?i:`)
		}
		for _, r := range re.Rune {
			escape(b, r, false)
		}
		if re.Flags&FoldCase != 0 {
			b.WriteString(`)`)
		}
	case OpCharClass:
		if len(re.Rune)%2 != 0 {
			b.WriteString(`[invalid char class]`)
			break
		}
		b.WriteRune('[')
		if len(re.Rune) == 0 {
			b.WriteString(`^\x00-\x{10FFFF}`)
		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
			// Contains 0 and MaxRune.  Probably a negated class.
			// Print the gaps.
			b.WriteRune('^')
			for i := 1; i < len(re.Rune)-1; i += 2 {
				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
				escape(b, lo, lo == '-')
				if lo != hi {
					b.WriteRune('-')
					escape(b, hi, hi == '-')
				}
			}
		} else {
			for i := 0; i < len(re.Rune); i += 2 {
				lo, hi := re.Rune[i], re.Rune[i+1]
				escape(b, lo, lo == '-')
				if lo != hi {
					b.WriteRune('-')
					escape(b, hi, hi == '-')
				}
			}
		}
		b.WriteRune(']')
	case OpAnyCharNotNL:
		b.WriteString(`(?-s:.)`)
	case OpAnyChar:
		b.WriteString(`(?s:.)`)
	case OpBeginLine:
		b.WriteString(`(?m:^)`)
	case OpEndLine:
		b.WriteString(`(?m:$)`)
	case OpBeginText:
		b.WriteString(`\A`)
	case OpEndText:
		if re.Flags&WasDollar != 0 {
			b.WriteString(`(?-m:$)`)
		} else {
			b.WriteString(`\z`)
		}
	case OpWordBoundary:
		b.WriteString(`\b`)
	case OpNoWordBoundary:
		b.WriteString(`\B`)
	case OpCapture:
		if re.Name != "" {
			b.WriteString(`(?P<`)
			b.WriteString(re.Name)
			b.WriteRune('>')
		} else {
			b.WriteRune('(')
		}
		if re.Sub[0].Op != OpEmptyMatch {
			writeRegexp(b, re.Sub[0])
		}
		b.WriteRune(')')
	case OpStar, OpPlus, OpQuest, OpRepeat:
		if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
			b.WriteString(`(?:`)
			writeRegexp(b, sub)
			b.WriteString(`)`)
		} else {
			writeRegexp(b, sub)
		}
		switch re.Op {
		case OpStar:
			b.WriteRune('*')
		case OpPlus:
			b.WriteRune('+')
		case OpQuest:
			b.WriteRune('?')
		case OpRepeat:
			b.WriteRune('{')
			b.WriteString(strconv.Itoa(re.Min))
			if re.Max != re.Min {
				b.WriteRune(',')
				if re.Max >= 0 {
					b.WriteString(strconv.Itoa(re.Max))
				}
			}
			b.WriteRune('}')
		}
		if re.Flags&NonGreedy != 0 {
			b.WriteRune('?')
		}
	case OpConcat:
		for _, sub := range re.Sub {
			if sub.Op == OpAlternate {
				b.WriteString(`(?:`)
				writeRegexp(b, sub)
				b.WriteString(`)`)
			} else {
				writeRegexp(b, sub)
			}
		}
	case OpAlternate:
		for i, sub := range re.Sub {
			if i > 0 {
				b.WriteRune('|')
			}
			writeRegexp(b, sub)
		}
	}
}

func (re *Regexp) String() string {
	var b bytes.Buffer
	writeRegexp(&b, re)
	return b.String()
}

const meta = `\.+*?()|[]{}^$`

func escape(b *bytes.Buffer, r rune, force bool) {
	if unicode.IsPrint(r) {
		if strings.IndexRune(meta, r) >= 0 || force {
			b.WriteRune('\\')
		}
		b.WriteRune(r)
		return
	}

	switch r {
	case '\a':
		b.WriteString(`\a`)
	case '\f':
		b.WriteString(`\f`)
	case '\n':
		b.WriteString(`\n`)
	case '\r':
		b.WriteString(`\r`)
	case '\t':
		b.WriteString(`\t`)
	case '\v':
		b.WriteString(`\v`)
	default:
		if r < 0x100 {
			b.WriteString(`\x`)
			s := strconv.FormatInt(int64(r), 16)
			if len(s) == 1 {
				b.WriteRune('0')
			}
			b.WriteString(s)
			break
		}
		b.WriteString(`\x{`)
		b.WriteString(strconv.FormatInt(int64(r), 16))
		b.WriteString(`}`)
	}
}

// MaxCap walks the regexp to find the maximum capture index.
func (re *Regexp) MaxCap() int {
	m := 0
	if re.Op == OpCapture {
		m = re.Cap
	}
	for _, sub := range re.Sub {
		if n := sub.MaxCap(); m < n {
			m = n
		}
	}
	return m
}

// CapNames walks the regexp to find the names of capturing groups.
func (re *Regexp) CapNames() []string {
	names := make([]string, re.MaxCap()+1)
	re.capNames(names)
	return names
}

func (re *Regexp) capNames(names []string) {
	if re.Op == OpCapture {
		names[re.Cap] = re.Name
	}
	for _, sub := range re.Sub {
		sub.capNames(names)
	}
}
add gcc 4.70 2012-03-27 23:13:14 +00:00			`// Copyright 2011 The Go Authors. All rights reserved.`
			`// Use of this source code is governed by a BSD-style`
			`// license that can be found in the LICENSE file.`

			`package syntax`

			`// Note to implementers:`
			`// In this package, re is always a *Regexp and r is always a rune.`

			`import (`
			`"bytes"`
			`"strconv"`
			`"strings"`
			`"unicode"`
			`)`

			`// A Regexp is a node in a regular expression syntax tree.`
			`type Regexp struct {`
			`Op Op // operator`
			`Flags Flags`
			`Sub []*Regexp // subexpressions, if any`
			`Sub0 [1]*Regexp // storage for short Sub`
			`Rune []rune // matched runes, for OpLiteral, OpCharClass`
			`Rune0 [2]rune // storage for short Rune`
			`Min, Max int // min, max for OpRepeat`
			`Cap int // capturing index, for OpCapture`
			`Name string // capturing name, for OpCapture`
			`}`

			`// An Op is a single regular expression operator.`
			`type Op uint8`

			`// Operators are listed in precedence order, tightest binding to weakest.`
			`// Character class operators are listed simplest to most complex`
			`// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).`

			`const (`
			`OpNoMatch Op = 1 + iota // matches no strings`
			`OpEmptyMatch // matches empty string`
			`OpLiteral // matches Runes sequence`
			`OpCharClass // matches Runes interpreted as range pair list`
Update gcc to 5.2.0 2015-08-28 15:33:40 +00:00			`OpAnyCharNotNL // matches any character except newline`
add gcc 4.70 2012-03-27 23:13:14 +00:00			`OpAnyChar // matches any character`
			`OpBeginLine // matches empty string at beginning of line`
			`OpEndLine // matches empty string at end of line`
			`OpBeginText // matches empty string at beginning of text`
			`OpEndText // matches empty string at end of text`
			OpWordBoundary // matches word boundary `\b`
			OpNoWordBoundary // matches word non-boundary `\B`
			`OpCapture // capturing subexpression with index Cap, optional name Name`
			`OpStar // matches Sub[0] zero or more times`
			`OpPlus // matches Sub[0] one or more times`
			`OpQuest // matches Sub[0] zero or one times`
			`OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)`
			`OpConcat // matches concatenation of Subs`
			`OpAlternate // matches alternation of Subs`
			`)`

			`const opPseudo Op = 128 // where pseudo-ops start`

			`// Equal returns true if x and y have identical structure.`
			`func (x Regexp) Equal(y Regexp) bool {`
			`if x == nil \|\| y == nil {`
			`return x == y`
			`}`
			`if x.Op != y.Op {`
			`return false`
			`}`
			`switch x.Op {`
			`case OpEndText:`
			`// The parse flags remember whether this is \z or \Z.`
			`if x.Flags&WasDollar != y.Flags&WasDollar {`
			`return false`
			`}`

			`case OpLiteral, OpCharClass:`
			`if len(x.Rune) != len(y.Rune) {`
			`return false`
			`}`
			`for i, r := range x.Rune {`
			`if r != y.Rune[i] {`
			`return false`
			`}`
			`}`

			`case OpAlternate, OpConcat:`
			`if len(x.Sub) != len(y.Sub) {`
			`return false`
			`}`
			`for i, sub := range x.Sub {`
			`if !sub.Equal(y.Sub[i]) {`
			`return false`
			`}`
			`}`

			`case OpStar, OpPlus, OpQuest:`
			`if x.Flags&NonGreedy != y.Flags&NonGreedy \|\| !x.Sub[0].Equal(y.Sub[0]) {`
			`return false`
			`}`

			`case OpRepeat:`
			`if x.Flags&NonGreedy != y.Flags&NonGreedy \|\| x.Min != y.Min \|\| x.Max != y.Max \|\| !x.Sub[0].Equal(y.Sub[0]) {`
			`return false`
			`}`

			`case OpCapture:`
			`if x.Cap != y.Cap \|\| x.Name != y.Name \|\| !x.Sub[0].Equal(y.Sub[0]) {`
			`return false`
			`}`
			`}`
			`return true`
			`}`

			`// writeRegexp writes the Perl syntax for the regular expression re to b.`
			`func writeRegexp(b bytes.Buffer, re Regexp) {`
			`switch re.Op {`
			`default:`
			`b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")`
			`case OpNoMatch:`
			b.WriteString(`[^\x00-\x{10FFFF}]`)
			`case OpEmptyMatch:`
			b.WriteString(`(?:)`)
			`case OpLiteral:`
			`if re.Flags&FoldCase != 0 {`
			b.WriteString(`(?i:`)
			`}`
			`for _, r := range re.Rune {`
			`escape(b, r, false)`
			`}`
			`if re.Flags&FoldCase != 0 {`
			b.WriteString(`)`)
			`}`
			`case OpCharClass:`
			`if len(re.Rune)%2 != 0 {`
			b.WriteString(`[invalid char class]`)
			`break`
			`}`
			`b.WriteRune('[')`
			`if len(re.Rune) == 0 {`
			b.WriteString(`^\x00-\x{10FFFF}`)
			`} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {`
			`// Contains 0 and MaxRune. Probably a negated class.`
			`// Print the gaps.`
			`b.WriteRune('^')`
			`for i := 1; i < len(re.Rune)-1; i += 2 {`
			`lo, hi := re.Rune[i]+1, re.Rune[i+1]-1`
			`escape(b, lo, lo == '-')`
			`if lo != hi {`
			`b.WriteRune('-')`
			`escape(b, hi, hi == '-')`
			`}`
			`}`
			`} else {`
			`for i := 0; i < len(re.Rune); i += 2 {`
			`lo, hi := re.Rune[i], re.Rune[i+1]`
			`escape(b, lo, lo == '-')`
			`if lo != hi {`
			`b.WriteRune('-')`
			`escape(b, hi, hi == '-')`
			`}`
			`}`
			`}`
			`b.WriteRune(']')`
			`case OpAnyCharNotNL:`
			b.WriteString(`(?-s:.)`)
			`case OpAnyChar:`
			b.WriteString(`(?s:.)`)
			`case OpBeginLine:`
upgrade gcc to 6.3.0, binutils to 2.28 2017-04-10 11:32:00 +00:00			b.WriteString(`(?m:^)`)
add gcc 4.70 2012-03-27 23:13:14 +00:00			`case OpEndLine:`
upgrade gcc to 6.3.0, binutils to 2.28 2017-04-10 11:32:00 +00:00			b.WriteString(`(?m:$)`)
add gcc 4.70 2012-03-27 23:13:14 +00:00			`case OpBeginText:`
			b.WriteString(`\A`)
			`case OpEndText:`
			`if re.Flags&WasDollar != 0 {`
			b.WriteString(`(?-m:$)`)
			`} else {`
			b.WriteString(`\z`)
			`}`
			`case OpWordBoundary:`
			b.WriteString(`\b`)
			`case OpNoWordBoundary:`
			b.WriteString(`\B`)
			`case OpCapture:`
			`if re.Name != "" {`
			b.WriteString(`(?P<`)
			`b.WriteString(re.Name)`
			`b.WriteRune('>')`
			`} else {`
			`b.WriteRune('(')`
			`}`
			`if re.Sub[0].Op != OpEmptyMatch {`
			`writeRegexp(b, re.Sub[0])`
			`}`
			`b.WriteRune(')')`
			`case OpStar, OpPlus, OpQuest, OpRepeat:`
			`if sub := re.Sub[0]; sub.Op > OpCapture \|\| sub.Op == OpLiteral && len(sub.Rune) > 1 {`
			b.WriteString(`(?:`)
			`writeRegexp(b, sub)`
			b.WriteString(`)`)
			`} else {`
			`writeRegexp(b, sub)`
			`}`
			`switch re.Op {`
			`case OpStar:`
			`b.WriteRune('*')`
			`case OpPlus:`
			`b.WriteRune('+')`
			`case OpQuest:`
			`b.WriteRune('?')`
			`case OpRepeat:`
			`b.WriteRune('{')`
			`b.WriteString(strconv.Itoa(re.Min))`
			`if re.Max != re.Min {`
			`b.WriteRune(',')`
			`if re.Max >= 0 {`
			`b.WriteString(strconv.Itoa(re.Max))`
			`}`
			`}`
			`b.WriteRune('}')`
			`}`
			`if re.Flags&NonGreedy != 0 {`
			`b.WriteRune('?')`
			`}`
			`case OpConcat:`
			`for _, sub := range re.Sub {`
			`if sub.Op == OpAlternate {`
			b.WriteString(`(?:`)
			`writeRegexp(b, sub)`
			b.WriteString(`)`)
			`} else {`
			`writeRegexp(b, sub)`
			`}`
			`}`
			`case OpAlternate:`
			`for i, sub := range re.Sub {`
			`if i > 0 {`
			`b.WriteRune('\|')`
			`}`
			`writeRegexp(b, sub)`
			`}`
			`}`
			`}`

			`func (re *Regexp) String() string {`
			`var b bytes.Buffer`
			`writeRegexp(&b, re)`
			`return b.String()`
			`}`

			const meta = `\.+*?()\|[]{}^$`

			`func escape(b *bytes.Buffer, r rune, force bool) {`
			`if unicode.IsPrint(r) {`
			`if strings.IndexRune(meta, r) >= 0 \|\| force {`
			`b.WriteRune('\\')`
			`}`
			`b.WriteRune(r)`
			`return`
			`}`

			`switch r {`
			`case '\a':`
			b.WriteString(`\a`)
			`case '\f':`
			b.WriteString(`\f`)
			`case '\n':`
			b.WriteString(`\n`)
			`case '\r':`
			b.WriteString(`\r`)
			`case '\t':`
			b.WriteString(`\t`)
			`case '\v':`
			b.WriteString(`\v`)
			`default:`
			`if r < 0x100 {`
			b.WriteString(`\x`)
			`s := strconv.FormatInt(int64(r), 16)`
			`if len(s) == 1 {`
			`b.WriteRune('0')`
			`}`
			`b.WriteString(s)`
			`break`
			`}`
			b.WriteString(`\x{`)
			`b.WriteString(strconv.FormatInt(int64(r), 16))`
			b.WriteString(`}`)
			`}`
			`}`

			`// MaxCap walks the regexp to find the maximum capture index.`
			`func (re *Regexp) MaxCap() int {`
			`m := 0`
			`if re.Op == OpCapture {`
			`m = re.Cap`
			`}`
			`for _, sub := range re.Sub {`
			`if n := sub.MaxCap(); m < n {`
			`m = n`
			`}`
			`}`
			`return m`
			`}`

			`// CapNames walks the regexp to find the names of capturing groups.`
			`func (re *Regexp) CapNames() []string {`
			`names := make([]string, re.MaxCap()+1)`
			`re.capNames(names)`
			`return names`
			`}`

			`func (re *Regexp) capNames(names []string) {`
			`if re.Op == OpCapture {`
			`names[re.Cap] = re.Name`
			`}`
			`for _, sub := range re.Sub {`
			`sub.capNames(names)`
			`}`
			`}`