diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index a4ccfe3bdb..6b360b8700 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -1863,6 +1863,22 @@ func cleanClass(rp *[]rune) []rune { return r[:w] } +// inCharClass reports whether r is in the class. +// It assumes the class has been cleaned by cleanClass. +func inCharClass(r rune, class []rune) bool { + _, ok := sort.Find(len(class)/2, func(i int) int { + lo, hi := class[2*i], class[2*i+1] + if r > hi { + return +1 + } + if r < lo { + return -1 + } + return 0 + }) + return ok +} + // appendLiteral returns the result of appending the literal x to the class r. func appendLiteral(r []rune, x rune, flags Flags) []rune { if flags&FoldCase != 0 { diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index d7999046e0..0f885bd5c8 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -590,3 +590,39 @@ func TestToStringEquivalentParse(t *testing.T) { } } } + +var stringTests = []struct { + re string + out string +}{ + {`x(?i:ab*c|d?e)1`, `x(?i:AB*C|D?E)1`}, + {`x(?i:ab*cd?e)1`, `x(?i:AB*CD?E)1`}, + {`0(?i:ab*c|d?e)1`, `(?i:0(?:AB*C|D?E)1)`}, + {`0(?i:ab*cd?e)1`, `(?i:0AB*CD?E1)`}, + {`x(?i:ab*c|d?e)`, `x(?i:AB*C|D?E)`}, + {`x(?i:ab*cd?e)`, `x(?i:AB*CD?E)`}, + {`0(?i:ab*c|d?e)`, `(?i:0(?:AB*C|D?E))`}, + {`0(?i:ab*cd?e)`, `(?i:0AB*CD?E)`}, + {`(?i:ab*c|d?e)1`, `(?i:(?:AB*C|D?E)1)`}, + {`(?i:ab*cd?e)1`, `(?i:AB*CD?E1)`}, + {`(?i:ab)[123](?i:cd)`, `(?i:AB[1-3]CD)`}, + {`(?i:ab*c|d?e)`, `(?i:AB*C|D?E)`}, + {`[Aa][Bb]`, `(?i:AB)`}, + {`[Aa][Bb]*[Cc]`, `(?i:AB*C)`}, + {`A(?:[Bb][Cc]|[Dd])[Zz]`, `A(?i:(?:BC|D)Z)`}, + {`[Aa](?:[Bb][Cc]|[Dd])Z`, `(?i:A(?:BC|D))Z`}, +} + +func TestString(t *testing.T) { + for _, tt := range stringTests { + re, err := Parse(tt.re, Perl) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.re, err) + continue + } + out := re.String() + if out != tt.out { + t.Errorf("Parse(%#q).String() = %#q, want %#q", tt.re, out, tt.out) + } + } +} diff --git a/src/regexp/syntax/regexp.go b/src/regexp/syntax/regexp.go index 3a4d2d201c..4fa7d0e2f8 100644 --- a/src/regexp/syntax/regexp.go +++ b/src/regexp/syntax/regexp.go @@ -112,8 +112,165 @@ func (x *Regexp) Equal(y *Regexp) bool { return true } +// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. +type printFlags uint8 + +const ( + flagI printFlags = 1 << iota // (?i: + flagM // (?m: + flagS // (?s: + flagOff // ) + flagPrec // (?: ) + negShift = 5 // flagI<") @@ -122,15 +279,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { case OpEmptyMatch: b.WriteString(`(?:)`) case OpLiteral: - if re.Flags&FoldCase != 0 { - b.WriteString(`(?i:`) - } for _, r := range re.Rune { escape(b, r, false) } - if re.Flags&FoldCase != 0 { - b.WriteString(`)`) - } case OpCharClass: if len(re.Rune)%2 != 0 { b.WriteString(`[invalid char class]`) @@ -147,7 +298,9 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } @@ -156,25 +309,25 @@ func writeRegexp(b *strings.Builder, re *Regexp) { lo, hi := re.Rune[i], re.Rune[i+1] escape(b, lo, lo == '-') if lo != hi { - b.WriteRune('-') + if hi != lo+1 { + b.WriteRune('-') + } escape(b, hi, hi == '-') } } } b.WriteRune(']') - case OpAnyCharNotNL: - b.WriteString(`(?-s:.)`) - case OpAnyChar: - b.WriteString(`(?s:.)`) + case OpAnyCharNotNL, OpAnyChar: + b.WriteString(`.`) case OpBeginLine: - b.WriteString(`(?m:^)`) + b.WriteString(`^`) case OpEndLine: - b.WriteString(`(?m:$)`) + b.WriteString(`$`) case OpBeginText: b.WriteString(`\A`) case OpEndText: if re.Flags&WasDollar != 0 { - b.WriteString(`(?-m:$)`) + b.WriteString(`$`) } else { b.WriteString(`\z`) } @@ -191,17 +344,17 @@ func writeRegexp(b *strings.Builder, re *Regexp) { b.WriteRune('(') } if re.Sub[0].Op != OpEmptyMatch { - writeRegexp(b, re.Sub[0]) + writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) } b.WriteRune(')') case OpStar, OpPlus, OpQuest, OpRepeat: - if sub := re.Sub[0]; sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p := printFlags(0) + sub := re.Sub[0] + if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { + p = flagPrec } + writeRegexp(b, sub, p, flags) + switch re.Op { case OpStar: b.WriteRune('*') @@ -225,27 +378,31 @@ func writeRegexp(b *strings.Builder, re *Regexp) { } case OpConcat: for _, sub := range re.Sub { + p := printFlags(0) if sub.Op == OpAlternate { - b.WriteString(`(?:`) - writeRegexp(b, sub) - b.WriteString(`)`) - } else { - writeRegexp(b, sub) + p = flagPrec } + writeRegexp(b, sub, p, flags) } case OpAlternate: for i, sub := range re.Sub { if i > 0 { b.WriteRune('|') } - writeRegexp(b, sub) + writeRegexp(b, sub, 0, flags) } } } func (re *Regexp) String() string { var b strings.Builder - writeRegexp(&b, re) + var flags map[*Regexp]printFlags + must, cant := calcFlags(re, &flags) + must |= (cant &^ flagI) << negShift + if must != 0 { + must |= flagOff + } + writeRegexp(&b, re, must, flags) return b.String() } diff --git a/src/regexp/syntax/simplify_test.go b/src/regexp/syntax/simplify_test.go index 9877db3d0a..6d06f99c1b 100644 --- a/src/regexp/syntax/simplify_test.go +++ b/src/regexp/syntax/simplify_test.go @@ -13,7 +13,7 @@ var simplifyTests = []struct { // Already-simple constructs {`a`, `a`}, {`ab`, `ab`}, - {`a|b`, `[a-b]`}, + {`a|b`, `[ab]`}, {`ab|cd`, `ab|cd`}, {`(ab)*`, `(ab)*`}, {`(ab)+`, `(ab)+`}, @@ -40,16 +40,16 @@ var simplifyTests = []struct { // Perl character classes {`\d`, `[0-9]`}, - {`\s`, `[\t-\n\f-\r ]`}, + {`\s`, `[\t\n\f\r ]`}, {`\w`, `[0-9A-Z_a-z]`}, {`\D`, `[^0-9]`}, - {`\S`, `[^\t-\n\f-\r ]`}, + {`\S`, `[^\t\n\f\r ]`}, {`\W`, `[^0-9A-Z_a-z]`}, {`[\d]`, `[0-9]`}, - {`[\s]`, `[\t-\n\f-\r ]`}, + {`[\s]`, `[\t\n\f\r ]`}, {`[\w]`, `[0-9A-Z_a-z]`}, {`[\D]`, `[^0-9]`}, - {`[\S]`, `[^\t-\n\f-\r ]`}, + {`[\S]`, `[^\t\n\f\r ]`}, {`[\W]`, `[^0-9A-Z_a-z]`}, // Posix repetitions @@ -82,7 +82,8 @@ var simplifyTests = []struct { {`a{0}`, `(?:)`}, // Character class simplification - {`[ab]`, `[a-b]`}, + {`[ab]`, `[ab]`}, + {`[abc]`, `[a-c]`}, {`[a-za-za-z]`, `[a-z]`}, {`[A-Za-zA-Za-z]`, `[A-Za-z]`}, {`[ABCDEFGH]`, `[A-H]`}, @@ -120,7 +121,8 @@ var simplifyTests = []struct { // interesting than they might otherwise be. String inserts // explicit (?:) in place of non-parenthesized empty strings, // to make them easier to spot for other parsers. - {`(a|b|)`, `([a-b]|(?:))`}, + {`(a|b|c|)`, `([a-c]|(?:))`}, + {`(a|b|)`, `([ab]|(?:))`}, {`(|)`, `()`}, {`a()`, `a()`}, {`(()|())`, `(()|())`},