RegExpTree.java
/*
* Copyright 2011 The Closure Compiler Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.google.javascript.jscomp.regex;
import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Iterables;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
/**
* An AST for JavaScript regular expressions.
*
* @author mikesamuel@gmail.com (Mike Samuel)
*/
public abstract class RegExpTree {
/**
* Returns a simpler regular expression that is semantically the same assuming
* the given flags.
* @param flags Regular expression flags, e.g. {@code "igm"}.
*/
public abstract RegExpTree simplify(String flags);
/**
* True if the presence or absence of an {@code "i"} flag would change the
* meaning of this regular expression.
*/
public abstract boolean isCaseSensitive();
/**
* True if the regular expression contains an anchor : {@code ^} or {@code $}.
*/
public abstract boolean containsAnchor();
/**
* True if the regular expression contains capturing groups.
*/
public final boolean hasCapturingGroup() {
return numCapturingGroups() != 0;
}
/**
* The number of capturing groups.
*/
public abstract int numCapturingGroups();
/**
* The children of this node.
*/
public abstract List<? extends RegExpTree> children();
/**
* Appends this regular expression source to the given buffer.
*/
protected abstract void appendSourceCode(StringBuilder sb);
protected abstract void appendDebugInfo(StringBuilder sb);
@Override
public final String toString() {
StringBuilder sb = new StringBuilder();
sb.append('/');
appendSourceCode(sb);
// Don't emit a regular expression that looks like a line comment start.
if (sb.length() == 1) {
sb.append("(?:)");
}
sb.append('/');
return sb.toString();
}
private void appendDebugString(StringBuilder sb) {
sb.append('(').append(getClass().getSimpleName());
int len = sb.length();
sb.append(' ');
appendDebugInfo(sb);
if (sb.length() == len + 1) { sb.setLength(len); }
for (RegExpTree child : children()) {
sb.append(' ');
child.appendDebugString(sb);
}
sb.append(')');
}
@Override
public abstract boolean equals(Object o);
@Override
public abstract int hashCode();
/**
* Parses a regular expression to an AST.
*
* @param pattern The {@code foo} From {@code /foo/i}.
* @param flags The {@code i} From {@code /foo/i}.
*/
public static RegExpTree parseRegExp(
final String pattern, final String flags) {
/** A recursive descent parser that closes over pattern and flags above. */
class Parser {
/** The number of characters in pattern consumed. */
int pos;
/** The number of capturing groups seen so far. */
int numCapturingGroups = 0;
/** The length of pattern. */
final int limit = pattern.length();
RegExpTree parseTopLevel() {
this.pos = 0;
RegExpTree out = parse();
if (pos < limit) { // Unmatched closed paren maybe.
throw new IllegalArgumentException(pattern.substring(pos));
}
return out;
}
RegExpTree parse() {
// Collects ["foo", "bar", "baz"] for /foo|bar|baz/.
ImmutableList.Builder<RegExpTree> alternatives = null;
// The last item parsed within an alternation.
RegExpTree preceder = null;
topLoop:
while (pos < limit) {
char ch = pattern.charAt(pos);
RegExpTree atom;
switch (ch) {
case '[':
atom = parseCharset();
break;
case '(':
atom = parseParenthetical();
break;
case ')':
break topLoop;
case '\\':
atom = parseEscape();
break;
case '^':
case '$':
atom = new Anchor(ch);
++pos;
break;
case '.':
// We represent . as a character set to make it easy to simplify
// things like /.|[\r\n]/.
atom = DOT_CHARSET;
++pos;
break;
case '|':
// An alternative may be empty as in /foo||bar/.
// The '|' is consumed below.
atom = Empty.INSTANCE;
break;
default:
// Find a run of concatenated characters to avoid building a
// tree node per literal character.
int start = pos;
int end = pos + 1;
charsLoop:
while (end < limit) {
switch (pattern.charAt(end)) {
case '[':
case '(':
case ')':
case '\\':
case '^':
case '$':
case '|':
case '.':
case '*':
case '+':
case '?':
case '{':
break charsLoop;
default:
// Repetition binds more tightly than concatenation.
// Only consume up to "foo" in /foob*/ so that the suffix
// operator parser below has the right precedence.
if (end + 1 >= limit
|| !isRepetitionStart(pattern.charAt(end + 1))) {
++end;
} else {
break charsLoop;
}
}
}
atom = new Text(pattern.substring(start, end));
pos = end;
break;
}
if (pos < limit && isRepetitionStart(pattern.charAt(pos))) {
atom = parseRepetition(atom);
}
if (preceder == null) {
preceder = atom;
} else {
preceder = new Concatenation(preceder, atom);
}
// If this is an alternative in a alternation, then add it to the
// list of complete alternatives, and reset the parser state for the
// next alternative.
if (pos < limit && pattern.charAt(pos) == '|') {
if (alternatives == null) {
alternatives = ImmutableList.builder();
}
alternatives.add(preceder);
preceder = null;
++pos;
}
}
// An alternative may have no parsed content blank as in /foo|/.
if (preceder == null) { preceder = Empty.INSTANCE; }
if (alternatives != null) {
alternatives.add(preceder);
return new Alternation(alternatives.build());
} else {
return preceder;
}
}
/**
* Handles capturing groups {@code (...)},
* non-capturing groups {@code (?:...)}, and lookahead assertions
* {@code (?=...)}.
*/
private RegExpTree parseParenthetical() {
checkState(pattern.charAt(pos) == '(');
int start = pos;
++pos;
boolean capturing = true;
int type = 0;
if (pos < limit && pattern.charAt(pos) == '?') {
if (pos + 1 < limit) {
capturing = false;
char ch = pattern.charAt(pos + 1);
switch (ch) {
case ':': // A (?:...) style non-capturing group.
pos += 2;
break;
case '!': // A lookahead assertion
case '=':
pos += 2;
type = ch;
break;
default:
throw new IllegalArgumentException(
"Malformed parenthetical: " + pattern.substring(start));
}
}
}
RegExpTree body = parse();
if (pos < limit && pattern.charAt(pos) == ')') {
++pos;
} else {
throw new IllegalArgumentException(
"Unclosed parenthetical group: " + pattern.substring(start));
}
if (capturing) {
++numCapturingGroups;
return new CapturingGroup(body);
} else if (type != 0) {
return new LookaheadAssertion(body, type == '=');
} else {
return body;
}
}
/**
* Parses a square bracketed character set.
* Standalone character groups (@code /\d/} are handled by
* {@link #parseEscape}.
*/
private RegExpTree parseCharset() {
checkState(pattern.charAt(pos) == '[');
++pos;
boolean isCaseInsensitive = flags.indexOf('i') >= 0;
boolean inverse = pos < limit && pattern.charAt(pos) == '^';
if (inverse) { ++pos; }
CharRanges ranges = CharRanges.EMPTY;
CharRanges ieExplicits = CharRanges.EMPTY;
while (pos < limit && pattern.charAt(pos) != ']') {
char ch = pattern.charAt(pos);
char start;
if (ch == '\\') {
++pos;
char possibleGroupName = pattern.charAt(pos);
CharRanges group = NAMED_CHAR_GROUPS.get(possibleGroupName);
if (group != null) {
++pos;
ranges = ranges.union(group);
continue;
}
start = parseEscapeChar();
} else {
start = ch;
++pos;
}
char end = start;
if (pos + 1 < limit && pattern.charAt(pos) == '-'
&& pattern.charAt(pos + 1) != ']') {
++pos;
ch = pattern.charAt(pos);
if (ch == '\\') {
++pos;
end = parseEscapeChar();
} else {
end = ch;
++pos;
}
}
CharRanges range = CharRanges.inclusive(start, end);
ranges = ranges.union(range);
if (IE_SPEC_ERRORS.contains(start) && IE_SPEC_ERRORS.contains(end)) {
ieExplicits = ieExplicits.union(range.intersection(IE_SPEC_ERRORS));
}
if (isCaseInsensitive) {
// If the flags contain the 'i' flag, then it is not correct to
// say that [^a-z] contains the letter 'A', or that [a-z] does not
// contain the letter 'A'.
// We expand out letter groups here so that parse returns something
// that is valid independent of flags.
// Calls to simplify(flags) may later reintroduce flag assumptions.
// but without this step, later steps might conflate
// /[a-z]/i
// and
// /[^\0-`{-\uffff]/i
// which matches nothing because the information about whether the
// ^ is present has been lost during optimizations and charset
// unionizing as in /[...]|[^...]/.
ranges = CaseCanonicalize.expandToAllMatched(ranges);
}
}
++pos; // Consume ']'
if (inverse) {
ranges = CharRanges.ALL_CODE_UNITS.difference(ranges);
}
return new Charset(ranges, ieExplicits);
}
/**
* Parses an escape to a code point.
* Some of the characters parsed here have special meanings in various
* contexts, so contexts must filter those instead.
* E.g. '\b' means a different thing inside a charset than without.
*/
private char parseEscapeChar() {
char ch = pattern.charAt(pos++);
switch (ch) {
case 'b': return '\b';
case 'f': return '\f';
case 'n': return '\n';
case 'r': return '\r';
case 't': return '\t';
case 'u': return parseHex(4);
case 'v': return '\u000b';
case 'x': return parseHex(2);
default:
if ('0' <= ch && ch <= '7') {
char codeUnit = (char) (ch - '0');
// Allow octal literals in the range \0-\377.
// \41 might be a group, but \041 is not a group.
// We read, but do not emit octal literals since they
// are deprecated in ES5.
int octLimit = Math.min(
limit, pos + (ch <= '3' ? 2 : 1) + (ch == '0' ? 1 : 0));
while (pos < octLimit) {
ch = pattern.charAt(pos);
if ('0' <= ch && ch <= '7') {
codeUnit = (char) ((codeUnit << 3) + (ch - '0'));
++pos;
} else {
break;
}
}
return codeUnit;
}
return ch;
}
}
/**
* Parses an escape that appears outside a charset.
*/
private RegExpTree parseEscape() {
checkState(pattern.charAt(pos) == '\\');
++pos;
char ch = pattern.charAt(pos);
if (ch == 'b' || ch == 'B') {
++pos;
return new WordBoundary(ch);
} else if ('1' <= ch && ch <= '9') {
++pos;
int possibleGroupIndex = ch - '0';
if (numCapturingGroups >= possibleGroupIndex) {
if (pos < limit) {
char next = pattern.charAt(pos);
if ('0' <= next && next <= '9') {
int twoDigitGroupIndex = possibleGroupIndex * 10 + (next - '0');
if (numCapturingGroups >= twoDigitGroupIndex) {
++pos;
possibleGroupIndex = twoDigitGroupIndex;
}
}
}
return new BackReference(possibleGroupIndex);
} else {
// \1 - \7 are octal escapes if there is no such group.
// \8 and \9 are the literal characters '8' and '9' if there
// is no such group.
return new Text(Character.toString(
possibleGroupIndex <= 7 ? (char) possibleGroupIndex : ch));
}
} else {
CharRanges charGroup = NAMED_CHAR_GROUPS.get(ch);
if (charGroup != null) { // Handle \d, etc.
++pos;
return new Charset(charGroup, CharRanges.EMPTY);
}
return new Text("" + parseEscapeChar());
}
}
/**
* Parses n hex digits to a code-unit.
*/
private char parseHex(int n) {
if (pos + n > limit) {
throw new IllegalArgumentException(
"Abbreviated hex escape " + pattern.substring(pos));
}
int result = 0;
while (--n >= 0) {
char ch = pattern.charAt(pos);
int digit;
if ('0' <= ch && ch <= '9') {
digit = ch - '0';
} else if ('a' <= ch && ch <= 'f') {
digit = ch + (10 - 'a');
} else if ('A' <= ch && ch <= 'F') {
digit = ch + (10 - 'A');
} else {
throw new IllegalArgumentException(pattern.substring(pos));
}
++pos;
result = (result << 4) | digit;
}
return (char) result;
}
private boolean isRepetitionStart(char ch) {
switch (ch) {
case '?':
case '*':
case '+':
case '{':
return true;
default:
return false;
}
}
/**
* Parse a repetition. {@code x?} is treated as a repetition --
* an optional production can be matched 0 or 1 time.
*/
private RegExpTree parseRepetition(RegExpTree body) {
if (pos == limit) { return body; }
int min, max;
switch (pattern.charAt(pos)) {
case '+':
++pos;
min = 1;
max = Integer.MAX_VALUE;
break;
case '*':
++pos;
min = 0;
max = Integer.MAX_VALUE;
break;
case '?':
++pos;
min = 0;
max = 1;
break;
case '{':
++pos;
int start = pos;
int end = pattern.indexOf('}', start);
if (end < 0) {
pos = start - 1;
return body;
}
String counts = pattern.substring(start, end);
pos = end + 1;
int comma = counts.indexOf(',');
try {
min = Integer.parseInt(
comma >= 0 ? counts.substring(0, comma) : counts);
max = comma >= 0
? comma + 1 != counts.length()
? Integer.parseInt(counts.substring(comma + 1))
: Integer.MAX_VALUE
: min;
} catch (NumberFormatException ex) {
min = max = -1;
}
if (min < 0 || min > max) {
// Treat the open curly bracket literally.
pos = start - 1;
return body;
}
break;
default:
return body;
}
boolean greedy = true;
if (pos < limit && pattern.charAt(pos) == '?') {
greedy = false;
++pos;
}
return new Repetition(body, min, max, greedy);
}
}
return new Parser().parseTopLevel();
}
/**
* True if, but not necessarily always when the, given regular expression
* must match the whole input or none of it.
*/
public static boolean matchesWholeInput(RegExpTree t, String flags) {
if (flags.indexOf('m') >= 0) { return false; }
if (!(t instanceof Concatenation)) {
return false;
}
Concatenation c = (Concatenation) t;
if (c.elements.isEmpty()) { return false; }
RegExpTree first = c.elements.get(0), last = Iterables.getLast(c.elements);
if (!(first instanceof Anchor && last instanceof Anchor)) { return false; }
return ((Anchor) first).type == '^' && ((Anchor) last).type == '$';
}
static abstract class RegExpTreeAtom extends RegExpTree {
@Override
public boolean isCaseSensitive() {
return false;
}
@Override
public boolean containsAnchor() {
return false;
}
@Override
public final int numCapturingGroups() {
return 0;
}
@Override
public final ImmutableList<? extends RegExpTree> children() {
return ImmutableList.of();
}
}
static final class Empty extends RegExpTreeAtom {
static final Empty INSTANCE = new Empty();
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
// No output
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// No output
}
@Override
public boolean equals(Object o) {
return o instanceof Empty;
}
@Override
public int hashCode() {
return 0x7ee06141;
}
}
static final class Anchor extends RegExpTreeAtom {
final char type;
Anchor(char type) { this.type = type; }
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
public boolean containsAnchor() {
return true;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append(type);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(type);
}
@Override
public boolean equals(Object o) {
return o instanceof Anchor && type == ((Anchor) o).type;
}
@Override
public int hashCode() {
return type ^ 0xe85317ff;
}
}
static final class WordBoundary extends RegExpTreeAtom {
final char type;
WordBoundary(char type) {
this.type = type;
}
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append('\\').append(type);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(type);
}
@Override
public boolean equals(Object o) {
return o instanceof WordBoundary && type == ((WordBoundary) o).type;
}
@Override
public int hashCode() {
return 0x5673aa29 ^ type;
}
}
static final class BackReference extends RegExpTreeAtom {
final int groupIndex;
BackReference(int groupIndex) {
checkArgument(groupIndex >= 0 && groupIndex <= 99);
this.groupIndex = groupIndex;
}
@Override
public RegExpTree simplify(String flags) {
return this;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append('\\').append(groupIndex);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(groupIndex);
}
@Override
public boolean equals(Object o) {
return o instanceof BackReference
&& groupIndex == ((BackReference) o).groupIndex;
}
@Override
public int hashCode() {
return 0xff072663 ^ groupIndex;
}
}
static final class Text extends RegExpTreeAtom {
final String text;
Text(String text) {
this.text = text;
}
/**
* @param ch The code-unit to escape.
* @param next The next code-unit or -1 if indeterminable.
*/
private static void escapeRegularCharOnto(
char ch, int next, StringBuilder sb) {
switch (ch) {
case '$':
case '^':
case '*':
case '(':
case ')':
case '+':
case '[':
case '|':
case '.':
case '/':
case '?':
sb.append('\\').append(ch);
break;
case '{':
// If possibly part of a repetition, then escape.
// Concatenation is handled by the digitsMightBleed check.
if ('0' <= next && next <= '9') {
sb.append('\\');
}
sb.append(ch);
break;
default:
escapeCharOnto(ch, sb);
}
}
@Override
public RegExpTree simplify(String flags) {
int n = text.length();
if (n == 0) {
return Empty.INSTANCE;
}
if (flags.indexOf('i') >= 0) {
String canonicalized = CaseCanonicalize.caseCanonicalize(text);
if (!text.equals(canonicalized)) {
return new Text(canonicalized);
}
}
return this;
}
@Override
public boolean isCaseSensitive() {
for (int i = 0, n = text.length(); i < n; ++i) {
if (CaseCanonicalize.CASE_SENSITIVE.contains(text.charAt(i))) {
return true;
}
}
return false;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
for (int i = 0, n = text.length(); i < n; ++i) {
escapeRegularCharOnto(text.charAt(i), i + 1 < n ? text.charAt(i + 1) : -1, sb);
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append('`').append(text).append('`');
}
@Override
public boolean equals(Object o) {
return o instanceof Text && text.equals(((Text) o).text);
}
@Override
public int hashCode() {
return text.hashCode() ^ 0x617e310;
}
}
static final class Repetition extends RegExpTree {
final RegExpTree body;
final int min, max;
final boolean greedy;
Repetition(RegExpTree body, int min, int max, boolean greedy) {
this.body = body;
this.min = min;
this.max = max;
this.greedy = greedy;
}
@Override
public RegExpTree simplify(String flags) {
RegExpTree body = this.body.simplify(flags);
if (max == 0 && !body.hasCapturingGroup()) { return Empty.INSTANCE; }
if (body instanceof Empty || NEVER_MATCHES.equals(body)) { return body; }
int min = this.min;
int max = this.max;
if (body instanceof Repetition) {
Repetition rbody = (Repetition) body;
if (rbody.greedy == greedy) {
long lmin = ((long) min) * rbody.min;
long lmax = ((long) max) * rbody.max;
if (lmin < Integer.MAX_VALUE) {
body = rbody.body;
min = (int) lmin;
max = lmax >= Integer.MAX_VALUE ? Integer.MAX_VALUE : (int) lmax;
}
}
}
if (min == 1 && max == 1) { return body; }
boolean greedy = this.greedy || min == max;
return body.equals(this.body) && min == this.min && max == this.max
&& greedy == this.greedy
? this
: new Repetition(body, min, max, greedy).simplify(flags);
}
@Override
public boolean isCaseSensitive() {
return body.isCaseSensitive();
}
@Override
public boolean containsAnchor() {
return body.containsAnchor();
}
@Override
public int numCapturingGroups() {
return body.numCapturingGroups();
}
@Override
public ImmutableList<? extends RegExpTree> children() {
return ImmutableList.of(body);
}
private void appendBodySourceCode(StringBuilder sb) {
if (body instanceof Alternation || body instanceof Concatenation
|| body instanceof Repetition
|| (body instanceof Text && ((Text) body).text.length() > 1)) {
sb.append("(?:");
body.appendSourceCode(sb);
sb.append(')');
} else {
body.appendSourceCode(sb);
}
}
private static int suffixLen(int min, int max) {
// This mirrors the branches that renders a suffix in appendSourceCode below.
if (max == Integer.MAX_VALUE) {
switch (min) {
case 0: // *
case 1: // +
return 1;
default:
return 3 + numDecimalDigits(min); // {3,}
}
}
if (min == 0 && max == 1) {
return 1; // ?
}
if (min == max) {
if (min == 1) {
return 0; // No suffix needed for {1}.
}
return 2 + numDecimalDigits(min); // {4}
}
return 3 + numDecimalDigits(min) + numDecimalDigits(max); // {2,7}
}
private static int numDecimalDigits(int n) {
if (n < 0) {
// Negative values should not be passed in.
throw new AssertionError();
// If changing this code to support negative values,
// Integer.MIN_VALUE is a corner-case..
}
int nDigits = 1;
while (n >= 10) {
++nDigits;
n /= 10;
}
return nDigits;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
int bodyStart = sb.length();
appendBodySourceCode(sb);
int bodyEnd = sb.length();
int bodyLen = bodyEnd - bodyStart;
int min = this.min;
int max = this.max;
if (min >= 2 && max == Integer.MAX_VALUE || max - min <= 1) {
int expanded =
// If min == max then we want to try expanding to the limit and
// attach the empty suffix, which is equivalent to min = max = 1,
// i.e. /a/ vs /a{1}/.
min == max
// Give aa+ preference over aaa*.
|| max == Integer.MAX_VALUE
? min - 1
: min;
int expandedMin = min - expanded;
int expandedMax = max == Integer.MAX_VALUE ? max : max - expanded;
int suffixLen = suffixLen(min, max);
int expandedSuffixLen = suffixLen(expandedMin, expandedMax);
if (bodyLen * expanded + expandedSuffixLen < suffixLen
&& !body.hasCapturingGroup()) {
// a{2} -> aa
// a{2,} -> aa+
// a{2,3} -> aaa?
while (--expanded >= 0) {
sb.append(sb, bodyStart, bodyEnd);
}
min = expandedMin;
max = expandedMax;
}
}
if (max == Integer.MAX_VALUE) {
switch (min) {
case 0: sb.append('*'); break;
case 1: sb.append('+'); break;
default:
sb.append('{').append(min).append(",}");
}
} else if (min == 0 && max == 1) {
sb.append('?');
} else if (min == max) {
if (min != 1) {
sb.append('{').append(min).append('}');
}
} else {
sb.append('{').append(min).append(',').append(max).append('}');
}
if (!greedy) {
sb.append('?');
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(" min=").append(min).append(", max=").append(max);
if (!greedy) { sb.append(" not_greedy"); }
}
@Override
public boolean equals(Object o) {
if (!(o instanceof Repetition)) { return false; }
Repetition that = (Repetition) o;
return this.body.equals(that.body)
&& this.min == that.min
&& this.max == that.max
&& this.greedy == that.greedy;
}
@Override
public int hashCode() {
return min + 31 * (max + 31 * ((greedy ? 1 : 0) + 31 * body.hashCode()));
}
}
static final class Alternation extends RegExpTree {
final ImmutableList<RegExpTree> alternatives;
Alternation(List<? extends RegExpTree> alternatives) {
this.alternatives = ImmutableList.copyOf(alternatives);
}
@Override
public RegExpTree simplify(String flags) {
List<RegExpTree> alternatives = new ArrayList<>();
for (RegExpTree alternative : this.alternatives) {
alternative = alternative.simplify(flags);
if (alternative instanceof Alternation) {
alternatives.addAll(((Alternation) alternative).alternatives);
} else {
alternatives.add(alternative);
}
}
// Remove duplicates
RegExpTree last = null;
for (Iterator<RegExpTree> it = alternatives.iterator(); it.hasNext();) {
RegExpTree alternative = it.next();
if (alternative.equals(NEVER_MATCHES)) { continue; }
if (alternative.equals(last) && !alternative.hasCapturingGroup()) {
it.remove();
} else {
last = alternative;
}
}
// Collapse character alternatives into character sets.
for (int i = 0, n = alternatives.size(); i < n; ++i) {
RegExpTree alternative = alternatives.get(i);
if ((alternative instanceof Text
&& ((Text) alternative).text.length() == 1)
|| alternative instanceof Charset) {
int end = i;
int nCharsets = 0;
while (end < n) {
RegExpTree follower = alternatives.get(end);
if (follower instanceof Charset) {
++nCharsets;
} else if (!(follower instanceof Text
&& ((Text) follower).text.length() == 1)) {
break;
}
++end;
}
if (end - i >= 3 || (nCharsets != 0 && end - i >= 2)) {
int[] members = new int[end - i - nCharsets];
int memberIdx = 0;
CharRanges chars = CharRanges.EMPTY;
CharRanges ieExplicits = CharRanges.EMPTY;
List<RegExpTree> charAlternatives = alternatives.subList(i, end);
for (RegExpTree charAlternative : charAlternatives) {
if (charAlternative instanceof Text) {
char ch = ((Text) charAlternative).text.charAt(0);
members[memberIdx++] = ch;
if (IE_SPEC_ERRORS.contains(ch)) {
ieExplicits = ieExplicits.union(CharRanges.inclusive(ch, ch));
}
} else if (charAlternative instanceof Charset) {
Charset cs = (Charset) charAlternative;
chars = chars.union(cs.ranges);
ieExplicits = ieExplicits.union(cs.ieExplicits);
}
}
chars = chars.union(CharRanges.withMembers(members));
charAlternatives.clear();
charAlternatives.add(
new Charset(chars, ieExplicits).simplify(flags));
n = alternatives.size();
}
}
}
switch (alternatives.size()) {
case 0: return Empty.INSTANCE;
case 1: return alternatives.get(0);
case 2:
if (alternatives.get(1) instanceof Empty) { // (?:a|) -> a?
return new Repetition(alternatives.get(0), 0, 1, true);
} else if (alternatives.get(0) instanceof Empty) {
return new Repetition(alternatives.get(1), 0, 1, false);
}
break;
}
// TODO: maybe pull out common prefix or suffix
return alternatives.equals(this.alternatives)
? this : new Alternation(alternatives);
}
@Override
public boolean isCaseSensitive() {
for (RegExpTree alternative : alternatives) {
if (alternative.isCaseSensitive()) { return true; }
}
return false;
}
@Override
public boolean containsAnchor() {
for (RegExpTree alternative : alternatives) {
if (alternative.containsAnchor()) { return true; }
}
return false;
}
@Override
public int numCapturingGroups() {
int n = 0;
for (RegExpTree alternative : alternatives) {
n += alternative.numCapturingGroups();
}
return n;
}
@Override
public ImmutableList<? extends RegExpTree> children() {
return alternatives;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
for (int i = 0, n = alternatives.size(); i < n; ++i) {
if (i != 0) {
sb.append('|');
}
alternatives.get(i).appendSourceCode(sb);
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides children.
}
@Override
public boolean equals(Object o) {
return this == o || (
(o instanceof Alternation)
&& alternatives.equals(((Alternation) o).alternatives));
}
@Override
public int hashCode() {
return 0x51b57cd1 ^ alternatives.hashCode();
}
}
private static final RegExpTree NEVER_MATCHES = new LookaheadAssertion(
Empty.INSTANCE, false);
static final class LookaheadAssertion extends RegExpTree {
final RegExpTree body;
final boolean positive;
LookaheadAssertion(RegExpTree body, boolean positive) {
this.body = body;
this.positive = positive;
}
@Override
public RegExpTree simplify(String flags) {
RegExpTree simpleBody = body.simplify(flags);
if (simpleBody instanceof Empty) {
if (positive) { // Always true
return simpleBody;
}
}
return new LookaheadAssertion(simpleBody, positive);
}
@Override
public boolean isCaseSensitive() {
return body.isCaseSensitive();
}
@Override
public boolean containsAnchor() {
return body.containsAnchor();
}
@Override
public int numCapturingGroups() {
return body.numCapturingGroups();
}
@Override
public ImmutableList<? extends RegExpTree> children() {
return ImmutableList.of(body);
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append(positive ? "(?=" : "(?!");
body.appendSourceCode(sb);
sb.append(')');
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(positive ? "positive" : "negative");
}
@Override
public boolean equals(Object o) {
if (!(o instanceof LookaheadAssertion)) { return false; }
LookaheadAssertion that = (LookaheadAssertion) o;
return this.positive == that.positive && this.body.equals(that.body);
}
@Override
public int hashCode() {
return 0x723aba9 ^ body.hashCode();
}
}
static final class CapturingGroup extends RegExpTree {
final RegExpTree body;
CapturingGroup(RegExpTree body) {
this.body = body;
}
@Override
public RegExpTree simplify(String flags) {
return new CapturingGroup(body.simplify(flags));
}
@Override
public boolean isCaseSensitive() {
return body.isCaseSensitive();
}
@Override
public boolean containsAnchor() {
return body.containsAnchor();
}
@Override
public int numCapturingGroups() {
return 1;
}
@Override
public ImmutableList<? extends RegExpTree> children() {
return ImmutableList.of(body);
}
@Override
protected void appendSourceCode(StringBuilder sb) {
sb.append('(');
body.appendSourceCode(sb);
sb.append(')');
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides children.
}
@Override
public boolean equals(Object o) {
return o instanceof CapturingGroup
&& body.equals(((CapturingGroup) o).body);
}
@Override
public int hashCode() {
return 0x55781738 ^ body.hashCode();
}
}
private static final CharRanges DIGITS = CharRanges.inclusive('0', '9');
private static final CharRanges UCASE_LETTERS
= CharRanges.inclusive('A', 'Z');
private static final CharRanges LCASE_LETTERS
= CharRanges.inclusive('a', 'z');
private static final CharRanges LETTERS = UCASE_LETTERS.union(LCASE_LETTERS);
private static final CharRanges WORD_CHARS = DIGITS
.union(LETTERS).union(CharRanges.withMembers('_'));
private static final CharRanges INVERSE_WORD_CHARS
= CharRanges.ALL_CODE_UNITS.difference(WORD_CHARS);
private static final CharRanges SPACE_CHARS = CharRanges.withMembers(
'\t', '\n', '\u000b', '\u000c', '\r', ' ', '\u00a0',
// Unicode 3.0 Zs
'\u1680', '\u180e', '\u2000', '\u2001',
'\u2002', '\u2003', '\u2004', '\u2005',
'\u2006', '\u2007', '\u2008', '\u2009',
'\u200a',
// Line terminator chars
'\u2028', '\u2029',
// Unicode 3.0 Zs
'\u202f', '\u205f', '\u3000',
// Byte order marker is a space character in ES5 but not ES3.
'\ufeff'
);
/** IE is broken around \s. IE (6, 7, 8 at least), only recognize these. */
private static final CharRanges IE_SPACE_CHARS = CharRanges.withMembers(
'\t', '\n', '\u000b', '\u000c', '\r', ' '
);
/** IE is broken around \s. IE (6, 7, 8 at least), only recognize these. */
private static final CharRanges IE_SPEC_ERRORS = SPACE_CHARS.difference(
IE_SPACE_CHARS);
private static final ImmutableMap<Character, CharRanges> NAMED_CHAR_GROUPS
= ImmutableMap.<Character, CharRanges>builder()
.put('d', DIGITS)
.put('D', CharRanges.ALL_CODE_UNITS.difference(DIGITS))
.put('s', SPACE_CHARS)
.put('S', CharRanges.ALL_CODE_UNITS.difference(SPACE_CHARS))
.put('w', WORD_CHARS)
.put('W', INVERSE_WORD_CHARS)
.build();
private static final Charset DOT_CHARSET = new Charset(
CharRanges.ALL_CODE_UNITS.difference(
CharRanges.withMembers('\n', '\r', '\u2028', '\u2029')),
CharRanges.EMPTY);
static final class Charset extends RegExpTreeAtom {
final CharRanges ranges;
/**
* Code units that were mentioned explicitly and that might be matched by
* a group according to ECMAScript 5 but would not because of specification
* violations in IE.
*/
final CharRanges ieExplicits;
Charset(CharRanges ranges, CharRanges ieExplicits) {
this.ranges = ranges;
this.ieExplicits = ieExplicits;
}
private static int complexityWordFolded(CharRanges ranges) {
return Math.min(
complexityWordFoldedHelper(ranges),
1 + complexityWordFoldedHelper(
CharRanges.ALL_CODE_UNITS.difference(ranges)));
}
private static int complexityWordFoldedHelper(CharRanges ranges) {
int complexity = DecomposedCharset.complexity(ranges);
if (ranges.containsAll(WORD_CHARS)) {
complexity = Math.min(
complexity,
1 + DecomposedCharset.complexity(ranges.difference(WORD_CHARS)));
}
if (ranges.containsAll(INVERSE_WORD_CHARS)) {
complexity = Math.min(
complexity,
1 + DecomposedCharset.complexity(
ranges.difference(INVERSE_WORD_CHARS)));
}
return complexity;
}
@Override
public RegExpTree simplify(String flags) {
if (ranges.isEmpty()) {
return NEVER_MATCHES;
}
CharRanges best = ranges;
if (flags.indexOf('i') >= 0) {
Set<CharRanges> options = new LinkedHashSet<>();
options.add(CaseCanonicalize.expandToAllMatched(ranges));
options.add(CaseCanonicalize.reduceToMinimum(ranges));
CharRanges lcaseLetters = ranges.intersection(LCASE_LETTERS);
CharRanges ucaseLetters = ranges.intersection(UCASE_LETTERS);
CharRanges lcaseLettersToUpper = lcaseLetters.shift(-32);
CharRanges ucaseLettersToLower = ucaseLetters.shift(32);
options.add(ranges.union(ucaseLettersToLower));
options.add(ranges.union(lcaseLettersToUpper));
options.add(ranges.union(lcaseLettersToUpper)
.union(ucaseLettersToLower));
options.add(ranges.union(ucaseLettersToLower).difference(ucaseLetters));
options.add(ranges.union(lcaseLettersToUpper).difference(lcaseLetters));
int bestComplexity = complexityWordFolded(ranges);
for (CharRanges option : options) {
int complexity = complexityWordFolded(option);
if (complexity < bestComplexity) {
bestComplexity = complexity;
best = option;
}
}
}
if (best.getNumRanges() == 1
&& best.end(0) - best.start(0) == 1) {
return new Text(Character.toString((char) best.start(0)));
}
if (!best.equals(ranges)) {
return new Charset(best, ieExplicits);
}
return this;
}
@Override
public boolean isCaseSensitive() {
// We could test
// !ranges.equals(CaseCanonicalize.expandToAllMatched(ranges))
// but we get better optimizations by leaving the 'i' flag on in most
// cases.
// Check whether skipping all the character groups that are known
// case-insensitive leaves us with something that matches the above
// definition.
CharRanges withoutNamedGroups = decompose().ranges;
return !withoutNamedGroups.equals(
CaseCanonicalize.expandToAllMatched(withoutNamedGroups));
}
private DecomposedCharset decompose(CharRanges ranges, boolean inverted) {
StringBuilder namedGroups = new StringBuilder();
CharRanges rangesInterIeExplicits = ranges.intersection(ieExplicits);
while (true) {
char groupName = 0;
CharRanges simplest = null;
int minComplexity = DecomposedCharset.complexity(ranges);
for (Map.Entry<Character, CharRanges> namedGroup
: NAMED_CHAR_GROUPS.entrySet()) {
CharRanges group = namedGroup.getValue();
if (ranges.containsAll(group)) {
CharRanges withoutGroup = ranges.difference(group).union(
rangesInterIeExplicits);
int complexity = DecomposedCharset.complexity(withoutGroup);
if (complexity < minComplexity) {
simplest = withoutGroup;
groupName = namedGroup.getKey().charValue();
minComplexity = complexity;
}
}
}
if (simplest != null) {
namedGroups.append('\\').append(groupName);
ranges = simplest;
} else {
break;
}
}
return new DecomposedCharset(inverted, ranges, namedGroups.toString());
}
DecomposedCharset decompose() {
CharRanges negRanges = CharRanges.ALL_CODE_UNITS.difference(ranges);
if (!ieExplicits.isEmpty()) {
if (negRanges.intersection(ieExplicits).isEmpty()) {
return decompose(ranges, false);
} else if (ranges.intersection(ieExplicits).isEmpty()) {
return decompose(negRanges, true);
}
}
DecomposedCharset positive = decompose(ranges, false);
DecomposedCharset negative = decompose(negRanges, true);
return positive.complexity() <= negative.complexity() ? positive : negative;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
if (DOT_CHARSET.ranges.equals(ranges)) {
sb.append('.');
return;
}
decompose().appendSourceCode(sb);
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
sb.append(ranges);
}
@Override
public boolean equals(Object o) {
return o instanceof Charset && ranges.equals(((Charset) o).ranges);
}
@Override
public int hashCode() {
return ranges.hashCode() ^ 0xdede2246;
}
}
static final class DecomposedCharset {
boolean inverted;
final CharRanges ranges;
final String namedGroups;
DecomposedCharset(
boolean inverted, CharRanges ranges, String namedGroups) {
this.inverted = inverted;
this.ranges = ranges;
this.namedGroups = namedGroups;
}
int complexity() {
return (inverted ? 1 : 0) + namedGroups.length() + complexity(ranges);
}
void appendSourceCode(StringBuilder sb) {
if (ranges.isEmpty()) {
if (!inverted && namedGroups.length() == 2) {
sb.append(namedGroups);
return;
} else if (ranges.isEmpty() && namedGroups.isEmpty()) {
sb.append(inverted ? "[\\S\\s]" : "(?!)");
return;
}
}
sb.append('[');
if (inverted) { sb.append('^'); }
sb.append(namedGroups);
boolean rangesStartCharset = !inverted && namedGroups.isEmpty();
boolean emitDashAtEnd = false;
for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) {
char start = (char) ranges.start(i);
char end = (char) (ranges.end(i) - 1);
switch (end - start) {
case 0:
if (start == '-') {
// Put it at the end where it doesn't need escaping.
emitDashAtEnd = true;
} else {
escapeRangeCharOnto(
start, rangesStartCharset, i == 0, i + 1 == n, sb);
}
break;
case 1:
escapeRangeCharOnto(start, rangesStartCharset, i == 0, false, sb);
escapeRangeCharOnto(
end, rangesStartCharset, false, i + 1 == n, sb);
break;
default:
escapeRangeCharOnto(start, rangesStartCharset, i == 0, false, sb);
sb.append('-');
escapeRangeCharOnto(end, rangesStartCharset, false, true, sb);
break;
}
}
if (emitDashAtEnd) { sb.append('-'); }
sb.append(']');
}
static void escapeRangeCharOnto(
char ch, boolean startIsFlush, boolean atStart, boolean atEnd,
StringBuilder sb) {
switch (ch) {
case '\b':
sb.append("\\b");
break;
case '^':
sb.append(atStart && startIsFlush ? "\\^" : "^");
break;
case '-':
sb.append(atStart || atEnd ? "-" : "\\-");
break;
case '\\':
case ']':
sb.append('\\').append(ch);
break;
default:
escapeCharOnto(ch, sb);
}
}
static int complexity(CharRanges ranges) {
int complexity = 0;
for (int i = 0, n = ranges.getNumRanges(); i < n; ++i) {
int start = ranges.start(i);
int end = ranges.end(i) - 1;
if (start < 0x20 || start >= 0x7f) {
complexity += start >= 0x100 ? 6 : 4;
} else {
++complexity;
}
switch (end - start) {
case 0: continue;
case 1: break;
default: complexity += 1;
}
if (end < 0x20 || end >= 0x7f) {
complexity += end >= 0x100 ? 6 : 4;
} else {
++complexity;
}
}
return complexity;
}
@Override
public boolean equals(Object o) {
if (!(o instanceof DecomposedCharset)) {
return false;
}
DecomposedCharset that = (DecomposedCharset) o;
return this.inverted = that.inverted
&& this.ranges.equals(that.ranges)
&& this.namedGroups.equals(that.namedGroups);
}
@Override
public int hashCode() {
return ranges.hashCode()
+ 31 * (namedGroups.hashCode() + (inverted ? 1 : 0));
}
}
static final class Concatenation extends RegExpTree {
final ImmutableList<RegExpTree> elements;
Concatenation(RegExpTree a, RegExpTree b) {
elements = ImmutableList.of(a, b);
}
Concatenation(List<? extends RegExpTree> elements) {
this.elements = ImmutableList.copyOf(elements);
}
@Override
public RegExpTree simplify(final String flags) {
class Simplifier {
final List<RegExpTree> simplified = new ArrayList<>();
void simplify(RegExpTree t) {
if (t instanceof Concatenation) {
for (RegExpTree child : ((Concatenation) t).elements) {
simplify(child);
}
} else if (t instanceof Empty) {
// Do nothing
} else {
int lastIndex = simplified.size() - 1;
if (lastIndex >= 0) {
RegExpTree pairwise = simplifyPairwise(
simplified.get(lastIndex), t);
if (pairwise != null) {
simplified.set(lastIndex, pairwise);
return;
}
}
simplified.add(t);
}
}
RegExpTree simplifyPairwise(RegExpTree before, RegExpTree after) {
if (before instanceof Text && after instanceof Text) {
return new Text(
((Text) before).text + ((Text) after).text).simplify(flags);
}
// Fold adjacent repetitions.
int beforeMin = 1, beforeMax = 1;
RegExpTree beforeBody = before;
boolean beforeGreedy = false;
if (before instanceof Repetition) {
Repetition r = (Repetition) before;
beforeMin = r.min;
beforeMax = r.max;
beforeBody = r.body;
beforeGreedy = r.greedy;
}
int afterMin = 1, afterMax = 1;
RegExpTree afterBody = after;
boolean afterGreedy = false;
if (after instanceof Repetition) {
Repetition r = (Repetition) after;
afterMin = r.min;
afterMax = r.max;
afterBody = r.body;
afterGreedy = r.greedy;
}
if (beforeBody.equals(afterBody)
&& !beforeBody.hasCapturingGroup()) {
long lmin = ((long) beforeMin) + afterMin;
long lmax = ((long) beforeMax) + afterMax;
if (lmin < Integer.MAX_VALUE) {
int min = (int) lmin;
int max = lmax >= Integer.MAX_VALUE
? Integer.MAX_VALUE : (int) lmax;
return new Repetition(
beforeBody, min, max,
beforeGreedy || afterGreedy || min == max);
}
}
return null;
}
}
Simplifier s = new Simplifier();
for (RegExpTree element : elements) {
s.simplify(element.simplify(flags));
}
switch (s.simplified.size()) {
case 0: return Empty.INSTANCE;
case 1: return s.simplified.get(0);
default: return new Concatenation(s.simplified);
}
}
@Override
public boolean isCaseSensitive() {
for (RegExpTree element : elements) {
if (element.isCaseSensitive()) {
return true;
}
}
return false;
}
@Override
public boolean containsAnchor() {
for (RegExpTree element : elements) {
if (element.containsAnchor()) {
return true;
}
}
return false;
}
@Override
public int numCapturingGroups() {
int n = 0;
for (RegExpTree element : elements) {
n += element.numCapturingGroups();
}
return n;
}
@Override
public ImmutableList<? extends RegExpTree> children() {
return elements;
}
@Override
protected void appendSourceCode(StringBuilder sb) {
// True if the last content written might consume
// decimal digits written subsequently.
boolean digitsMightBleed = false;
for (RegExpTree element : elements) {
boolean parenthesize = false;
if (element instanceof Alternation
|| element instanceof Concatenation) {
parenthesize = true;
}
if (parenthesize) {
sb.append("(?:");
element.appendSourceCode(sb);
sb.append(')');
} else {
int start = sb.length();
element.appendSourceCode(sb);
if (digitsMightBleed && sb.length() > start) {
char firstChar = sb.charAt(start);
if ('0' <= firstChar && firstChar <= '9') {
// Bleeding happened.
// If the last character would be ambiguous
// with a repetition, escape it.
if (sb.charAt(start - 1) == '{') {
// Concatenation from optimization of
// /{(?:0,}/ -> /\{0,}/
sb.insert(start - 1, '\\');
} else {
// Or parenthesize otherwise.
// Concatenation from optimization of
// /(.)\1(?:0)/ -> /(.)\1(?:0)/.
sb.insert(start, "(?:").append(')');
}
}
}
}
digitsMightBleed = (
// \1(?:0) bleeds if there are 10 or more
// capturing groups preceding.
(element instanceof BackReference
&& ((BackReference) element).groupIndex < 10)
// foo{(?:10}) bleeds.
|| (element instanceof Text
&& ((Text) element).text.endsWith("{")));
}
}
@Override
protected void appendDebugInfo(StringBuilder sb) {
// Nothing besides children.
}
@Override
public boolean equals(Object o) {
return o instanceof Concatenation
&& elements.equals(((Concatenation) o).elements);
}
@Override
public int hashCode() {
return 0x20997e3e ^ elements.hashCode();
}
}
static void escapeCharOnto(char ch, StringBuilder sb) {
switch (ch) {
case '\u0000':
sb.append("\\0");
break;
case '\f':
sb.append("\\f");
break;
case '\t':
sb.append("\\t");
break;
case '\n':
sb.append("\\n");
break;
case '\r':
sb.append("\\r");
break;
case '\\':
sb.append("\\\\");
break;
default:
if (ch < 0x20 || ch >= 0x7f) {
if (ch >= 0x100) {
sb.append("\\u");
sb.append("0123456789abcdef".charAt((ch >> 12) & 0xf));
sb.append("0123456789abcdef".charAt((ch >> 8) & 0xf));
sb.append("0123456789abcdef".charAt((ch >> 4) & 0xf));
sb.append("0123456789abcdef".charAt((ch) & 0xf));
} else {
sb.append("\\x");
sb.append("0123456789abcdef".charAt((ch >> 4) & 0xf));
sb.append("0123456789abcdef".charAt((ch) & 0xf));
}
} else {
sb.append(ch);
}
}
}
}