// simplify1 implements Simplify for the unary OpStar, // OpPlus, and OpQuest operators. It returns the simple regexp // equivalent to // // Regexp{Op: op, Flags: flags, Sub: {sub}} // // under the assumption that sub is already simple, and // without first allocating that structure. If the regexp // to be returned turns out to be equivalent to re, simplify1 // returns re instead. // // simplify1 is factored out of Simplify because the implementation // for other operators generates these unary expressions. // Letting them call simplify1 makes sure the expressions they // generate are simple. private static Regexp simplify1(Regexp.Op op, int flags, Regexp sub, Regexp re) { // Special case: repeat the empty string as much as // you want, but it's still the empty string. if (sub.op == Regexp.Op.EMPTY_MATCH) { return(sub); } // The operators are idempotent if the flags match. if (op == sub.op && (flags & RE2.NON_GREEDY) == (sub.flags & RE2.NON_GREEDY)) { return(sub); } if (re != null && re.op == op && (re.flags & RE2.NON_GREEDY) == (flags & RE2.NON_GREEDY) && sub == re.subs[0]) { return(re); } re = new Regexp(op); re.flags = flags; re.subs = new Regexp[] { sub }; return(re); }
// Exposed to ExecTests. public static RE2 compileImpl(String expr, int mode, bool longest) { Regexp re = Parser.parse(expr, mode); int maxCap = re.maxCap(); // (may shrink during simplify) re = Simplify.simplify(re); Prog prog = Compiler.compileRegexp(re); RE2 re2 = new RE2(expr, prog, maxCap, longest); StringBuilder prefixBuilder = new StringBuilder(); re2.prefixComplete = prog.prefix(prefixBuilder); re2.prefix = prefixBuilder.ToString(); try { re2.prefixUTF8 = Encoding.UTF8.GetBytes(re2.prefix); } catch (Exception e) { throw new IllegalStateException("can't happen"); } if (re2.prefix.Length > 0) { re2.prefixRune = Char.ConvertToUtf32(re2.prefix, 0); } return(re2); }
public static Prog compileRegexp(Regexp re) { Compiler c = new Compiler(); Frag f = c.compile(re); c.prog.patch(f.@out, c.newInst(Inst.InstOp.MATCH).i); c.prog.start = f.i; return(c.prog); }
// Shallow copy constructor. public Regexp(Regexp that) { this.op = that.op; this.flags = that.flags; this.subs = that.subs; this.runes = that.runes; this.min = that.min; this.max = that.max; this.cap = that.cap; this.name = that.name; }
private Frag compile(Regexp re) { switch (re.op) { case Regexp.Op.NO_MATCH: return(fail()); case Regexp.Op.EMPTY_MATCH: return(nop()); case Regexp.Op.LITERAL: if (re.runes.Length == 0) { return(nop()); } else { Frag f = null; foreach (int r in re.runes) { Frag f1 = rune(r, re.flags); f = (f == null) ? f1 : cat(f, f1); } return(f); } case Regexp.Op.CHAR_CLASS: return(rune(re.runes, re.flags)); case Regexp.Op.ANY_CHAR_NOT_NL: return(rune(ANY_RUNE_NOT_NL, 0)); case Regexp.Op.ANY_CHAR: return(rune(ANY_RUNE, 0)); case Regexp.Op.BEGIN_LINE: return(empty(Utils.EMPTY_BEGIN_LINE)); case Regexp.Op.END_LINE: return(empty(Utils.EMPTY_END_LINE)); case Regexp.Op.BEGIN_TEXT: return(empty(Utils.EMPTY_BEGIN_TEXT)); case Regexp.Op.END_TEXT: return(empty(Utils.EMPTY_END_TEXT)); case Regexp.Op.WORD_BOUNDARY: return(empty(Utils.EMPTY_WORD_BOUNDARY)); case Regexp.Op.NO_WORD_BOUNDARY: return(empty(Utils.EMPTY_NO_WORD_BOUNDARY)); case Regexp.Op.CAPTURE: { Frag bra = cap(re.cap << 1), sub = compile(re.subs[0]), ket = cap(re.cap << 1 | 1); return(cat(cat(bra, sub), ket)); } case Regexp.Op.STAR: return(star(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0)); case Regexp.Op.PLUS: return(plus(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0)); case Regexp.Op.QUEST: return(quest(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0)); case Regexp.Op.CONCAT: if (re.subs.Length == 0) { return(nop()); } else { Frag f = null; foreach (Regexp sub in re.subs) { Frag f1 = compile(sub); f = (f == null) ? f1 : cat(f, f1); } return(f); } case Regexp.Op.ALTERNATE: { if (re.subs.Length == 0) { return(nop()); } else { Frag f = null; foreach (Regexp sub in re.subs) { Frag f1 = compile(sub); f = (f == null) ? f1 : alt(f, f1); } return(f); } } default: throw new IllegalStateException("regexp: unhandled case in compile"); } }
// Simplify returns a regexp equivalent to re but without counted // repetitions and with various other simplifications, such as // rewriting /(?:a+)+/ to /a+/. The resulting regexp will execute // correctly but its string representation will not produce the same // parse tree, because capturing parentheses may have been duplicated // or removed. For example, the simplified form for /(x){1,2}/ is // /(x)(x)?/ but both parentheses capture as $1. The returned regexp // may share structure with or be the original. public static Regexp simplify(Regexp re) { if (re == null) { return(null); } switch (re.op) { case Regexp.Op.CAPTURE: case Regexp.Op.CONCAT: case Regexp.Op.ALTERNATE: { // Simplify children, building new Regexp if children change. Regexp nre = re; for (int i = 0; i < re.subs.Length; ++i) { Regexp sub = re.subs[i]; Regexp nsub = simplify(sub); if (nre == re && nsub != sub) { // Start a copy. nre = new Regexp(re); // shallow copy nre.runes = null; nre.subs = Parser.subarray(re.subs, 0, re.subs.Length); // clone } if (nre != re) { nre.subs[i] = nsub; } } return(nre); } case Regexp.Op.STAR: case Regexp.Op.PLUS: case Regexp.Op.QUEST: { Regexp sub = simplify(re.subs[0]); return(simplify1(re.op, re.flags, sub, re)); } case Regexp.Op.REPEAT: { // Special special case: x{0} matches the empty string // and doesn't even need to consider x. if (re.min == 0 && re.max == 0) { return(new Regexp(Regexp.Op.EMPTY_MATCH)); } // The fun begins. Regexp sub = simplify(re.subs[0]); // x{n,} means at least n matches of x. if (re.max == -1) { // Special case: x{0,} is x*. if (re.min == 0) { return(simplify1(Regexp.Op.STAR, re.flags, sub, null)); } // Special case: x{1,} is x+. if (re.min == 1) { return(simplify1(Regexp.Op.PLUS, re.flags, sub, null)); } // General case: x{4,} is xxxx+. Regexp nre = new Regexp(Regexp.Op.CONCAT); List <Regexp> subs = new List <Regexp>(); for (int i = 0; i < re.min - 1; i++) { subs.Add(sub); } subs.Add(simplify1(Regexp.Op.PLUS, re.flags, sub, null)); nre.subs = subs.ToArray(); return(nre); } // Special case x{0} handled above. // Special case: x{1} is just x. if (re.min == 1 && re.max == 1) { return(sub); } // General case: x{n,m} means n copies of x and m copies of x? // The machine will do less work if we nest the final m copies, // so that x{2,5} = xx(x(x(x)?)?)? // Build leading prefix: xx. List <Regexp> prefixSubs = null; if (re.min > 0) { prefixSubs = new List <Regexp>(); for (int i = 0; i < re.min; i++) { prefixSubs.Add(sub); } } // Build and attach suffix: (x(x(x)?)?)? if (re.max > re.min) { Regexp suffix = simplify1(Regexp.Op.QUEST, re.flags, sub, null); for (int i = re.min + 1; i < re.max; i++) { Regexp nre2 = new Regexp(Regexp.Op.CONCAT); nre2.subs = new Regexp[] { sub, suffix }; suffix = simplify1(Regexp.Op.QUEST, re.flags, nre2, null); } if (prefixSubs == null) { return(suffix); } prefixSubs.Add(suffix); } if (prefixSubs != null) { Regexp prefix = new Regexp(Regexp.Op.CONCAT); prefix.subs = prefixSubs.ToArray(); return(prefix); } // Some degenerate case like min > max or min < max < 0. // Handle as impossible match. return(new Regexp(Regexp.Op.NO_MATCH)); } } return(re); }
// equals() returns true if this and that have identical structure. public override bool Equals(Object that) { if (that as Regexp == null) { return(false); } Regexp x = this; Regexp y = (Regexp)that; if (x.op != y.op) { return(false); } switch (x.op) { case Op.END_TEXT: // The parse flags remember whether this is \z or \Z. if ((x.flags & RE2.WAS_DOLLAR) != (y.flags & RE2.WAS_DOLLAR)) { return(false); } break; case Op.LITERAL: case Op.CHAR_CLASS: if (!Array.Equals(x.runes, y.runes)) { return(false); } break; case Op.ALTERNATE: case Op.CONCAT: if (x.subs.Length != y.subs.Length) { return(false); } for (int i = 0; i < x.subs.Length; ++i) { if (!x.subs[i].Equals(y.subs[i])) { return(false); } } break; case Op.STAR: case Op.PLUS: case Op.QUEST: if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) || !x.subs[0].Equals(y.subs[0])) { return(false); } break; case Op.REPEAT: if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) || x.min != y.min || x.max != y.max || !x.subs[0].Equals(y.subs[0])) { return(false); } break; case Op.CAPTURE: if (x.cap != y.cap || (x.name == null ? y.name != null : !x.name.Equals(y.name)) || !x.subs[0].Equals(y.subs[0])) { return(false); } break; } return(true); }
// appendTo() appends the Perl syntax for |this| regular expression to |out|. private void appendTo(StringBuilder @out) { switch (op) { case Op.NO_MATCH: @out.Append("[^\\x00-\\x{10FFFF}]"); break; case Op.EMPTY_MATCH: @out.Append("(?:)"); break; case Op.STAR: case Op.PLUS: case Op.QUEST: case Op.REPEAT: { Regexp sub = subs[0]; if (sub.op > Op.CAPTURE || (sub.op == Op.LITERAL && sub.runes.Length > 1)) { @out.Append("(?:"); sub.appendTo(@out); @out.Append(')'); } else { sub.appendTo(@out); } switch (op) { case Op.STAR: @out.Append('*'); break; case Op.PLUS: @out.Append('+'); break; case Op.QUEST: @out.Append('?'); break; case Op.REPEAT: @out.Append('{').Append(min); if (min != max) { @out.Append(','); if (max >= 0) { @out.Append(max); } } @out.Append('}'); break; } if ((flags & RE2.NON_GREEDY) != 0) { @out.Append('?'); } break; } case Op.CONCAT: foreach (Regexp sub in subs) { if (sub.op == Op.ALTERNATE) { @out.Append("(?:"); sub.appendTo(@out); @out.Append(')'); } else { sub.appendTo(@out); } } break; case Op.ALTERNATE: { String sep = ""; foreach (Regexp sub in subs) { @out.Append(sep); sep = "|"; sub.appendTo(@out); } break; } case Op.LITERAL: if ((flags & RE2.FOLD_CASE) != 0) { @out.Append("(?i:"); } foreach (int rune in runes) { Utils.escapeRune(@out, rune); } if ((flags & RE2.FOLD_CASE) != 0) { @out.Append(')'); } break; case Op.ANY_CHAR_NOT_NL: @out.Append("(?-s:.)"); break; case Op.ANY_CHAR: @out.Append("(?s:.)"); break; case Op.CAPTURE: if (name == null || name.Length == 0) { @out.Append('('); } else { @out.Append("(?P<"); @out.Append(name); @out.Append(">"); } if (subs[0].op != Op.EMPTY_MATCH) { subs[0].appendTo(@out); } @out.Append(')'); break; case Op.BEGIN_TEXT: @out.Append("\\A"); break; case Op.END_TEXT: if ((flags & RE2.WAS_DOLLAR) != 0) { @out.Append("(?-m:$)"); } else { @out.Append("\\z"); } break; case Op.BEGIN_LINE: @out.Append('^'); break; case Op.END_LINE: @out.Append('$'); break; case Op.WORD_BOUNDARY: @out.Append("\\b"); break; case Op.NO_WORD_BOUNDARY: @out.Append("\\B"); break; case Op.CHAR_CLASS: if (runes.Length % 2 != 0) { @out.Append("[invalid char class]"); break; } @out.Append('['); if (runes.Length == 0) { @out.Append("^\\x00-\\x{10FFFF}"); } else if (runes[0] == 0 && runes[runes.Length - 1] == Unicode.MAX_RUNE) { // Contains 0 and MAX_RUNE. Probably a negated class. // Print the gaps. @out.Append('^'); for (int i = 1; i < runes.Length - 1; i += 2) { int lo = runes[i] + 1; int hi = runes[i + 1] - 1; quoteIfHyphen(@out, lo); Utils.escapeRune(@out, lo); if (lo != hi) { @out.Append('-'); quoteIfHyphen(@out, hi); Utils.escapeRune(@out, hi); } } } else { for (int i = 0; i < runes.Length; i += 2) { int lo = runes[i]; int hi = runes[i + 1]; quoteIfHyphen(@out, lo); Utils.escapeRune(@out, lo); if (lo != hi) { @out.Append('-'); quoteIfHyphen(@out, hi); Utils.escapeRune(@out, hi); } } } @out.Append(']'); break; default: // incl. pseudos @out.Append(op); break; } }