Exemplo n.º 1
0
        // simplify1 implements Simplify for the unary OpStar,
        // OpPlus, and OpQuest operators.  It returns the simple regexp
        // equivalent to
        //
        //      Regexp{Op: op, Flags: flags, Sub: {sub}}
        //
        // under the assumption that sub is already simple, and
        // without first allocating that structure.  If the regexp
        // to be returned turns out to be equivalent to re, simplify1
        // returns re instead.
        //
        // simplify1 is factored out of Simplify because the implementation
        // for other operators generates these unary expressions.
        // Letting them call simplify1 makes sure the expressions they
        // generate are simple.
        private static Regexp simplify1(Regexp.Op op, int flags, Regexp sub, Regexp re)
        {
            // Special case: repeat the empty string as much as
            // you want, but it's still the empty string.
            if (sub.op == Regexp.Op.EMPTY_MATCH)
            {
                return(sub);
            }

            // The operators are idempotent if the flags match.
            if (op == sub.op && (flags & RE2.NON_GREEDY) == (sub.flags & RE2.NON_GREEDY))
            {
                return(sub);
            }

            if (re != null &&
                re.op == op &&
                (re.flags & RE2.NON_GREEDY) == (flags & RE2.NON_GREEDY) &&
                sub == re.subs[0])
            {
                return(re);
            }

            re       = new Regexp(op);
            re.flags = flags;
            re.subs  = new Regexp[] { sub };
            return(re);
        }
Exemplo n.º 2
0
Arquivo: RE2.cs Projeto: kaby76/re2cs
        // Exposed to ExecTests.
        public static RE2 compileImpl(String expr, int mode, bool longest)
        {
            Regexp re     = Parser.parse(expr, mode);
            int    maxCap = re.maxCap(); // (may shrink during simplify)

            re = Simplify.simplify(re);
            Prog          prog          = Compiler.compileRegexp(re);
            RE2           re2           = new RE2(expr, prog, maxCap, longest);
            StringBuilder prefixBuilder = new StringBuilder();

            re2.prefixComplete = prog.prefix(prefixBuilder);
            re2.prefix         = prefixBuilder.ToString();
            try
            {
                re2.prefixUTF8 = Encoding.UTF8.GetBytes(re2.prefix);
            }
            catch (Exception e)
            {
                throw new IllegalStateException("can't happen");
            }

            if (re2.prefix.Length > 0)
            {
                re2.prefixRune = Char.ConvertToUtf32(re2.prefix, 0);
            }

            return(re2);
        }
Exemplo n.º 3
0
        public static Prog compileRegexp(Regexp re)
        {
            Compiler c = new Compiler();
            Frag     f = c.compile(re);

            c.prog.patch(f.@out, c.newInst(Inst.InstOp.MATCH).i);
            c.prog.start = f.i;
            return(c.prog);
        }
Exemplo n.º 4
0
 // Shallow copy constructor.
 public Regexp(Regexp that)
 {
     this.op    = that.op;
     this.flags = that.flags;
     this.subs  = that.subs;
     this.runes = that.runes;
     this.min   = that.min;
     this.max   = that.max;
     this.cap   = that.cap;
     this.name  = that.name;
 }
Exemplo n.º 5
0
        private Frag compile(Regexp re)
        {
            switch (re.op)
            {
            case Regexp.Op.NO_MATCH:
                return(fail());

            case Regexp.Op.EMPTY_MATCH:
                return(nop());

            case Regexp.Op.LITERAL:
                if (re.runes.Length == 0)
                {
                    return(nop());
                }
                else
                {
                    Frag f = null;
                    foreach (int r in re.runes)
                    {
                        Frag f1 = rune(r, re.flags);
                        f = (f == null) ? f1 : cat(f, f1);
                    }
                    return(f);
                }

            case Regexp.Op.CHAR_CLASS:
                return(rune(re.runes, re.flags));

            case Regexp.Op.ANY_CHAR_NOT_NL:
                return(rune(ANY_RUNE_NOT_NL, 0));

            case Regexp.Op.ANY_CHAR:
                return(rune(ANY_RUNE, 0));

            case Regexp.Op.BEGIN_LINE:
                return(empty(Utils.EMPTY_BEGIN_LINE));

            case Regexp.Op.END_LINE:
                return(empty(Utils.EMPTY_END_LINE));

            case Regexp.Op.BEGIN_TEXT:
                return(empty(Utils.EMPTY_BEGIN_TEXT));

            case Regexp.Op.END_TEXT:
                return(empty(Utils.EMPTY_END_TEXT));

            case Regexp.Op.WORD_BOUNDARY:
                return(empty(Utils.EMPTY_WORD_BOUNDARY));

            case Regexp.Op.NO_WORD_BOUNDARY:
                return(empty(Utils.EMPTY_NO_WORD_BOUNDARY));

            case Regexp.Op.CAPTURE:
            {
                Frag bra = cap(re.cap << 1), sub = compile(re.subs[0]), ket = cap(re.cap << 1 | 1);
                return(cat(cat(bra, sub), ket));
            }

            case Regexp.Op.STAR:
                return(star(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0));

            case Regexp.Op.PLUS:
                return(plus(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0));

            case Regexp.Op.QUEST:
                return(quest(compile(re.subs[0]), (re.flags & RE2.NON_GREEDY) != 0));

            case Regexp.Op.CONCAT:
                if (re.subs.Length == 0)
                {
                    return(nop());
                }
                else
                {
                    Frag f = null;
                    foreach (Regexp sub in re.subs)
                    {
                        Frag f1 = compile(sub);
                        f = (f == null) ? f1 : cat(f, f1);
                    }

                    return(f);
                }

            case Regexp.Op.ALTERNATE:
            {
                if (re.subs.Length == 0)
                {
                    return(nop());
                }
                else
                {
                    Frag f = null;
                    foreach (Regexp sub in re.subs)
                    {
                        Frag f1 = compile(sub);
                        f = (f == null) ? f1 : alt(f, f1);
                    }

                    return(f);
                }
            }

            default:
                throw new IllegalStateException("regexp: unhandled case in compile");
            }
        }
Exemplo n.º 6
0
        // Simplify returns a regexp equivalent to re but without counted
        // repetitions and with various other simplifications, such as
        // rewriting /(?:a+)+/ to /a+/.  The resulting regexp will execute
        // correctly but its string representation will not produce the same
        // parse tree, because capturing parentheses may have been duplicated
        // or removed.  For example, the simplified form for /(x){1,2}/ is
        // /(x)(x)?/ but both parentheses capture as $1.  The returned regexp
        // may share structure with or be the original.
        public static Regexp simplify(Regexp re)
        {
            if (re == null)
            {
                return(null);
            }

            switch (re.op)
            {
            case Regexp.Op.CAPTURE:
            case Regexp.Op.CONCAT:
            case Regexp.Op.ALTERNATE:
            {
                // Simplify children, building new Regexp if children change.
                Regexp nre = re;
                for (int i = 0; i < re.subs.Length; ++i)
                {
                    Regexp sub  = re.subs[i];
                    Regexp nsub = simplify(sub);
                    if (nre == re && nsub != sub)
                    {
                        // Start a copy.
                        nre       = new Regexp(re);                              // shallow copy
                        nre.runes = null;
                        nre.subs  = Parser.subarray(re.subs, 0, re.subs.Length); // clone
                    }

                    if (nre != re)
                    {
                        nre.subs[i] = nsub;
                    }
                }

                return(nre);
            }

            case Regexp.Op.STAR:
            case Regexp.Op.PLUS:
            case Regexp.Op.QUEST:
            {
                Regexp sub = simplify(re.subs[0]);
                return(simplify1(re.op, re.flags, sub, re));
            }

            case Regexp.Op.REPEAT:
            {
                // Special special case: x{0} matches the empty string
                // and doesn't even need to consider x.
                if (re.min == 0 && re.max == 0)
                {
                    return(new Regexp(Regexp.Op.EMPTY_MATCH));
                }

                // The fun begins.
                Regexp sub = simplify(re.subs[0]);

                // x{n,} means at least n matches of x.
                if (re.max == -1)
                {
                    // Special case: x{0,} is x*.
                    if (re.min == 0)
                    {
                        return(simplify1(Regexp.Op.STAR, re.flags, sub, null));
                    }

                    // Special case: x{1,} is x+.
                    if (re.min == 1)
                    {
                        return(simplify1(Regexp.Op.PLUS, re.flags, sub, null));
                    }

                    // General case: x{4,} is xxxx+.
                    Regexp        nre  = new Regexp(Regexp.Op.CONCAT);
                    List <Regexp> subs = new List <Regexp>();
                    for (int i = 0; i < re.min - 1; i++)
                    {
                        subs.Add(sub);
                    }

                    subs.Add(simplify1(Regexp.Op.PLUS, re.flags, sub, null));
                    nre.subs = subs.ToArray();
                    return(nre);
                }

                // Special case x{0} handled above.

                // Special case: x{1} is just x.
                if (re.min == 1 && re.max == 1)
                {
                    return(sub);
                }

                // General case: x{n,m} means n copies of x and m copies of x?
                // The machine will do less work if we nest the final m copies,
                // so that x{2,5} = xx(x(x(x)?)?)?

                // Build leading prefix: xx.
                List <Regexp> prefixSubs = null;
                if (re.min > 0)
                {
                    prefixSubs = new List <Regexp>();
                    for (int i = 0; i < re.min; i++)
                    {
                        prefixSubs.Add(sub);
                    }
                }

                // Build and attach suffix: (x(x(x)?)?)?
                if (re.max > re.min)
                {
                    Regexp suffix = simplify1(Regexp.Op.QUEST, re.flags, sub, null);
                    for (int i = re.min + 1; i < re.max; i++)
                    {
                        Regexp nre2 = new Regexp(Regexp.Op.CONCAT);
                        nre2.subs = new Regexp[] { sub, suffix };
                        suffix    = simplify1(Regexp.Op.QUEST, re.flags, nre2, null);
                    }

                    if (prefixSubs == null)
                    {
                        return(suffix);
                    }

                    prefixSubs.Add(suffix);
                }

                if (prefixSubs != null)
                {
                    Regexp prefix = new Regexp(Regexp.Op.CONCAT);
                    prefix.subs = prefixSubs.ToArray();
                    return(prefix);
                }

                // Some degenerate case like min > max or min < max < 0.
                // Handle as impossible match.
                return(new Regexp(Regexp.Op.NO_MATCH));
            }
            }

            return(re);
        }
Exemplo n.º 7
0
// equals() returns true if this and that have identical structure.
        public override bool Equals(Object that)
        {
            if (that as Regexp == null)
            {
                return(false);
            }

            Regexp x = this;
            Regexp y = (Regexp)that;

            if (x.op != y.op)
            {
                return(false);
            }

            switch (x.op)
            {
            case Op.END_TEXT:
                // The parse flags remember whether this is \z or \Z.
                if ((x.flags & RE2.WAS_DOLLAR) != (y.flags & RE2.WAS_DOLLAR))
                {
                    return(false);
                }

                break;

            case Op.LITERAL:
            case Op.CHAR_CLASS:
                if (!Array.Equals(x.runes, y.runes))
                {
                    return(false);
                }

                break;

            case Op.ALTERNATE:
            case Op.CONCAT:
                if (x.subs.Length != y.subs.Length)
                {
                    return(false);
                }

                for (int i = 0; i < x.subs.Length; ++i)
                {
                    if (!x.subs[i].Equals(y.subs[i]))
                    {
                        return(false);
                    }
                }

                break;

            case Op.STAR:
            case Op.PLUS:
            case Op.QUEST:
                if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) ||
                    !x.subs[0].Equals(y.subs[0]))
                {
                    return(false);
                }

                break;

            case Op.REPEAT:
                if ((x.flags & RE2.NON_GREEDY) != (y.flags & RE2.NON_GREEDY) ||
                    x.min != y.min ||
                    x.max != y.max ||
                    !x.subs[0].Equals(y.subs[0]))
                {
                    return(false);
                }

                break;

            case Op.CAPTURE:
                if (x.cap != y.cap ||
                    (x.name == null ? y.name != null : !x.name.Equals(y.name)) ||
                    !x.subs[0].Equals(y.subs[0]))
                {
                    return(false);
                }

                break;
            }

            return(true);
        }
Exemplo n.º 8
0
        // appendTo() appends the Perl syntax for |this| regular expression to |out|.
        private void appendTo(StringBuilder @out)
        {
            switch (op)
            {
            case Op.NO_MATCH:
                @out.Append("[^\\x00-\\x{10FFFF}]");
                break;

            case Op.EMPTY_MATCH:
                @out.Append("(?:)");
                break;

            case Op.STAR:
            case Op.PLUS:
            case Op.QUEST:
            case Op.REPEAT:
            {
                Regexp sub = subs[0];
                if (sub.op > Op.CAPTURE ||
                    (sub.op == Op.LITERAL && sub.runes.Length > 1))
                {
                    @out.Append("(?:");
                    sub.appendTo(@out);
                    @out.Append(')');
                }
                else
                {
                    sub.appendTo(@out);
                }

                switch (op)
                {
                case Op.STAR:
                    @out.Append('*');
                    break;

                case Op.PLUS:
                    @out.Append('+');
                    break;

                case Op.QUEST:
                    @out.Append('?');
                    break;

                case Op.REPEAT:
                    @out.Append('{').Append(min);
                    if (min != max)
                    {
                        @out.Append(',');
                        if (max >= 0)
                        {
                            @out.Append(max);
                        }
                    }

                    @out.Append('}');
                    break;
                }

                if ((flags & RE2.NON_GREEDY) != 0)
                {
                    @out.Append('?');
                }

                break;
            }

            case Op.CONCAT:
                foreach (Regexp sub in subs)
                {
                    if (sub.op == Op.ALTERNATE)
                    {
                        @out.Append("(?:");
                        sub.appendTo(@out);
                        @out.Append(')');
                    }
                    else
                    {
                        sub.appendTo(@out);
                    }
                }

                break;

            case Op.ALTERNATE:
            {
                String sep = "";
                foreach (Regexp sub in subs)
                {
                    @out.Append(sep);
                    sep = "|";
                    sub.appendTo(@out);
                }

                break;
            }

            case Op.LITERAL:
                if ((flags & RE2.FOLD_CASE) != 0)
                {
                    @out.Append("(?i:");
                }

                foreach (int rune in runes)
                {
                    Utils.escapeRune(@out, rune);
                }
                if ((flags & RE2.FOLD_CASE) != 0)
                {
                    @out.Append(')');
                }

                break;

            case Op.ANY_CHAR_NOT_NL:
                @out.Append("(?-s:.)");
                break;

            case Op.ANY_CHAR:
                @out.Append("(?s:.)");
                break;

            case Op.CAPTURE:
                if (name == null || name.Length == 0)
                {
                    @out.Append('(');
                }
                else
                {
                    @out.Append("(?P<");
                    @out.Append(name);
                    @out.Append(">");
                }

                if (subs[0].op != Op.EMPTY_MATCH)
                {
                    subs[0].appendTo(@out);
                }

                @out.Append(')');
                break;

            case Op.BEGIN_TEXT:
                @out.Append("\\A");
                break;

            case Op.END_TEXT:
                if ((flags & RE2.WAS_DOLLAR) != 0)
                {
                    @out.Append("(?-m:$)");
                }
                else
                {
                    @out.Append("\\z");
                }

                break;

            case Op.BEGIN_LINE:
                @out.Append('^');
                break;

            case Op.END_LINE:
                @out.Append('$');
                break;

            case Op.WORD_BOUNDARY:
                @out.Append("\\b");
                break;

            case Op.NO_WORD_BOUNDARY:
                @out.Append("\\B");
                break;

            case Op.CHAR_CLASS:
                if (runes.Length % 2 != 0)
                {
                    @out.Append("[invalid char class]");
                    break;
                }

                @out.Append('[');
                if (runes.Length == 0)
                {
                    @out.Append("^\\x00-\\x{10FFFF}");
                }
                else if (runes[0] == 0 && runes[runes.Length - 1] == Unicode.MAX_RUNE)
                {
                    // Contains 0 and MAX_RUNE.  Probably a negated class.
                    // Print the gaps.
                    @out.Append('^');
                    for (int i = 1; i < runes.Length - 1; i += 2)
                    {
                        int lo = runes[i] + 1;
                        int hi = runes[i + 1] - 1;
                        quoteIfHyphen(@out, lo);
                        Utils.escapeRune(@out, lo);
                        if (lo != hi)
                        {
                            @out.Append('-');
                            quoteIfHyphen(@out, hi);
                            Utils.escapeRune(@out, hi);
                        }
                    }
                }
                else
                {
                    for (int i = 0; i < runes.Length; i += 2)
                    {
                        int lo = runes[i];
                        int hi = runes[i + 1];
                        quoteIfHyphen(@out, lo);
                        Utils.escapeRune(@out, lo);
                        if (lo != hi)
                        {
                            @out.Append('-');
                            quoteIfHyphen(@out, hi);
                            Utils.escapeRune(@out, hi);
                        }
                    }
                }

                @out.Append(']');
                break;

            default:     // incl. pseudos
                @out.Append(op);
                break;
            }
        }