Esempio n. 1
0
        /// <summary>
        /// Nested repeaters just get multiplied with each other if they're not
        /// too lumpy
        /// </summary>
        private RegexNode ReduceRep()
        {
            RegexNode u = this;
            RegexNode child;
            int       type = Type();
            int       min  = M;
            int       max  = N;

            for (; ;)
            {
                if (u.ChildCount() == 0)
                {
                    break;
                }

                child = u.Child(0);

                // multiply reps of the same type only
                if (child.Type() != type)
                {
                    int childType = child.Type();

                    if (!(childType >= Oneloop && childType <= Setloop && type == Loop ||
                          childType >= Onelazy && childType <= Setlazy && type == Lazyloop))
                    {
                        break;
                    }
                }

                // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})?
                // [but things like (a {2,})+ are not too lumpy...]
                if (u.M == 0 && child.M > 1 || child.N < child.M * 2)
                {
                    break;
                }

                u = child;
                if (u.M > 0)
                {
                    u.M = min = ((int.MaxValue - 1) / u.M < min) ? int.MaxValue : u.M * min;
                }
                if (u.N > 0)
                {
                    u.N = max = ((int.MaxValue - 1) / u.N < max) ? int.MaxValue : u.N * max;
                }
            }

            return(min == int.MaxValue ? new RegexNode(Nothing, Options) : u);
        }
Esempio n. 2
0
        /// <summary>
        /// This is a related computation: it takes a RegexTree and computes the
        /// leading substring if it see one. It's quite trivial and gives up easily.
        /// </summary>
        public static RegexPrefix Prefix(RegexTree tree)
        {
            RegexNode curNode    = tree.Root;
            RegexNode concatNode = null;
            int       nextChild  = 0;

            for (; ;)
            {
                switch (curNode.NType)
                {
                case RegexNode.Concatenate:
                    if (curNode.ChildCount() > 0)
                    {
                        concatNode = curNode;
                        nextChild  = 0;
                    }
                    break;

                case RegexNode.Greedy:
                case RegexNode.Capture:
                    curNode    = curNode.Child(0);
                    concatNode = null;
                    continue;

                case RegexNode.Oneloop:
                case RegexNode.Onelazy:

                    // In release, cutoff at a length to which we can still reasonably construct a string
                    // In debug, use a smaller cutoff to exercise the cutoff path in tests
                    const int Cutoff =
                        #if DEBUG
                        50;
                        #else
                        1_000_000;
                        #endif

                    if (curNode.M > 0 && curNode.M < Cutoff)
                    {
                        string pref = string.Empty.PadRight(curNode.M, curNode.Ch);
                        return(new RegexPrefix(pref, 0 != (curNode.Options & RegexOptions.IgnoreCase)));
                    }
                    else
                    {
                        return(RegexPrefix.Empty);
                    }

                case RegexNode.One:
                    return(new RegexPrefix(curNode.Ch.ToString(), 0 != (curNode.Options & RegexOptions.IgnoreCase)));

                case RegexNode.Multi:
                    return(new RegexPrefix(curNode.Str, 0 != (curNode.Options & RegexOptions.IgnoreCase)));

                case RegexNode.Bol:
                case RegexNode.Eol:
                case RegexNode.Boundary:
                case RegexNode.ECMABoundary:
                case RegexNode.Beginning:
                case RegexNode.Start:
                case RegexNode.EndZ:
                case RegexNode.End:
                case RegexNode.Empty:
                case RegexNode.Require:
                case RegexNode.Prevent:
                    break;

                default:
                    return(RegexPrefix.Empty);
                }

                if (concatNode == null || nextChild >= concatNode.ChildCount())
                {
                    return(RegexPrefix.Empty);
                }

                curNode = concatNode.Child(nextChild++);
            }
        }
Esempio n. 3
0
        /// <summary>
        /// The top level RegexCode generator. It does a depth-first walk
        /// through the tree and calls EmitFragment to emits code before
        /// and after each child of an interior node, and at each leaf.
        /// </summary>
        public RegexCode RegexCodeFromRegexTree(RegexTree tree)
        {
            // construct sparse capnum mapping if some numbers are unused
            int capsize;

            if (tree.CapNumList == null || tree.CapTop == tree.CapNumList.Length)
            {
                capsize = tree.CapTop;
                _caps   = null;
            }
            else
            {
                capsize = tree.CapNumList.Length;
                _caps   = tree.Caps;
                for (int i = 0; i < tree.CapNumList.Length; i++)
                {
                    _caps[tree.CapNumList[i]] = i;
                }
            }

            RegexNode curNode  = tree.Root;
            int       curChild = 0;

            Emit(RegexCode.Lazybranch, 0);

            for (; ;)
            {
                if (curNode.Children == null)
                {
                    EmitFragment(curNode.NType, curNode, 0);
                }
                else if (curChild < curNode.Children.Count)
                {
                    EmitFragment(curNode.NType | BeforeChild, curNode, curChild);

                    curNode = curNode.Children[curChild];
                    _intStack.Push(curChild);
                    curChild = 0;
                    continue;
                }

                if (_intStack.Count == 0)
                {
                    break;
                }

                curChild = _intStack.Pop();
                curNode  = curNode.Next;

                EmitFragment(curNode.NType | AfterChild, curNode, curChild);
                curChild++;
            }

            PatchJump(0, _emitted.Count);
            Emit(RegexCode.Stop);

            RegexPrefix?fcPrefix = RegexFCD.FirstChars(tree);
            RegexPrefix prefix   = RegexFCD.Prefix(tree);
            bool        rtl      = ((tree.Options & RegexOptions.RightToLeft) != 0);

            CultureInfo     culture = (tree.Options & RegexOptions.CultureInvariant) != 0 ? CultureInfo.InvariantCulture : CultureInfo.CurrentCulture;
            RegexBoyerMoore bmPrefix;

            if (prefix.Prefix.Length > 0)
            {
                bmPrefix = new RegexBoyerMoore(prefix.Prefix, prefix.CaseInsensitive, rtl, culture);
            }
            else
            {
                bmPrefix = null;
            }

            int anchors = RegexFCD.Anchors(tree);

            int[] emitted = _emitted.ToArray();

            return(new RegexCode(emitted, _stringTable, _trackCount, _caps, capsize, bmPrefix, fcPrefix, anchors, rtl));
        }
Esempio n. 4
0
        /// <summary>
        /// FC computation and shortcut cases for each node type
        /// </summary>
        private void CalculateFC(int NodeType, RegexNode node, int CurIndex)
        {
            bool ci  = false;
            bool rtl = false;

            if (NodeType <= RegexNode.Ref)
            {
                if ((node.Options & RegexOptions.IgnoreCase) != 0)
                {
                    ci = true;
                }
                if ((node.Options & RegexOptions.RightToLeft) != 0)
                {
                    rtl = true;
                }
            }

            switch (NodeType)
            {
            case RegexNode.Concatenate | BeforeChild:
            case RegexNode.Alternate | BeforeChild:
            case RegexNode.Testref | BeforeChild:
            case RegexNode.Loop | BeforeChild:
            case RegexNode.Lazyloop | BeforeChild:
                break;

            case RegexNode.Testgroup | BeforeChild:
                if (CurIndex == 0)
                {
                    SkipChild();
                }
                break;

            case RegexNode.Empty:
                PushFC(new RegexFC(true));
                break;

            case RegexNode.Concatenate | AfterChild:
                if (CurIndex != 0)
                {
                    RegexFC child = PopFC();
                    RegexFC cumul = TopFC();

                    _failed = !cumul.AddFC(child, true);
                }

                if (!TopFC()._nullable)
                {
                    _skipAllChildren = true;
                }
                break;

            case RegexNode.Testgroup | AfterChild:
                if (CurIndex > 1)
                {
                    RegexFC child = PopFC();
                    RegexFC cumul = TopFC();

                    _failed = !cumul.AddFC(child, false);
                }
                break;

            case RegexNode.Alternate | AfterChild:
            case RegexNode.Testref | AfterChild:
                if (CurIndex != 0)
                {
                    RegexFC child = PopFC();
                    RegexFC cumul = TopFC();

                    _failed = !cumul.AddFC(child, false);
                }
                break;

            case RegexNode.Loop | AfterChild:
            case RegexNode.Lazyloop | AfterChild:
                if (node.M == 0)
                {
                    TopFC()._nullable = true;
                }
                break;

            case RegexNode.Group | BeforeChild:
            case RegexNode.Group | AfterChild:
            case RegexNode.Capture | BeforeChild:
            case RegexNode.Capture | AfterChild:
            case RegexNode.Greedy | BeforeChild:
            case RegexNode.Greedy | AfterChild:
                break;

            case RegexNode.Require | BeforeChild:
            case RegexNode.Prevent | BeforeChild:
                SkipChild();
                PushFC(new RegexFC(true));
                break;

            case RegexNode.Require | AfterChild:
            case RegexNode.Prevent | AfterChild:
                break;

            case RegexNode.One:
            case RegexNode.Notone:
                PushFC(new RegexFC(node.Ch, NodeType == RegexNode.Notone, false, ci));
                break;

            case RegexNode.Oneloop:
            case RegexNode.Onelazy:
                PushFC(new RegexFC(node.Ch, false, node.M == 0, ci));
                break;

            case RegexNode.Notoneloop:
            case RegexNode.Notonelazy:
                PushFC(new RegexFC(node.Ch, true, node.M == 0, ci));
                break;

            case RegexNode.Multi:
                if (node.Str.Length == 0)
                {
                    PushFC(new RegexFC(true));
                }
                else if (!rtl)
                {
                    PushFC(new RegexFC(node.Str[0], false, false, ci));
                }
                else
                {
                    PushFC(new RegexFC(node.Str[node.Str.Length - 1], false, false, ci));
                }
                break;

            case RegexNode.Set:
                PushFC(new RegexFC(node.Str, false, ci));
                break;

            case RegexNode.Setloop:
            case RegexNode.Setlazy:
                PushFC(new RegexFC(node.Str, node.M == 0, ci));
                break;

            case RegexNode.Ref:
                PushFC(new RegexFC(RegexCharClass.AnyClass, true, false));
                break;

            case RegexNode.Nothing:
            case RegexNode.Bol:
            case RegexNode.Eol:
            case RegexNode.Boundary:
            case RegexNode.Nonboundary:
            case RegexNode.ECMABoundary:
            case RegexNode.NonECMABoundary:
            case RegexNode.Beginning:
            case RegexNode.Start:
            case RegexNode.EndZ:
            case RegexNode.End:
                PushFC(new RegexFC(true));
                break;

            default:
                throw new ArgumentException($"Unexpected opcode in regular expression generation: {NodeType.ToString(CultureInfo.CurrentCulture)}.");
            }
        }
Esempio n. 5
0
        /// <summary>
        /// The main RegexCode generator. It does a depth-first walk
        /// through the tree and calls EmitFragment to emits code before
        /// and after each child of an interior node, and at each leaf.
        /// </summary>
        private void EmitFragment(int nodetype, RegexNode node, int curIndex)
        {
            int bits = 0;

            if (nodetype <= RegexNode.Ref)
            {
                if (node.UseOptionR())
                {
                    bits |= RegexCode.Rtl;
                }
                if ((node.Options & RegexOptions.IgnoreCase) != 0)
                {
                    bits |= RegexCode.Ci;
                }
            }

            switch (nodetype)
            {
            case RegexNode.Concatenate | BeforeChild:
            case RegexNode.Concatenate | AfterChild:
            case RegexNode.Empty:
                break;

            case RegexNode.Alternate | BeforeChild:
                if (curIndex < node.Children.Count - 1)
                {
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Lazybranch, 0);
                }
                break;

            case RegexNode.Alternate | AfterChild:
            {
                if (curIndex < node.Children.Count - 1)
                {
                    int LBPos = _intStack.Pop();
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Goto, 0);
                    PatchJump(LBPos, _emitted.Count);
                }
                else
                {
                    int I;
                    for (I = 0; I < curIndex; I++)
                    {
                        PatchJump(_intStack.Pop(), _emitted.Count);
                    }
                }
                break;
            }

            case RegexNode.Testref | BeforeChild:
                switch (curIndex)
                {
                case 0:
                    Emit(RegexCode.Setjump);
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Lazybranch, 0);
                    Emit(RegexCode.Testref, MapCapnum(node.M));
                    Emit(RegexCode.Forejump);
                    break;
                }
                break;

            case RegexNode.Testref | AfterChild:
                switch (curIndex)
                {
                case 0:
                {
                    int Branchpos = _intStack.Pop();
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Goto, 0);
                    PatchJump(Branchpos, _emitted.Count);
                    Emit(RegexCode.Forejump);
                    if (node.Children.Count > 1)
                    {
                        break;
                    }
                    // else fallthrough
                    goto case 1;
                }

                case 1:
                    PatchJump(_intStack.Pop(), _emitted.Count);
                    break;
                }
                break;

            case RegexNode.Testgroup | BeforeChild:
                switch (curIndex)
                {
                case 0:
                    Emit(RegexCode.Setjump);
                    Emit(RegexCode.Setmark);
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Lazybranch, 0);
                    break;
                }
                break;

            case RegexNode.Testgroup | AfterChild:
                switch (curIndex)
                {
                case 0:
                    Emit(RegexCode.Getmark);
                    Emit(RegexCode.Forejump);
                    break;

                case 1:
                    int Branchpos = _intStack.Pop();
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Goto, 0);
                    PatchJump(Branchpos, _emitted.Count);
                    Emit(RegexCode.Getmark);
                    Emit(RegexCode.Forejump);

                    if (node.Children.Count > 2)
                    {
                        break;
                    }
                    // else fallthrough
                    goto case 2;

                case 2:
                    PatchJump(_intStack.Pop(), _emitted.Count);
                    break;
                }
                break;

            case RegexNode.Loop | BeforeChild:
            case RegexNode.Lazyloop | BeforeChild:

                if (node.N < int.MaxValue || node.M > 1)
                {
                    Emit(node.M == 0 ? RegexCode.Nullcount : RegexCode.Setcount, node.M == 0 ? 0 : 1 - node.M);
                }
                else
                {
                    Emit(node.M == 0 ? RegexCode.Nullmark : RegexCode.Setmark);
                }

                if (node.M == 0)
                {
                    _intStack.Push(_emitted.Count);
                    Emit(RegexCode.Goto, 0);
                }
                _intStack.Push(_emitted.Count);
                break;

            case RegexNode.Loop | AfterChild:
            case RegexNode.Lazyloop | AfterChild:
            {
                int StartJumpPos = _emitted.Count;
                int Lazy         = (nodetype - (RegexNode.Loop | AfterChild));

                if (node.N < int.MaxValue || node.M > 1)
                {
                    Emit(RegexCode.Branchcount + Lazy, _intStack.Pop(), node.N == int.MaxValue ? int.MaxValue : node.N - node.M);
                }
                else
                {
                    Emit(RegexCode.Branchmark + Lazy, _intStack.Pop());
                }

                if (node.M == 0)
                {
                    PatchJump(_intStack.Pop(), StartJumpPos);
                }
            }
            break;

            case RegexNode.Group | BeforeChild:
            case RegexNode.Group | AfterChild:
                break;

            case RegexNode.Capture | BeforeChild:
                Emit(RegexCode.Setmark);
                break;

            case RegexNode.Capture | AfterChild:
                Emit(RegexCode.Capturemark, MapCapnum(node.M), MapCapnum(node.N));
                break;

            case RegexNode.Require | BeforeChild:
                // NOTE: the following line causes lookahead/lookbehind to be
                // NON-BACKTRACKING. It can be commented out with (*)
                Emit(RegexCode.Setjump);


                Emit(RegexCode.Setmark);
                break;

            case RegexNode.Require | AfterChild:
                Emit(RegexCode.Getmark);

                // NOTE: the following line causes lookahead/lookbehind to be
                // NON-BACKTRACKING. It can be commented out with (*)
                Emit(RegexCode.Forejump);

                break;

            case RegexNode.Prevent | BeforeChild:
                Emit(RegexCode.Setjump);
                _intStack.Push(_emitted.Count);
                Emit(RegexCode.Lazybranch, 0);
                break;

            case RegexNode.Prevent | AfterChild:
                Emit(RegexCode.Backjump);
                PatchJump(_intStack.Pop(), _emitted.Count);
                Emit(RegexCode.Forejump);
                break;

            case RegexNode.Greedy | BeforeChild:
                Emit(RegexCode.Setjump);
                break;

            case RegexNode.Greedy | AfterChild:
                Emit(RegexCode.Forejump);
                break;

            case RegexNode.One:
            case RegexNode.Notone:
                Emit(node.NType | bits, node.Ch);
                break;

            case RegexNode.Notoneloop:
            case RegexNode.Notonelazy:
            case RegexNode.Oneloop:
            case RegexNode.Onelazy:
                if (node.M > 0)
                {
                    Emit(((node.NType == RegexNode.Oneloop || node.NType == RegexNode.Onelazy) ?
                          RegexCode.Onerep : RegexCode.Notonerep) | bits, node.Ch, node.M);
                }
                if (node.N > node.M)
                {
                    Emit(node.NType | bits, node.Ch, node.N == int.MaxValue ?
                         int.MaxValue : node.N - node.M);
                }
                break;

            case RegexNode.Setloop:
            case RegexNode.Setlazy:
                if (node.M > 0)
                {
                    Emit(RegexCode.Setrep | bits, StringCode(node.Str), node.M);
                }
                if (node.N > node.M)
                {
                    Emit(node.NType | bits, StringCode(node.Str),
                         (node.N == int.MaxValue) ? int.MaxValue : node.N - node.M);
                }
                break;

            case RegexNode.Multi:
                Emit(node.NType | bits, StringCode(node.Str));
                break;

            case RegexNode.Set:
                Emit(node.NType | bits, StringCode(node.Str));
                break;

            case RegexNode.Ref:
                Emit(node.NType | bits, MapCapnum(node.M));
                break;

            case RegexNode.Nothing:
            case RegexNode.Bol:
            case RegexNode.Eol:
            case RegexNode.Boundary:
            case RegexNode.Nonboundary:
            case RegexNode.ECMABoundary:
            case RegexNode.NonECMABoundary:
            case RegexNode.Beginning:
            case RegexNode.Start:
            case RegexNode.EndZ:
            case RegexNode.End:
                Emit(node.NType);
                break;

            default:
                throw new ArgumentException($"Unexpected opcode in regular expression generation: {nodetype.ToString()}.");
            }
        }