public void LoopsReducedWithAutoAtomic() { (RegexTree tree, AnalysisResults analysis) = Analyze("a*(b*)c*"); RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode concat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: false, captures: true); RegexNode aStar = AssertNode(analysis, concat.Child(0), RegexNodeKind.Oneloopatomic, atomicByAncestor: false, backtracks: false, captures: false); RegexNode implicitBumpalong = AssertNode(analysis, concat.Child(1), RegexNodeKind.UpdateBumpalong, atomicByAncestor: false, backtracks: false, captures: false); RegexNode bStarCapture = AssertNode(analysis, concat.Child(2), RegexNodeKind.Capture, atomicByAncestor: false, backtracks: false, captures: true); RegexNode cStar = AssertNode(analysis, concat.Child(3), RegexNodeKind.Oneloopatomic, atomicByAncestor: true, backtracks: false, captures: false); RegexNode bStar = AssertNode(analysis, bStarCapture.Child(0), RegexNodeKind.Oneloopatomic, atomicByAncestor: false, backtracks: false, captures: false); }
public void SimpleString() { (RegexTree tree, AnalysisResults analysis) = Analyze("abc"); RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode abc = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Multi, atomicByAncestor: true, backtracks: false, captures: false); }
private SymbolicFiniteAutomaton <TConstraint> ConvertNode(RegexNode node, int minStateId, bool isStart, bool isEnd) { return(node._type switch { RegexNode.Oneloop => ConvertNodeOneloop(node, minStateId, isStart, isEnd), RegexNode.Notoneloop => ConvertNodeNotoneloop(node, minStateId, isStart, isEnd), RegexNode.Setloop => ConvertNodeSetloop(node, minStateId, isStart, isEnd), RegexNode.Onelazy or RegexNode.Notonelazy or RegexNode.Setlazy => throw new RexException(RexException.NotSupported), RegexNode.One => ConvertNodeOne(node, minStateId, isStart, isEnd), RegexNode.Notone => ConvertNodeNotone(node, minStateId, isStart, isEnd), RegexNode.Set => ConvertNodeSet(node, minStateId, isStart, isEnd), RegexNode.Multi => ConvertNodeMulti(node, minStateId, isStart, isEnd), RegexNode.Ref => throw new RexException(RexException.NotSupported), RegexNode.Bol => ConvertNodeBol(minStateId, isStart, isEnd), RegexNode.Eol => ConvertNodeEol(minStateId, isStart, isEnd), RegexNode.Boundary or RegexNode.Nonboundary => throw new RexException(RexException.NotSupported), RegexNode.Beginning => ConvertNodeBeginning(minStateId, isStart, isEnd), RegexNode.Start => throw new RexException(RexException.NotSupported), RegexNode.EndZ => ConvertNodeEndZ(minStateId, isStart, isEnd), RegexNode.End => ConvertNodeEnd(minStateId, isStart, isEnd), RegexNode.Nothing => throw new RexException(RexException.NotSupported), RegexNode.Empty => ConvertNodeEmpty(minStateId, isStart, isEnd), RegexNode.Alternate => ConvertNodeAlternate(node, minStateId, isStart, isEnd), RegexNode.Concatenate => ConvertNodeConcatenate(node, minStateId, isStart, isEnd), RegexNode.Loop => ConvertNodeLoop(node, minStateId, isStart, isEnd), RegexNode.Lazyloop => throw new RexException(RexException.NotSupported), RegexNode.Capture => ConvertNode(node.Child(0), minStateId, isStart, isEnd), RegexNode.Group or RegexNode.Require or RegexNode.Prevent or RegexNode.Greedy or RegexNode.Testref or RegexNode.Testgroup or RegexNode.ECMABoundary or RegexNode.NonECMABoundary => throw new RexException(RexException.NotSupported), _ => throw new RexException(RexException.UnrecognizedRegex), });
private void ConvertNode(RegexNode node) { switch (node._type) { case RegexNode.Alternate: { ConvertNodeAlternate(node); return; } case RegexNode.Beginning: { ConvertNodeBeginning(node); return; } case RegexNode.Bol: { ConvertNodeBol(node); return; } case RegexNode.Capture: // (...) { ConvertNode(node.Child(0)); return; } case RegexNode.Concatenate: { ConvertNodeConcatenate(node); return; } case RegexNode.Empty: { ConvertNodeEmpty(node); return; } case RegexNode.End: { ConvertNodeEnd(node); return; } case RegexNode.EndZ: { ConvertNodeEndZ(node); return; } case RegexNode.Eol: { ConvertNodeEol(node); return; } case RegexNode.Loop: { ConvertNodeLoop(node); return; } case RegexNode.Multi: { ConvertNodeMulti(node); return; } case RegexNode.Notone: { ConvertNodeNotone(node); return; } case RegexNode.Notoneloop: { ConvertNodeNotoneloop(node); return; } case RegexNode.One: { ConvertNodeOne(node); return; } case RegexNode.Oneloop: { ConvertNodeOneloop(node); return; } case RegexNode.Set: { ConvertNodeSet(node); return; } case RegexNode.Setloop: { ConvertNodeSetloop(node); return; } default: throw new AutomataException(AutomataExceptionKind.RegexConstructNotSupported); } }
public void AtomicGroupAroundBacktracking() { (RegexTree tree, AnalysisResults analysis) = Analyze("[ab]*(?>[bc]*[cd])[ef]"); RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: true, captures: true, inLoop: false); RegexNode rootConcat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: true, captures: false, inLoop: false); RegexNode abStar = AssertNode(analysis, rootConcat.Child(0), RegexNodeKind.Setloop, atomicByAncestor: false, backtracks: true, captures: false, inLoop: false); RegexNode implicitBumpalong = AssertNode(analysis, rootConcat.Child(1), RegexNodeKind.UpdateBumpalong, atomicByAncestor: false, backtracks: false, captures: false, inLoop: false); RegexNode atomic = AssertNode(analysis, rootConcat.Child(2), RegexNodeKind.Atomic, atomicByAncestor: false, backtracks: false, captures: false, inLoop: false); RegexNode ef = AssertNode(analysis, rootConcat.Child(3), RegexNodeKind.Set, atomicByAncestor: true, backtracks: false, captures: false, inLoop: false); RegexNode atomicConcat = AssertNode(analysis, atomic.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: true, captures: false, inLoop: false); RegexNode bcStar = AssertNode(analysis, atomicConcat.Child(0), RegexNodeKind.Setloop, atomicByAncestor: false, backtracks: true, captures: false, inLoop: false); RegexNode cd = AssertNode(analysis, atomicConcat.Child(1), RegexNodeKind.Set, atomicByAncestor: true, backtracks: false, captures: false, inLoop: false); }
public void AlternationWithCaptures() { (RegexTree tree, AnalysisResults analysis) = Analyze("abc|d(e)f|(ghi)"); RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode implicitAtomic = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Atomic, atomicByAncestor: true, backtracks: false, captures: true); RegexNode alternation = AssertNode(analysis, implicitAtomic.Child(0), RegexNodeKind.Alternate, atomicByAncestor: true, backtracks: false, captures: true); RegexNode abc = AssertNode(analysis, alternation.Child(0), RegexNodeKind.Multi, atomicByAncestor: true, backtracks: false, captures: false); RegexNode def = AssertNode(analysis, alternation.Child(1), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: false, captures: true); RegexNode ghiCapture = AssertNode(analysis, alternation.Child(2), RegexNodeKind.Capture, atomicByAncestor: true, backtracks: false, captures: true); RegexNode d = AssertNode(analysis, def.Child(0), RegexNodeKind.One, atomicByAncestor: false, backtracks: false, captures: false); RegexNode eCapture = AssertNode(analysis, def.Child(1), RegexNodeKind.Capture, atomicByAncestor: false, backtracks: false, captures: true); RegexNode f = AssertNode(analysis, def.Child(2), RegexNodeKind.One, atomicByAncestor: true, backtracks: false, captures: false); RegexNode e = AssertNode(analysis, eCapture.Child(0), RegexNodeKind.One, atomicByAncestor: false, backtracks: false, captures: false); RegexNode ghi = AssertNode(analysis, ghiCapture.Child(0), RegexNodeKind.Multi, atomicByAncestor: true, backtracks: false, captures: false); }
public void LoopsAroundVariousConstructs() { (RegexTree tree, AnalysisResults analysis) = Analyze("(abc|def)*(?:[ab]*[cd])+?d"); RegexNode rootCapture = AssertNode(analysis, tree.Root, RegexNodeKind.Capture, atomicByAncestor: true, backtracks: true, captures: true, inLoop: false); RegexNode rootConcat = AssertNode(analysis, rootCapture.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: true, backtracks: true, captures: true, inLoop: false); RegexNode loop = AssertNode(analysis, rootConcat.Child(0), RegexNodeKind.Loop, atomicByAncestor: false, backtracks: true, captures: true, inLoop: false); RegexNode loopCapture = AssertNode(analysis, loop.Child(0), RegexNodeKind.Capture, atomicByAncestor: false, backtracks: true, captures: true, inLoop: true); RegexNode alternation = AssertNode(analysis, loopCapture.Child(0), RegexNodeKind.Alternate, atomicByAncestor: false, backtracks: true, captures: false, inLoop: true); RegexNode abc = AssertNode(analysis, alternation.Child(0), RegexNodeKind.Multi, atomicByAncestor: false, backtracks: false, captures: false, inLoop: true); RegexNode def = AssertNode(analysis, alternation.Child(1), RegexNodeKind.Multi, atomicByAncestor: false, backtracks: false, captures: false, inLoop: true); RegexNode lazyLoop = AssertNode(analysis, rootConcat.Child(1), RegexNodeKind.Lazyloop, atomicByAncestor: false, backtracks: true, captures: false, inLoop: false); RegexNode lazyLoopConcat = AssertNode(analysis, lazyLoop.Child(0), RegexNodeKind.Concatenate, atomicByAncestor: false, backtracks: false, captures: false, inLoop: true); RegexNode abStar = AssertNode(analysis, lazyLoopConcat.Child(0), RegexNodeKind.Setloopatomic, atomicByAncestor: false, backtracks: false, captures: false, inLoop: true); RegexNode cd = AssertNode(analysis, lazyLoopConcat.Child(1), RegexNodeKind.Set, atomicByAncestor: false, backtracks: false, captures: false, inLoop: true); RegexNode d = AssertNode(analysis, rootConcat.Child(2), RegexNodeKind.One, atomicByAncestor: true, backtracks: false, captures: false, inLoop: false); }
internal SymbolicRegexNode <S> ConvertNodeToSymbolicRegex(RegexNode node, bool topLevel) { switch (node._type) { case RegexNode.Alternate: return(this.srBuilder.MkOr(Array.ConvertAll(node._children.ToArray(), x => ConvertNodeToSymbolicRegex(x, topLevel)))); case RegexNode.Beginning: return(this.srBuilder.startAnchor); case RegexNode.Bol: return(this.srBuilder.bolAnchor); case RegexNode.Capture: //paranthesis (...) return(ConvertNodeToSymbolicRegex(node.Child(0), topLevel)); case RegexNode.Concatenate: return(this.srBuilder.MkConcat(Array.ConvertAll(node._children.ToArray(), x => ConvertNodeToSymbolicRegex(x, false)), topLevel)); case RegexNode.Empty: return(this.srBuilder.epsilon); case RegexNode.End: case RegexNode.EndZ: return(this.srBuilder.endAnchor); case RegexNode.Eol: return(this.srBuilder.eolAnchor); case RegexNode.Loop: return(this.srBuilder.MkLoop(ConvertNodeToSymbolicRegex(node._children[0], false), false, node._m, node._n)); case RegexNode.Lazyloop: return(this.srBuilder.MkLoop(ConvertNodeToSymbolicRegex(node._children[0], false), true, node._m, node._n)); case RegexNode.Multi: return(ConvertNodeMultiToSymbolicRegex(node, topLevel)); case RegexNode.Notone: return(ConvertNodeNotoneToSymbolicRegex(node)); case RegexNode.Notoneloop: return(ConvertNodeNotoneloopToSymbolicRegex(node, false)); case RegexNode.Notonelazy: return(ConvertNodeNotoneloopToSymbolicRegex(node, true)); case RegexNode.One: return(ConvertNodeOneToSymbolicRegex(node)); case RegexNode.Oneloop: return(ConvertNodeOneloopToSymbolicRegex(node, false)); case RegexNode.Onelazy: return(ConvertNodeOneloopToSymbolicRegex(node, true)); case RegexNode.Set: return(ConvertNodeSetToSymbolicRegex(node)); case RegexNode.Setloop: return(ConvertNodeSetloopToSymbolicRegex(node, false)); case RegexNode.Setlazy: return(ConvertNodeSetloopToSymbolicRegex(node, true)); case RegexNode.Testgroup: return(MkIfThenElse(ConvertNodeToSymbolicRegex(node._children[0], false), ConvertNodeToSymbolicRegex(node._children[1], false), ConvertNodeToSymbolicRegex(node._children[2], false))); case RegexNode.ECMABoundary: case RegexNode.Boundary: throw new AutomataException(@"Not implemented: word-boundary \b"); case RegexNode.Nonboundary: case RegexNode.NonECMABoundary: throw new AutomataException(@"Not implemented: non-word-boundary \B"); case RegexNode.Nothing: throw new AutomataException(@"Not implemented: Nothing"); case RegexNode.Greedy: throw new AutomataException("Not implemented: greedy constructs (?>) (?<)"); case RegexNode.Start: throw new AutomataException(@"Not implemented: \G"); case RegexNode.Group: throw new AutomataException("Not supported: grouping (?:)"); case RegexNode.Prevent: throw new AutomataException("Not supported: prevent constructs (?!) (?<!)"); case RegexNode.Require: throw new AutomataException("Not supported: require constructs (?=) (?<=)"); case RegexNode.Testref: throw new AutomataException("Not supported: test construct (?(n) | )"); case RegexNode.Ref: throw new AutomataException(@"Not supported: references \1"); default: throw new AutomataException(@"Unexpected regex construct"); } }
internal Automaton <S> ConvertNode(RegexNode node) { //node = node.Reduce(); switch (node._type) { case RegexNode.Alternate: return(this.automBuilder.MkUnion(node._children.ToArray())); case RegexNode.Beginning: return(this.automBuilder.MkBeginning()); case RegexNode.Bol: return(this.automBuilder.MkBol(solver.MkCharConstraint('\n'))); case RegexNode.Capture: //paranthesis (...) return(ConvertNode(node.Child(0))); case RegexNode.Concatenate: return(this.automBuilder.MkConcatenate(node._children.ToArray())); case RegexNode.Empty: return(this.automBuilder.MkEmptyWord()); case RegexNode.End: case RegexNode.EndZ: return(this.automBuilder.MkEnd()); case RegexNode.Eol: return(this.automBuilder.MkEol(solver.MkCharConstraint('\n'))); case RegexNode.Lazyloop: case RegexNode.Loop: return(automBuilder.MkLoop(node._children[0], node._m, node._n)); case RegexNode.Multi: return(ConvertNodeMulti(node)); case RegexNode.Notonelazy: case RegexNode.Notone: return(ConvertNodeNotone(node)); case RegexNode.Notoneloop: return(ConvertNodeNotoneloop(node)); case RegexNode.Onelazy: case RegexNode.One: return(ConvertNodeOne(node)); case RegexNode.Oneloop: return(ConvertNodeOneloop(node)); case RegexNode.Setlazy: case RegexNode.Set: return(ConvertNodeSet(node)); case RegexNode.Setloop: return(ConvertNodeSetloop(node)); case RegexNode.ECMABoundary: case RegexNode.Boundary: return(automBuilder.MkWordBoundary()); case RegexNode.Nothing: return(automBuilder.MkEmptyAutomaton()); //currently not supported cases //case RegexNode.Lazyloop: //throw new AutomataException("Regex construct not supported: lazy constructs *? +? ?? {,}?"); //case RegexNode.Notonelazy: // throw new AutomataException("Regex construct not supported: lazy construct .*?"); //case RegexNode.Onelazy: // throw new AutomataException("Regex construct not supported: lazy construct a*?"); //case RegexNode.Setlazy: // throw new AutomataException(@"Regex construct not supported: lazy construct \d*?"); case RegexNode.Nonboundary: case RegexNode.NonECMABoundary: throw new AutomataException(@"Regex construct not supported: \B"); case RegexNode.Greedy: throw new AutomataException("Regex construct not supported: greedy constructs (?>) (?<)"); case RegexNode.Group: throw new AutomataException("Regex construct not supported: grouping (?:)"); case RegexNode.Prevent: throw new AutomataException("Regex construct not supported: prevent constructs (?!) (?<!)"); case RegexNode.Require: throw new AutomataException("Regex construct not supported: require constructs (?=) (?<=)"); case RegexNode.Testgroup: throw new AutomataException("Regex construct not supported: test construct (?(...) | )"); case RegexNode.Testref: throw new AutomataException("Regex construct not supported: test cosntruct (?(n) | )"); case RegexNode.Ref: throw new AutomataException(@"Regex construct not supported: references \1"); case RegexNode.Start: throw new AutomataException(@"Regex construct not supported: \G"); default: throw new AutomataException(AutomataExceptionKind.UnrecognizedRegex); } }
/// <summary>Converts the root <see cref="RegexNode"/> into its corresponding <see cref="SymbolicRegexNode{S}"/>.</summary> /// <param name="root">The root node to convert.</param> /// <returns>The generated <see cref="SymbolicRegexNode{S}"/> that corresponds to the supplied <paramref name="root"/>.</returns> internal SymbolicRegexNode <BDD> ConvertToSymbolicRegexNode(RegexNode root) { Debug.Assert(_builder is not null); // Create the root list that will store the built-up result. DoublyLinkedList <SymbolicRegexNode <BDD> > rootResult = new(); // Create a stack to be processed in order to process iteratively rather than recursively, and push the root on. Stack <(RegexNode Node, bool TryToMarkFixedLength, DoublyLinkedList <SymbolicRegexNode <BDD> > Result, DoublyLinkedList <SymbolicRegexNode <BDD> >[]? ChildResults)> stack = new(); stack.Push((root, true, rootResult, CreateChildResultArray(root.ChildCount()))); // Continue to iterate until the stack is empty, popping the next item on each iteration. // Some popped items may be pushed back on as part of processing. while (stack.TryPop(out (RegexNode Node, bool TryToMarkFixedLength, DoublyLinkedList <SymbolicRegexNode <BDD> > Result, DoublyLinkedList <SymbolicRegexNode <BDD> >[]? ChildResults)popped)) { RegexNode node = popped.Node; DoublyLinkedList <SymbolicRegexNode <BDD> > result = popped.Result; DoublyLinkedList <SymbolicRegexNode <BDD> >[]? childResults = popped.ChildResults; Debug.Assert(childResults is null || childResults.Length != 0); if (childResults is null || childResults[0] is null) { // Child nodes have not been converted yet // Handle each node kind as-is appropriate. switch (node.Kind) { // Singletons and multis case RegexNodeKind.One: result.AddLast(_builder.CreateSingleton(_builder._solver.CreateFromChar(node.Ch))); break; case RegexNodeKind.Notone: result.AddLast(_builder.CreateSingleton(_builder._solver.Not(_builder._solver.CreateFromChar(node.Ch)))); break; case RegexNodeKind.Set: result.AddLast(ConvertSet(node)); break; case RegexNodeKind.Multi: { // Create a BDD for each character in the string and concatenate them. string?str = node.Str; Debug.Assert(str is not null); foreach (char c in str) { result.AddLast(_builder.CreateSingleton(_builder._solver.CreateFromChar(c))); } break; } // The following five cases are the only node kinds that are pushed twice: // Joins, general loops, and supported captures case RegexNodeKind.Concatenate: case RegexNodeKind.Alternate: case RegexNodeKind.Loop: case RegexNodeKind.Lazyloop: case RegexNodeKind.Capture when node.N == -1: // N == -1 because balancing groups (which have N >= 0) aren't supported { Debug.Assert(childResults is not null && childResults.Length == node.ChildCount()); // Push back the temporarily popped item. Next time this work item is seen, its ChildResults list will be ready. // Propagate the length mark check only in case of alternation. stack.Push(popped); bool mark = node.Kind == RegexNodeKind.Alternate && popped.TryToMarkFixedLength; // Push all the children to be converted for (int i = 0; i < node.ChildCount(); ++i) { childResults[i] = new DoublyLinkedList <SymbolicRegexNode <BDD> >(); stack.Push((node.Child(i), mark, childResults[i], CreateChildResultArray(node.Child(i).ChildCount()))); } break; } // Specialized loops case RegexNodeKind.Oneloop: case RegexNodeKind.Onelazy: case RegexNodeKind.Notoneloop: case RegexNodeKind.Notonelazy: { // Create a BDD that represents the character, then create a loop around it. BDD bdd = _builder._solver.CreateFromChar(node.Ch); if (node.IsNotoneFamily) { bdd = _builder._solver.Not(bdd); } result.AddLast(_builder.CreateLoop(_builder.CreateSingleton(bdd), node.Kind is RegexNodeKind.Onelazy or RegexNodeKind.Notonelazy, node.M, node.N)); break; } case RegexNodeKind.Setloop: case RegexNodeKind.Setlazy: { // Create a BDD that represents the set string, then create a loop around it. string?set = node.Str; Debug.Assert(set is not null); BDD setBdd = CreateBDDFromSetString(set); result.AddLast(_builder.CreateLoop(_builder.CreateSingleton(setBdd), node.Kind == RegexNodeKind.Setlazy, node.M, node.N)); break; } case RegexNodeKind.Empty: case RegexNodeKind.UpdateBumpalong: // UpdateBumpalong is a directive relevant only to backtracking and can be ignored just like Empty break; case RegexNodeKind.Nothing: result.AddLast(_builder._nothing); break; // Anchors case RegexNodeKind.Beginning: result.AddLast(_builder.BeginningAnchor); break; case RegexNodeKind.Bol: EnsureNewlinePredicateInitialized(); result.AddLast(_builder.BolAnchor); break; case RegexNodeKind.End: // \z anchor result.AddLast(_builder.EndAnchor); break; case RegexNodeKind.EndZ: // \Z anchor EnsureNewlinePredicateInitialized(); result.AddLast(_builder.EndAnchorZ); break; case RegexNodeKind.Eol: EnsureNewlinePredicateInitialized(); result.AddLast(_builder.EolAnchor); break; case RegexNodeKind.Boundary: EnsureWordLetterPredicateInitialized(); result.AddLast(_builder.BoundaryAnchor); break; case RegexNodeKind.NonBoundary: EnsureWordLetterPredicateInitialized(); result.AddLast(_builder.NonBoundaryAnchor); break; // Unsupported default: throw new NotSupportedException(SR.Format(SR.NotSupported_NonBacktrackingConflictingExpression, node.Kind switch { RegexNodeKind.Atomic or RegexNodeKind.Setloopatomic or RegexNodeKind.Oneloopatomic or RegexNodeKind.Notoneloopatomic => SR.ExpressionDescription_AtomicSubexpressions, RegexNodeKind.Backreference => SR.ExpressionDescription_Backreference, RegexNodeKind.BackreferenceConditional => SR.ExpressionDescription_Conditional, RegexNodeKind.Capture => SR.ExpressionDescription_BalancingGroup, RegexNodeKind.ExpressionConditional => SR.ExpressionDescription_IfThenElse, RegexNodeKind.NegativeLookaround => SR.ExpressionDescription_NegativeLookaround, RegexNodeKind.PositiveLookaround => SR.ExpressionDescription_PositiveLookaround, RegexNodeKind.Start => SR.ExpressionDescription_ContiguousMatches, _ => UnexpectedNodeType(node) }));