static void _RunFromFA() { var expr = RegexExpression.Parse("(0*11*0[01]*)"); var fa = expr.ToFA <string>(); fa.IsAccepting = true; // modify the expression by changing the FSM var x = new CharFA <string>(); var y = new CharFA <string>(); var z = new CharFA <string>(true); x.InputTransitions.Add('a', y); y.InputTransitions.Add('b', y); y.InputTransitions.Add('c', z); fa = x; var test = "[A-Z_a-z][0-9A-Z_a-z]*"; //test = "ab*c"; test = "(foo)*bar"; fa = RegexExpression.Parse(test).ToFA <string>(); fa.RenderToFile(@"..\..\..\test_expr_nfa.jpg"); var ffa = fa.ToDfa(); ffa.RenderToFile(@"..\..\..\test_expr.jpg"); expr = RegexExpression.FromFA(fa); Console.WriteLine(expr); fa.RenderToFile(@"..\..\..\test_nfa.jpg"); var dfa = fa.ToDfa(); dfa.RenderToFile(@"..\..\..\test.jpg"); }
static void _RunStress2() { CharFA <string> fa = null; var min = 255; var max = 511; Console.Write("Building NFA matching integer values {0}-{1} ", min, max); for (var i = min; i <= max; ++i) { // for perf reasons we reduce every 12 times if (null == fa) { fa = CharFA <string> .Literal(i.ToString()); } else { fa = CharFA <string> .Or(new CharFA <string>[] { fa, CharFA <string> .Literal(i.ToString()) }); } if (0 == (i % 12)) { Console.Write('.'); } // replace the above "Console.Write('.');" line with below is MUCH faster // fa=fa.Reduce(new _ConsoleProgress()); } Console.WriteLine(); Console.WriteLine("C# integer NFA has {0} states.", fa.FillClosure().Count); fa = fa.Reduce(new _ConsoleProgress()); Console.WriteLine(); Console.WriteLine("C# integer DFA has {0} states.", fa.FillClosure().Count); Console.WriteLine("Rendering stress2.jpg"); fa.RenderToFile(@"..\..\..\stress2.jpg"); }
public override CharFA ToFA(EbnfDocument parent, Cfg cfg) { string sym = ""; if (null != parent) { sym = parent.GetContainingIdForExpression(this); } if (null == Right) { if (null == Left) { return(null); } var fa = Left.ToFA(parent, cfg); fa.FirstAcceptingState.AcceptingSymbol = sym; return(fa); } else if (null == Left) { var fa = Right.ToFA(parent, cfg); fa.FirstAcceptingState.AcceptingSymbol = sym; return(fa); } return(CharFA.Concat(new CharFA[] { Left.ToFA(parent, cfg), Right.ToFA(parent, cfg) }, sym)); }
public DebugTokenizer(Cfg cfg, CharFA lexer, IEnumerable <char> input) { _cfg = cfg; _lexer = lexer; _input = input; // we use the blockEnd attribute in the lexer to enable things like block comments and XML CDATA sections _PopulateAttrs(); }
public override CharFA ToFA(EbnfDocument parent, Cfg cfg) { if (null == Expression) { return(null); } return(CharFA.Optional(Expression.ToFA(parent, cfg), (null == parent) ? "" : parent.GetContainingIdForExpression(this))); }
static void _RunCompiledLexCodeGen() { // create our expressions var digits = CharFA <string> .Repeat( CharFA <string> .Set("0123456789"), 1, -1 , "Digits"); var word = CharFA <string> .Repeat( CharFA <string> .Set(new CharRange[] { new CharRange('A', 'Z'), new CharRange('a', 'z') }), 1, -1 , "Word"); var whitespace = CharFA <string> .Repeat( CharFA <string> .Set(" \t\r\n\v\f"), 1, -1 , "Whitespace"); // initialize our lexer var lexer = CharFA <string> .ToLexer(digits, word, whitespace); // create the symbol table (include the error symbol at index/id 3) var symbolTable = new string[] { "Digits", "Word", "Whitespace", "#ERROR" }; // create the DFA table we'll use to generate code var dfaTable = lexer.ToDfaStateTable(symbolTable); // create our new class var compClass = new CodeTypeDeclaration("RegexGenerated"); compClass.TypeAttributes = System.Reflection.TypeAttributes.Class; compClass.Attributes = MemberAttributes.Final | MemberAttributes.Static; // add the symbol table field - in production we'll set the name // to something more appropriate var symtblField = new CodeMemberField(typeof(string[]), "LexSymbols"); symtblField.Attributes = MemberAttributes.Static | MemberAttributes.Public; // generate the symbol table init code symtblField.InitExpression = CharFA <string> .GenerateSymbolTableInitializer(symbolTable); compClass.Members.Add(symtblField); // Generate and add the compiled lex method code compClass.Members.Add(CharFA <string> .GenerateLexMethod(dfaTable, 3)); // in production we'd change the name of the returned method // above // add the DFA table field - in production we'd change the name var dfatblField = new CodeMemberField(typeof(CharDfaEntry[]), "LexDfaTable"); dfatblField.Attributes = MemberAttributes.Static | MemberAttributes.Public; // generate the DFA state table init code dfatblField.InitExpression = CharFA <string> .GenerateDfaStateTableInitializer(dfaTable); compClass.Members.Add(dfatblField); // create the C# provider and generate the code // we'll usually want to put this in a namespace // but we haven't here var prov = CodeDomProvider.CreateProvider("cs"); prov.GenerateCodeFromType(compClass, Console.Out, new CodeGeneratorOptions()); }
static void Main(string[] args) { var fa = CharFA.Parse("fu(ba+r|baz)", "woo!"); var dfa = fa.ToDfa(); var closure = dfa.FillClosure(); var subset = closure[0].ClonePathTo(closure[5]); subset.RenderToFile(@"..\..\..\fa.jpg"); Console.WriteLine(dfa.IsLiteral); Console.WriteLine(subset.IsLiteral); }
public DebugTokenEnumerator(CharFA lexer, IDictionary <string, int> symbolIds, IDictionary <string, string> blockEnds, IEnumerable <char> @string) { _lexer = lexer; _symbolIds = symbolIds; _blockEnds = blockEnds; _input = @string.GetEnumerator(); _buffer = new StringBuilder(); _initialStates = _lexer.FillEpsilonClosure(); _state = -1; _line = 1; _column = 1; _position = 0; }
public override CharFA ToFA(EbnfDocument parent, Cfg cfg) { if (null == Expression) { return(null); } var result = CharFA.Repeat(Expression.ToFA(parent, cfg), (null == parent) ? "" : parent.GetContainingIdForExpression(this)); if (IsOptional) { result = CharFA.Optional(result); } return(result); }
/// <summary> /// Makes a simple lexer where each terminal is its own literal value. /// </summary> /// <returns>A lexer suitable for lexing the grammar</returns> public CharFA ToSimpleLexer() { var result = new CharFA(); foreach (var t in _EnumTerminals()) { if ("#ERROR" != t && "#EOS" != t) { result.EpsilonTransitions.Add(CharFA.Literal(t, t)); } } result = result.ToDfa(); return(result); }
public CharFA ToLexer(Cfg cfg) { var result = new CharFA(); foreach (var prod in Productions) { var exp = prod.Value.Expression; CharFA fa = null; if (prod.Value.IsTerminal) { fa = prod.Value.Expression.ToFA(this, cfg); result.EpsilonTransitions.Add(fa); } } return(result); }
public SpecflowStepInfo( string classFullName, string methodName, GherkinStepKind stepKind, string pattern, [CanBeNull] Regex regex, [CanBeNull] CharFA <string> regexForPartialMatch, List <Regex> regexesPerCapture ) { ClassFullName = classFullName; MethodName = methodName; StepKind = stepKind; Pattern = pattern; Regex = regex; RegexForPartialMatch = regexForPartialMatch; RegexesPerCapture = regexesPerCapture; }
static void _RunStress() { // C# keywords const string cskw = "abstract|add|as|ascending|async|await|base|bool|break|byte|case|catch|char|checked|class|const|continue|decimal|default|delegate|descending|do|double|dynamic|else|enum|equals|explicit|extern|false|finally|fixed|float|for|foreach|get|global|goto|if|implicit|int|interface|internal|is|lock|long|namespace|new|null|object|operator|out|override|params|partial|private|protected|public|readonly|ref|remove|return|sbyte|sealed|set|short|sizeof|stackalloc|static|string|struct|switch|this|throw|true|try|typeof|uint|ulong|unchecked|unsafe|ushort|using|var|virtual|void|volatile|while|yield"; var expr = RegexExpression.Parse(cskw); var fa = expr.ToFA(""); Console.WriteLine("C# keyword NFA has {0} states.", fa.FillClosure().Count); Console.WriteLine("Reducing C# keywords"); // very expensive in this case fa = fa.Reduce(new _ConsoleProgress()); Console.WriteLine(); Console.WriteLine("C# keyword DFA has {0} states.", fa.FillClosure().Count); var dopt = new CharFA <string> .DotGraphOptions(); dopt.Dpi = 150; // make the image smaller Console.WriteLine("Rendering stress.jpg"); fa.RenderToFile(@"..\..\..\stress.jpg", dopt); }
static void _RunMatch() { var test = "foo123_ _bar"; var word = CharFA <string> .Repeat( CharFA <string> .Set(new CharRange[] { new CharRange('A', 'Z'), new CharRange('a', 'z') }), 1, -1 , "Word"); var dfaWord = word.ToDfa(); var dfaTableWord = word.ToDfaStateTable(); CharFAMatch match; var pc = ParseContext.Create(test); Console.WriteLine("Matching words with an NFA:"); while (null != (match = word.Match(pc))) { Console.WriteLine("Found match at {0}: {1}", match.Position, match.Value); } Console.WriteLine(); pc = ParseContext.Create(test); Console.WriteLine("Matching words with a DFA:"); while (null != (match = dfaWord.MatchDfa(pc))) { Console.WriteLine("Found match at {0}: {1}", match.Position, match.Value); } Console.WriteLine(); pc = ParseContext.Create(test); Console.WriteLine("Matching words with a DFA state table:"); while (null != (match = CharFA <string> .MatchDfa(dfaTableWord, pc))) { Console.WriteLine("Found match at {0}: {1}", match.Position, match.Value); } Console.WriteLine(); pc = ParseContext.Create(test); Console.WriteLine("Matching words with a compiled DFA:"); while (null != (match = Match(pc))) { Console.WriteLine("Found match at {0}: {1}", match.Position, match.Value); } Console.WriteLine(); }
public override CharFA ToFA(EbnfDocument parent, Cfg cfg) { string sym = ""; if (null != parent) { sym = parent.GetContainingIdForExpression(this); } if (null == Right) { if (null == Left) { return(null); } return(CharFA.Optional(Left.ToFA(parent, cfg), sym)); } else if (null == Left) { return(CharFA.Optional(Right.ToFA(parent, cfg), sym)); } return(CharFA.Or(new CharFA[] { Left.ToFA(parent, cfg), Right.ToFA(parent, cfg) }, sym)); }
static void _RunStress2() { CharFA <string> fa = null; var min = 599; var max = 639; Console.Write("Building NFA matching integer values {0}-{1} ", min, max); for (var i = min; i <= max; ++i) { if (null == fa) { fa = CharFA <string> .Literal(i.ToString()); } else { fa = CharFA <string> .Or(new CharFA <string>[] { fa, CharFA <string> .Literal(i.ToString()) }); } // for perf reasons we can reduce every 12 times if (0 == (i % 12)) { Console.Write('.'); } // replace the above "Console.Write('.');" line with below is MUCH faster // fa=fa.Reduce(new _ConsoleProgress()); } Console.WriteLine(); fa.TrimNeutrals(); //fa.TrimDuplicates(); Console.WriteLine("C# integer NFA has {0} states.", fa.FillClosure().Count); fa.RenderToFile(@"..\..\..\stress2_nfa.jpg"); fa = fa.Reduce(new _ConsoleProgress()); Console.WriteLine(); Console.WriteLine("C# integer DFA has {0} states.", fa.FillClosure().Count); //var expr = RegexExpression.FromFA(fa); //Console.WriteLine("Final Expression: {0}", expr); Console.WriteLine("Rendering stress2.jpg"); fa.RenderToFile(@"..\..\..\stress2.jpg"); }
/// <summary> /// This is where the work happens /// </summary> /// <returns>The symbol that was matched. members _state _line,_column,_position,_buffer and _input are also modified.</returns> string _Lex() { string acc; var states = _initialStates; _buffer.Clear(); switch (_state) { case -1: // initial if (!_MoveNextInput()) { _state = -2; acc = _GetAcceptingSymbol(states); if (null != acc) { return(acc); } else { return("#ERROR"); } } _state = 0; // running break; case -2: // end of stream return("#EOS"); } // Here's where we run most of the match. FillMove runs one interation of the NFA state machine. // We match until we can't match anymore (greedy matching) and then report the symbol of the last // match we found, or an error ("#ERROR") if we couldn't find one. while (true) { var next = CharFA.FillMove(states, _input.Current); if (0 == next.Count) // couldn't find any states { break; } _buffer.Append(_input.Current); states = next; if (!_MoveNextInput()) { // end of stream _state = -2; acc = _GetAcceptingSymbol(states); if (null != acc) // do we accept? { return(acc); } else { return("#ERROR"); } } } acc = _GetAcceptingSymbol(states); if (null != acc) // do we accept? { string be; if (_blockEnds.TryGetValue(acc, out be) && !string.IsNullOrEmpty(be as string)) { // we have to resolve our blockends. This is tricky. We break out of the FA // processing and instead we loop until we match the block end. We have to // be very careful when we match only partial block ends and we have to // handle the case where there's no terminating block end. var more = true; while (more) { while (more) { if (_input.Current != be[0]) { _buffer.Append(_input.Current); more = _MoveNextInput(); if (!more) { return("#ERROR"); } break; } else { var i = 0; var found = true; while (i < be.Length && _input.Current == be[i]) { if (!(more = _MoveNextInput())) { ++i; found = false; if (i < be.Length) { acc = "#ERROR"; } break; } ++i; } if (be.Length != i) { found = false; } if (!found) { _buffer.Append(be.Substring(0, i)); } else { more = false; _buffer.Append(be); break; } if (found) { more = _MoveNextInput(); if (!more) { break; } } } } } } return(acc); } else { // handle the error condition _buffer.Append(_input.Current); if (!_MoveNextInput()) { _state = -2; } return("#ERROR"); } }
public override CharFA ToFA(EbnfDocument parent, Cfg cfg) { return(CharFA.Literal(Value, (null == parent) ? "" : parent.GetIdForExpression(this))); }
void _ValidateExpression(EbnfExpression expr, IDictionary <string, int> refCounts, IList <EbnfMessage> messages) { var l = expr as EbnfLiteralExpression; if (null != l) { var i = GetIdForExpression(l); // don't count itself. only things just like itself if (!string.IsNullOrEmpty(i) && !ReferenceEquals(Productions[i].Expression, l)) { refCounts[i] += 1; } } var rx = expr as EbnfRegexExpression; if (null != rx) { try { CharFA.Parse(rx.Value); } catch (ExpectingException) { messages.Add( new EbnfMessage( EbnfErrorLevel.Error, 12, "Invalid regular expression", expr.Line, expr.Column, expr.Position)); } var i = GetIdForExpression(rx); if (!string.IsNullOrEmpty(i) && !ReferenceEquals(Productions[i].Expression, l)) { refCounts[i] += 1; } } var r = expr as EbnfRefExpression; if (null != r) { int rc; if (null == r.Symbol) { messages.Add( new EbnfMessage( EbnfErrorLevel.Error, 4, "Null reference expression", expr.Line, expr.Column, expr.Position)); return; } if (!refCounts.TryGetValue(r.Symbol, out rc)) { messages.Add( new EbnfMessage( EbnfErrorLevel.Error, 1, string.Concat( "Reference to undefined symbol \"", r.Symbol, "\""), expr.Line, expr.Column, expr.Position)); return; } refCounts[r.Symbol] = rc + 1; return; } var b = expr as EbnfBinaryExpression; if (null != b) { if (null == b.Left && null == b.Right) { messages.Add( new EbnfMessage( EbnfErrorLevel.Warning, 3, "Nil expression", expr.Line, expr.Column, expr.Position)); return; } _ValidateExpression(b.Left, refCounts, messages); _ValidateExpression(b.Right, refCounts, messages); return; } var u = expr as EbnfUnaryExpression; if (null != u) { if (null == u.Expression) { messages.Add( new EbnfMessage( EbnfErrorLevel.Warning, 3, "Nil expression", expr.Line, expr.Column, expr.Position)); return; } _ValidateExpression(u.Expression, refCounts, messages); } }
static void _BuildArticleImages() { // this generates the figures used in the code project article // at https://www.codeproject.com/Articles/5251476/How-to-Build-a-Regex-Engine-in-Csharp var litA = CharFA <string> .Literal("ABC", "Accept"); litA.RenderToFile(@"..\..\..\literal.jpg"); var litAa = CharFA <string> .CaseInsensitive(litA, "Accept"); litAa.RenderToFile(@"..\..\..\literal_ci.jpg"); var opt = CharFA <string> .Optional(litA, "Accept"); opt.RenderToFile(@"..\..\..\optional.jpg"); var litB = CharFA <string> .Literal("DEF"); var or = CharFA <string> .Or(new CharFA <string>[] { litA, litB }, "Accept"); or.RenderToFile(@"..\..\..\or.jpg"); var set = CharFA <string> .Set("ABC", "Accept"); set.RenderToFile(@"..\..\..\set.jpg"); var loop = CharFA <string> .Repeat(litA, 1, -1, "Accept"); loop.RenderToFile(@"..\..\..\repeat.jpg"); var concat = CharFA <string> .Concat(new CharFA <string>[] { litA, litB }, "Accept"); concat.RenderToFile(@"..\..\..\concat.jpg"); var foobar = CharFA <string> .Or(new CharFA <string>[] { CharFA <string> .Literal("foo"), CharFA <string> .Literal("bar") }, "Accept"); foobar.RenderToFile(@"..\..\..\foobar_nfa.jpg"); var rfoobar = foobar.Reduce(); rfoobar.RenderToFile(@"..\..\..\foobar.jpg"); var lfoobar = CharFA <string> .Repeat(foobar, 1, -1, "Accept"); lfoobar.RenderToFile(@"..\..\..\foobar_loop_nfa.jpg"); var rlfoobar = lfoobar.Reduce(); rlfoobar.RenderToFile(@"..\..\..\foobar_loop.jpg"); var digits = CharFA <string> .Repeat( CharFA <string> .Set("0123456789"), 1, -1 , "Digits"); var word = CharFA <string> .Repeat( CharFA <string> .Set(new CharRange[] { new CharRange('A', 'Z'), new CharRange('a', 'z') }), 1, -1 , "Word"); var whitespace = CharFA <string> .Repeat( CharFA <string> .Set(" \t\r\n\v\f"), 1, -1 , "Whitespace"); var lexer = CharFA <string> .ToLexer(digits, word, whitespace); lexer.RenderToFile(@"..\..\..\lexer.jpg"); var dopt = new CharFA <string> .DotGraphOptions(); dopt.DebugSourceNfa = lexer; var dlexer = lexer.ToDfa(); dlexer.RenderToFile(@"..\..\..\dlexer.jpg", dopt ); dlexer.RenderToFile(@"..\..\..\dlexer2.jpg"); var dom = RegexExpression.Parse("(ABC|DEF)+"); var fa = dom.ToFA("Accept"); fa.RenderToFile(@"..\..\..\ABCorDEFloop.jpg"); }
static void _RunLexer() { var digits = CharFA <string> .Repeat( CharFA <string> .Set("0123456789"), 1, -1 , "Digits"); var word = CharFA <string> .Repeat( CharFA <string> .Set(new CharRange[] { new CharRange('A', 'Z'), new CharRange('a', 'z') }), 1, -1 , "Word"); var whitespace = CharFA <string> .Repeat( CharFA <string> .Set(" \t\r\n\v\f"), 1, -1 , "Whitespace"); var lexer = CharFA <string> .ToLexer(digits, word, whitespace); var lexerDfa = lexer.ToDfa(); lexerDfa.TrimDuplicates(); // we use a symbol table with the DFA state table to map ids back to strings var symbolTable = new string[] { "Digits", "Word", "Whitespace", "#ERROR" }; // make sure to pass the symbol table if you're using one var dfaTable = lexer.ToDfaStateTable(symbolTable); var test = "foo123_ _bar"; Console.WriteLine("Lex using the NFA"); // create a parse context over our test string var pc = ParseContext.Create(test); // while not end of input while (-1 != pc.Current) { // clear the capture so that we don't keep appending the token data pc.ClearCapture(); // lex the next token var acc = lexer.Lex(pc, "#ERROR"); // write the result Console.WriteLine("{0}: {1}", acc, pc.GetCapture()); } Console.WriteLine(); Console.WriteLine("Lex using the DFA"); // create a new parse context over our test string // because our old parse context is now past the end pc = ParseContext.Create(test); while (-1 != pc.Current) { pc.ClearCapture(); // lex using the DFA. This works exactly like // the previous Lex method except that it's // optimized for DFA traversal. // DO NOT use this with an NFA. It won't work // but won't error (can't check for perf reasons) var acc = lexerDfa.LexDfa(pc, "#ERROR"); // write the result Console.WriteLine("{0}: {1}", acc, pc.GetCapture()); } Console.WriteLine(); Console.WriteLine("Lex using the DFA state table"); pc = ParseContext.Create(test); while (-1 != pc.Current) { pc.ClearCapture(); // Lex using our DFA table. This is a little different // because it's a static method that takes CharDfaEntry[] // as its first parameter. It also uses symbol ids instead // of the actual symbol. You must map them back using the // symbol table you created earlier. var acc = CharFA <string> .LexDfa(dfaTable, pc, 3); // when we write this, we map our symbol id back to the // symbol using our symbol table Console.WriteLine("{0}: {1}", symbolTable[acc], pc.GetCapture()); } Console.WriteLine(); Console.WriteLine("Lex using our compiled lex method"); pc = ParseContext.Create(test); while (-1 != pc.Current) { pc.ClearCapture(); // Lex using our compiledDFA. Like the table driven lex // this also uses symbol ids instead of the actual symbol. var acc = Lex(pc); // when we write this, we map our symbol id back to the // symbol using our symbol table Console.WriteLine("{0}: {1}", symbolTable[acc], pc.GetCapture()); } Console.WriteLine(); }