public void RegexStateMachine() { // Initialize unicode-related classes to avoid having their cost in the time computation var sw = Stopwatch.StartNew(); var provider = new UnicodeCharSetProvider(); provider.GetClassSet(CharSetClass.Digit); provider.GetClassSet(CharSetClass.Dot); provider.GetClassSet(CharSetClass.Space); provider.GetClassSet(CharSetClass.Word); UnicodeRanges.FromUnicodeName("InCombining_Diacritical_Marks"); sw.Stop(); this.output.WriteLine("Unicode init ms: " + sw.ElapsedMilliseconds); sw = Stopwatch.StartNew(); RegexLexer.CreateStateMachine(out var stateMachine, out var startStateId); var lapBuild = sw.ElapsedTicks; var sm = stateMachine.Compile(); var lapCompile = sw.ElapsedTicks; var startState = new Id <DfaState <LetterId> >(startStateId); sm(ref startState, ' '); var lapJit = sw.ElapsedTicks; sm(ref startState, ' '); sw.Stop(); this.output.WriteLine("Build ticks: " + lapBuild); this.output.WriteLine("Compile ticks: " + (lapCompile - lapBuild)); this.output.WriteLine("JIT ticks: " + (lapJit - lapCompile)); this.output.WriteLine("Exec ticks: " + (sw.ElapsedTicks - lapJit)); this.output.WriteLine("Total ms: " + sw.ElapsedMilliseconds); this.output.WriteLine(stateMachine.ToReadableString()); }
public void CharacterInSet(string ch, bool inSet, string unicodeName) { Codepoint parsedChar; if (ch.StartsWith("\\")) { Assert.True(((RangeSetHandle.Static)RegexMatchSet.ParseEscape(ch)).TryGetSingle(out parsedChar)); } else { parsedChar = ch.Single(); } Assert.Equal(inSet, UnicodeRanges.FromUnicodeName(unicodeName).Contains(parsedChar)); }
public static RangeSetHandle ParseEscape(string escape) { var match = rxEscape.Match(escape); if (!match.Success) { throw new ArgumentException("Escape is invalid", "escape"); } if (match.Groups["name"].Success) { return(new RangeSetHandle.Static(UnicodeRanges.FromUnicodeName(match.Groups["name"].Value), match.Groups["c"].Value == "P")); } if (match.Groups["hex"].Success) { return(new RangeSetHandle.Static(Codepoint.Parse(match.Groups["hex"].Value))); } var c = match.Groups["c"].Value[0]; switch (c) { case '0': return(new RangeSetHandle.Static('\0')); case 'r': return(new RangeSetHandle.Static('\r')); case 'n': return(new RangeSetHandle.Static('\n')); case 't': return(new RangeSetHandle.Static('\t')); case 'a': return(new RangeSetHandle.Static('\x07')); case 'e': return(new RangeSetHandle.Static('\x1B')); case 'f': return(new RangeSetHandle.Static('\x0C')); case 'v': return(new RangeSetHandle.Static('\x0B')); case 'd': return(new RangeSetHandle.Class(CharSetClass.Digit, false)); case 'D': return(new RangeSetHandle.Class(CharSetClass.Digit, true)); case 'w': return(new RangeSetHandle.Class(CharSetClass.Word, false)); case 'W': return(new RangeSetHandle.Class(CharSetClass.Word, true)); case 's': return(new RangeSetHandle.Class(CharSetClass.Space, false)); case 'S': return(new RangeSetHandle.Class(CharSetClass.Space, true)); default: if (char.IsLetterOrDigit(c)) { throw new ArgumentOutOfRangeException(nameof(escape), "Invalid escape character " + c); } return(new RangeSetHandle.Static(c)); } }
public static RegexMatchSet FromUnicode(string name, bool negate = false) { return(new RegexMatchSet($@"\{(negate ? 'P' : 'p')}{{{name}}}", new RangeSetHandle.Static(UnicodeRanges.FromUnicodeName(name), negate))); }