public override Core.Tokenization.Token Recognize(string s, int from, bool allowTokenBundles, ref int consumedLength) { // TODO this could test whether the match is followed by whitespace or non-word characters (punctuation etc) - // that would be simpler than inlcuding the respective constraints in the RX Token winner = null; int winningLength = 0; int winningPriority = 0; for (int p = 0; p < _Patterns.Count; ++p) { System.Text.RegularExpressions.Regex rx = _Patterns[p].Regex; CharacterSet first = _Patterns[p].First; // NOTE if the requirement that m.Index == from is dropped, we cannot use FIRST any more if (first != null && from < s.Length && !first.Contains(s[from])) { continue; } System.Text.RegularExpressions.Match m = rx.Match(s, from); if (m != null && m.Success && m.Index == from) { if (VerifyContextConstraints(s, m.Index + m.Value.Length)) { Token t = CreateToken(m.Value, m.Groups); // TODO set other token values? if (t != null && m.Length > 0) { // longest wins, if two matches are found with equal length, the one with // higher prio wins, or if both have same prio, first match wins if ((m.Length > winningLength) || winner == null || (m.Length == winningLength && _Patterns[p].Priority > winningPriority && !allowTokenBundles)) { winningLength = m.Length; winner = t; winningPriority = _Patterns[p].Priority; } else if (allowTokenBundles && m.Length == winningLength) { if (!(winner is TokenBundle)) { winner = new TokenBundle(winner, winningPriority); } ((TokenBundle)winner).Add(t, _Patterns[p].Priority); winningPriority = Math.Max(winningPriority, _Patterns[p].Priority); } } } } } if (winner != null) { consumedLength = winningLength; return(winner); } else { return(null); } }
private List <Core.Tokenization.Token> TokenizeInternal(string s, int currentRun, bool createWhitespaceTokens, bool allowTokenBundles) { List <Token> result = new List <Token>(); int p = 0; int sLen = s.Length; while (p < sLen) { int start = p; while (p < sLen && System.Char.IsWhiteSpace(s, p)) { ++p; } if (p > start) { if (createWhitespaceTokens) { Token t = new SimpleToken(s.Substring(start, p - start), TokenType.Whitespace); t.Span = new SegmentRange(currentRun, start, p - 1); result.Add(t); } start = p; } if (p >= sLen) { break; } // test which recognizer claims the longest prefix Recognizer winningRecognizer = null; int winningLength = 0; Token winningToken = null; const bool allowBundlesOfDifferentType = false; for (int r = 0; r < _Parameters.Count; ++r) { Recognizer rec = _Parameters[r]; int consumedLength = 0; Token t = rec.Recognize(s, start, allowTokenBundles, ref consumedLength); if (t != null) { if (winningRecognizer == null || (winningLength < consumedLength && !(winningRecognizer.OverrideFallbackRecognizer && rec.IsFallbackRecognizer))) { winningToken = t; winningRecognizer = rec; winningLength = consumedLength; p = start + consumedLength; } else if (allowTokenBundles && allowBundlesOfDifferentType) { Core.Tokenization.TokenBundle winningBundle = winningToken as Core.Tokenization.TokenBundle; if (winningBundle == null) { winningBundle = new TokenBundle(winningToken, winningRecognizer.Priority); winningToken = winningBundle; } else { winningBundle.Add(t, winningRecognizer.Priority); } System.Diagnostics.Debug.Assert(winningLength == consumedLength); System.Diagnostics.Debug.Assert(p == start + consumedLength); } else if (winningRecognizer.Priority < rec.Priority) { // same length, but lower priority - highest prio wins winningToken = t; winningRecognizer = rec; winningLength = consumedLength; p = start + consumedLength; } } } if (winningToken == null) { // none of the recognizers claimed any input, or there were no recognizers set up. // ultimate fallback required: group by same Unicode category // TODO scanning on just the category is too fine - we may want to group coarser categories together System.Globalization.UnicodeCategory cat = System.Char.GetUnicodeCategory(s, start); while (p < sLen && System.Char.GetUnicodeCategory(s, p) == cat) { ++p; } winningLength = p - start; // TODO distinguish result token type depending on the category winningToken = new SimpleToken(s.Substring(start, p - start), TokenType.Word); winningRecognizer = null; } else if (winningToken is TokenBundle) { // convert single-element token bundles to single tokens TokenBundle tb = winningToken as TokenBundle; if (tb.Count == 1) { winningToken = tb[0].Token; } } System.Diagnostics.Debug.Assert(winningLength > 0); System.Diagnostics.Debug.Assert(winningToken != null); winningToken.Span = new SegmentRange(currentRun, start, p - 1); result.Add(winningToken); } return(result); }