private IEnumerable<Span> SplitToken(string input, Span span) { var token = input.Substring(span.Start, span.Length()); if (string.IsNullOrEmpty(token)) { return new List<Span>(); } // optimization - don't tokenize token of 1 character or token with letters only if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token)) { return new List<Span>(){ span }; } var splitTokens = TokenizationRegex.Split(token); var spans = new List<Span>(); var currentStart = span.Start; foreach (var splitToken in splitTokens) { if (splitToken.Length > 0) { spans.Add(new Span(currentStart, currentStart + splitToken.Length)); } currentStart += splitToken.Length; } return spans; }
private List<Span> SplitToken(string input, Span span) { var token = input.Substring(span.Start, span.Length()); if (string.IsNullOrEmpty(token)) { return new List<Span>(); } // optimization - don't tokenize token of 1 character or token with letters only if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token)) { return new List<Span>(){ span }; } var splitTokens = new List<string>() { token }; foreach (var tokenizationRegex in TokenizationRegexes) { /*var tempSpans = new List<Span>(); foreach (var tempSpan in spans) { var tempToken = input.Substring(tempSpan.Start, tempSpan.Length()); var matches = tokenizationRegex.Matches(tempToken); var matchIndices = new List<int>(); for (int i = 0; i < matches.Count; i++) { var index = matches[i].Index; if (0 < index && index < tempToken.Length) { matchIndices.Add(index); } } if (matchIndices.Any()) { for (var i = 0; i < matchIndices.Count; i++) { var start = i == 0 ? 0 : matchIndices[i - 1] - 1; tempSpans.Add(new Span(tempSpan.Start + start, tempSpan.Start + matchIndices[i])); } // add last one tempSpans.Add(new Span(matchIndices.Last(), tempSpan.End)); } else { tempSpans.Add(tempSpan); } } spans = tempSpans;*/ var tempTokens = splitTokens .SelectMany(tok => tokenizationRegex.Split(tok)) .Where(p => !string.IsNullOrEmpty(p)) .ToList(); splitTokens = tempTokens; } var spans = new List<Span>(); var currentStart = span.Start; foreach (var splitToken in splitTokens) { spans.Add(new Span(currentStart, currentStart + splitToken.Length)); currentStart += splitToken.Length; } return spans; }