예제 #1
0
        private IEnumerable<Span> SplitToken(string input, Span span)
        {
            var token = input.Substring(span.Start, span.Length());
            if (string.IsNullOrEmpty(token))
            {
                return new List<Span>();
            }

            // optimization - don't tokenize token of 1 character or token with letters only
            if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token))
            {
                return new List<Span>(){ span };
            }

            var splitTokens = TokenizationRegex.Split(token);

            var spans = new List<Span>();
            var currentStart = span.Start;
            foreach (var splitToken in splitTokens)
            {
                if (splitToken.Length > 0)
                {
                    spans.Add(new Span(currentStart, currentStart + splitToken.Length)); 
                }
                currentStart += splitToken.Length;
            }
            return spans;
        }
예제 #2
0
        private List<Span> SplitToken(string input, Span span)
        {
            var token = input.Substring(span.Start, span.Length());
            if (string.IsNullOrEmpty(token))
            {
                return new List<Span>();
            }

            // optimization - don't tokenize token of 1 character or token with letters only
            if (span.Length() <= 1 || LettersOnlyRegex.IsMatch(token))
            {
                return new List<Span>(){ span };
            }

            var splitTokens = new List<string>() { token };
            foreach (var tokenizationRegex in TokenizationRegexes)
            {
                /*var tempSpans = new List<Span>();
                foreach (var tempSpan in spans)
                {
                    var tempToken = input.Substring(tempSpan.Start, tempSpan.Length());
                    var matches = tokenizationRegex.Matches(tempToken);

                    var matchIndices = new List<int>();
                    for (int i = 0; i < matches.Count; i++)
                    {
                        var index = matches[i].Index;
                        if (0 < index && index < tempToken.Length)
                        {
                            matchIndices.Add(index);
                        }
                    }

                    if (matchIndices.Any())
                    {
                        for (var i = 0; i < matchIndices.Count; i++)
                        {
                            var start = i == 0 ? 0 : matchIndices[i - 1] - 1;
                            tempSpans.Add(new Span(tempSpan.Start + start, tempSpan.Start + matchIndices[i]));
                        }
                        // add last one
                        tempSpans.Add(new Span(matchIndices.Last(), tempSpan.End));
                    }
                    else
                    {
                        tempSpans.Add(tempSpan);
                    }
                }
                spans = tempSpans;*/

                var tempTokens = splitTokens
                    .SelectMany(tok => tokenizationRegex.Split(tok))
                    .Where(p => !string.IsNullOrEmpty(p))
                    .ToList();
                splitTokens = tempTokens;
            }

            var spans = new List<Span>();
            var currentStart = span.Start;
            foreach (var splitToken in splitTokens)
            {
                spans.Add(new Span(currentStart, currentStart + splitToken.Length));
                currentStart += splitToken.Length;
            }
            return spans;
        }