Beispiel #1
0
        private readonly Dictionary <int, List <Token> > materializedWords = new Dictionary <int, List <Token> >(); // Pooling? Configuration for expected unique words per document?

        public void MergeOrAdd(TokenHash hash, StringBuilder word, WordLocation location)
        {
            if (this.materializedWords.TryGetValue(hash.HashValue, out var existingEntries))
            {
                foreach (var existingEntry in existingEntries)
                {
                    if (word.SequenceEqual(existingEntry.Value))
                    {
                        existingEntry.AddLocation(location);
                        return;
                    }
                }

                existingEntries.Add(new Token(word.ToString(), location));
            }
            else
            {
                this.materializedWords.Add(
                    hash.HashValue,
                    new List <Token>()
                {
                    new Token(word.ToString(), location)
                });
            }
        }
Beispiel #2
0
        /// <summary>
        /// Captures a token at a location, merging the token with any locations
        /// it previously matched at.
        /// </summary>
        public void MergeOrAdd(StringBuilder token, TokenLocation location)
        {
            var hash = new TokenHash(token);

            if (this.materializedTokens.TryGetValue(hash.HashValue, out var existingEntries))
            {
                foreach (var existingEntry in existingEntries)
                {
                    if (token.SequenceEqual(existingEntry.Value))
                    {
                        existingEntry.AddLocation(location);
                        return;
                    }
                }

                existingEntries.Add(new Token(token.ToString(), location));
            }
            else
            {
                this.materializedTokens.Add(
                    hash.HashValue,
                    new List <Token>()
                {
                    new Token(token.ToString(), location)
                });
            }
        }
Beispiel #3
0
        public IEnumerable <Token> Process(ReadOnlySpan <char> input)
        {
            var processedWords = new TokenStore(); // TODO Pool?

            var wordIndex   = 0;
            var start       = 0;
            var wordBuilder = new StringBuilder();
            var hash        = new TokenHash();

            for (var i = 0; i < input.Length; i++)
            {
                var current = input[i];
                if (this.IsWordSplitCharacter(current))
                {
                    if (wordBuilder.Length > 0)
                    {
                        CaptureWord(processedWords, hash, wordIndex, start, i, wordBuilder);
                        wordIndex++;
                        wordBuilder.Length = 0;
                        hash = new TokenHash();
                    }

                    start = i + 1;
                }
                else
                {
                    foreach (var processed in this.inputPreprocessorPipeline.Process(current))
                    {
                        wordBuilder.Append(processed);
                        hash = hash.Combine(processed);
                    }
                }
            }

            if (wordBuilder.Length > 0)
            {
                CaptureWord(processedWords, hash, wordIndex, start, input.Length, wordBuilder);
            }

            return(processedWords.ToList());
        }
Beispiel #4
0
        private static void CaptureWord(TokenStore processedWords, TokenHash hash, int wordIndex, int start, int end, StringBuilder wordBuilder)
        {
            var length = end - start;

            processedWords.MergeOrAdd(hash, wordBuilder, new WordLocation(wordIndex, start, length));
        }