private readonly Dictionary <int, List <Token> > materializedWords = new Dictionary <int, List <Token> >(); // Pooling? Configuration for expected unique words per document? public void MergeOrAdd(TokenHash hash, StringBuilder word, WordLocation location) { if (this.materializedWords.TryGetValue(hash.HashValue, out var existingEntries)) { foreach (var existingEntry in existingEntries) { if (word.SequenceEqual(existingEntry.Value)) { existingEntry.AddLocation(location); return; } } existingEntries.Add(new Token(word.ToString(), location)); } else { this.materializedWords.Add( hash.HashValue, new List <Token>() { new Token(word.ToString(), location) }); } }
/// <summary> /// Captures a token at a location, merging the token with any locations /// it previously matched at. /// </summary> public void MergeOrAdd(StringBuilder token, TokenLocation location) { var hash = new TokenHash(token); if (this.materializedTokens.TryGetValue(hash.HashValue, out var existingEntries)) { foreach (var existingEntry in existingEntries) { if (token.SequenceEqual(existingEntry.Value)) { existingEntry.AddLocation(location); return; } } existingEntries.Add(new Token(token.ToString(), location)); } else { this.materializedTokens.Add( hash.HashValue, new List <Token>() { new Token(token.ToString(), location) }); } }
public IEnumerable <Token> Process(ReadOnlySpan <char> input) { var processedWords = new TokenStore(); // TODO Pool? var wordIndex = 0; var start = 0; var wordBuilder = new StringBuilder(); var hash = new TokenHash(); for (var i = 0; i < input.Length; i++) { var current = input[i]; if (this.IsWordSplitCharacter(current)) { if (wordBuilder.Length > 0) { CaptureWord(processedWords, hash, wordIndex, start, i, wordBuilder); wordIndex++; wordBuilder.Length = 0; hash = new TokenHash(); } start = i + 1; } else { foreach (var processed in this.inputPreprocessorPipeline.Process(current)) { wordBuilder.Append(processed); hash = hash.Combine(processed); } } } if (wordBuilder.Length > 0) { CaptureWord(processedWords, hash, wordIndex, start, input.Length, wordBuilder); } return(processedWords.ToList()); }
private static void CaptureWord(TokenStore processedWords, TokenHash hash, int wordIndex, int start, int end, StringBuilder wordBuilder) { var length = end - start; processedWords.MergeOrAdd(hash, wordBuilder, new WordLocation(wordIndex, start, length)); }