private void Process( ReadOnlySpan <char> input, ref int tokenIndex, int startOffset, TokenStore processedTokens, StringBuilder tokenBuilder) { var start = startOffset; for (var i = 0; i < input.Length; i++) { var current = input[i]; if (this.IsSplitCharacter(current)) { if (tokenBuilder.Length > 0) { this.CaptureToken(processedTokens, ref tokenIndex, start, i + startOffset, tokenBuilder); } start = i + startOffset + 1; } else { foreach (var processed in this.inputPreprocessorPipeline.Process(current)) { tokenBuilder.Append(processed); } } } if (tokenBuilder.Length > 0) { this.CaptureToken(processedTokens, ref tokenIndex, start, input.Length + startOffset, tokenBuilder); } }
/// <inheritdoc /> public IReadOnlyList <Token> Process(ReadOnlySpan <char> text) { var processedTokens = new TokenStore(); var tokenIndex = 0; var tokenBuilder = new StringBuilder(); this.Process(text, ref tokenIndex, 0, processedTokens, tokenBuilder); return(processedTokens.ToList()); }
public IEnumerable <Token> Process(ReadOnlySpan <char> input) { var processedWords = new TokenStore(); var wordIndex = 0; var start = 0; var wordBuilder = new StringBuilder(); Process(processedWords, ref wordIndex, ref start, 0, wordBuilder, input); return(processedWords.ToList()); }
private void CaptureWord(TokenStore processedWords, int wordIndex, int start, int end, StringBuilder wordBuilder) { var length = end - start; if (length > ushort.MaxValue) { throw new LiftiException($"Only words up to {ushort.MaxValue} characters long can be indexed"); } if (this.stemmer != null) { this.stemmer.Stem(wordBuilder); } processedWords.MergeOrAdd(new TokenHash(wordBuilder), wordBuilder, new WordLocation(wordIndex, start, (ushort)length)); }
private void CaptureToken(TokenStore processedTokens, ref int tokenIndex, int start, int end, StringBuilder tokenBuilder) { var length = end - start; if (length > ushort.MaxValue) { throw new LiftiException(string.Format(CultureInfo.InvariantCulture, ExceptionMessages.MaxTokenLengthExceeded, ushort.MaxValue)); } if (this.stemmer != null) { this.stemmer.Stem(tokenBuilder); } processedTokens.MergeOrAdd(tokenBuilder, new TokenLocation(tokenIndex, start, (ushort)length)); tokenIndex++; tokenBuilder.Length = 0; }
private void Process( TokenStore processedWords, ref int wordIndex, ref int start, int endOffset, StringBuilder wordBuilder, ReadOnlySpan <char> input) { for (var i = 0; i < input.Length; i++) { var current = input[i]; if (this.IsWordSplitCharacter(current)) { if (wordBuilder.Length > 0) { this.CaptureWord(processedWords, wordIndex, start, i + endOffset, wordBuilder); wordIndex++; wordBuilder.Length = 0; } start = i + endOffset + 1; } else { foreach (var processed in this.inputPreprocessorPipeline.Process(current)) { wordBuilder.Append(processed); } } } if (wordBuilder.Length > 0) { this.CaptureWord(processedWords, wordIndex, start, input.Length + endOffset, wordBuilder); wordIndex++; wordBuilder.Length = 0; } endOffset += input.Length; start = endOffset; }
public IEnumerable <Token> Process(ReadOnlySpan <char> input) { var processedWords = new TokenStore(); // TODO Pool? var wordIndex = 0; var start = 0; var wordBuilder = new StringBuilder(); var hash = new TokenHash(); for (var i = 0; i < input.Length; i++) { var current = input[i]; if (this.IsWordSplitCharacter(current)) { if (wordBuilder.Length > 0) { CaptureWord(processedWords, hash, wordIndex, start, i, wordBuilder); wordIndex++; wordBuilder.Length = 0; hash = new TokenHash(); } start = i + 1; } else { foreach (var processed in this.inputPreprocessorPipeline.Process(current)) { wordBuilder.Append(processed); hash = hash.Combine(processed); } } } if (wordBuilder.Length > 0) { CaptureWord(processedWords, hash, wordIndex, start, input.Length, wordBuilder); } return(processedWords.ToList()); }
public IEnumerable <Token> Process(IEnumerable <string> inputs) { if (inputs is null) { return(Enumerable.Empty <Token>()); } var processedWords = new TokenStore(); var wordIndex = 0; var start = 0; var wordBuilder = new StringBuilder(); var endOffset = 0; foreach (var input in inputs) { Process(processedWords, ref wordIndex, ref start, endOffset, wordBuilder, input.AsSpan()); endOffset += input.Length; } return(processedWords.ToList()); }
/// <inheritdoc /> public IReadOnlyList <Token> Process(IEnumerable <DocumentTextFragment> document) { if (document is null) { return(Array.Empty <Token>()); } var processedTokens = new TokenStore(); var tokenIndex = 0; var tokenBuilder = new StringBuilder(); foreach (var documentFragment in document) { this.Process( documentFragment.Text.Span, ref tokenIndex, documentFragment.Offset, processedTokens, tokenBuilder); } return(processedTokens.ToList()); }
private static void CaptureWord(TokenStore processedWords, TokenHash hash, int wordIndex, int start, int end, StringBuilder wordBuilder) { var length = end - start; processedWords.MergeOrAdd(hash, wordBuilder, new WordLocation(wordIndex, start, length)); }