コード例 #1
0
        private void Process(
            ReadOnlySpan <char> input,
            ref int tokenIndex,
            int startOffset,
            TokenStore processedTokens,
            StringBuilder tokenBuilder)
        {
            var start = startOffset;

            for (var i = 0; i < input.Length; i++)
            {
                var current = input[i];
                if (this.IsSplitCharacter(current))
                {
                    if (tokenBuilder.Length > 0)
                    {
                        this.CaptureToken(processedTokens, ref tokenIndex, start, i + startOffset, tokenBuilder);
                    }

                    start = i + startOffset + 1;
                }
                else
                {
                    foreach (var processed in this.inputPreprocessorPipeline.Process(current))
                    {
                        tokenBuilder.Append(processed);
                    }
                }
            }

            if (tokenBuilder.Length > 0)
            {
                this.CaptureToken(processedTokens, ref tokenIndex, start, input.Length + startOffset, tokenBuilder);
            }
        }
コード例 #2
0
        /// <inheritdoc />
        public IReadOnlyList <Token> Process(ReadOnlySpan <char> text)
        {
            var processedTokens = new TokenStore();
            var tokenIndex      = 0;
            var tokenBuilder    = new StringBuilder();

            this.Process(text, ref tokenIndex, 0, processedTokens, tokenBuilder);

            return(processedTokens.ToList());
        }
コード例 #3
0
ファイル: BasicTokenizer.cs プロジェクト: jangocheng/lifti
        public IEnumerable <Token> Process(ReadOnlySpan <char> input)
        {
            var processedWords = new TokenStore();
            var wordIndex      = 0;
            var start          = 0;
            var wordBuilder    = new StringBuilder();

            Process(processedWords, ref wordIndex, ref start, 0, wordBuilder, input);

            return(processedWords.ToList());
        }
コード例 #4
0
ファイル: BasicTokenizer.cs プロジェクト: jangocheng/lifti
        private void CaptureWord(TokenStore processedWords, int wordIndex, int start, int end, StringBuilder wordBuilder)
        {
            var length = end - start;

            if (length > ushort.MaxValue)
            {
                throw new LiftiException($"Only words up to {ushort.MaxValue} characters long can be indexed");
            }

            if (this.stemmer != null)
            {
                this.stemmer.Stem(wordBuilder);
            }

            processedWords.MergeOrAdd(new TokenHash(wordBuilder), wordBuilder, new WordLocation(wordIndex, start, (ushort)length));
        }
コード例 #5
0
        private void CaptureToken(TokenStore processedTokens, ref int tokenIndex, int start, int end, StringBuilder tokenBuilder)
        {
            var length = end - start;

            if (length > ushort.MaxValue)
            {
                throw new LiftiException(string.Format(CultureInfo.InvariantCulture, ExceptionMessages.MaxTokenLengthExceeded, ushort.MaxValue));
            }

            if (this.stemmer != null)
            {
                this.stemmer.Stem(tokenBuilder);
            }

            processedTokens.MergeOrAdd(tokenBuilder, new TokenLocation(tokenIndex, start, (ushort)length));

            tokenIndex++;
            tokenBuilder.Length = 0;
        }
コード例 #6
0
ファイル: BasicTokenizer.cs プロジェクト: jangocheng/lifti
        private void Process(
            TokenStore processedWords,
            ref int wordIndex,
            ref int start,
            int endOffset,
            StringBuilder wordBuilder,
            ReadOnlySpan <char> input)
        {
            for (var i = 0; i < input.Length; i++)
            {
                var current = input[i];
                if (this.IsWordSplitCharacter(current))
                {
                    if (wordBuilder.Length > 0)
                    {
                        this.CaptureWord(processedWords, wordIndex, start, i + endOffset, wordBuilder);
                        wordIndex++;
                        wordBuilder.Length = 0;
                    }

                    start = i + endOffset + 1;
                }
                else
                {
                    foreach (var processed in this.inputPreprocessorPipeline.Process(current))
                    {
                        wordBuilder.Append(processed);
                    }
                }
            }

            if (wordBuilder.Length > 0)
            {
                this.CaptureWord(processedWords, wordIndex, start, input.Length + endOffset, wordBuilder);
                wordIndex++;
                wordBuilder.Length = 0;
            }

            endOffset += input.Length;
            start      = endOffset;
        }
コード例 #7
0
        public IEnumerable <Token> Process(ReadOnlySpan <char> input)
        {
            var processedWords = new TokenStore(); // TODO Pool?

            var wordIndex   = 0;
            var start       = 0;
            var wordBuilder = new StringBuilder();
            var hash        = new TokenHash();

            for (var i = 0; i < input.Length; i++)
            {
                var current = input[i];
                if (this.IsWordSplitCharacter(current))
                {
                    if (wordBuilder.Length > 0)
                    {
                        CaptureWord(processedWords, hash, wordIndex, start, i, wordBuilder);
                        wordIndex++;
                        wordBuilder.Length = 0;
                        hash = new TokenHash();
                    }

                    start = i + 1;
                }
                else
                {
                    foreach (var processed in this.inputPreprocessorPipeline.Process(current))
                    {
                        wordBuilder.Append(processed);
                        hash = hash.Combine(processed);
                    }
                }
            }

            if (wordBuilder.Length > 0)
            {
                CaptureWord(processedWords, hash, wordIndex, start, input.Length, wordBuilder);
            }

            return(processedWords.ToList());
        }
コード例 #8
0
ファイル: BasicTokenizer.cs プロジェクト: jangocheng/lifti
        public IEnumerable <Token> Process(IEnumerable <string> inputs)
        {
            if (inputs is null)
            {
                return(Enumerable.Empty <Token>());
            }

            var processedWords = new TokenStore();
            var wordIndex      = 0;
            var start          = 0;
            var wordBuilder    = new StringBuilder();
            var endOffset      = 0;

            foreach (var input in inputs)
            {
                Process(processedWords, ref wordIndex, ref start, endOffset, wordBuilder, input.AsSpan());
                endOffset += input.Length;
            }

            return(processedWords.ToList());
        }
コード例 #9
0
        /// <inheritdoc />
        public IReadOnlyList <Token> Process(IEnumerable <DocumentTextFragment> document)
        {
            if (document is null)
            {
                return(Array.Empty <Token>());
            }

            var processedTokens = new TokenStore();
            var tokenIndex      = 0;
            var tokenBuilder    = new StringBuilder();

            foreach (var documentFragment in document)
            {
                this.Process(
                    documentFragment.Text.Span,
                    ref tokenIndex,
                    documentFragment.Offset,
                    processedTokens,
                    tokenBuilder);
            }

            return(processedTokens.ToList());
        }
コード例 #10
0
        private static void CaptureWord(TokenStore processedWords, TokenHash hash, int wordIndex, int start, int end, StringBuilder wordBuilder)
        {
            var length = end - start;

            processedWords.MergeOrAdd(hash, wordBuilder, new WordLocation(wordIndex, start, length));
        }