예제 #1
0
        public virtual ITokenizer Create(TokenizationOptions options)
        {
            var tokenizer = CreateTokenizer(options.TokenizerKind);

            tokenizer.Configure(options);
            return(tokenizer);
        }
예제 #2
0
        public virtual ITokenizer Create(TokenizationOptions options)
        {
            if (options is null)
            {
                throw new ArgumentNullException(nameof(options));
            }

            var tokenizer = CreateTokenizer(options.TokenizerKind);

            tokenizer.Configure(options);
            return(tokenizer);
        }
예제 #3
0
        public List <Token> Tokenize(string sentence, TokenizationOptions options)
        {
            Init();

            var tokens = segmenter.Cut(sentence)
                         .Select(x => new Token
            {
                Text = x
            }).ToList();

            CorrectTokenPosition(sentence, tokens);

            return(tokens);
        }
예제 #4
0
        /// <summary>
        /// Tokenizes a string, returning its list of words.
        /// </summary>
        /// <param name="text">The document.</param>
        /// <param name="options">The tokenization options.</param>
        /// <param name="punctuationCharacters">The characters considered as punctuation.</param>
        /// <returns>The tokens.</returns>
        public static string[] Tokenize(
            string text,
            TokenizationOptions options  = TokenizationOptions.All,
            string punctuationCharacters = PunctuationCharacters)
        {
            var left  = "⋘";
            var right = "⋙";

            if ((options & TokenizationOptions.StripHtml) != 0)
            {
                text = Regex.Replace(text, "<[^<>]+>", string.Empty);
            }

            if ((options & TokenizationOptions.StripNumbers) != 0)
            {
                text = Regex.Replace(text, "[0-9]+", left + "number" + right);
            }

            if ((options & TokenizationOptions.StripUrls) != 0)
            {
                text = Regex.Replace(text, @"(http|https)://[^\s]*", left + "httpaddr" + right);
            }

            if ((options & TokenizationOptions.StripEmailAddresses) != 0)
            {
                text = Regex.Replace(text, @"[^\s]+@[^\s]+", left + "emailaddr" + right);
            }

            if ((options & TokenizationOptions.StripMonetary) != 0)
            {
                text = Regex.Replace(text, "[$]+", left + "dollar" + right);
            }

            if ((options & TokenizationOptions.StripUsernames) != 0)
            {
                text = Regex.Replace(text, @"@[^\s]+", left + "username" + right);
            }

            // Tokenize and also get rid of any punctuation
            var tokens = text.Split(punctuationCharacters.ToCharArray()).Select(
                token =>
            {
                var result = token.Replace(left, "{");
                result     = result.Replace(right, "}");
                return(result);
            }).Where(str => !string.IsNullOrWhiteSpace(str)).ToArray();

            return(tokens);
        }
예제 #5
0
        /// <summary>
        /// Initializes a new instance of the <see cref="Tokenizer"/> class.
        /// </summary>
        /// <param name="tokenizationOptions">The tokenization options for this instance.</param>
        public Tokenizer(TokenizationOptions tokenizationOptions)
        {
            this.Options = tokenizationOptions ?? throw new ArgumentNullException(nameof(tokenizationOptions));

            if (tokenizationOptions.Stemming)
            {
                this.stemmer = new PorterStemmer();
            }

            this.additionalSplitChars = tokenizationOptions.AdditionalSplitCharacters.Count > 0
                ? new HashSet <char>(tokenizationOptions.AdditionalSplitCharacters)
                : null;

            this.inputPreprocessorPipeline = new InputPreprocessorPipeline(tokenizationOptions);
        }
예제 #6
0
        /// <summary>
        /// Initializes a new instance of the <see cref="InputPreprocessorPipeline"/> class.
        /// </summary>
        public InputPreprocessorPipeline(TokenizationOptions options)
        {
            if (options is null)
            {
                throw new ArgumentNullException(nameof(options));
            }

            if (options.AccentInsensitive)
            {
                this.inputPreprocessors.Add(new LatinCharacterNormalizer());
            }

            if (options.CaseInsensitive)
            {
                this.inputPreprocessors.Add(new CaseInsensitiveNormalizer());
            }
        }
예제 #7
0
 public FakeTokenizer(TokenizationOptions options)
 {
     this.Options = options;
 }
예제 #8
0
 public void Configure(TokenizationOptions options)
 {
 }
 public TokenExApiController(
     TokenizationOptions tokenizationOptions)
 {
     _tokenizationOptions = tokenizationOptions;
 }
 protected TokenExGateway(TokenizationOptions tokenizationOptions) => _tokenizationOptions = tokenizationOptions;