Beispiel #1
0
        public void ParseTest()
        {
            var text = TextParserTexts.TestTextOldVersion;

            var parseOptions = new TextParseOptions();

            parseOptions.UseTokenizer <TextNewLineTokenizer>();
            QuestionTitleOptions(parseOptions);

            var parseResult = TextParser.Parse(text, parseOptions);
        }
Beispiel #2
0
        /// <summary>
        ///
        /// </summary>
        /// <param name="parseOptions"></param>
        private void QuestionTitleOptions(TextParseOptions parseOptions)
        {
            var questionTitleAnalyzer = new TextTokenAnalyzer(TextParserTypes.QuestionTitle);

            parseOptions.AddAnalyzer(questionTitleAnalyzer);

            var previousTokenScanner = new TextTokenPreviousTextScanner();

            previousTokenScanner.WithText("^[0-9]*-савол");
            previousTokenScanner.WithTextScannerType(TextTokenTextScannerType.Regex);

            questionTitleAnalyzer.UseScanner(previousTokenScanner);
            questionTitleAnalyzer.UseScanner <TextTokenNextEmptyTextScanner>();
        }
Beispiel #3
0
        /// <summary>
        /// Get tokens from a string.
        /// </summary>
        /// <param name="data">String.</param>
        /// <param name="options">Text parse options.</param>
        /// <returns>List of tokens.</returns>
        public static List <Token> GetTokens(string data, TextParseOptions options)
        {
            if (options == null)
            {
                throw new ArgumentNullException(nameof(options));
            }
            Dictionary <string, Token> dict = new Dictionary <string, Token>();
            List <Token>  ret       = new List <Token>();
            List <string> tokenList = new List <string>();

            tokenList = new List <string>(data.Split(options.SplitCharacters, StringSplitOptions.RemoveEmptyEntries));

            if (tokenList != null && tokenList.Count > 0)
            {
                for (int i = 0; i < tokenList.Count; i++)
                {
                    if (String.IsNullOrEmpty(tokenList[i]))
                    {
                        continue;
                    }

                    string token = tokenList[i];
                    if (options.TokenManipulation.SetLowerCase)
                    {
                        token = token.ToLower();
                    }
                    if (options.TokenManipulation.ReduceWhitespace)
                    {
                        token = TextNormalizer.ReduceWhitespace(token);
                    }
                    if (options.TokenManipulation.RemovePunctuation)
                    {
                        token = TextNormalizer.RemovePunctuation(options.PunctuationCharacters, token);
                    }
                    if (options.TokenManipulation.RemoveNumbers)
                    {
                        token = TextNormalizer.RemoveNumbers(token);
                    }
                    if (options.TokenManipulation.RemoveStopWords)
                    {
                        token = TextNormalizer.RemoveStopWords(options.StopWords, token);
                    }
                    if (token.Length < options.TokenLength.Min)
                    {
                        continue;
                    }
                    if (token.Length > options.TokenLength.Max)
                    {
                        continue;
                    }
                    if (options.StopWords.Contains(token))
                    {
                        continue;
                    }

                    Token t = new Token();
                    t.Value = token;
                    t.Count = 1;
                    t.Positions.Add(i);

                    if (dict.ContainsKey(t.Value))
                    {
                        Token orig    = dict[t.Value];
                        Token replace = new Token();
                        replace.Value     = orig.Value;
                        replace.Count     = orig.Count + 1;
                        replace.Positions = new List <long>();

                        if (t.Positions != null && t.Positions.Count > 0)
                        {
                            replace.Positions.Add(i);
                            replace.Positions.AddRange(orig.Positions);
                        }

                        dict.Remove(t.Value);
                        dict.Add(replace.Value, replace);
                    }
                    else
                    {
                        dict.Add(t.Value, t);
                    }
                }
            }

            if (dict != null && dict.Count > 0)
            {
                ret = dict.Values.ToList().OrderByDescending(u => u.Count).ToList();
            }

            return(ret);
        }