public void ParseTest() { var text = TextParserTexts.TestTextOldVersion; var parseOptions = new TextParseOptions(); parseOptions.UseTokenizer <TextNewLineTokenizer>(); QuestionTitleOptions(parseOptions); var parseResult = TextParser.Parse(text, parseOptions); }
/// <summary> /// /// </summary> /// <param name="parseOptions"></param> private void QuestionTitleOptions(TextParseOptions parseOptions) { var questionTitleAnalyzer = new TextTokenAnalyzer(TextParserTypes.QuestionTitle); parseOptions.AddAnalyzer(questionTitleAnalyzer); var previousTokenScanner = new TextTokenPreviousTextScanner(); previousTokenScanner.WithText("^[0-9]*-савол"); previousTokenScanner.WithTextScannerType(TextTokenTextScannerType.Regex); questionTitleAnalyzer.UseScanner(previousTokenScanner); questionTitleAnalyzer.UseScanner <TextTokenNextEmptyTextScanner>(); }
/// <summary> /// Get tokens from a string. /// </summary> /// <param name="data">String.</param> /// <param name="options">Text parse options.</param> /// <returns>List of tokens.</returns> public static List <Token> GetTokens(string data, TextParseOptions options) { if (options == null) { throw new ArgumentNullException(nameof(options)); } Dictionary <string, Token> dict = new Dictionary <string, Token>(); List <Token> ret = new List <Token>(); List <string> tokenList = new List <string>(); tokenList = new List <string>(data.Split(options.SplitCharacters, StringSplitOptions.RemoveEmptyEntries)); if (tokenList != null && tokenList.Count > 0) { for (int i = 0; i < tokenList.Count; i++) { if (String.IsNullOrEmpty(tokenList[i])) { continue; } string token = tokenList[i]; if (options.TokenManipulation.SetLowerCase) { token = token.ToLower(); } if (options.TokenManipulation.ReduceWhitespace) { token = TextNormalizer.ReduceWhitespace(token); } if (options.TokenManipulation.RemovePunctuation) { token = TextNormalizer.RemovePunctuation(options.PunctuationCharacters, token); } if (options.TokenManipulation.RemoveNumbers) { token = TextNormalizer.RemoveNumbers(token); } if (options.TokenManipulation.RemoveStopWords) { token = TextNormalizer.RemoveStopWords(options.StopWords, token); } if (token.Length < options.TokenLength.Min) { continue; } if (token.Length > options.TokenLength.Max) { continue; } if (options.StopWords.Contains(token)) { continue; } Token t = new Token(); t.Value = token; t.Count = 1; t.Positions.Add(i); if (dict.ContainsKey(t.Value)) { Token orig = dict[t.Value]; Token replace = new Token(); replace.Value = orig.Value; replace.Count = orig.Count + 1; replace.Positions = new List <long>(); if (t.Positions != null && t.Positions.Count > 0) { replace.Positions.Add(i); replace.Positions.AddRange(orig.Positions); } dict.Remove(t.Value); dict.Add(replace.Value, replace); } else { dict.Add(t.Value, t); } } } if (dict != null && dict.Count > 0) { ret = dict.Values.ToList().OrderByDescending(u => u.Count).ToList(); } return(ret); }