private void StripAnalysisText(List <string> rawAnalysisWords) { List <string> strippedList = new List <string>(); foreach (string s in rawAnalysisWords) { string p; string q; if (!string.IsNullOrEmpty(s)) { if (Rgx.IsMatch(s)) { p = Rgx.Replace(s, "").ToLower(); } else { p = s.ToLower(); } if (!StopWords.Contains(p.ToLower()) && !string.IsNullOrWhiteSpace(p)) { if (Apos.IsMatch(p)) { q = Apos.Replace(p, "").ToLower(); strippedList.Add(q); } else { strippedList.Add(p); } } } } StrippedWords = strippedList; }
public Dictionary <string, int> GetWordOccurancesFromText(string text) { if (string.IsNullOrEmpty(text)) { throw new ArgumentNullException(nameof(text)); } text = GetPlainText(text); var result = new Dictionary <string, int>(); var matches = Regex.Matches(text, WordRegexPattern); foreach (Match word in matches) { var key = word.Value.ToLower(); if (!StopWords.Contains(key, StringComparer.OrdinalIgnoreCase)) { if (result.ContainsKey(key)) { result[key]++; } else { result.Add(key, 1); } } } return(result); }
private IDictionary <string, double> GetWordIfidf(string text, IEnumerable <string> allowPos) { IEnumerable <string> words = null; if (allowPos.IsNotEmpty()) { words = FilterCutByPos(text, allowPos); } else { words = Segmenter.Cut(text); } // Calculate TF var freq = new Dictionary <string, double>(); foreach (var word in words) { var w = word; if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower())) { continue; } freq[w] = freq.GetDefault(w, 0.0) + 1.0; } var total = freq.Values.Sum(); foreach (var k in freq.Keys.ToList()) { freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total; } return(freq); }
public IEnumerable <Posting> Index(sqliteContext dbContext, List <string> words) { Dictionary <string, Posting> postings = new Dictionary <string, Posting>(); int index = 0; foreach (var token in Tokens) { var word = token.ToString().ToLower(); index++; if (StopWords.Contains(word)) { continue; } if (words != null && !words.Contains(word)) { continue; } if (dbContext?.IndexWord.Find(word) == null) { dbContext?.IndexWord.Add(new IndexWord() { Word = word }); } if (!postings.ContainsKey(word)) { postings.Add(word, new Posting() { Word = word, DocumentName = name, Indexes = "" }); } var p = postings[word]; p.Frequency++; if (p.Indexes != "") { p.Indexes += ","; } p.Indexes += index; } foreach (var posting in postings) { dbContext?.Posting.Add(posting.Value); } return(postings.Values); }
/// <summary> /// determines if the passed term is likely to be of interest in "more like" comparisons /// </summary> /// <param name="term"> The word being considered </param> /// <returns> <c>true</c> if should be ignored, <c>false</c> if should be used in further analysis </returns> private bool IsNoiseWord(string term) { int len = term.Length; if (MinWordLen > 0 && len < MinWordLen) { return(true); } if (MaxWordLen > 0 && len > MaxWordLen) { return(true); } return(StopWords != null && StopWords.Contains(term)); }
public Boolean ShouldStop(ChatMessage chatMessage) { if (!chatMessage.IsModerator) { return(false); } if (chatMessage.Username.Equals("streamelements", StringComparison.OrdinalIgnoreCase)) { return(false); } String[] messageWords = chatMessage.Message.Split(' '); return(messageWords.Any(word => StopWords.Contains(word.ToLower()) || word.Equals(TwitchClientManager.Name.Value, StringComparison.OrdinalIgnoreCase))); }
/// <summary> /// Очищает текст используя базу (словарь) стоп-слов /// </summary> /// <param name="text">Неочищенный текст, который нужно канонизировать</param> /// <returns>Коллекцию слов из текста, которые готовы к употреблению =)</returns> private static List <string> TextPurify(string text) { //разделяем ввесь текст на отдельные слова var rawTokens = text.Split(Separators).ToList(); //проходимся по этому списку слов в linq-выражении var canonedTokens = rawTokens.Select(word => word.ToCharArray().Where(n => !char.IsDigit(n)).ToArray()).Select(purified => new string(purified)).ToList(); //из этой коллекции удаляем все пустые элементы и стоп-слова используя linq canonedTokens.RemoveAll(item => StopWords.Contains(item.ToLower()) || string.IsNullOrWhiteSpace(item)); //также удаляются все стоп-символы из слов в коллекции var purifiedTokens = (from item in canonedTokens let regex = new Regex("[0-9/|_!@#$%^&*()_+=?:;.,{}№><«»'\"`~" + @"\\[\]– -]*") select regex.Replace(item, "")).ToList(); //устанавливаются все слова в Lower Case var purifiedLowerCaseTokens = purifiedTokens.Select(purifiedToken => purifiedToken.ToLower()).ToList(); var stemmedLowerCaseTokens = new List <string>(); var cyrillicStemmer = new RussianStemmer(); var latinStemmer = new EnglishStemmer(); foreach (var purifiedLowerCaseToken in purifiedLowerCaseTokens) { switch (Verifications.GetFontType(purifiedLowerCaseToken)) { case FontType.Cyrillic: stemmedLowerCaseTokens.Add(cyrillicStemmer.Stem(purifiedLowerCaseToken)); break; case FontType.Latin: stemmedLowerCaseTokens.Add(latinStemmer.Stem(purifiedLowerCaseToken)); break; case FontType.Other: case FontType.Numbers: break; default: throw new ArgumentOutOfRangeException(); } } return(stemmedLowerCaseTokens); }
public bool PairFilter(IEnumerable <string> allowPos, Pair wp) { return(allowPos.Contains(wp.Flag) && wp.Word.Trim().Length >= 2 && !StopWords.Contains(wp.Word.ToLower())); }
public bool PairFilter(Pair wp) { return(DefaultPosFilter.Contains(wp.Flag) && wp.Word.Trim().Length >= 2 && !StopWords.Contains(wp.Word.ToLower())); }
/// <summary> /// Removes stop words and HTML from the specified string. /// </summary> /// <param name="content"> /// The content. /// </param> /// <param name="removeHtml"> /// The remove Html. /// </param> /// <returns> /// The clean content. /// </returns> private static string CleanContent(string content, bool removeHtml) { if (removeHtml) { content = Utils.StripHtml(content); } content = content.Replace("\\", string.Empty).Replace("|", string.Empty).Replace("(", string.Empty).Replace( ")", string.Empty).Replace("[", string.Empty).Replace("]", string.Empty).Replace("*", string.Empty). Replace("?", string.Empty).Replace("}", string.Empty).Replace("{", string.Empty).Replace( "^", string.Empty).Replace("+", string.Empty); var words = content.Split(new[] { ' ', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries); var sb = new StringBuilder(); foreach (var word in words.Select(t => t.ToLowerInvariant().Trim()).Where(word => word.Length > 1 && !StopWords.Contains(word))) { sb.AppendFormat("{0} ", word); } return(sb.ToString()); }