예제 #1
0
        private void StripAnalysisText(List <string> rawAnalysisWords)
        {
            List <string> strippedList = new List <string>();

            foreach (string s in rawAnalysisWords)
            {
                string p;
                string q;
                if (!string.IsNullOrEmpty(s))
                {
                    if (Rgx.IsMatch(s))
                    {
                        p = Rgx.Replace(s, "").ToLower();
                    }
                    else
                    {
                        p = s.ToLower();
                    }

                    if (!StopWords.Contains(p.ToLower()) && !string.IsNullOrWhiteSpace(p))
                    {
                        if (Apos.IsMatch(p))
                        {
                            q = Apos.Replace(p, "").ToLower();
                            strippedList.Add(q);
                        }
                        else
                        {
                            strippedList.Add(p);
                        }
                    }
                }
            }
            StrippedWords = strippedList;
        }
        public Dictionary <string, int> GetWordOccurancesFromText(string text)
        {
            if (string.IsNullOrEmpty(text))
            {
                throw new ArgumentNullException(nameof(text));
            }

            text = GetPlainText(text);

            var result  = new Dictionary <string, int>();
            var matches = Regex.Matches(text, WordRegexPattern);

            foreach (Match word in matches)
            {
                var key = word.Value.ToLower();
                if (!StopWords.Contains(key, StringComparer.OrdinalIgnoreCase))
                {
                    if (result.ContainsKey(key))
                    {
                        result[key]++;
                    }
                    else
                    {
                        result.Add(key, 1);
                    }
                }
            }
            return(result);
        }
예제 #3
0
        private IDictionary <string, double> GetWordIfidf(string text, IEnumerable <string> allowPos)
        {
            IEnumerable <string> words = null;

            if (allowPos.IsNotEmpty())
            {
                words = FilterCutByPos(text, allowPos);
            }
            else
            {
                words = Segmenter.Cut(text);
            }

            // Calculate TF
            var freq = new Dictionary <string, double>();

            foreach (var word in words)
            {
                var w = word;
                if (string.IsNullOrEmpty(w) || w.Trim().Length < 2 || StopWords.Contains(w.ToLower()))
                {
                    continue;
                }
                freq[w] = freq.GetDefault(w, 0.0) + 1.0;
            }
            var total = freq.Values.Sum();

            foreach (var k in freq.Keys.ToList())
            {
                freq[k] *= IdfFreq.GetDefault(k, MedianIdf) / total;
            }

            return(freq);
        }
예제 #4
0
        public IEnumerable <Posting> Index(sqliteContext dbContext, List <string> words)
        {
            Dictionary <string, Posting> postings = new Dictionary <string, Posting>();
            int index = 0;

            foreach (var token in Tokens)
            {
                var word = token.ToString().ToLower();
                index++;
                if (StopWords.Contains(word))
                {
                    continue;
                }
                if (words != null && !words.Contains(word))
                {
                    continue;
                }

                if (dbContext?.IndexWord.Find(word) == null)
                {
                    dbContext?.IndexWord.Add(new IndexWord()
                    {
                        Word = word
                    });
                }

                if (!postings.ContainsKey(word))
                {
                    postings.Add(word, new Posting()
                    {
                        Word         = word,
                        DocumentName = name,
                        Indexes      = ""
                    });
                }

                var p = postings[word];
                p.Frequency++;
                if (p.Indexes != "")
                {
                    p.Indexes += ",";
                }
                p.Indexes += index;
            }

            foreach (var posting in postings)
            {
                dbContext?.Posting.Add(posting.Value);
            }
            return(postings.Values);
        }
예제 #5
0
        /// <summary>
        /// determines if the passed term is likely to be of interest in "more like" comparisons
        /// </summary>
        /// <param name="term"> The word being considered </param>
        /// <returns> <c>true</c> if should be ignored, <c>false</c> if should be used in further analysis </returns>
        private bool IsNoiseWord(string term)
        {
            int len = term.Length;

            if (MinWordLen > 0 && len < MinWordLen)
            {
                return(true);
            }
            if (MaxWordLen > 0 && len > MaxWordLen)
            {
                return(true);
            }
            return(StopWords != null && StopWords.Contains(term));
        }
예제 #6
0
        public Boolean ShouldStop(ChatMessage chatMessage)
        {
            if (!chatMessage.IsModerator)
            {
                return(false);
            }

            if (chatMessage.Username.Equals("streamelements", StringComparison.OrdinalIgnoreCase))
            {
                return(false);
            }

            String[] messageWords = chatMessage.Message.Split(' ');

            return(messageWords.Any(word => StopWords.Contains(word.ToLower()) || word.Equals(TwitchClientManager.Name.Value, StringComparison.OrdinalIgnoreCase)));
        }
        /// <summary>
        /// Очищает текст используя базу (словарь) стоп-слов
        /// </summary>
        /// <param name="text">Неочищенный текст, который нужно канонизировать</param>
        /// <returns>Коллекцию слов из текста, которые готовы к употреблению =)</returns>
        private static List <string> TextPurify(string text)
        {
            //разделяем ввесь текст на отдельные слова
            var rawTokens = text.Split(Separators).ToList();

            //проходимся по этому списку слов в linq-выражении
            var canonedTokens = rawTokens.Select(word => word.ToCharArray().Where(n => !char.IsDigit(n)).ToArray()).Select(purified => new string(purified)).ToList();

            //из этой коллекции удаляем все пустые элементы и стоп-слова используя linq
            canonedTokens.RemoveAll(item => StopWords.Contains(item.ToLower()) || string.IsNullOrWhiteSpace(item));

            //также удаляются все стоп-символы из слов в коллекции
            var purifiedTokens = (from item in canonedTokens let regex = new Regex("[0-9/|_!@#$%^&*()_+=?:;.,{}№><«»'\"`~" + @"\\[\]– -]*") select regex.Replace(item, "")).ToList();

            //устанавливаются все слова в Lower Case
            var purifiedLowerCaseTokens = purifiedTokens.Select(purifiedToken => purifiedToken.ToLower()).ToList();

            var stemmedLowerCaseTokens = new List <string>();
            var cyrillicStemmer        = new RussianStemmer();
            var latinStemmer           = new EnglishStemmer();

            foreach (var purifiedLowerCaseToken in purifiedLowerCaseTokens)
            {
                switch (Verifications.GetFontType(purifiedLowerCaseToken))
                {
                case FontType.Cyrillic:
                    stemmedLowerCaseTokens.Add(cyrillicStemmer.Stem(purifiedLowerCaseToken));
                    break;

                case FontType.Latin:
                    stemmedLowerCaseTokens.Add(latinStemmer.Stem(purifiedLowerCaseToken));
                    break;

                case FontType.Other:
                case FontType.Numbers:
                    break;

                default:
                    throw new ArgumentOutOfRangeException();
                }
            }

            return(stemmedLowerCaseTokens);
        }
예제 #8
0
 public bool PairFilter(IEnumerable <string> allowPos, Pair wp)
 {
     return(allowPos.Contains(wp.Flag) &&
            wp.Word.Trim().Length >= 2 &&
            !StopWords.Contains(wp.Word.ToLower()));
 }
예제 #9
0
 public bool PairFilter(Pair wp)
 {
     return(DefaultPosFilter.Contains(wp.Flag) &&
            wp.Word.Trim().Length >= 2 &&
            !StopWords.Contains(wp.Word.ToLower()));
 }
예제 #10
0
        /// <summary>
        /// Removes stop words and HTML from the specified string.
        /// </summary>
        /// <param name="content">
        /// The content.
        /// </param>
        /// <param name="removeHtml">
        /// The remove Html.
        /// </param>
        /// <returns>
        /// The clean content.
        /// </returns>
        private static string CleanContent(string content, bool removeHtml)
        {
            if (removeHtml)
            {
                content = Utils.StripHtml(content);
            }

            content =
                content.Replace("\\", string.Empty).Replace("|", string.Empty).Replace("(", string.Empty).Replace(
                    ")", string.Empty).Replace("[", string.Empty).Replace("]", string.Empty).Replace("*", string.Empty).
                Replace("?", string.Empty).Replace("}", string.Empty).Replace("{", string.Empty).Replace(
                    "^", string.Empty).Replace("+", string.Empty);

            var words = content.Split(new[] { ' ', '\n', '\r' }, StringSplitOptions.RemoveEmptyEntries);
            var sb    = new StringBuilder();

            foreach (var word in
                     words.Select(t => t.ToLowerInvariant().Trim()).Where(word => word.Length > 1 && !StopWords.Contains(word)))
            {
                sb.AppendFormat("{0} ", word);
            }

            return(sb.ToString());
        }