예제 #1
0
 public static string[] SplitToWords(string text)
 {
     RussianStemmer stemmer = new RussianStemmer();
     char[] separators = { ' ', ',', '.', '!', '"', '?', '+', '-', ':', ';', '\n', '\r', '\t', ')', '(', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' };
     string[] textSplit = text.Split(separators, StringSplitOptions.RemoveEmptyEntries);
     for (int i = 0; i < textSplit.Length; i++)
     {
         textSplit[i] = stemmer.Stem(textSplit[i]);
     }
     textSplit = TextChange.JoinNo(textSplit);
     textSplit = TextChange.RemovePrepositions(textSplit);
     return textSplit;
 }
 private void copy_from(RussianStemmer other)
 {
     I_p2 = other.I_p2;
     I_pV = other.I_pV;
     copy_from(other);
 }
예제 #3
0
        public static Post Parse(HtmlNode postContentHtmlnode)
        {
            if (postContentHtmlnode != null)
            {
                string published = postContentHtmlnode.SelectSingleNode("//div[@class='published']").InnerText;
                CultureInfo provider = CultureInfo.CurrentCulture;
                string format = "d MMMM yyyy в HH:mm";
                DateTime publishedDate = DateTime.ParseExact(published.Trim(), format, provider, DateTimeStyles.AllowWhiteSpaces);

                string title = postContentHtmlnode.SelectSingleNode("//span[@class='post_title']").InnerText;
                var hubnodes = postContentHtmlnode.SelectNodes("//a[@class='hub ']");
                List<string> hubs = new List<string>();
                if (hubnodes != null)
                    hubs = hubnodes.Select(x => x.InnerText).ToList();
                string postHtml = postContentHtmlnode.SelectSingleNode("//div[@class='content html_format']").InnerHtml;
                string postText = postContentHtmlnode.SelectSingleNode("//div[@class='content html_format']").InnerText;

                Regex wordsRegex = new Regex(@"\w+");
                RussianStemmer stemmer = new RussianStemmer();
                var stems = wordsRegex
                    .Matches(postText)
                    .Cast<Match>()
                    .Select(x => stemmer.Stem(x.Value))
                    .GroupBy(x => x)
                    .Select(x => new Stem() { id = Guid.NewGuid(), Word = x.Key, Frequency = x.Count() });

                var tagNodes = postContentHtmlnode.SelectSingleNode("//ul[@class='tags']").SelectNodes("//a[@rel='tag']");
                List<string> tags = new List<string>();
                if (tagNodes != null)
                    tags = tagNodes.Select(x => x.InnerText).ToList();
                string pageViews = postContentHtmlnode.SelectSingleNode("//div[@class='pageviews']").InnerText;
                if (string.IsNullOrEmpty(pageViews))
                    pageViews = "0";
                string favCount = postContentHtmlnode.SelectSingleNode("//div[@class='favs_count']").InnerText;
                if (string.IsNullOrEmpty(favCount))
                    favCount = "0";
                string author = postContentHtmlnode.SelectSingleNode("//div[@class='author']").SelectSingleNode(".//a").InnerText;
                string authorRating = postContentHtmlnode.SelectSingleNode("//span[@class='rating']").InnerText;
                string commentsCount = postContentHtmlnode.SelectSingleNode("//span[@id='comments_count']").InnerText;
                if (string.IsNullOrEmpty(commentsCount))
                    commentsCount = "0";
                string postId = postContentHtmlnode.SelectSingleNode("//div[@class='post shortcuts_item'] | //div[@class='post translation shortcuts_item']").GetAttributeValue("id", null);
                postId = postId.Split('_')[1];

                string originalAuthor = null;
                string originalSource = null;
                bool translation = false;
                var originalAuthorNode = postContentHtmlnode.SelectSingleNode("//div[@class='original-author']/a");
                if (originalAuthorNode != null)
                {
                    translation = true;
                    originalAuthor = originalAuthorNode.InnerText;
                    originalSource = originalAuthorNode.GetAttributeValue("href", null);
                }

                List<Comment> comments = new List<Comment>();
                var commentNodes = postContentHtmlnode.SelectNodes("//div[@class='comment_item']");
                if (commentNodes != null)
                    foreach (var commentNode in commentNodes)
                    {
                        Comment comment = Comment.Parse(commentNode, postId);
                        if (comment != null)
                            comments.Add(comment);
                    }

                return new Post
                {
                    Author = author,
                    AuthorRating = authorRating,
                    CommentsCount = int.Parse(commentsCount),
                    Comments = comments,
                    FavoritesCount = int.Parse(favCount),
                    Hubs = hubs,
                    id = postId,
                    PageViews = int.Parse(pageViews),
                    PostHtml = postHtml,
                    PostText = postText,
                    Published = publishedDate,
                    Tags = tags,
                    Title = title,
                    OriginalAuthor = originalAuthor,
                    OriginalSource = originalSource,
                    Translation = translation,
                    Stems = new List<Stem>(stems)
                };
            }
            else
                return null;
        }
 public static void InitParams(List<string> t)
 {
     Eng = new StWdsEng();
     Rus = new StWdsRus();
     EngStem = new EnglishStemmer();
     RusStem = new RussianStemmer();
     ts = new List<string>();
     ts = t;
     Tag = new List<List<Word>>();
 }
 public static void InitParams(string[] t)
 {
     string[] temp = t;
     Eng = new StWdsEng();
     Rus = new StWdsRus();
     EngStem = new EnglishStemmer();
     RusStem = new RussianStemmer();
     ts = new List<string>();
     ts.AddRange(temp);
     Tag = new List<List<Word>>();
 }
 public static void InitParams(List<string> t)
 {
     Eng = new StWdsEng();
     Rus = new StWdsRus();
     EngStem = new EnglishStemmer();
     RusStem = new RussianStemmer();
     ts = new List<string>();
     ts = t;
     fullWdsCollection = new List<List<string>>[2];
     fullWdsCollection[0] = new List<List<string>>();
     fullWdsCollection[1] = new List<List<string>>();
     Tag = new List<List<Word>>[2];
     Tag[0] = new List<List<Word>>();
     Tag[1] = new List<List<Word>>();
 }
예제 #7
0
 private string StemTerm(string term)
 {
     string massagedTerm = term.Trim().ToLower();
     if (String.IsNullOrEmpty(massagedTerm)) return null;
     switch (_language) {
         case ConfigHandler.Language.English:
             EnglishWord englishWord = new EnglishWord(massagedTerm);
             return englishWord.Stem;
         case ConfigHandler.Language.German:
             GermanStemmer germanStemmer = new GermanStemmer();
             return germanStemmer.Stem(massagedTerm);
         case ConfigHandler.Language.Russian:
             RussianStemmer russianStemmer = new RussianStemmer();
             return russianStemmer.Stem(massagedTerm);
         case ConfigHandler.Language.French:
             FrenchStemmer frenchStemmer = new FrenchStemmer();
             return frenchStemmer.Stem(massagedTerm);
         default:
             return massagedTerm;
     }
 }
 private void copy_from(RussianStemmer other)
 {
     I_p2 = other.I_p2;
     I_pV = other.I_pV;
     copy_from(other);
 }
예제 #9
0
        public void ParsePage(string urlPage)
        {
            if (urlPage == null)
            {
                mess.WriteMessage(String.Format("Передана пустая ссылка"));
                return;
            }
            Dictionary<string, object> allInformation = new Dictionary<string, object>();
            allInformation.Add("url", urlPage);
            //Регулярка для поиска начала статьи. Ограничение на количество, что бы не зацепить другие комменты
            Regex beginText = new Regex("<!--.*testcom.{0,5}news.{0,50}-->");
            Regex beginText2 = new Regex("<!-- СТАТЬЯ -->");
            //Регулярка для поиска конца статьи
            Regex endText = new Regex("(Ссылки по теме)|(Сайты по теме)|(<!-- social -->)");

            //urlPage = "http://lenta.ru/news/2012/10/07/milf/"; // "http://lenta.ru/news/2012/10/01/party/"; //http://lenta.ru/news/2012/10/05/manson1/
            DateTime dateArticle = ConvertSubUriInDateTime(urlPage);
            allInformation.Add("date", dateArticle);

            string subUrl = string.Format("testcom/news/2012");

            HtmlDocument newsPage = GetPage(urlPage);
            if (newsPage == null)
            {
                mess.WriteMessage(String.Format("Не удалось загрузить страницу: {0}", urlPage));
                return;
            }

            allInformation.Add("articleHtml", newsPage.DocumentNode.InnerHtml);

            var text = newsPage.DocumentNode.SelectSingleNode("//td[@class='statya']");
            var title = newsPage.DocumentNode.SelectSingleNode("/html/head/title");
            var description = newsPage.DocumentNode.SelectSingleNode("/html/head/meta[1]");
            string descrTest = description.GetAttributeValue("content", "");
            allInformation.Add("description", descrTest);

            string badText = text.InnerText;
            var matchBegin = beginText.Match(badText);
            if (!matchBegin.Success)
                matchBegin = beginText2.Match(badText);
            var matchEnd = endText.Match(badText);
            if (!matchBegin.Success || !matchEnd.Success)
            {
                mess.WriteMessage(String.Format("Не удалось выделить основной текст: {0}", urlPage));
                //throw new Exception("Начало или конец статьи не найден!");
            }
            StringBuilder articleText = new StringBuilder(
                badText.Substring(matchBegin.Index + matchBegin.Length, matchEnd.Index - matchBegin.Index - matchBegin.Length + 0));
            //после замены чистый текст статьи
            articleText.Replace("\n", "");

            allInformation.Add("articleText", articleText.ToString());

            //выделение основ
            RussianStemmer rusStemmer = new RussianStemmer();
            Regex separators = new Regex("[А-Яа-я]+");
            var matches = separators.Matches(articleText.ToString());
            Dictionary<string, int> words = new Dictionary<string, int>();
            foreach (Match item in matches)
            {
                string tmp = rusStemmer.Stem(item.Value);
                if (words.ContainsKey(tmp))
                    words[tmp]++;
                else
                    words.Add(tmp, 1);
            }

            //Выделение тематики
            Regex themeRegEx = new Regex(":");
            var themeMatches = themeRegEx.Matches(title.InnerText);
            string theme = title.InnerText.Substring(themeMatches[0].Index+1, themeMatches[1].Index - themeMatches[0].Index - 1).Trim();

            allInformation.Add("theme", theme);

            //Выделение заголовка
            var h2Title = newsPage.DocumentNode.SelectSingleNode("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/h2");
            string titleText = h2Title.InnerText;

            allInformation.Add("title", titleText);

            //Выделение ссылок по теме
            //*[@id="pacman"]/table/tbody/tr/td/p[6]
            //*[@id="pacman"]/table/tbody/tr/td/p[6]/a
            var linksOnTheme = newsPage.DocumentNode.SelectNodes("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/p[@class='links']/a");
            List<string> links = new List<string>();
            if (linksOnTheme != null)
            {
                foreach (var item in linksOnTheme)
                {
                    string tmp = item.GetAttributeValue("href", "");
                    if (tmp != String.Empty)
                        links.Add(tmp);
                }
            }

            allInformation.Add("wordBase", words);

            allInformation.Add("themeLinks", links);

            //Выделение ссылок в тексте
            var linksInArticle = newsPage.DocumentNode.SelectNodes("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/a");
            List<string> listLinksInArticle = new List<string>();

            if (linksInArticle != null)
            {
                foreach (var item in linksInArticle)
                {
                    string tmp = item.GetAttributeValue("href", "");
                    if (!tmp.StartsWith("http://lenta.ru"))
                        tmp = String.Format("http://lenta.ru{0}", tmp);
                    if(tmp != String.Empty)
                        listLinksInArticle.Add(tmp);
                }
            }

            allInformation.Add("articleLinks", listLinksInArticle);

            //сохраняем в Mongo
            repository.SaveDocument(allInformation);

            mess.WriteMessage(String.Format("Сохранили: {0}", urlPage));
        }
예제 #10
0
        public static Comment Parse(HtmlNode commentNode, string postId)
        {
            if (commentNode != null)
            {
                string commentId = commentNode.GetAttributeValue("id", null);
                if (commentId != null)
                {
                    commentId = commentId.Split('_')[1];
                    string parentId = commentNode.SelectSingleNode("./span[@class='parent_id']").GetAttributeValue("data-parent_id", null);
                    string user = commentNode.SelectSingleNode("./div[@class='info  ']/a[@class='username']").InnerText;
                    string score = commentNode.SelectSingleNode("./div[@class='info  ']/div[@class='voting   ']/*/span[@class='score']").InnerText;

                    string text = commentNode.SelectSingleNode("./div[contains(@class,'message html_format ')]").InnerText.Trim();
                    Regex wordsRegex = new Regex(@"\w+");
                    RussianStemmer stemmer = new RussianStemmer();
                    var stems = wordsRegex
                        .Matches(text)
                        .Cast<Match>()
                        .Select(x => stemmer.Stem(x.Value))
                        .GroupBy(x => x)
                        .Select(x => new Stem() { id = Guid.NewGuid(), Word = x.Key, Frequency = x.Count() });

                    string timeStr = commentNode.SelectSingleNode("./div[@class='info  ']/time").InnerText;
                    CultureInfo provider = CultureInfo.CurrentCulture;
                    string[] formats = new string[] { "d MMMM yyyy в HH:mm", "d MMMM yyyy в HH:mm (комментарий был изменён)" };
                    DateTime time = DateTime.ParseExact(timeStr, formats, provider, DateTimeStyles.AllowWhiteSpaces);
                    if (string.IsNullOrEmpty(score))
                        score = "0";
                    score = score.Trim('+');
                    score = score.Replace("–", "-");

                    return new Comment()
                    {
                        id = commentId,
                        ParentId = (parentId == null || parentId == "0") ? null : parentId,
                        PostId = postId,
                        Score = int.Parse(score),
                        Time = time,
                        User = user,
                        Text = text,
                        Stems = new List<Stem>(stems)
                    };

                }
                else
                    return null;
            }
            else
                return null;
        }