public static string[] SplitToWords(string text) { RussianStemmer stemmer = new RussianStemmer(); char[] separators = { ' ', ',', '.', '!', '"', '?', '+', '-', ':', ';', '\n', '\r', '\t', ')', '(', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9' }; string[] textSplit = text.Split(separators, StringSplitOptions.RemoveEmptyEntries); for (int i = 0; i < textSplit.Length; i++) { textSplit[i] = stemmer.Stem(textSplit[i]); } textSplit = TextChange.JoinNo(textSplit); textSplit = TextChange.RemovePrepositions(textSplit); return textSplit; }
private void copy_from(RussianStemmer other) { I_p2 = other.I_p2; I_pV = other.I_pV; copy_from(other); }
public static Post Parse(HtmlNode postContentHtmlnode) { if (postContentHtmlnode != null) { string published = postContentHtmlnode.SelectSingleNode("//div[@class='published']").InnerText; CultureInfo provider = CultureInfo.CurrentCulture; string format = "d MMMM yyyy в HH:mm"; DateTime publishedDate = DateTime.ParseExact(published.Trim(), format, provider, DateTimeStyles.AllowWhiteSpaces); string title = postContentHtmlnode.SelectSingleNode("//span[@class='post_title']").InnerText; var hubnodes = postContentHtmlnode.SelectNodes("//a[@class='hub ']"); List<string> hubs = new List<string>(); if (hubnodes != null) hubs = hubnodes.Select(x => x.InnerText).ToList(); string postHtml = postContentHtmlnode.SelectSingleNode("//div[@class='content html_format']").InnerHtml; string postText = postContentHtmlnode.SelectSingleNode("//div[@class='content html_format']").InnerText; Regex wordsRegex = new Regex(@"\w+"); RussianStemmer stemmer = new RussianStemmer(); var stems = wordsRegex .Matches(postText) .Cast<Match>() .Select(x => stemmer.Stem(x.Value)) .GroupBy(x => x) .Select(x => new Stem() { id = Guid.NewGuid(), Word = x.Key, Frequency = x.Count() }); var tagNodes = postContentHtmlnode.SelectSingleNode("//ul[@class='tags']").SelectNodes("//a[@rel='tag']"); List<string> tags = new List<string>(); if (tagNodes != null) tags = tagNodes.Select(x => x.InnerText).ToList(); string pageViews = postContentHtmlnode.SelectSingleNode("//div[@class='pageviews']").InnerText; if (string.IsNullOrEmpty(pageViews)) pageViews = "0"; string favCount = postContentHtmlnode.SelectSingleNode("//div[@class='favs_count']").InnerText; if (string.IsNullOrEmpty(favCount)) favCount = "0"; string author = postContentHtmlnode.SelectSingleNode("//div[@class='author']").SelectSingleNode(".//a").InnerText; string authorRating = postContentHtmlnode.SelectSingleNode("//span[@class='rating']").InnerText; string commentsCount = postContentHtmlnode.SelectSingleNode("//span[@id='comments_count']").InnerText; if (string.IsNullOrEmpty(commentsCount)) commentsCount = "0"; string postId = postContentHtmlnode.SelectSingleNode("//div[@class='post shortcuts_item'] | //div[@class='post translation shortcuts_item']").GetAttributeValue("id", null); postId = postId.Split('_')[1]; string originalAuthor = null; string originalSource = null; bool translation = false; var originalAuthorNode = postContentHtmlnode.SelectSingleNode("//div[@class='original-author']/a"); if (originalAuthorNode != null) { translation = true; originalAuthor = originalAuthorNode.InnerText; originalSource = originalAuthorNode.GetAttributeValue("href", null); } List<Comment> comments = new List<Comment>(); var commentNodes = postContentHtmlnode.SelectNodes("//div[@class='comment_item']"); if (commentNodes != null) foreach (var commentNode in commentNodes) { Comment comment = Comment.Parse(commentNode, postId); if (comment != null) comments.Add(comment); } return new Post { Author = author, AuthorRating = authorRating, CommentsCount = int.Parse(commentsCount), Comments = comments, FavoritesCount = int.Parse(favCount), Hubs = hubs, id = postId, PageViews = int.Parse(pageViews), PostHtml = postHtml, PostText = postText, Published = publishedDate, Tags = tags, Title = title, OriginalAuthor = originalAuthor, OriginalSource = originalSource, Translation = translation, Stems = new List<Stem>(stems) }; } else return null; }
public static void InitParams(List<string> t) { Eng = new StWdsEng(); Rus = new StWdsRus(); EngStem = new EnglishStemmer(); RusStem = new RussianStemmer(); ts = new List<string>(); ts = t; Tag = new List<List<Word>>(); }
public static void InitParams(string[] t) { string[] temp = t; Eng = new StWdsEng(); Rus = new StWdsRus(); EngStem = new EnglishStemmer(); RusStem = new RussianStemmer(); ts = new List<string>(); ts.AddRange(temp); Tag = new List<List<Word>>(); }
public static void InitParams(List<string> t) { Eng = new StWdsEng(); Rus = new StWdsRus(); EngStem = new EnglishStemmer(); RusStem = new RussianStemmer(); ts = new List<string>(); ts = t; fullWdsCollection = new List<List<string>>[2]; fullWdsCollection[0] = new List<List<string>>(); fullWdsCollection[1] = new List<List<string>>(); Tag = new List<List<Word>>[2]; Tag[0] = new List<List<Word>>(); Tag[1] = new List<List<Word>>(); }
private string StemTerm(string term) { string massagedTerm = term.Trim().ToLower(); if (String.IsNullOrEmpty(massagedTerm)) return null; switch (_language) { case ConfigHandler.Language.English: EnglishWord englishWord = new EnglishWord(massagedTerm); return englishWord.Stem; case ConfigHandler.Language.German: GermanStemmer germanStemmer = new GermanStemmer(); return germanStemmer.Stem(massagedTerm); case ConfigHandler.Language.Russian: RussianStemmer russianStemmer = new RussianStemmer(); return russianStemmer.Stem(massagedTerm); case ConfigHandler.Language.French: FrenchStemmer frenchStemmer = new FrenchStemmer(); return frenchStemmer.Stem(massagedTerm); default: return massagedTerm; } }
public void ParsePage(string urlPage) { if (urlPage == null) { mess.WriteMessage(String.Format("Передана пустая ссылка")); return; } Dictionary<string, object> allInformation = new Dictionary<string, object>(); allInformation.Add("url", urlPage); //Регулярка для поиска начала статьи. Ограничение на количество, что бы не зацепить другие комменты Regex beginText = new Regex("<!--.*testcom.{0,5}news.{0,50}-->"); Regex beginText2 = new Regex("<!-- СТАТЬЯ -->"); //Регулярка для поиска конца статьи Regex endText = new Regex("(Ссылки по теме)|(Сайты по теме)|(<!-- social -->)"); //urlPage = "http://lenta.ru/news/2012/10/07/milf/"; // "http://lenta.ru/news/2012/10/01/party/"; //http://lenta.ru/news/2012/10/05/manson1/ DateTime dateArticle = ConvertSubUriInDateTime(urlPage); allInformation.Add("date", dateArticle); string subUrl = string.Format("testcom/news/2012"); HtmlDocument newsPage = GetPage(urlPage); if (newsPage == null) { mess.WriteMessage(String.Format("Не удалось загрузить страницу: {0}", urlPage)); return; } allInformation.Add("articleHtml", newsPage.DocumentNode.InnerHtml); var text = newsPage.DocumentNode.SelectSingleNode("//td[@class='statya']"); var title = newsPage.DocumentNode.SelectSingleNode("/html/head/title"); var description = newsPage.DocumentNode.SelectSingleNode("/html/head/meta[1]"); string descrTest = description.GetAttributeValue("content", ""); allInformation.Add("description", descrTest); string badText = text.InnerText; var matchBegin = beginText.Match(badText); if (!matchBegin.Success) matchBegin = beginText2.Match(badText); var matchEnd = endText.Match(badText); if (!matchBegin.Success || !matchEnd.Success) { mess.WriteMessage(String.Format("Не удалось выделить основной текст: {0}", urlPage)); //throw new Exception("Начало или конец статьи не найден!"); } StringBuilder articleText = new StringBuilder( badText.Substring(matchBegin.Index + matchBegin.Length, matchEnd.Index - matchBegin.Index - matchBegin.Length + 0)); //после замены чистый текст статьи articleText.Replace("\n", ""); allInformation.Add("articleText", articleText.ToString()); //выделение основ RussianStemmer rusStemmer = new RussianStemmer(); Regex separators = new Regex("[А-Яа-я]+"); var matches = separators.Matches(articleText.ToString()); Dictionary<string, int> words = new Dictionary<string, int>(); foreach (Match item in matches) { string tmp = rusStemmer.Stem(item.Value); if (words.ContainsKey(tmp)) words[tmp]++; else words.Add(tmp, 1); } //Выделение тематики Regex themeRegEx = new Regex(":"); var themeMatches = themeRegEx.Matches(title.InnerText); string theme = title.InnerText.Substring(themeMatches[0].Index+1, themeMatches[1].Index - themeMatches[0].Index - 1).Trim(); allInformation.Add("theme", theme); //Выделение заголовка var h2Title = newsPage.DocumentNode.SelectSingleNode("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/h2"); string titleText = h2Title.InnerText; allInformation.Add("title", titleText); //Выделение ссылок по теме //*[@id="pacman"]/table/tbody/tr/td/p[6] //*[@id="pacman"]/table/tbody/tr/td/p[6]/a var linksOnTheme = newsPage.DocumentNode.SelectNodes("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/p[@class='links']/a"); List<string> links = new List<string>(); if (linksOnTheme != null) { foreach (var item in linksOnTheme) { string tmp = item.GetAttributeValue("href", ""); if (tmp != String.Empty) links.Add(tmp); } } allInformation.Add("wordBase", words); allInformation.Add("themeLinks", links); //Выделение ссылок в тексте var linksInArticle = newsPage.DocumentNode.SelectNodes("/html/body/table[@class='peredovica']/tr[1]/td[@id='pacman']/table/tr/td/a"); List<string> listLinksInArticle = new List<string>(); if (linksInArticle != null) { foreach (var item in linksInArticle) { string tmp = item.GetAttributeValue("href", ""); if (!tmp.StartsWith("http://lenta.ru")) tmp = String.Format("http://lenta.ru{0}", tmp); if(tmp != String.Empty) listLinksInArticle.Add(tmp); } } allInformation.Add("articleLinks", listLinksInArticle); //сохраняем в Mongo repository.SaveDocument(allInformation); mess.WriteMessage(String.Format("Сохранили: {0}", urlPage)); }
public static Comment Parse(HtmlNode commentNode, string postId) { if (commentNode != null) { string commentId = commentNode.GetAttributeValue("id", null); if (commentId != null) { commentId = commentId.Split('_')[1]; string parentId = commentNode.SelectSingleNode("./span[@class='parent_id']").GetAttributeValue("data-parent_id", null); string user = commentNode.SelectSingleNode("./div[@class='info ']/a[@class='username']").InnerText; string score = commentNode.SelectSingleNode("./div[@class='info ']/div[@class='voting ']/*/span[@class='score']").InnerText; string text = commentNode.SelectSingleNode("./div[contains(@class,'message html_format ')]").InnerText.Trim(); Regex wordsRegex = new Regex(@"\w+"); RussianStemmer stemmer = new RussianStemmer(); var stems = wordsRegex .Matches(text) .Cast<Match>() .Select(x => stemmer.Stem(x.Value)) .GroupBy(x => x) .Select(x => new Stem() { id = Guid.NewGuid(), Word = x.Key, Frequency = x.Count() }); string timeStr = commentNode.SelectSingleNode("./div[@class='info ']/time").InnerText; CultureInfo provider = CultureInfo.CurrentCulture; string[] formats = new string[] { "d MMMM yyyy в HH:mm", "d MMMM yyyy в HH:mm (комментарий был изменён)" }; DateTime time = DateTime.ParseExact(timeStr, formats, provider, DateTimeStyles.AllowWhiteSpaces); if (string.IsNullOrEmpty(score)) score = "0"; score = score.Trim('+'); score = score.Replace("–", "-"); return new Comment() { id = commentId, ParentId = (parentId == null || parentId == "0") ? null : parentId, PostId = postId, Score = int.Parse(score), Time = time, User = user, Text = text, Stems = new List<Stem>(stems) }; } else return null; } else return null; }