public static bool isWeeklyNewspaper(string htmlText) { try { // if containts the word ejenedelnik we need this var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title")); node.AcceptVisitor(finder); var titleNode = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var title = titleNode.Value; Regex regEx = new Regex(@"(Еженедельник \""Аргументы и Факты\"" №)"); Match m = regEx.Match(title); if (m.Success) { LogServices.WriteProgressLog("it is weekly "); return(true); } else { return(false); } } catch (Exception) { LogServices.WriteProgressLog("исключение"); return(false); } }
public static List <string> extractLinks(string htmlNewspaperText) { try { var links = new List <string>(); var parser = new HtmlParser(); var node = parser.Parse(htmlNewspaperText); var areaFinder = new FindTagsVisitor(tag => tag.Name == "h2" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("data_title mbottom10")); node.AcceptVisitor(areaFinder); var linksarea = areaFinder.Result; foreach (Majestic13.HtmlNode.Tag tag in linksarea) { var linkFinder = new FindTagsVisitor(t => t.Name == "a" && t.Attributes.ContainsKey("href")); tag.AcceptVisitor(linkFinder); var link = linkFinder.Result[0].Attributes.Values.FirstOrDefault(); links.Add(link.ToString()); } LogServices.WriteProgressLog("extracted links"); return(links); } catch (Exception) { return(null); } }
public static string getEditionName(string htmlText) { try { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var edNameFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("newspaper_new")); node.AcceptVisitor(edNameFinder); if (edNameFinder.Result.Count > 0) { var hrefTag = (Majestic13.HtmlNode.Tag)edNameFinder.Result[0].Children[3]; var edNameTag = (Majestic13.HtmlNode.Text)hrefTag.Children[0]; return(edNameTag.Value); } else { var edNameFinder1 = new FindTagsVisitor(tag => tag.Name == "span" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("artic_num_box")); node.AcceptVisitor(edNameFinder1); var aTag = (Majestic13.HtmlNode.Tag)edNameFinder1.Result[1].Children[3]; var edNameTag = (Majestic13.HtmlNode.Text)aTag.Children[0]; return(edNameTag.Value); } } catch (Exception) { return("Название не может быть загружено из-за исключения"); } }
public static DateTime getQADate(string htmlText) { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "time"); node.AcceptVisitor(finder); var dateTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var datestring = dateTag.Value; return(DateTime.ParseExact(datestring, "HH:mm dd/MM/yyyy", null)); }
public static string getQATitle(string htmlText) { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title")); node.AcceptVisitor(finder); var titleTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var title = titleTag.Value; return(title); }
public static string getQAEditionName(string htmlText) { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("autors_box")); node.AcceptVisitor(finder); var lastIndex = finder.Result[0].Children.Count - 2; var divTag = (Majestic13.HtmlNode.Tag)finder.Result[0].Children[lastIndex]; var aTag = (Majestic13.HtmlNode.Tag)divTag.Children[3]; var name = (Majestic13.HtmlNode.Text)aTag.Children[0]; return(name.Value); }
public static string getQAContent(string htmlText) { var content = ""; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("vo_o_text")); node.AcceptVisitor(finder); var contextTag = finder.Result[0]; content = RequestServices.ContinueUntilOnlyTextLeft(contextTag, content); LogServices.WriteProgressLog("Fetched content of article "); return(content); }
public static string getArticleAuthor(string htmlText) { try { var author = "Автор не известен"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("material_topline_info")); node.AcceptVisitor(authorFieldFinder); if (authorFieldFinder.Result.Count > 0) { var spanTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[3]; var authorFinder = new FindTagsVisitor(tag => tag.Name == "a" && tag.Attributes.ContainsKey("href")); spanTag.AcceptVisitor(authorFinder); if (authorFinder.Result.Count > 0) { author = ""; for (int i = 0; i < authorFinder.Result.Count; i++) { var authorText = (Majestic13.HtmlNode.Text)authorFinder.Result[i].Children[0]; var authorToBeAdded = authorText.Value; author = author + " " + authorToBeAdded; } } } else { var authorFieldFinder2 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("autors_box")); node.AcceptVisitor(authorFieldFinder2); var aTag = (Majestic13.HtmlNode.Tag)authorFieldFinder2.Result[0].Children[2]; var aText = (Majestic13.HtmlNode.Text)aTag.Children[0]; author = aText.Value; } return(author); } catch (Exception) { return("Автор не может быть загружен из-за исключения"); } }
public static string getQAAuthor(string htmlText) { var author = "Автор не известен"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("autors_box")); node.AcceptVisitor(authorFieldFinder); var authorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[1]; if (authorTag.Attributes.ContainsValue("icon autors_icon")) { var autorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[2]; var authorName = (Majestic13.HtmlNode.Text)autorTag.Children[0]; author = authorName.Value; } return(author); }
public static string getArticleContent(string htmlText) { try { var content = ""; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var contentFinder = new FindTagsVisitor(tag => tag.Name == "article"); node.AcceptVisitor(contentFinder); var resultTag = new Majestic13.HtmlNode.Tag(); resultTag = (Majestic13.HtmlNode.Tag)contentFinder.Result[0]; content = ContinueUntilOnlyTextLeft(resultTag, content); LogServices.WriteProgressLog("Fetched content of article "); return(content); } catch (Exception) { return("Содержимое не может быть загружено из-за исключения"); } }
public static string getArticleTitle(string htmlText) { try { var title = "Без заголовка"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var titleFinder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("material_title increase_text")); node.AcceptVisitor(titleFinder); if (titleFinder.Result.Count > 0 && titleFinder.Result[0].Children.Count > 0) { var titleTag = (Majestic13.HtmlNode.Text)titleFinder.Result[0].Children[0]; title = titleTag.Value; } return(title); } catch (Exception) { return("Название не может быть загружено из-за исключения"); } }
public static DateTime getArticleDate(string htmlText) { try { var date = "Date not known"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var dateFieldFinder = new FindTagsVisitor(tag => tag.Name == "time"); node.AcceptVisitor(dateFieldFinder); if (dateFieldFinder.Result.Count > 0) { var timeTag = (Majestic13.HtmlNode.Tag)dateFieldFinder.Result[0]; var text = (Majestic13.HtmlNode.Text)timeTag.Children[0]; date = text.Value; } else { var dateFieldFinder1 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("material_topline_info")); node.AcceptVisitor(dateFieldFinder1); var spanTag = (Majestic13.HtmlNode.Tag)dateFieldFinder1.Result[0].Children[1]; var text = (Majestic13.HtmlNode.Text)spanTag.Children[1]; date = text.Value; } if (date.Contains("сегодня")) { return(DateTime.Now); } else { return(DateTime.ParseExact(date, "HH:mm dd/MM/yyyy", null)); } } catch (Exception) { return(DateTime.Now); } }