public void CaseA() { var html = "<html><body><a href=\"#1\">foo</a></body></html>"; var parser = new HtmlParser(); var node = parser.Parse(html); Assert.That(node is HtmlNode.Tag); }
public static bool isWeeklyNewspaper(string htmlText) { try { // if containts the word ejenedelnik we need this var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title")); node.AcceptVisitor(finder); var titleNode = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var title = titleNode.Value; Regex regEx = new Regex(@"(Еженедельник \""Аргументы и Факты\"" №)"); Match m = regEx.Match(title); if (m.Success) { LogServices.WriteProgressLog("it is weekly "); return true; } else return false; } catch (Exception) { LogServices.WriteProgressLog("исключение"); return false; } }
public void CaseB() { var html = "<html><br/></html>"; var parser = new HtmlParser(); var node = parser.Parse(html); Assert.That(node is HtmlNode.Tag); }
public void CaseC() { var html = File.ReadAllText(@"Resources\356.html"); var parser = new HtmlParser(); var node = parser.Parse(html); Assert.That(node is HtmlNode.Tag); }
public static string getQATitle(string htmlText) { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title")); node.AcceptVisitor(finder); var titleTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var title = titleTag.Value; return title; }
public static DateTime getQADate(string htmlText) { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "time"); node.AcceptVisitor(finder); var dateTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0]; var datestring = dateTag.Value; return DateTime.ParseExact(datestring, "HH:mm dd/MM/yyyy", null); }
public static string getQAContent(string htmlText) { var content = ""; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("vo_o_text")); node.AcceptVisitor(finder); var contextTag = finder.Result[0]; content = RequestServices.ContinueUntilOnlyTextLeft(contextTag, content); LogServices.WriteProgressLog("Fetched content of article "); return content; }
public static string getQAEditionName(string htmlText) { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("autors_box")); node.AcceptVisitor(finder); var lastIndex = finder.Result[0].Children.Count-2; var divTag = (Majestic13.HtmlNode.Tag)finder.Result[0].Children[lastIndex]; var aTag = (Majestic13.HtmlNode.Tag)divTag.Children[3]; var name = (Majestic13.HtmlNode.Text)aTag.Children[0]; return name.Value; }
public static string getQAAuthor(string htmlText) { var author = "Автор не известен"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("autors_box")); node.AcceptVisitor(authorFieldFinder); var authorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[1]; if (authorTag.Attributes.ContainsValue("icon autors_icon")) { var autorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[2]; var authorName = (Majestic13.HtmlNode.Text)autorTag.Children[0]; author = authorName.Value; } return author; }
/// <summary> /// 分析github的issue,返回issue的title和body /// </summary> /// <param name="html"></param> /// <returns></returns> private string GetIssueTitleAndDescription(string html) { if (string.IsNullOrEmpty(html)) { return null; } var parser = new HtmlParser(); var node = parser.Parse(html); //get issue title var visitor = new FindTagsVisitor(x => x.Name == "h2" && x.Attributes.ContainsKey("class") && x.Attributes["class"] == "discussion-topic-title"); node.AcceptVisitor(visitor); var issueTitle = string.Empty; if (visitor.Result != null && visitor.Result.Count > 0) { var textNode = visitor.Result.First().Children.FirstOrDefault() as HtmlNode.Text; if (textNode != null) { issueTitle = textNode.Value; } } //get issue body visitor = new FindTagsVisitor(x => x.Name == "div" && x.Attributes.ContainsKey("class") && x.Attributes["class"] == "js-comment-body comment-body markdown-body markdown-format"); node.AcceptVisitor(visitor); var issueBody = string.Empty; if (visitor.Result != null && visitor.Result.Count > 0) { var childTag = visitor.Result.First().Children.FirstOrDefault(x => x is HtmlNode.Tag && ((HtmlNode.Tag)x).Name == "p") as HtmlNode.Tag; if (childTag != null) { var textNode = childTag.Children.FirstOrDefault(x => x is HtmlNode.Text) as HtmlNode.Text; if (textNode != null) { issueBody = textNode.Value; } } } return issueTitle + Environment.NewLine + issueBody; }
public static string getArticleTitle(string htmlText) { try { var title = "Без заголовка"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var titleFinder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("material_title increase_text")); node.AcceptVisitor(titleFinder); if (titleFinder.Result.Count > 0 && titleFinder.Result[0].Children.Count > 0) { var titleTag = (Majestic13.HtmlNode.Text)titleFinder.Result[0].Children[0]; title = titleTag.Value; } return title; } catch (Exception) { return "Название не может быть загружено из-за исключения"; } }
public static DateTime getArticleDate(string htmlText) { try { var date = "Date not known"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var dateFieldFinder = new FindTagsVisitor(tag => tag.Name == "time"); node.AcceptVisitor(dateFieldFinder); if (dateFieldFinder.Result.Count > 0) { var timeTag = (Majestic13.HtmlNode.Tag)dateFieldFinder.Result[0]; var text = (Majestic13.HtmlNode.Text)timeTag.Children[0]; date = text.Value; } else { var dateFieldFinder1 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("material_topline_info")); node.AcceptVisitor(dateFieldFinder1); var spanTag = (Majestic13.HtmlNode.Tag)dateFieldFinder1.Result[0].Children[1]; var text = (Majestic13.HtmlNode.Text)spanTag.Children[1]; date = text.Value; } if (date.Contains("сегодня")) return DateTime.Now; else return DateTime.ParseExact(date, "HH:mm dd/MM/yyyy", null); } catch (Exception) { return DateTime.Now; } }
public static string getArticleContent(string htmlText) { try { var content = ""; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var contentFinder = new FindTagsVisitor(tag => tag.Name == "article"); node.AcceptVisitor(contentFinder); var resultTag = new Majestic13.HtmlNode.Tag(); resultTag = (Majestic13.HtmlNode.Tag)contentFinder.Result[0]; content = ContinueUntilOnlyTextLeft(resultTag, content); LogServices.WriteProgressLog("Fetched content of article "); return content; } catch (Exception ) { return "Содержимое не может быть загружено из-за исключения"; } }
/// <summary> /// 获取Html的纯文本 /// </summary> /// <param name="html"></param> /// <returns></returns> private string GetRawText(string html) { var parser = new HtmlParser(); var node = parser.Parse(html); var stringBuilder = new StringBuilder(); GenerateRawText(node, stringBuilder); return stringBuilder.ToString(); }
/// <summary> /// 处理指定的html内容,返回处理过的html内容; /// 处理过的内容不包含style, css, script, 以及其他一些不合法的html标签 /// </summary> /// <param name="html"></param> /// <returns></returns> private string GetAbbreviationHtml(string html) { if (string.IsNullOrEmpty(html)) { return null; } //分析html文档,将其解析为一颗树形结构 var node = new HtmlParser().Parse(html); //获取html文档中的第一个body元素 var visitor = new FindTagsVisitor(x => x.Name == "body"); node.AcceptVisitor(visitor); //处理所有的Body元素 var builder = new StringBuilder(); foreach (var bodyNode in visitor.Result) { ProcessBodyElement(bodyNode as HtmlNode.Tag, builder); } //返回解析结果 return builder.ToString(); }
public static string getArticleAuthor(string htmlText) { try { var author = "Автор не известен"; var parser = new HtmlParser(); var node = parser.Parse(htmlText); var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("material_topline_info")); node.AcceptVisitor(authorFieldFinder); if (authorFieldFinder.Result.Count > 0) { var spanTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[3]; var authorFinder = new FindTagsVisitor(tag => tag.Name == "a" && tag.Attributes.ContainsKey("href")); spanTag.AcceptVisitor(authorFinder); if (authorFinder.Result.Count > 0) { author = ""; for (int i = 0; i < authorFinder.Result.Count; i++) { var authorText = (Majestic13.HtmlNode.Text)authorFinder.Result[i].Children[0]; var authorToBeAdded = authorText.Value; author = author + " " + authorToBeAdded; } } } else { var authorFieldFinder2 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("autors_box")); node.AcceptVisitor(authorFieldFinder2); var aTag = (Majestic13.HtmlNode.Tag)authorFieldFinder2.Result[0].Children[2]; var aText = (Majestic13.HtmlNode.Text)aTag.Children[0]; author = aText.Value; } return author; } catch (Exception) { return "Автор не может быть загружен из-за исключения"; } }
public static string getEditionName(string htmlText) { try { var parser = new HtmlParser(); var node = parser.Parse(htmlText); var edNameFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("newspaper_new")); node.AcceptVisitor(edNameFinder); if (edNameFinder.Result.Count > 0) { var hrefTag = (Majestic13.HtmlNode.Tag)edNameFinder.Result[0].Children[3]; var edNameTag = (Majestic13.HtmlNode.Text)hrefTag.Children[0]; return edNameTag.Value; } else { var edNameFinder1 = new FindTagsVisitor(tag => tag.Name == "span" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("artic_num_box")); node.AcceptVisitor(edNameFinder1); var aTag = (Majestic13.HtmlNode.Tag)edNameFinder1.Result[1].Children[3]; var edNameTag = (Majestic13.HtmlNode.Text)aTag.Children[0]; return edNameTag.Value; } } catch (Exception) { return "Название не может быть загружено из-за исключения"; } }
public static List<string> extractLinks(string htmlNewspaperText) { try { var links = new List<string>(); var parser = new HtmlParser(); var node = parser.Parse(htmlNewspaperText); var areaFinder = new FindTagsVisitor(tag => tag.Name == "h2" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("data_title mbottom10")); node.AcceptVisitor(areaFinder); var linksarea = areaFinder.Result; foreach (Majestic13.HtmlNode.Tag tag in linksarea) { var linkFinder = new FindTagsVisitor(t => t.Name == "a" && t.Attributes.ContainsKey("href")); tag.AcceptVisitor(linkFinder); var link = linkFinder.Result[0].Attributes.Values.FirstOrDefault(); links.Add(link.ToString()); } LogServices.WriteProgressLog("extracted links"); return links; } catch (Exception) { return null; } }