예제 #1
0
 public void CaseA()
 {
     var html = "<html><body><a href=\"#1\">foo</a></body></html>";
     var parser = new HtmlParser();
     var node = parser.Parse(html);
     Assert.That(node is HtmlNode.Tag);
 }
예제 #2
0
        public static bool isWeeklyNewspaper(string htmlText) {
            try
            {
                // if containts the word ejenedelnik we need this
                var parser = new HtmlParser();
                var node = parser.Parse(htmlText);
                var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("title"));
                node.AcceptVisitor(finder);
                var titleNode = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
                var title = titleNode.Value;
                Regex regEx = new Regex(@"(Еженедельник \""Аргументы и Факты\"" №)");
                Match m = regEx.Match(title);

                if (m.Success)
                {
                    LogServices.WriteProgressLog("it is weekly ");
                    return true;
                }
                else
                    return false;
            }
            catch (Exception)
            {

                LogServices.WriteProgressLog("исключение");
                return false;
            }   
        }
예제 #3
0
 public void CaseB()
 {
     var html = "<html><br/></html>";
     var parser = new HtmlParser();
     var node = parser.Parse(html);
     Assert.That(node is HtmlNode.Tag);
 }
예제 #4
0
 public void CaseC()
 {
     var html = File.ReadAllText(@"Resources\356.html");
     var parser = new HtmlParser();
     var node = parser.Parse(html);
     Assert.That(node is HtmlNode.Tag);
 }
예제 #5
0
 public static string getQATitle(string htmlText) {
     var parser = new HtmlParser();
     var node = parser.Parse(htmlText);
     var finder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class")
         && tag.Attributes.ContainsValue("title"));
     node.AcceptVisitor(finder);
   var titleTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
     var title = titleTag.Value;
     return title;
 }
예제 #6
0
 public static DateTime getQADate(string htmlText)
 {
     var parser = new HtmlParser();
     var node = parser.Parse(htmlText);
     var finder = new FindTagsVisitor(tag => tag.Name == "time");
     node.AcceptVisitor(finder);
     var dateTag = (Majestic13.HtmlNode.Text)finder.Result[0].Children[0];
     var datestring = dateTag.Value;
     return DateTime.ParseExact(datestring, "HH:mm dd/MM/yyyy", null);
 
 }
예제 #7
0
 public static string getQAContent(string htmlText)
 {
     var content = "";
     var parser = new HtmlParser();
     var node = parser.Parse(htmlText);
     var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
         && tag.Attributes.ContainsValue("vo_o_text"));
     node.AcceptVisitor(finder);
     var contextTag = finder.Result[0];
     content = RequestServices.ContinueUntilOnlyTextLeft(contextTag, content);
     LogServices.WriteProgressLog("Fetched content of article ");
     return content;
 }
예제 #8
0
 public static string getQAEditionName(string htmlText)
 {
     var parser = new HtmlParser();
     var node = parser.Parse(htmlText);           
     var finder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
         && tag.Attributes.ContainsValue("autors_box"));
     node.AcceptVisitor(finder);
     var lastIndex = finder.Result[0].Children.Count-2;
     var divTag = (Majestic13.HtmlNode.Tag)finder.Result[0].Children[lastIndex];
     var aTag = (Majestic13.HtmlNode.Tag)divTag.Children[3];
     var name = (Majestic13.HtmlNode.Text)aTag.Children[0];
     return name.Value;
 
 }
예제 #9
0
 public static string getQAAuthor(string htmlText)
 {
     var author = "Автор не известен";
     var parser = new HtmlParser();
     var node = parser.Parse(htmlText);
     var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
         && tag.Attributes.ContainsValue("autors_box"));
     node.AcceptVisitor(authorFieldFinder);
     var authorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[1];
     if (authorTag.Attributes.ContainsValue("icon autors_icon"))
     {
         var autorTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[2];
         var authorName = (Majestic13.HtmlNode.Text)autorTag.Children[0];
         author = authorName.Value;
     }   
     return author;
 }
        /// <summary>
        /// 分析github的issue,返回issue的title和body
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private string GetIssueTitleAndDescription(string html)
        {
            if (string.IsNullOrEmpty(html))
            {
                return null;
            }

            var parser = new HtmlParser();
            var node = parser.Parse(html);

            //get issue title
            var visitor = new FindTagsVisitor(x => x.Name == "h2" && x.Attributes.ContainsKey("class") && x.Attributes["class"] == "discussion-topic-title");
            node.AcceptVisitor(visitor);

            var issueTitle = string.Empty;
            if (visitor.Result != null && visitor.Result.Count > 0)
            {
                var textNode = visitor.Result.First().Children.FirstOrDefault() as HtmlNode.Text;
                if (textNode != null)
                {
                    issueTitle = textNode.Value;
                }
            }

            //get issue body
            visitor = new FindTagsVisitor(x => x.Name == "div" && x.Attributes.ContainsKey("class") && x.Attributes["class"] == "js-comment-body comment-body markdown-body markdown-format");
            node.AcceptVisitor(visitor);

            var issueBody = string.Empty;
            if (visitor.Result != null && visitor.Result.Count > 0)
            {
                var childTag = visitor.Result.First().Children.FirstOrDefault(x => x is HtmlNode.Tag && ((HtmlNode.Tag)x).Name == "p") as HtmlNode.Tag;
                if (childTag != null)
                {
                    var textNode = childTag.Children.FirstOrDefault(x => x is HtmlNode.Text) as HtmlNode.Text;
                    if (textNode != null)
                    {
                        issueBody = textNode.Value;
                    }
                }
            }

            return issueTitle + Environment.NewLine + issueBody;
        }
예제 #11
0
        public static string getArticleTitle(string htmlText) {
            try
            {
                var title = "Без заголовка";
                var parser = new HtmlParser();
                var node = parser.Parse(htmlText);
                var titleFinder = new FindTagsVisitor(tag => tag.Name == "h1" && tag.Attributes.ContainsKey("class")
                    && tag.Attributes.ContainsValue("material_title increase_text"));
                node.AcceptVisitor(titleFinder);
                if (titleFinder.Result.Count > 0 && titleFinder.Result[0].Children.Count > 0)
                {
                    var titleTag = (Majestic13.HtmlNode.Text)titleFinder.Result[0].Children[0];
                    title = titleTag.Value;
                }
                return title;

            }
            catch (Exception)
            {

                return "Название не может быть загружено из-за исключения";
            }
        }
예제 #12
0
        public static DateTime getArticleDate(string htmlText)
        {
            try
            {
                var date = "Date not known";
                var parser = new HtmlParser();
                var node = parser.Parse(htmlText);
                var dateFieldFinder = new FindTagsVisitor(tag => tag.Name == "time");
                node.AcceptVisitor(dateFieldFinder);
                if (dateFieldFinder.Result.Count > 0)
                {
                    var timeTag = (Majestic13.HtmlNode.Tag)dateFieldFinder.Result[0];
                    var text = (Majestic13.HtmlNode.Text)timeTag.Children[0];
                    date = text.Value;
                }
                else
                {
                    var dateFieldFinder1 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
                        && tag.Attributes.ContainsValue("material_topline_info"));
                    node.AcceptVisitor(dateFieldFinder1);
                    var spanTag = (Majestic13.HtmlNode.Tag)dateFieldFinder1.Result[0].Children[1];
                    var text = (Majestic13.HtmlNode.Text)spanTag.Children[1];
                    date = text.Value;
                }
                if (date.Contains("сегодня")) return DateTime.Now;
                else return DateTime.ParseExact(date, "HH:mm dd/MM/yyyy", null);
            }
            catch (Exception)
            {

                return DateTime.Now;
            }   
            }
예제 #13
0
        public static string getArticleContent(string htmlText)
        {
            try
            {
                var content = "";
                var parser = new HtmlParser();
                var node = parser.Parse(htmlText);
                var contentFinder = new FindTagsVisitor(tag => tag.Name == "article");
                node.AcceptVisitor(contentFinder);
                var resultTag = new Majestic13.HtmlNode.Tag();
                resultTag = (Majestic13.HtmlNode.Tag)contentFinder.Result[0];
                content = ContinueUntilOnlyTextLeft(resultTag, content);
                LogServices.WriteProgressLog("Fetched content of article ");
                return content;
            }
            catch (Exception )
            {

                return "Содержимое не может быть загружено из-за исключения";
            }   
        }
 /// <summary>
 /// 获取Html的纯文本
 /// </summary>
 /// <param name="html"></param>
 /// <returns></returns>
 private string GetRawText(string html)
 {
     var parser = new HtmlParser();
     var node = parser.Parse(html);
     var stringBuilder = new StringBuilder();
     GenerateRawText(node, stringBuilder);
     return stringBuilder.ToString();
 }
        /// <summary>
        /// 处理指定的html内容,返回处理过的html内容;
        /// 处理过的内容不包含style, css, script, 以及其他一些不合法的html标签
        /// </summary>
        /// <param name="html"></param>
        /// <returns></returns>
        private string GetAbbreviationHtml(string html)
        {
            if (string.IsNullOrEmpty(html))
            {
                return null;
            }

            //分析html文档,将其解析为一颗树形结构
            var node = new HtmlParser().Parse(html);

            //获取html文档中的第一个body元素
            var visitor = new FindTagsVisitor(x => x.Name == "body");
            node.AcceptVisitor(visitor);

            //处理所有的Body元素
            var builder = new StringBuilder();
            foreach (var bodyNode in visitor.Result)
            {
                ProcessBodyElement(bodyNode as HtmlNode.Tag, builder);
            }

            //返回解析结果
            return builder.ToString();
        }
예제 #16
0
        public static string getArticleAuthor(string htmlText) {
            try
            {
                var author = "Автор не известен";
                var parser = new HtmlParser();
                var node = parser.Parse(htmlText);
                var authorFieldFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
                    && tag.Attributes.ContainsValue("material_topline_info"));
                node.AcceptVisitor(authorFieldFinder);

                if (authorFieldFinder.Result.Count > 0)
                {
                    var spanTag = (Majestic13.HtmlNode.Tag)authorFieldFinder.Result[0].Children[3];
                    var authorFinder = new FindTagsVisitor(tag => tag.Name == "a" && tag.Attributes.ContainsKey("href"));
                    spanTag.AcceptVisitor(authorFinder);
                    if (authorFinder.Result.Count > 0)
                    {
                        author = "";
                        for (int i = 0; i < authorFinder.Result.Count; i++)
                        {
                            var authorText = (Majestic13.HtmlNode.Text)authorFinder.Result[i].Children[0];
                            var authorToBeAdded = authorText.Value;
                            author = author + " " + authorToBeAdded;
                        }
                    }
                }
                else
                {
                    var authorFieldFinder2 = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
                   && tag.Attributes.ContainsValue("autors_box"));
                    node.AcceptVisitor(authorFieldFinder2);
                    var aTag = (Majestic13.HtmlNode.Tag)authorFieldFinder2.Result[0].Children[2];
                    var aText = (Majestic13.HtmlNode.Text)aTag.Children[0];
                    author = aText.Value;
                }

                return author;
            }
            catch (Exception)
            {

                return "Автор не может быть загружен из-за исключения";
            }
            }
예제 #17
0
        public static string getEditionName(string htmlText)
        {
            try
            {
                var parser = new HtmlParser();
                var node = parser.Parse(htmlText);
                var edNameFinder = new FindTagsVisitor(tag => tag.Name == "div" && tag.Attributes.ContainsKey("class")
                    && tag.Attributes.ContainsValue("newspaper_new"));
                node.AcceptVisitor(edNameFinder);
                if (edNameFinder.Result.Count > 0)
                {
                    var hrefTag = (Majestic13.HtmlNode.Tag)edNameFinder.Result[0].Children[3];
                    var edNameTag = (Majestic13.HtmlNode.Text)hrefTag.Children[0];
                    return edNameTag.Value;
                }
                else
                {
                    var edNameFinder1 = new FindTagsVisitor(tag => tag.Name == "span" && tag.Attributes.ContainsKey("class")
                    && tag.Attributes.ContainsValue("artic_num_box"));
                    node.AcceptVisitor(edNameFinder1);
                    var aTag = (Majestic13.HtmlNode.Tag)edNameFinder1.Result[1].Children[3];
                    var edNameTag = (Majestic13.HtmlNode.Text)aTag.Children[0];
                    return edNameTag.Value;
                }
            }
            catch (Exception)
            {

                return "Название не может быть загружено из-за исключения";
            }
        }
예제 #18
0
        public static List<string> extractLinks(string htmlNewspaperText)
        {
            try
            {
                var links = new List<string>();
                var parser = new HtmlParser();
                var node = parser.Parse(htmlNewspaperText);
                var areaFinder = new FindTagsVisitor(tag => tag.Name == "h2" && tag.Attributes.ContainsKey("class") && tag.Attributes.ContainsValue("data_title mbottom10"));
                node.AcceptVisitor(areaFinder);
                var linksarea = areaFinder.Result;
                foreach (Majestic13.HtmlNode.Tag tag in linksarea)
                {
                    var linkFinder = new FindTagsVisitor(t => t.Name == "a" && t.Attributes.ContainsKey("href"));
                    tag.AcceptVisitor(linkFinder);
                    var link = linkFinder.Result[0].Attributes.Values.FirstOrDefault();
                    links.Add(link.ToString());
                }
                LogServices.WriteProgressLog("extracted links");
                return links;
            }
            catch (Exception)
            {

                return null;
            }
        }