Пример #1
0
 private static void ConvertContentTo(HtmlNode node, WebPageContent webPageContent)
 {
     foreach (HtmlNode subnode in node.ChildNodes)
     {
         ConvertTo(subnode, webPageContent);
     }
 }
Пример #2
0
        private static void ConvertTo(HtmlNode node, WebPageContent webPageContent)
        {
            string html;
            switch (node.NodeType)
            {
                case HtmlNodeType.Document:
                    ConvertContentTo(node, webPageContent);
                    break;

                case HtmlNodeType.Text:
                    // script and style must not be output
                    string parentName = node.ParentNode.Name;
                    if (parentName != "title")
                        break;

                    // get text
                    html = ((HtmlTextNode)node).Text;

                    // is it in fact a special closing node output as text?
                    if (HtmlNode.IsOverlappedClosingElement(html))
                        break;

                    webPageContent.Element = HtmlEntity.DeEntitize(html.Trim());
                    webPageContent.Line = node.Line;
                    webPageContent.LinePosition = node.LinePosition;
                    break;

                case HtmlNodeType.Element:
                    if (node.HasChildNodes)
                    {
                        ConvertContentTo(node, webPageContent);
                    }
                    break;
            }
        }
Пример #3
0
 public WebPageContent GetTitle()
 {
     var content = new WebPageContent();
     ConvertTo(_crawledPage.HtmlDocument.DocumentNode, content);
     return content;
 }