private static void ConvertContentTo(HtmlNode node, WebPageContent webPageContent) { foreach (HtmlNode subnode in node.ChildNodes) { ConvertTo(subnode, webPageContent); } }
private static void ConvertTo(HtmlNode node, WebPageContent webPageContent) { string html; switch (node.NodeType) { case HtmlNodeType.Document: ConvertContentTo(node, webPageContent); break; case HtmlNodeType.Text: // script and style must not be output string parentName = node.ParentNode.Name; if (parentName != "title") break; // get text html = ((HtmlTextNode)node).Text; // is it in fact a special closing node output as text? if (HtmlNode.IsOverlappedClosingElement(html)) break; webPageContent.Element = HtmlEntity.DeEntitize(html.Trim()); webPageContent.Line = node.Line; webPageContent.LinePosition = node.LinePosition; break; case HtmlNodeType.Element: if (node.HasChildNodes) { ConvertContentTo(node, webPageContent); } break; } }
public WebPageContent GetTitle() { var content = new WebPageContent(); ConvertTo(_crawledPage.HtmlDocument.DocumentNode, content); return content; }