/// <summary> /// Builds the content tree out of <see cref="HtmlDocument"/> /// </summary> /// <param name="htmlDoc">The HTML document</param> /// <param name="__name">The name of the root</param> /// <returns></returns> public static nodeTree buildTree(this HtmlDocument htmlDoc, string __name, Boolean allowTitle = false, Boolean allowMeta = false) { nodeTree output = new nodeTree(__name, htmlDoc); XPathNodeIterator iterator = htmlDoc.CreateNavigator().Select("//*[text()][count(*)=0]"); //XPathNodeIterator iterator = htmlDoc.CreateNavigator().SelectDescendants(System.Xml.XPath.XPathNodeType.Text, false); while (iterator.MoveNext()) { XPathNavigator current = iterator.Current; HtmlNodeNavigator htmlNavigator = current as HtmlNodeNavigator; string sp = htmlNavigator.CurrentNode.XPath.Replace("/", "\\"); string cn = ""; if (htmlNavigator.CurrentNode.Name.ToLower() == "title") { } if (isTagAcceptable(htmlNavigator.CurrentNode, null, allowTitle, allowMeta)) { cn = ""; cn = htmlNavigator.CurrentNode.InnerText; //if (htmlNavigator.CurrentNode.checkTextHtmlConsistensy()) //{ //} cn = cn.htmlContentProcess().Trim(); if (!cn.isNullOrEmptyString()) { graphWrapNode <htmlWrapper> nn = output.Add(sp, htmlNavigator.CurrentNode.Clone()); nn.item.content = cn; nn.item.xPath = sp; nn.item.path = nn.path; // nodesWithText.AddNewLeaf(sp, htmlNavigator.CurrentNode.Clone(), report, cn); } else { } } else { } } return(output); }
public List <imbMCBlock> process(HtmlDocument html, String name) { List <imbMCBlock> output = new List <imbMCBlock>(); nodeTree tree = new nodeTree("document", html); //var navigator = html.DocumentNode.CreateNavigator(); var contentTree = html.buildTree(name, true, false); // contentTree = new nodeTree(page.webpage.domain, htmlDoc); var contentBlocks = contentTree.getBlocks(targetBlockCount); contentBlocks.CalculateScores(); //var blocks = tree.getBlocks(targetBlockCount); Int32 c = 0; foreach (nodeBlock bl in contentBlocks) { c++; imbMCBlock mcBlock = new imbMCBlock(); mcBlock.name = "B" + c.ToString("D3"); mcBlock.blockModel = bl; output.Add(mcBlock); } if (output.Any()) { } else { } return(output); }