Example #1
0
        /// <summary>
        /// Builds the content tree out of <see cref="HtmlDocument"/>
        /// </summary>
        /// <param name="htmlDoc">The HTML document</param>
        /// <param name="__name">The name of the root</param>
        /// <returns></returns>
        public static nodeTree buildTree(this HtmlDocument htmlDoc, string __name, Boolean allowTitle = false, Boolean allowMeta = false)
        {
            nodeTree output = new nodeTree(__name, htmlDoc);

            XPathNodeIterator iterator = htmlDoc.CreateNavigator().Select("//*[text()][count(*)=0]");

            //XPathNodeIterator iterator = htmlDoc.CreateNavigator().SelectDescendants(System.Xml.XPath.XPathNodeType.Text, false);
            while (iterator.MoveNext())
            {
                XPathNavigator    current       = iterator.Current;
                HtmlNodeNavigator htmlNavigator = current as HtmlNodeNavigator;
                string            sp            = htmlNavigator.CurrentNode.XPath.Replace("/", "\\");
                string            cn            = "";

                if (htmlNavigator.CurrentNode.Name.ToLower() == "title")
                {
                }

                if (isTagAcceptable(htmlNavigator.CurrentNode, null, allowTitle, allowMeta))
                {
                    cn = "";
                    cn = htmlNavigator.CurrentNode.InnerText;

                    //if (htmlNavigator.CurrentNode.checkTextHtmlConsistensy())
                    //{

                    //}

                    cn = cn.htmlContentProcess().Trim();
                    if (!cn.isNullOrEmptyString())
                    {
                        graphWrapNode <htmlWrapper> nn = output.Add(sp, htmlNavigator.CurrentNode.Clone());
                        nn.item.content = cn;
                        nn.item.xPath   = sp;
                        nn.item.path    = nn.path;
                        // nodesWithText.AddNewLeaf(sp, htmlNavigator.CurrentNode.Clone(), report, cn);
                    }
                    else
                    {
                    }
                }
                else
                {
                }
            }

            return(output);
        }
Example #2
0
        public List <imbMCBlock> process(HtmlDocument html, String name)
        {
            List <imbMCBlock> output = new List <imbMCBlock>();

            nodeTree tree = new nodeTree("document", html);

            //var navigator = html.DocumentNode.CreateNavigator();

            var contentTree = html.buildTree(name, true, false);
            // contentTree = new nodeTree(page.webpage.domain, htmlDoc);
            var contentBlocks = contentTree.getBlocks(targetBlockCount);

            contentBlocks.CalculateScores();

            //var blocks = tree.getBlocks(targetBlockCount);


            Int32 c = 0;

            foreach (nodeBlock bl in contentBlocks)
            {
                c++;
                imbMCBlock mcBlock = new imbMCBlock();
                mcBlock.name       = "B" + c.ToString("D3");
                mcBlock.blockModel = bl;


                output.Add(mcBlock);
            }

            if (output.Any())
            {
            }
            else
            {
            }

            return(output);
        }