Beispiel #1
0
        public void AttributeParsing()
        {
            string      html = @"            
            <html>
	            <body>
		            <a id=01 /class/red >class red</a>
		            <a id=02 /class/= /red >class = red</a>
		            <a id=03 class// / = /red >class = red</a>
		            <a id=04    class / = /red >class = red</a>
		            <a id=05 class   = /red >class='/red'</a>
		            <a id=06 class= /red >class='/red'</a>
		            <a id=07 clas:s= red >clas:s='red'</a>
		            <a id=08 class:= red >class:='red'</a>
		            <a id=09 class= 'red'/ >class='red'</a>
		            <a id=10 class=red/ >class='red/'</a>
		            <a id=11 class=red/>class='red/'</a>
	            </body>
            </html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #2
0
        public void SelfClosingTags()
        {
            string      html = @"           
            <html><body>
            <h1>Hello World</h1>
            Some text <br> Some more text <img src='foobar.jpg'> more text <hr><a>foo</a>
            <p/> non self-closing </p>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure img node has attributes, and no children
            XmlNode imgNode = doc.SelectSingleNode("//img");

            Assert.AreEqual("foobar.jpg", imgNode.Attributes["src"].Value);
            Assert.IsNull(imgNode.FirstChild);

            // Ensure br node has no attributes, and no children
            XmlNode brNode = doc.SelectSingleNode("//br");

            Assert.AreEqual(0, brNode.Attributes.Count);
            Assert.IsNull(brNode.FirstChild);

            // Ensure hr node has no attributes, and no children
            XmlNode hrNode = doc.SelectSingleNode("//hr");

            Assert.AreEqual(0, hrNode.Attributes.Count);
            Assert.IsNull(hrNode.FirstChild);

            // Ensure p node has children, because it is non self-closing
            XmlNode pNode = doc.SelectSingleNode("//p");

            Assert.AreEqual(" non self-closing ", pNode.FirstChild.InnerText);
        }
Beispiel #3
0
        public void FullyQualifyUrls()
        {
            string        html    = @"            
            <html><body>
            <a id='1' href='//foobar.com'>hello</a>
            <a id='2' href='/helloworld.html'>hello</a>
            <a id='3' href='helloworld.html'>hello</a>
            <a id='4' href='http://blah.com/helloworld.html'>hello</a>
            <a id='5' href='../helloworld.html'>hello</a>
            <a id='6' href='/wiki/Wikipedia:Introduction'>hello</a>

            </body></html>";
            string        baseUrl = "http://www.foobar.com/products/cat1/someprod.html";
            ParserOptions options = new ParserOptions {
                BaseUrl = baseUrl
            };
            XmlDocument doc = XHtmlLoader.LoadHtml(html, options);

            Console.WriteLine(doc.OuterXml);

            // Ensure the urls are fully qualified...
            Assert.AreEqual("http://foobar.com/", doc.SelectSingleNode("//a[@id='1']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/helloworld.html", doc.SelectSingleNode("//a[@id='2']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/products/cat1/helloworld.html", doc.SelectSingleNode("//a[@id='3']/@href").Value);
            Assert.AreEqual("http://blah.com/helloworld.html", doc.SelectSingleNode("//a[@id='4']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/products/helloworld.html", doc.SelectSingleNode("//a[@id='5']/@href").Value);
            Assert.AreEqual("http://www.foobar.com/wiki/Wikipedia:Introduction", doc.SelectSingleNode("//a[@id='6']/@href").Value);

            // Linq check
            LinqCompare(doc, html, options);
        }
Beispiel #4
0
        public void TestReadingEncodedFile(string fileName, System.Text.Encoding encoding, System.Text.Encoding defaultEncoding)
        {
            // Set some options, so that we can know if things are working...
            LoaderOptions loaderOptions = new LoaderOptions();

            loaderOptions.DetectEncoding  = true;
            loaderOptions.DefaultEncoding = defaultEncoding;
            loaderOptions.ParserOptions.IncludeMetaData = true;

            // Load multi-byte html file into memory
            XmlDocument doc = XHtmlLoader.LoadHtml(_sampleMultiByteHtml);

            // Ensure Sample directory exists
            string sampleDir = (new DirectoryInfo(AssemblyDirectory)).Parent.Parent.Parent.FullName + "\\SampleData\\";

            if (!Directory.Exists(sampleDir))
            {
                Directory.CreateDirectory(sampleDir);
            }

            // Create Encoded file
            string fullName = sampleDir + fileName;

            using (TextWriter sw = new StreamWriter(File.Create(fullName), encoding))
            {
                doc.Save(sw);
            }

            // Re-load into memory
            XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("file://" + fullName, loaderOptions).Result;

            Console.WriteLine("Reading file: " + fileName);
            Console.WriteLine(doc2.OuterXml);
            Assert.AreEqual(doc.SelectSingleNode("//body").OuterXml, doc2.SelectSingleNode("//body").OuterXml);
        }
        public void XML_Characters_InResults()
        {
            XmlDocument doc = XHtmlLoader.LoadHtml(_testHTML);

            Console.WriteLine(ToFormattedString(doc));

            string query   = "<row xpath='//body/div/@class'></row>";
            string results = QueryEngine.SelectOnHtml(_testHTML, query).InnerXml;

            Assert.AreEqual("<row>This &amp; is a test</row>", results);
        }
Beispiel #6
0
    static void Main(string[] args)
    {
        // Load Html string into an XmlDocument
        XmlDocument doc1 = XHtmlLoader.LoadHtml("<html><head><title>Hello World!</title><body><h1>Hello World</h1><p>This is a test</body>");

        Console.WriteLine("OuterXml is: " + doc1.OuterXml);

        // Load web page into an XmlDocument
        XmlDocument doc2  = XHtmlLoader.LoadWebPageAsync("http://wikipedia.org").Result;
        string      title = doc2.SelectSingleNode("//title").InnerText;

        Console.WriteLine("Title is: " + title);
    }
Beispiel #7
0
        public void HelloWorldBasicTest()
        {
            string html = @"
            <html><body>
            <h1>Hello World</h1>
            </body></html>";

            XmlDocument doc  = XHtmlLoader.LoadHtml(html);
            string      doc1 = doc.OuterXml;

            Console.WriteLine(doc1);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #8
0
        public void HeadTagInsideBodyTag()
        {
            string      html = @"
            <html><body>
            <h1>Hello World</h1><p> para <head>somehead</head> end para </p>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the <head> inside the body tag is ingnored
            Assert.IsNull(doc.SelectSingleNode("//body/head"));

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #9
0
        static string[] XHtmlKit_ParseAndSearch(string html, string xpath = null)
        {
            List <string> searchResults = new List <string>();
            XmlDocument   doc           = XHtmlLoader.LoadHtml(html);

            if (xpath != null)
            {
                var results = doc.DocumentElement.SelectNodes(xpath);
                foreach (XmlNode node in results)
                {
                    string result = node.InnerText;
                    searchResults.Add(result);
                }
            }
            return(searchResults.ToArray());
        }
Beispiel #10
0
        public void FormattingTags()
        {
            string      html = @"            
            <html><body>
            <h1>Hello World</h1>
            Some text <b><i>italics</b></i>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // The <b> tag should contain the <i> tag
            Assert.AreEqual("<b><i>italics</i></b>", doc.SelectSingleNode("//b").OuterXml);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #11
0
        public void TagWithInvalidXmlChar()
        {
            string      html = @"
            <html>
                <body>
                    <ahref='http://yyk.familydoctor.com.cn/1389/' target='_blank'>南方医科大学南方医院</a>
                </body>
            </html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure the
            Assert.AreEqual("ahref_x003D__x0027_http_x003A_", doc.SelectSingleNode("//body").FirstChild.Name);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #12
0
        public void TitleRCDataParsingCaps()
        {
            string      html = @"            
            <html><head>
                <title>This is a Title!</TITLE>
                </head>
            <body>
            <h1>Hello World</h1>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the </TITLE> match is case insenstive...
            Assert.AreEqual("This is a Title!", doc.SelectSingleNode("//title/text()").Value);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #13
0
        public void NoOuterHtmlTag()
        {
            string      html = @"
            <title>a title</title> <body>
            <h1>Hello World</h1>
            Some_&nbsp;_text > hello &gt; &copy; &#169; <b><i>italics</b></i>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure the document gets constructed properly...
            Assert.IsTrue(doc.DocumentElement.Name == "html");
            Assert.IsTrue(doc.DocumentElement.FirstChild.Name == "head");
            Assert.IsTrue(doc.DocumentElement.FirstChild.FirstChild.Name == "title");

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #14
0
        public void BodyTagwithAttributes()
        {
            string      html = @"
            <html>
                <head> 
                    <script src='/js/mobileredirect.js'></script>
                </head>
                <body class='foo'>
                    <h1>Hello World</h1>
                </body>
            </html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            Assert.AreEqual("foo", doc.SelectSingleNode("//body").Attributes["class"].Value);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #15
0
        public void HtmlTagWithAttributes()
        {
            string      html = @"
            <html lang='en'><body>
            <h1>Hello World</h1><html lang='fr' style='green'>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(ToFormattedString(doc));

            // Ensure the nested <html> tag is used simply as a source of attributes on the
            // main <html> tag - the 'lang' attribute should not overwrite the value, but the 'style'
            // attribute should get tacked on.
            Assert.IsTrue(doc.DocumentElement.Name == "html");
            Assert.AreEqual("en", doc.DocumentElement.Attributes["lang"].Value);
            Assert.AreEqual("green", doc.DocumentElement.Attributes["style"].Value);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #16
0
        public void ScriptRCDataParsing()
        {
            string      html = @"            
            <html><body>
            <h1>Hello World</h1><script>
                ga('create', 'UA-40765809-1', {
                  'allowLinker': true,
                  'cookiePath': '/finance'
                });
                ga('send', 'pageview<table>');
            </script>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the </body> inside the script is treated as RCData...
            Assert.IsTrue(doc.SelectSingleNode("//script/text()").Value.Contains("<table>"));

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #17
0
        public void HeadTagWithNestedTags()
        {
            string      html = @"
            <html>
                <head>
                    <meta> <a> foobar </a> blah </meta>
                </head>
                <body>
            <h1>Hello World</h1>
            </body></html>";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the text under meta ignored, since meta is a self-closing tag
            Assert.IsNull(doc.SelectSingleNode("//head/meta/a"));
            // The <a> tag should go under the body
            Assert.IsNotNull(doc.SelectSingleNode("//body/a"));

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #18
0
        public void FullyQualifyUrls2()
        {
            string html = @"
            <html>
            <body>
                <a href='/bakery/bread.html'>Bread</a>
            </body>
            </html>";

            ParserOptions options = new ParserOptions {
                BaseUrl = "http://foobar.com", FullyQualifyUrls = false
            };
            XmlDocument doc = XHtmlLoader.LoadHtml(html, options);

            Console.WriteLine(doc.OuterXml);

            // Ensure the comment shows up at the beginning, and the text at the end...
            Assert.AreEqual("/bakery/bread.html", doc.SelectSingleNode("//a/@href").Value, "Ensure we don't fully qualify. We set flag to false");

            // Linq check
            LinqCompare(doc, html, options);
        }
Beispiel #19
0
        public void BeforeAndAfterFragments()
        {
            string      html = @"
            <!----- some comment ----->
            <html>
            <body>
                <h1>Hello World</h1>
                <p>Some_&nbsp;_text > hello &gt; &copy; &#169; <b><i>italics</b></i> 
                qrs
                </p>
            </body>
            </html>some after text";
            XmlDocument doc  = XHtmlLoader.LoadHtml(html);

            Console.WriteLine(doc.OuterXml);

            // Ensure the comment shows up at the beginning, and the text at the end...
            Assert.IsTrue(doc.FirstChild.NodeType == XmlNodeType.Comment);
            Assert.AreEqual("some after text", doc.SelectSingleNode("//body").LastChild.InnerText);

            // Linq check
            LinqCompare(doc, html);
        }
Beispiel #20
0
 public static void LoadHtml(this XmlDocument doc, string html)
 {
     XHtmlLoader.LoadHtml(doc, new StringReader(html), new ParserOptions());
 }