public void AttributeParsing() { string html = @" <html> <body> <a id=01 /class/red >class red</a> <a id=02 /class/= /red >class = red</a> <a id=03 class// / = /red >class = red</a> <a id=04 class / = /red >class = red</a> <a id=05 class = /red >class='/red'</a> <a id=06 class= /red >class='/red'</a> <a id=07 clas:s= red >clas:s='red'</a> <a id=08 class:= red >class:='red'</a> <a id=09 class= 'red'/ >class='red'</a> <a id=10 class=red/ >class='red/'</a> <a id=11 class=red/>class='red/'</a> </body> </html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(ToFormattedString(doc)); // Linq check LinqCompare(doc, html); }
public void SelfClosingTags() { string html = @" <html><body> <h1>Hello World</h1> Some text <br> Some more text <img src='foobar.jpg'> more text <hr><a>foo</a> <p/> non self-closing </p> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // Ensure img node has attributes, and no children XmlNode imgNode = doc.SelectSingleNode("//img"); Assert.AreEqual("foobar.jpg", imgNode.Attributes["src"].Value); Assert.IsNull(imgNode.FirstChild); // Ensure br node has no attributes, and no children XmlNode brNode = doc.SelectSingleNode("//br"); Assert.AreEqual(0, brNode.Attributes.Count); Assert.IsNull(brNode.FirstChild); // Ensure hr node has no attributes, and no children XmlNode hrNode = doc.SelectSingleNode("//hr"); Assert.AreEqual(0, hrNode.Attributes.Count); Assert.IsNull(hrNode.FirstChild); // Ensure p node has children, because it is non self-closing XmlNode pNode = doc.SelectSingleNode("//p"); Assert.AreEqual(" non self-closing ", pNode.FirstChild.InnerText); }
public void FullyQualifyUrls() { string html = @" <html><body> <a id='1' href='//foobar.com'>hello</a> <a id='2' href='/helloworld.html'>hello</a> <a id='3' href='helloworld.html'>hello</a> <a id='4' href='http://blah.com/helloworld.html'>hello</a> <a id='5' href='../helloworld.html'>hello</a> <a id='6' href='/wiki/Wikipedia:Introduction'>hello</a> </body></html>"; string baseUrl = "http://www.foobar.com/products/cat1/someprod.html"; ParserOptions options = new ParserOptions { BaseUrl = baseUrl }; XmlDocument doc = XHtmlLoader.LoadHtml(html, options); Console.WriteLine(doc.OuterXml); // Ensure the urls are fully qualified... Assert.AreEqual("http://foobar.com/", doc.SelectSingleNode("//a[@id='1']/@href").Value); Assert.AreEqual("http://www.foobar.com/helloworld.html", doc.SelectSingleNode("//a[@id='2']/@href").Value); Assert.AreEqual("http://www.foobar.com/products/cat1/helloworld.html", doc.SelectSingleNode("//a[@id='3']/@href").Value); Assert.AreEqual("http://blah.com/helloworld.html", doc.SelectSingleNode("//a[@id='4']/@href").Value); Assert.AreEqual("http://www.foobar.com/products/helloworld.html", doc.SelectSingleNode("//a[@id='5']/@href").Value); Assert.AreEqual("http://www.foobar.com/wiki/Wikipedia:Introduction", doc.SelectSingleNode("//a[@id='6']/@href").Value); // Linq check LinqCompare(doc, html, options); }
public void TestReadingEncodedFile(string fileName, System.Text.Encoding encoding, System.Text.Encoding defaultEncoding) { // Set some options, so that we can know if things are working... LoaderOptions loaderOptions = new LoaderOptions(); loaderOptions.DetectEncoding = true; loaderOptions.DefaultEncoding = defaultEncoding; loaderOptions.ParserOptions.IncludeMetaData = true; // Load multi-byte html file into memory XmlDocument doc = XHtmlLoader.LoadHtml(_sampleMultiByteHtml); // Ensure Sample directory exists string sampleDir = (new DirectoryInfo(AssemblyDirectory)).Parent.Parent.Parent.FullName + "\\SampleData\\"; if (!Directory.Exists(sampleDir)) { Directory.CreateDirectory(sampleDir); } // Create Encoded file string fullName = sampleDir + fileName; using (TextWriter sw = new StreamWriter(File.Create(fullName), encoding)) { doc.Save(sw); } // Re-load into memory XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("file://" + fullName, loaderOptions).Result; Console.WriteLine("Reading file: " + fileName); Console.WriteLine(doc2.OuterXml); Assert.AreEqual(doc.SelectSingleNode("//body").OuterXml, doc2.SelectSingleNode("//body").OuterXml); }
public void XML_Characters_InResults() { XmlDocument doc = XHtmlLoader.LoadHtml(_testHTML); Console.WriteLine(ToFormattedString(doc)); string query = "<row xpath='//body/div/@class'></row>"; string results = QueryEngine.SelectOnHtml(_testHTML, query).InnerXml; Assert.AreEqual("<row>This & is a test</row>", results); }
static void Main(string[] args) { // Load Html string into an XmlDocument XmlDocument doc1 = XHtmlLoader.LoadHtml("<html><head><title>Hello World!</title><body><h1>Hello World</h1><p>This is a test</body>"); Console.WriteLine("OuterXml is: " + doc1.OuterXml); // Load web page into an XmlDocument XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("http://wikipedia.org").Result; string title = doc2.SelectSingleNode("//title").InnerText; Console.WriteLine("Title is: " + title); }
public void AttributeParsingMultiple() { string html = @"<settings id=01 class=red foo=bar />"; XmlDocument doc = new XmlDocument(); XHtmlLoader.LoadHtmlFragment(doc, html); Console.WriteLine(ToFormattedString(doc)); XmlNode settingsNode = doc.SelectSingleNode("//settings"); Assert.AreEqual("01", settingsNode.Attributes["id"].Value); Assert.AreEqual("red", settingsNode.Attributes["class"].Value); Assert.AreEqual("bar", settingsNode.Attributes["foo"].Value); }
public void HelloWorldBasicTest() { string html = @" <html><body> <h1>Hello World</h1> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); string doc1 = doc.OuterXml; Console.WriteLine(doc1); // Linq check LinqCompare(doc, html); }
public void HeadTagInsideBodyTag() { string html = @" <html><body> <h1>Hello World</h1><p> para <head>somehead</head> end para </p> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // Ensure the <head> inside the body tag is ingnored Assert.IsNull(doc.SelectSingleNode("//body/head")); // Linq check LinqCompare(doc, html); }
static string[] XHtmlKit_ParseAndSearch(string html, string xpath = null) { List <string> searchResults = new List <string>(); XmlDocument doc = XHtmlLoader.LoadHtml(html); if (xpath != null) { var results = doc.DocumentElement.SelectNodes(xpath); foreach (XmlNode node in results) { string result = node.InnerText; searchResults.Add(result); } } return(searchResults.ToArray()); }
/// <summary> /// Sample scraper /// </summary> public static async Task <Article[]> GetCodeProjectArticlesAsync(int pageNum = 1) { List <Article> results = new List <Article>(); // Get web page as an XHtml document using XHtmlKit string url = "https://www.codeproject.com/script/Articles/Latest.aspx?pgnum=" + pageNum; XmlDocument page = await XHtmlLoader.LoadWebPageAsync(url); // Select all articles using an anchor node containing a robust @class attribute var articles = page.SelectNodes("//table[contains(@class,'article-list')]/tr[@valign]"); // Get each article foreach (XmlNode a in articles) { // Extract article data - we need to be aware that sometimes there are no results // for certain fields var category = a.SelectSingleNode("./td[1]//a/text()"); var title = a.SelectSingleNode(".//div[@class='title']/a/text()"); var date = a.SelectSingleNode(".//div[contains(@class,'modified')]/text()"); var rating = a.SelectSingleNode(".//div[contains(@class,'rating-stars')]/@title"); var desc = a.SelectSingleNode(".//div[@class='description']/text()"); var author = a.SelectSingleNode(".//div[contains(@class,'author')]/text()"); XmlNodeList tagNodes = a.SelectNodes(".//div[@class='t']/a/text()"); StringBuilder tags = new StringBuilder(); foreach (XmlNode tagNode in tagNodes) { tags.Append((tags.Length > 0 ? "," : "") + tagNode.Value); } // Create the data structure we want Article article = new Article { Category = category != null ? category.Value : string.Empty, Title = title != null ? title.Value : string.Empty, Author = author != null ? author.Value : string.Empty, Description = desc != null ? desc.Value : string.Empty, Rating = rating != null ? rating.Value : string.Empty, Date = date != null ? date.Value : string.Empty, Tags = tags.ToString() }; // Add to results results.Add(article); } return(results.ToArray()); }
public void FormattingTags() { string html = @" <html><body> <h1>Hello World</h1> Some text <b><i>italics</b></i> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // The <b> tag should contain the <i> tag Assert.AreEqual("<b><i>italics</i></b>", doc.SelectSingleNode("//b").OuterXml); // Linq check LinqCompare(doc, html); }
public void LoadHtmlFragment() { string html = @" <html> <div> <h1>Hello World</h1> <body foo='bar'> </div></html>"; XmlDocument doc = new XmlDocument(); XHtmlLoader.LoadHtmlFragment(doc, html); Console.WriteLine(doc.OuterXml); // Ensure we are not inserting html, head or body nodes... Assert.AreEqual("div", doc.DocumentElement.Name); }
public void TagWithInvalidXmlChar() { string html = @" <html> <body> <ahref='http://yyk.familydoctor.com.cn/1389/' target='_blank'>南方医科大学南方医院</a> </body> </html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(ToFormattedString(doc)); // Ensure the Assert.AreEqual("ahref_x003D__x0027_http_x003A_", doc.SelectSingleNode("//body").FirstChild.Name); // Linq check LinqCompare(doc, html); }
public void NoOuterHtmlTag() { string html = @" <title>a title</title> <body> <h1>Hello World</h1> Some_ _text > hello > © © <b><i>italics</b></i> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(ToFormattedString(doc)); // Ensure the document gets constructed properly... Assert.IsTrue(doc.DocumentElement.Name == "html"); Assert.IsTrue(doc.DocumentElement.FirstChild.Name == "head"); Assert.IsTrue(doc.DocumentElement.FirstChild.FirstChild.Name == "title"); // Linq check LinqCompare(doc, html); }
public void TitleRCDataParsingCaps() { string html = @" <html><head> <title>This is a Title!</TITLE> </head> <body> <h1>Hello World</h1> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // Ensure the </TITLE> match is case insenstive... Assert.AreEqual("This is a Title!", doc.SelectSingleNode("//title/text()").Value); // Linq check LinqCompare(doc, html); }
public void HtmlTagWithAttributes() { string html = @" <html lang='en'><body> <h1>Hello World</h1><html lang='fr' style='green'> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(ToFormattedString(doc)); // Ensure the nested <html> tag is used simply as a source of attributes on the // main <html> tag - the 'lang' attribute should not overwrite the value, but the 'style' // attribute should get tacked on. Assert.IsTrue(doc.DocumentElement.Name == "html"); Assert.AreEqual("en", doc.DocumentElement.Attributes["lang"].Value); Assert.AreEqual("green", doc.DocumentElement.Attributes["style"].Value); // Linq check LinqCompare(doc, html); }
public void BodyTagwithAttributes() { string html = @" <html> <head> <script src='/js/mobileredirect.js'></script> </head> <body class='foo'> <h1>Hello World</h1> </body> </html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); Assert.AreEqual("foo", doc.SelectSingleNode("//body").Attributes["class"].Value); // Linq check LinqCompare(doc, html); }
public void FullyQualifyUrls2() { string html = @" <html> <body> <a href='/bakery/bread.html'>Bread</a> </body> </html>"; ParserOptions options = new ParserOptions { BaseUrl = "http://foobar.com", FullyQualifyUrls = false }; XmlDocument doc = XHtmlLoader.LoadHtml(html, options); Console.WriteLine(doc.OuterXml); // Ensure the comment shows up at the beginning, and the text at the end... Assert.AreEqual("/bakery/bread.html", doc.SelectSingleNode("//a/@href").Value, "Ensure we don't fully qualify. We set flag to false"); // Linq check LinqCompare(doc, html, options); }
public void HeadTagWithNestedTags() { string html = @" <html> <head> <meta> <a> foobar </a> blah </meta> </head> <body> <h1>Hello World</h1> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // Ensure the text under meta ignored, since meta is a self-closing tag Assert.IsNull(doc.SelectSingleNode("//head/meta/a")); // The <a> tag should go under the body Assert.IsNotNull(doc.SelectSingleNode("//body/a")); // Linq check LinqCompare(doc, html); }
public void ScriptRCDataParsing() { string html = @" <html><body> <h1>Hello World</h1><script> ga('create', 'UA-40765809-1', { 'allowLinker': true, 'cookiePath': '/finance' }); ga('send', 'pageview<table>'); </script> </body></html>"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // Ensure the </body> inside the script is treated as RCData... Assert.IsTrue(doc.SelectSingleNode("//script/text()").Value.Contains("<table>")); // Linq check LinqCompare(doc, html); }
public void BeforeAndAfterFragments() { string html = @" <!----- some comment -----> <html> <body> <h1>Hello World</h1> <p>Some_ _text > hello > © © <b><i>italics</b></i> qrs </p> </body> </html>some after text"; XmlDocument doc = XHtmlLoader.LoadHtml(html); Console.WriteLine(doc.OuterXml); // Ensure the comment shows up at the beginning, and the text at the end... Assert.IsTrue(doc.FirstChild.NodeType == XmlNodeType.Comment); Assert.AreEqual("some after text", doc.SelectSingleNode("//body").LastChild.InnerText); // Linq check LinqCompare(doc, html); }
public void LoadHtmlFragmentInSubElem() { string html = @" <div> <h1>Hello World</h1> <body foo='bar'> </div> <p>hello</p> </foo>abc"; XmlDocument doc = new XmlDocument(); XmlElement parent = doc.CreateElement("foo"); doc.AppendChild(parent); XHtmlLoader.LoadHtmlFragment(parent, html); Console.WriteLine(ToFormattedString(doc)); // Ensure we are not inserting html, head or body nodes... // And ensure we can load the fragment // And ensure that the </foo> tag does not match our root node... Assert.AreEqual("foo", parent.Name); Assert.AreEqual(3, parent.ChildNodes.Count); }
public static void LoadHtml(this XmlDocument doc, string html) { XHtmlLoader.LoadHtml(doc, new StringReader(html), new ParserOptions()); }
public static async Task LoadWebPageAsync(this XmlDocument doc, string url) { await XHtmlLoader.LoadWebPageAsync(doc, url, new LoaderOptions()); }
public static void LoadHtmlFragment(this XmlNode node, string html) { XHtmlLoader.LoadHtmlFragment(node, new StringReader(html), new ParserOptions()); }
static void Crawl(string[] args) { // Get crawler settings from the command-line Settings crawlerSettings = new Settings(); // Get command-line settings - use XHtmlKit parser. Why not. if (args.Length > 0) { string settingsHtml = "<settings " + string.Join(" ", args) + " />"; XmlDocument settingsDoc = new XmlDocument(); XHtmlLoader.LoadHtmlFragment(settingsDoc, settingsHtml.ToLower()); XmlElement settings = settingsDoc.DocumentElement; crawlerSettings.Url = (settings.Attributes["url"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["url"].Value)) ? settings.Attributes["url"].Value.Trim() : crawlerSettings.Url; crawlerSettings.Depth = settings.Attributes["depth"] != null?Convert.ToInt32(settings.Attributes["depth"].Value) : crawlerSettings.Depth; crawlerSettings.UrlFilter = (settings.Attributes["urlfilter"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["urlfilter"].Value)) ? settings.Attributes["urlfilter"].Value.Trim() : crawlerSettings.UrlFilter; crawlerSettings.OutputDir = (settings.Attributes["outputdir"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["outputdir"].Value)) ? settings.Attributes["outputdir"].Value.Trim() : crawlerSettings.OutputDir; crawlerSettings.Encoding = (settings.Attributes["encoding"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["encoding"].Value)) ? settings.Attributes["encoding"].Value.Trim() : crawlerSettings.Encoding; crawlerSettings.IncludeMetaData = (settings.Attributes["includemetadata"] != null); } // Gather HtmlParser and HtmlClient options LoaderOptions xhtmlLoaderOptions = new LoaderOptions(); xhtmlLoaderOptions.ParserOptions.IncludeMetaData = crawlerSettings.IncludeMetaData; xhtmlLoaderOptions.DetectEncoding = (crawlerSettings.Encoding == null); xhtmlLoaderOptions.DefaultEncoding = crawlerSettings.Encoding != null?System.Text.Encoding.GetEncoding(crawlerSettings.Encoding) : xhtmlLoaderOptions.DefaultEncoding; // Create 'todo' and 'done' lists Queue <Link> urlsToCrawl = new Queue <Link>(); HashSet <string> urlsCrawled = new HashSet <string>(); Uri baseUri = new Uri(crawlerSettings.Url.ToLower()); // Add the root url to the todo list urlsToCrawl.Enqueue(new Link { Url = crawlerSettings.Url }); Console.WriteLine("Crawling Url = " + crawlerSettings.Url + " Depth = " + crawlerSettings.Depth + " OutputDir = '" + crawlerSettings.OutputDir + "' UrlFilter = '" + crawlerSettings.UrlFilter + "'"); // Crawl all urls on the 'todo' list while (urlsToCrawl.Count > 0) { Link currentUrl = urlsToCrawl.Dequeue(); Console.Write(currentUrl.Url); urlsCrawled.Add(currentUrl.Url.ToLower()); // Crawl the Url using XHtmlKit XmlDocument xhtmlDoc; try { xhtmlDoc = XHtmlLoader.LoadWebPageAsync(currentUrl.Url, xhtmlLoaderOptions).Result; Console.WriteLine(", [OK]"); } catch (Exception ex) { Console.WriteLine(", [Error], " + (ex.InnerException != null ? ex.InnerException.Message : "")); continue; } // Get title from the XHtml document var title = xhtmlDoc.SelectSingleNode("//title"); if (title != null) { currentUrl.PageTitle = title.InnerText.Trim(); } // Save the XHtml file to disk try { if (!string.IsNullOrWhiteSpace(crawlerSettings.OutputDir)) { Uri currentUri = new Uri(currentUrl.Url.ToLower()); string fileName = currentUri.PathAndQuery.Trim(); // Replace invalid characters foreach (char c in System.IO.Path.GetInvalidFileNameChars()) { fileName = fileName.Replace(c.ToString(), "."); } fileName = fileName.Trim(new char[] { '.' }); // Set default file name if (string.IsNullOrWhiteSpace(fileName)) { fileName = "default"; } // Add xml extension fileName = fileName + ".xml"; // Ensure output directory exists string outputDir = crawlerSettings.OutputDir + "\\" + currentUri.Host; if (!System.IO.Directory.Exists(outputDir)) { System.IO.Directory.CreateDirectory(outputDir); } // Save file xhtmlDoc.Save(outputDir + "\\" + fileName); } } catch (Exception ex) { Console.WriteLine("Error saving document: " + ex.Message); } // Get sub-links from the XHtml var subLinkElems = xhtmlDoc.SelectNodes("//a"); // If we are at the Max Depth, we won't crawl deeper if (currentUrl.Depth >= crawlerSettings.Depth) { continue; } // Add sub-links to the 'todo' list int numSubLinks = 0; foreach (XmlNode subLinkElem in subLinkElems) { // Don't add empty links if (subLinkElem.Attributes["href"] == null || string.IsNullOrWhiteSpace(subLinkElem.Attributes["href"].InnerText)) { continue; } // Get the sub-link string sublink = subLinkElem.Attributes["href"].InnerText; if (!Uri.IsWellFormedUriString(sublink.ToLower(), UriKind.RelativeOrAbsolute)) { continue; } Uri subUri = new Uri(sublink.ToLower()); // Don't add links that don't match the UrlFilter if (!string.IsNullOrWhiteSpace(crawlerSettings.UrlFilter) && (!sublink.Contains(crawlerSettings.UrlFilter))) { continue; } // Don't add links that we have already crawled... if (urlsCrawled.Contains(sublink.ToLower())) { continue; } // Add the sub-link urlsToCrawl.Enqueue(new Link { Url = sublink, LinkText = subLinkElem.InnerText.Trim(), Depth = (currentUrl.Depth + 1) }); numSubLinks++; } currentUrl.SubLinks = numSubLinks; // Todo - put the currentUrl metadata somewhere... } }