public void TestReadingEncodedFile(string fileName, System.Text.Encoding encoding, System.Text.Encoding defaultEncoding) { // Set some options, so that we can know if things are working... LoaderOptions loaderOptions = new LoaderOptions(); loaderOptions.DetectEncoding = true; loaderOptions.DefaultEncoding = defaultEncoding; loaderOptions.ParserOptions.IncludeMetaData = true; // Load multi-byte html file into memory XmlDocument doc = XHtmlLoader.LoadHtml(_sampleMultiByteHtml); // Ensure Sample directory exists string sampleDir = (new DirectoryInfo(AssemblyDirectory)).Parent.Parent.Parent.FullName + "\\SampleData\\"; if (!Directory.Exists(sampleDir)) { Directory.CreateDirectory(sampleDir); } // Create Encoded file string fullName = sampleDir + fileName; using (TextWriter sw = new StreamWriter(File.Create(fullName), encoding)) { doc.Save(sw); } // Re-load into memory XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("file://" + fullName, loaderOptions).Result; Console.WriteLine("Reading file: " + fileName); Console.WriteLine(doc2.OuterXml); Assert.AreEqual(doc.SelectSingleNode("//body").OuterXml, doc2.SelectSingleNode("//body").OuterXml); }
static void Main(string[] args) { // Load Html string into an XmlDocument XmlDocument doc1 = XHtmlLoader.LoadHtml("<html><head><title>Hello World!</title><body><h1>Hello World</h1><p>This is a test</body>"); Console.WriteLine("OuterXml is: " + doc1.OuterXml); // Load web page into an XmlDocument XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("http://wikipedia.org").Result; string title = doc2.SelectSingleNode("//title").InnerText; Console.WriteLine("Title is: " + title); }
/// <summary> /// Sample scraper /// </summary> public static async Task <Article[]> GetCodeProjectArticlesAsync(int pageNum = 1) { List <Article> results = new List <Article>(); // Get web page as an XHtml document using XHtmlKit string url = "https://www.codeproject.com/script/Articles/Latest.aspx?pgnum=" + pageNum; XmlDocument page = await XHtmlLoader.LoadWebPageAsync(url); // Select all articles using an anchor node containing a robust @class attribute var articles = page.SelectNodes("//table[contains(@class,'article-list')]/tr[@valign]"); // Get each article foreach (XmlNode a in articles) { // Extract article data - we need to be aware that sometimes there are no results // for certain fields var category = a.SelectSingleNode("./td[1]//a/text()"); var title = a.SelectSingleNode(".//div[@class='title']/a/text()"); var date = a.SelectSingleNode(".//div[contains(@class,'modified')]/text()"); var rating = a.SelectSingleNode(".//div[contains(@class,'rating-stars')]/@title"); var desc = a.SelectSingleNode(".//div[@class='description']/text()"); var author = a.SelectSingleNode(".//div[contains(@class,'author')]/text()"); XmlNodeList tagNodes = a.SelectNodes(".//div[@class='t']/a/text()"); StringBuilder tags = new StringBuilder(); foreach (XmlNode tagNode in tagNodes) { tags.Append((tags.Length > 0 ? "," : "") + tagNode.Value); } // Create the data structure we want Article article = new Article { Category = category != null ? category.Value : string.Empty, Title = title != null ? title.Value : string.Empty, Author = author != null ? author.Value : string.Empty, Description = desc != null ? desc.Value : string.Empty, Rating = rating != null ? rating.Value : string.Empty, Date = date != null ? date.Value : string.Empty, Tags = tags.ToString() }; // Add to results results.Add(article); } return(results.ToArray()); }
static void Crawl(string[] args) { // Get crawler settings from the command-line Settings crawlerSettings = new Settings(); // Get command-line settings - use XHtmlKit parser. Why not. if (args.Length > 0) { string settingsHtml = "<settings " + string.Join(" ", args) + " />"; XmlDocument settingsDoc = new XmlDocument(); XHtmlLoader.LoadHtmlFragment(settingsDoc, settingsHtml.ToLower()); XmlElement settings = settingsDoc.DocumentElement; crawlerSettings.Url = (settings.Attributes["url"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["url"].Value)) ? settings.Attributes["url"].Value.Trim() : crawlerSettings.Url; crawlerSettings.Depth = settings.Attributes["depth"] != null?Convert.ToInt32(settings.Attributes["depth"].Value) : crawlerSettings.Depth; crawlerSettings.UrlFilter = (settings.Attributes["urlfilter"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["urlfilter"].Value)) ? settings.Attributes["urlfilter"].Value.Trim() : crawlerSettings.UrlFilter; crawlerSettings.OutputDir = (settings.Attributes["outputdir"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["outputdir"].Value)) ? settings.Attributes["outputdir"].Value.Trim() : crawlerSettings.OutputDir; crawlerSettings.Encoding = (settings.Attributes["encoding"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["encoding"].Value)) ? settings.Attributes["encoding"].Value.Trim() : crawlerSettings.Encoding; crawlerSettings.IncludeMetaData = (settings.Attributes["includemetadata"] != null); } // Gather HtmlParser and HtmlClient options LoaderOptions xhtmlLoaderOptions = new LoaderOptions(); xhtmlLoaderOptions.ParserOptions.IncludeMetaData = crawlerSettings.IncludeMetaData; xhtmlLoaderOptions.DetectEncoding = (crawlerSettings.Encoding == null); xhtmlLoaderOptions.DefaultEncoding = crawlerSettings.Encoding != null?System.Text.Encoding.GetEncoding(crawlerSettings.Encoding) : xhtmlLoaderOptions.DefaultEncoding; // Create 'todo' and 'done' lists Queue <Link> urlsToCrawl = new Queue <Link>(); HashSet <string> urlsCrawled = new HashSet <string>(); Uri baseUri = new Uri(crawlerSettings.Url.ToLower()); // Add the root url to the todo list urlsToCrawl.Enqueue(new Link { Url = crawlerSettings.Url }); Console.WriteLine("Crawling Url = " + crawlerSettings.Url + " Depth = " + crawlerSettings.Depth + " OutputDir = '" + crawlerSettings.OutputDir + "' UrlFilter = '" + crawlerSettings.UrlFilter + "'"); // Crawl all urls on the 'todo' list while (urlsToCrawl.Count > 0) { Link currentUrl = urlsToCrawl.Dequeue(); Console.Write(currentUrl.Url); urlsCrawled.Add(currentUrl.Url.ToLower()); // Crawl the Url using XHtmlKit XmlDocument xhtmlDoc; try { xhtmlDoc = XHtmlLoader.LoadWebPageAsync(currentUrl.Url, xhtmlLoaderOptions).Result; Console.WriteLine(", [OK]"); } catch (Exception ex) { Console.WriteLine(", [Error], " + (ex.InnerException != null ? ex.InnerException.Message : "")); continue; } // Get title from the XHtml document var title = xhtmlDoc.SelectSingleNode("//title"); if (title != null) { currentUrl.PageTitle = title.InnerText.Trim(); } // Save the XHtml file to disk try { if (!string.IsNullOrWhiteSpace(crawlerSettings.OutputDir)) { Uri currentUri = new Uri(currentUrl.Url.ToLower()); string fileName = currentUri.PathAndQuery.Trim(); // Replace invalid characters foreach (char c in System.IO.Path.GetInvalidFileNameChars()) { fileName = fileName.Replace(c.ToString(), "."); } fileName = fileName.Trim(new char[] { '.' }); // Set default file name if (string.IsNullOrWhiteSpace(fileName)) { fileName = "default"; } // Add xml extension fileName = fileName + ".xml"; // Ensure output directory exists string outputDir = crawlerSettings.OutputDir + "\\" + currentUri.Host; if (!System.IO.Directory.Exists(outputDir)) { System.IO.Directory.CreateDirectory(outputDir); } // Save file xhtmlDoc.Save(outputDir + "\\" + fileName); } } catch (Exception ex) { Console.WriteLine("Error saving document: " + ex.Message); } // Get sub-links from the XHtml var subLinkElems = xhtmlDoc.SelectNodes("//a"); // If we are at the Max Depth, we won't crawl deeper if (currentUrl.Depth >= crawlerSettings.Depth) { continue; } // Add sub-links to the 'todo' list int numSubLinks = 0; foreach (XmlNode subLinkElem in subLinkElems) { // Don't add empty links if (subLinkElem.Attributes["href"] == null || string.IsNullOrWhiteSpace(subLinkElem.Attributes["href"].InnerText)) { continue; } // Get the sub-link string sublink = subLinkElem.Attributes["href"].InnerText; if (!Uri.IsWellFormedUriString(sublink.ToLower(), UriKind.RelativeOrAbsolute)) { continue; } Uri subUri = new Uri(sublink.ToLower()); // Don't add links that don't match the UrlFilter if (!string.IsNullOrWhiteSpace(crawlerSettings.UrlFilter) && (!sublink.Contains(crawlerSettings.UrlFilter))) { continue; } // Don't add links that we have already crawled... if (urlsCrawled.Contains(sublink.ToLower())) { continue; } // Add the sub-link urlsToCrawl.Enqueue(new Link { Url = sublink, LinkText = subLinkElem.InnerText.Trim(), Depth = (currentUrl.Depth + 1) }); numSubLinks++; } currentUrl.SubLinks = numSubLinks; // Todo - put the currentUrl metadata somewhere... } }
public static async Task LoadWebPageAsync(this XmlDocument doc, string url) { await XHtmlLoader.LoadWebPageAsync(doc, url, new LoaderOptions()); }