Beispiel #1
0
        public void TestReadingEncodedFile(string fileName, System.Text.Encoding encoding, System.Text.Encoding defaultEncoding)
        {
            // Set some options, so that we can know if things are working...
            LoaderOptions loaderOptions = new LoaderOptions();

            loaderOptions.DetectEncoding  = true;
            loaderOptions.DefaultEncoding = defaultEncoding;
            loaderOptions.ParserOptions.IncludeMetaData = true;

            // Load multi-byte html file into memory
            XmlDocument doc = XHtmlLoader.LoadHtml(_sampleMultiByteHtml);

            // Ensure Sample directory exists
            string sampleDir = (new DirectoryInfo(AssemblyDirectory)).Parent.Parent.Parent.FullName + "\\SampleData\\";

            if (!Directory.Exists(sampleDir))
            {
                Directory.CreateDirectory(sampleDir);
            }

            // Create Encoded file
            string fullName = sampleDir + fileName;

            using (TextWriter sw = new StreamWriter(File.Create(fullName), encoding))
            {
                doc.Save(sw);
            }

            // Re-load into memory
            XmlDocument doc2 = XHtmlLoader.LoadWebPageAsync("file://" + fullName, loaderOptions).Result;

            Console.WriteLine("Reading file: " + fileName);
            Console.WriteLine(doc2.OuterXml);
            Assert.AreEqual(doc.SelectSingleNode("//body").OuterXml, doc2.SelectSingleNode("//body").OuterXml);
        }
Beispiel #2
0
    static void Main(string[] args)
    {
        // Load Html string into an XmlDocument
        XmlDocument doc1 = XHtmlLoader.LoadHtml("<html><head><title>Hello World!</title><body><h1>Hello World</h1><p>This is a test</body>");

        Console.WriteLine("OuterXml is: " + doc1.OuterXml);

        // Load web page into an XmlDocument
        XmlDocument doc2  = XHtmlLoader.LoadWebPageAsync("http://wikipedia.org").Result;
        string      title = doc2.SelectSingleNode("//title").InnerText;

        Console.WriteLine("Title is: " + title);
    }
Beispiel #3
0
        /// <summary>
        /// Sample scraper
        /// </summary>
        public static async Task <Article[]> GetCodeProjectArticlesAsync(int pageNum = 1)
        {
            List <Article> results = new List <Article>();

            // Get web page as an XHtml document using XHtmlKit
            string      url  = "https://www.codeproject.com/script/Articles/Latest.aspx?pgnum=" + pageNum;
            XmlDocument page = await XHtmlLoader.LoadWebPageAsync(url);

            // Select all articles using an anchor node containing a robust @class attribute
            var articles = page.SelectNodes("//table[contains(@class,'article-list')]/tr[@valign]");

            // Get each article
            foreach (XmlNode a in articles)
            {
                // Extract article data - we need to be aware that sometimes there are no results
                // for certain fields
                var           category = a.SelectSingleNode("./td[1]//a/text()");
                var           title    = a.SelectSingleNode(".//div[@class='title']/a/text()");
                var           date     = a.SelectSingleNode(".//div[contains(@class,'modified')]/text()");
                var           rating   = a.SelectSingleNode(".//div[contains(@class,'rating-stars')]/@title");
                var           desc     = a.SelectSingleNode(".//div[@class='description']/text()");
                var           author   = a.SelectSingleNode(".//div[contains(@class,'author')]/text()");
                XmlNodeList   tagNodes = a.SelectNodes(".//div[@class='t']/a/text()");
                StringBuilder tags     = new StringBuilder();
                foreach (XmlNode tagNode in tagNodes)
                {
                    tags.Append((tags.Length > 0 ? "," : "") + tagNode.Value);
                }

                // Create the data structure we want
                Article article = new Article
                {
                    Category    = category != null ? category.Value : string.Empty,
                    Title       = title != null ? title.Value : string.Empty,
                    Author      = author != null ? author.Value : string.Empty,
                    Description = desc != null ? desc.Value : string.Empty,
                    Rating      = rating != null ? rating.Value : string.Empty,
                    Date        = date != null ? date.Value : string.Empty,
                    Tags        = tags.ToString()
                };

                // Add to results
                results.Add(article);
            }
            return(results.ToArray());
        }
Beispiel #4
0
        static void Crawl(string[] args)
        {
            // Get crawler settings from the command-line
            Settings crawlerSettings = new Settings();

            // Get command-line settings - use XHtmlKit parser. Why not.
            if (args.Length > 0)
            {
                string      settingsHtml = "<settings " + string.Join(" ", args) + " />";
                XmlDocument settingsDoc  = new XmlDocument();
                XHtmlLoader.LoadHtmlFragment(settingsDoc, settingsHtml.ToLower());
                XmlElement settings = settingsDoc.DocumentElement;

                crawlerSettings.Url   = (settings.Attributes["url"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["url"].Value)) ? settings.Attributes["url"].Value.Trim() : crawlerSettings.Url;
                crawlerSettings.Depth = settings.Attributes["depth"] != null?Convert.ToInt32(settings.Attributes["depth"].Value) : crawlerSettings.Depth;

                crawlerSettings.UrlFilter       = (settings.Attributes["urlfilter"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["urlfilter"].Value)) ? settings.Attributes["urlfilter"].Value.Trim() : crawlerSettings.UrlFilter;
                crawlerSettings.OutputDir       = (settings.Attributes["outputdir"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["outputdir"].Value)) ? settings.Attributes["outputdir"].Value.Trim() : crawlerSettings.OutputDir;
                crawlerSettings.Encoding        = (settings.Attributes["encoding"] != null && !string.IsNullOrWhiteSpace(settings.Attributes["encoding"].Value)) ? settings.Attributes["encoding"].Value.Trim() : crawlerSettings.Encoding;
                crawlerSettings.IncludeMetaData = (settings.Attributes["includemetadata"] != null);
            }

            // Gather HtmlParser and HtmlClient options
            LoaderOptions xhtmlLoaderOptions = new LoaderOptions();

            xhtmlLoaderOptions.ParserOptions.IncludeMetaData = crawlerSettings.IncludeMetaData;
            xhtmlLoaderOptions.DetectEncoding  = (crawlerSettings.Encoding == null);
            xhtmlLoaderOptions.DefaultEncoding = crawlerSettings.Encoding != null?System.Text.Encoding.GetEncoding(crawlerSettings.Encoding) : xhtmlLoaderOptions.DefaultEncoding;

            // Create 'todo' and 'done' lists
            Queue <Link>     urlsToCrawl = new Queue <Link>();
            HashSet <string> urlsCrawled = new HashSet <string>();
            Uri baseUri = new Uri(crawlerSettings.Url.ToLower());

            // Add the root url to the todo list
            urlsToCrawl.Enqueue(new Link {
                Url = crawlerSettings.Url
            });

            Console.WriteLine("Crawling Url = " + crawlerSettings.Url + " Depth = " + crawlerSettings.Depth + " OutputDir = '" + crawlerSettings.OutputDir + "' UrlFilter = '" + crawlerSettings.UrlFilter + "'");

            // Crawl all urls on the 'todo' list
            while (urlsToCrawl.Count > 0)
            {
                Link currentUrl = urlsToCrawl.Dequeue();

                Console.Write(currentUrl.Url);

                urlsCrawled.Add(currentUrl.Url.ToLower());

                // Crawl the Url using XHtmlKit
                XmlDocument xhtmlDoc;
                try
                {
                    xhtmlDoc = XHtmlLoader.LoadWebPageAsync(currentUrl.Url, xhtmlLoaderOptions).Result;
                    Console.WriteLine(", [OK]");
                }
                catch (Exception ex)
                {
                    Console.WriteLine(", [Error], " + (ex.InnerException != null ? ex.InnerException.Message : ""));
                    continue;
                }

                // Get title from the XHtml document
                var title = xhtmlDoc.SelectSingleNode("//title");
                if (title != null)
                {
                    currentUrl.PageTitle = title.InnerText.Trim();
                }

                // Save the XHtml file to disk
                try
                {
                    if (!string.IsNullOrWhiteSpace(crawlerSettings.OutputDir))
                    {
                        Uri    currentUri = new Uri(currentUrl.Url.ToLower());
                        string fileName   = currentUri.PathAndQuery.Trim();

                        // Replace invalid characters
                        foreach (char c in System.IO.Path.GetInvalidFileNameChars())
                        {
                            fileName = fileName.Replace(c.ToString(), ".");
                        }
                        fileName = fileName.Trim(new char[] { '.' });

                        // Set default file name
                        if (string.IsNullOrWhiteSpace(fileName))
                        {
                            fileName = "default";
                        }

                        // Add xml extension
                        fileName = fileName + ".xml";

                        // Ensure output directory exists
                        string outputDir = crawlerSettings.OutputDir + "\\" + currentUri.Host;
                        if (!System.IO.Directory.Exists(outputDir))
                        {
                            System.IO.Directory.CreateDirectory(outputDir);
                        }

                        // Save file
                        xhtmlDoc.Save(outputDir + "\\" + fileName);
                    }
                }
                catch (Exception ex)
                {
                    Console.WriteLine("Error saving document: " + ex.Message);
                }

                // Get sub-links from the XHtml
                var subLinkElems = xhtmlDoc.SelectNodes("//a");

                // If we are at the Max Depth, we won't crawl deeper
                if (currentUrl.Depth >= crawlerSettings.Depth)
                {
                    continue;
                }

                // Add sub-links to the 'todo' list
                int numSubLinks = 0;
                foreach (XmlNode subLinkElem in subLinkElems)
                {
                    // Don't add empty links
                    if (subLinkElem.Attributes["href"] == null || string.IsNullOrWhiteSpace(subLinkElem.Attributes["href"].InnerText))
                    {
                        continue;
                    }

                    // Get the sub-link
                    string sublink = subLinkElem.Attributes["href"].InnerText;
                    if (!Uri.IsWellFormedUriString(sublink.ToLower(), UriKind.RelativeOrAbsolute))
                    {
                        continue;
                    }

                    Uri subUri = new Uri(sublink.ToLower());

                    // Don't add links that don't match the UrlFilter
                    if (!string.IsNullOrWhiteSpace(crawlerSettings.UrlFilter) && (!sublink.Contains(crawlerSettings.UrlFilter)))
                    {
                        continue;
                    }

                    // Don't add links that we have already crawled...
                    if (urlsCrawled.Contains(sublink.ToLower()))
                    {
                        continue;
                    }

                    // Add the sub-link
                    urlsToCrawl.Enqueue(new Link {
                        Url = sublink, LinkText = subLinkElem.InnerText.Trim(), Depth = (currentUrl.Depth + 1)
                    });
                    numSubLinks++;
                }

                currentUrl.SubLinks = numSubLinks;

                // Todo - put the currentUrl metadata somewhere...
            }
        }
Beispiel #5
0
 public static async Task LoadWebPageAsync(this XmlDocument doc, string url)
 {
     await XHtmlLoader.LoadWebPageAsync(doc, url, new LoaderOptions());
 }