/// <summary> /// Crawl the page at the specified URI for content /// </summary> /// <param name="uri">URI of the page to scrape</param> /// <param name="requestDelayMs">Time in milliseconds between requests to avoid overloading websites</param> /// <returns>Information found while crawling the page</returns> private PageInfo CrawlPage(Uri uri, int requestDelayMs) { PageInfo pageInfo = new PageInfo(); RobotsTxt robotsTxt = TryGetRobotsTxt(uri); // Test whether a crawler is allowed to crawl this page if (robotsTxt.IsAllowed(uri)) { Console.WriteLine("Parsing page: " + uri.ToString()); // Crawl the page for information PageParser pageParser = new PageParser(); pageInfo = pageParser.Parse(uri); // Do not overload the website Console.WriteLine("Sleeping thread for " + requestDelayMs + "ms to avoid overloading the website."); Thread.Sleep(requestDelayMs); // Remove any links that violate the robots.txt file of the current domain List <Uri> validSameDomainLinks = new List <Uri>(); foreach (Uri link in pageInfo.Links) { // External domain, no need to check robots.txt for it if (link.Host != uri.Host) { continue; } // Same domain, check whether the crawler is allowed to access // the page if (robotsTxt.IsAllowed(link)) { validSameDomainLinks.Add(link); } } } else { Console.WriteLine("Skipping page: " + uri.ToString()); } // Ensure no malicious input is sent to the database return(SanitizePageInfo(pageInfo)); }
/// <summary> /// Start crawling the web /// </summary> /// <param name="minCrawlDelay">Minimum time between two requests to the same host</param> /// <param name="maxCrawlDelay">Maximum time between two requests to the same host</param> /// <param name="database">Database to save indexed pages into</param> public void Start(int minCrawlDelay, int maxCrawlDelay, Database database) { //#DEBUG: simply exit once we have crawled a thousand URLs int i = 1000; while (i-- > 0) { // Look for any discovered URLs and crawl them string[] urls = database.GetUncrawledUrls(100); foreach (string url in urls) { // Random timeout in milliseconds to avoid overloading websites int crawlDelayMs = RandomNumberGenerator.Next(minCrawlDelay * 1000, maxCrawlDelay * 1000); // Crawl the page for information and links PageInfo pageInfo = CrawlPage(new Uri(url), crawlDelayMs); if (pageInfo.Uri != null && pageInfo.Links != null) { // Save the crawled page database.AddOrUpdateCrawledPage(pageInfo); // Convert from URI to string List <string> links = new List <string>(); foreach (Uri link in pageInfo.Links) { RobotsTxt robotsTxt = TryGetRobotsTxt(link); if (Uri.IsWellFormedUriString(link.ToString(), UriKind.Absolute) && robotsTxt.IsAllowed(link)) { links.Add(link.ToString()); } else { Console.WriteLine("Incorrectly formed URI string or denied by robots.txt: " + link.ToString()); } } // Save the newly discovered URLs in the "pending" database database.TryAddPendingUrls(links.ToArray()); } } } }