// Setting up bot config public void setup_abot() { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 150; crawlConfig.MaxConcurrentThreads = 25; crawlConfig.IsExternalPageCrawlingEnabled = false; crawlConfig.MaxCrawlDepth = 1; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { Regex rx = new Regex(@"\d{5}"); if (!rx.IsMatch(pageToCrawl.Uri.ToString()) && !pageToCrawl.Uri.ToString().Contains("text=")) return new CrawlDecision { Allow = false, Reason = "Want only comlinks" }; return new CrawlDecision { Allow = true, Reason = "OK Link" }; ; }); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = new PoliteWebCrawler(); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost")) return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" }; return new CrawlDecision { Allow = true }; }); //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { if (crawlContext.CrawledCount >= 5) return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" }; return new CrawlDecision { Allow = true }; }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { if (!crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" }; return new CrawlDecision { Allow = true }; }); return crawler; }
private void Crawl() { ThreadPool.QueueUserWorkItem((state) => { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; crawler.ShouldCrawlPage((crawledPage, crawledContext) => { CrawlDecision decision = new CrawlDecision(); var uri = crawledPage.Uri.ToString(); if (crawledPage.IsRoot || uri.StartsWith("http://www.tingchina.com/")) { decision.Allow = true; } else { decision.Allow = false; decision.Reason = "Just erge pages!"; } return decision; }); CrawlResult result = crawler.Crawl(new Uri("http://www.tingchina.com/"), cancellationTokenSource); if (result.ErrorOccurred) { NextUrl = result.ErrorException.Message; Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); Console.ReadLine(); }, null); }