public void Constructor_ValidUri_CreatesInstance() { CrawlDecision unitUnderTest = new CrawlDecision(); Assert.AreEqual(false, unitUnderTest.Allow); Assert.AreEqual("", unitUnderTest.Reason); Assert.IsFalse(unitUnderTest.ShouldHardStopCrawl); Assert.IsFalse(unitUnderTest.ShouldStopCrawl); }
//public static Dictionary<string, int> garmentTypes; static void Main(string[] args) { Console.WriteLine("howdy!"); log4net.Config.XmlConfigurator.Configure(); //PrintDisclaimer(); //Uri uriToCrawl = GetSiteToCrawl(args); //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/product/women-airism-tank-top-143149.html"); // out of stock //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/product/women-extra-fine-cotton-long-sleeve-long-shirt-167548001.html"); //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/women.html"); // starting point for crawl - WOMEN //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/men.html"); // starting point for crawl - MEN //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/women/tops/t-shirts.html"); //Uri uriToCrawl = new Uri("http://www.hm.com/us/product/77536?article=77536-C"); //Uri uriToCrawl = new Uri("http://www.hm.com/us/"); //Uri uriToCrawl = new Uri("http://www.hm.com/us/products/men"); // starting point for crawl - men //Uri uriToCrawl = new Uri("http://www.hm.com/us/products/men/tshirt"); //Uri uriToCrawl = new Uri("http://www.hm.com/us/products/ladies"); // starting point for crawl - women //Uri uriToCrawl = new Uri("http://www.hm.com/us/product/30283?article=30283-J"); // multi-colors //Uri uriToCrawl = new Uri("http://www.hm.com/us/product/04252?article=04252-A"); //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/man/outerwear/view-all/navy-coat-c764502p3094502.html"); //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/sale/man/coats-and-trench-coats/view-all/long-denim-parka-c794501p3276509.html"); //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/sale/woman/t-shirts/view-all/crop-t-shirt-c732027p2874029.html"); // multiple colors Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/woman/outerwear/view-all-c719012.html"); // crawl start point //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/man/jackets/bomber-jacket-c586542p3268146.html"); //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/woman/outerwear/view-all/wool-coat-with-lapels-c719012p3186217.html"); //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/woman/bags-c358019.html"); IWebCrawler crawler; crawler = GetDefaultWebCrawler(); //Subscribe to any of these asynchronous events, there are also sychronous versions of each. //This is where you process data about specific events of the crawl crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true, Reason = "OK" }; // H&M rules if (pageToCrawl.Uri.AbsoluteUri.Contains("hm.com")) { // only have rules for h&m decision = new CrawlDecision { Allow = false, Reason = "Not good!" }; if (pageToCrawl.Uri.AbsoluteUri.Contains("http://www.hm.com/us/product/") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.hm.com/us/products/men") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.hm.com/us/products/ladies")) return new CrawlDecision { Allow = true, Reason = "OK!" }; } // UNIQLO rules if (pageToCrawl.Uri.AbsoluteUri.Contains("uniqlo")) { // need some for uniqlo too decision = new CrawlDecision { Allow = false, Reason = "Not good!" }; if (pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/women.html") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/women/") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/product/") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/men.html") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/men/")) return new CrawlDecision { Allow = true, Reason = "OK!" }; } // ZARA rules if (pageToCrawl.Uri.AbsoluteUri.Contains("zara")) { // need some for zara too decision = new CrawlDecision { Allow = false, Reason = "Not good!" }; if (pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/collection-ss16/") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/sale/woman") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/sale/man/") || pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/collection-ss16/man/")) return new CrawlDecision { Allow = true, Reason = "OK!" }; } return decision; }); myConnection = new SqlConnection("user id=marakas;" + "password=M@rakas69!;server=yzf0vdv9dr.database.windows.net;" + "Trusted_Connection=False;Encrypt=True;" + "database=superfashiondb_db; " + "connection timeout=30"); try { myConnection.Open(); Console.WriteLine("DB OK!"); } catch (Exception e) { Console.WriteLine(e.ToString()); } //Start the crawl //This is a synchronous call CrawlResult result = crawler.Crawl(uriToCrawl); // Keep the console window open in debug mode. Console.WriteLine("Press any key to exit."); Console.ReadKey(); }
protected virtual void SignalCrawlStopIfNeeded(CrawlDecision decision) { if (decision.ShouldHardStopCrawl) { _logger.InfoFormat("Decision marked crawl [Hard Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason); _crawlContext.IsCrawlHardStopRequested = decision.ShouldHardStopCrawl; } else if (decision.ShouldStopCrawl) { _logger.InfoFormat("Decision marked crawl [Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason); _crawlContext.IsCrawlStopRequested = decision.ShouldStopCrawl; } }
private void Crawl() { ThreadPool.QueueUserWorkItem((state) => { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; crawler.ShouldCrawlPage((crawledPage, crawledContext) => { CrawlDecision decision = new CrawlDecision(); var uri = crawledPage.Uri.ToString(); if (crawledPage.IsRoot || uri.StartsWith("http://www.tingchina.com/")) { decision.Allow = true; } else { decision.Allow = false; decision.Reason = "Just erge pages!"; } return decision; }); CrawlResult result = crawler.Crawl(new Uri("http://www.tingchina.com/"), cancellationTokenSource); if (result.ErrorOccurred) { NextUrl = result.ErrorException.Message; Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); Console.ReadLine(); }, null); }
/// <summary> /// Delegate passed to WebCrawler which calls functionaltiy to look at /// blacklisted urls when deciding whether a page should be crawled or not. /// If the <paramref name="pageToCrawl"/> Domain is blacklisted, then the /// CrawlDecision.Allow is set to false. This delegate is called after the default /// CrawlDecisionMaker.ShouldCrawlPage() method is called. /// </summary> /// <returns>CrawlDecision</returns> /// <remarks>The default CrawlDecisionMaker.ShouldCrawlPage() method is called first, but then /// this method will be called. No reason to override the default CrawlDecisionMaker, see /// about line ~721 in WebCrawler.cs</remarks> public CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { CrawlDecision decision = null; var domain = pageToCrawl.Uri.GetBaseDomain(); if (_repo.IsBlackListed(domain)) { decision = new CrawlDecision { Allow = false, Reason = string.Format("The domain {0} is blacklisted", domain) }; } else { decision = new CrawlDecision() { Allow = true }; } return decision; }