Пример #1
0
 public void Constructor_ValidUri_CreatesInstance()
 {
     CrawlDecision unitUnderTest = new CrawlDecision();
     Assert.AreEqual(false, unitUnderTest.Allow);
     Assert.AreEqual("", unitUnderTest.Reason);
     Assert.IsFalse(unitUnderTest.ShouldHardStopCrawl);
     Assert.IsFalse(unitUnderTest.ShouldStopCrawl);
 }
Пример #2
0
        //public static Dictionary<string, int> garmentTypes;
        static void Main(string[] args)
        {
            Console.WriteLine("howdy!");

            log4net.Config.XmlConfigurator.Configure();
            //PrintDisclaimer();

            //Uri uriToCrawl = GetSiteToCrawl(args);
            //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/product/women-airism-tank-top-143149.html"); // out of stock
            //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/product/women-extra-fine-cotton-long-sleeve-long-shirt-167548001.html");
            //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/women.html");  // starting point for crawl - WOMEN
            //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/men.html");  // starting point for crawl - MEN
            //Uri uriToCrawl = new Uri("http://www.uniqlo.com/us/women/tops/t-shirts.html");
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/product/77536?article=77536-C");
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/");
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/products/men"); // starting point for crawl - men
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/products/men/tshirt");
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/products/ladies"); // starting point for crawl - women
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/product/30283?article=30283-J"); // multi-colors
            //Uri uriToCrawl = new Uri("http://www.hm.com/us/product/04252?article=04252-A");
            //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/man/outerwear/view-all/navy-coat-c764502p3094502.html");
            //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/sale/man/coats-and-trench-coats/view-all/long-denim-parka-c794501p3276509.html");
            //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/sale/woman/t-shirts/view-all/crop-t-shirt-c732027p2874029.html"); // multiple colors
            Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/woman/outerwear/view-all-c719012.html"); // crawl start point
            //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/man/jackets/bomber-jacket-c586542p3268146.html");

            //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/woman/outerwear/view-all/wool-coat-with-lapels-c719012p3186217.html");
            //Uri uriToCrawl = new Uri("http://www.zara.com/us/en/collection-ss16/woman/bags-c358019.html");

                 IWebCrawler crawler;
            crawler = GetDefaultWebCrawler();

            //Subscribe to any of these asynchronous events, there are also sychronous versions of each.
            //This is where you process data about specific events of the crawl
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision { Allow = true, Reason = "OK" };
                // H&M rules
                if (pageToCrawl.Uri.AbsoluteUri.Contains("hm.com")) {
                    // only have rules for h&m
                    decision = new CrawlDecision { Allow = false, Reason = "Not good!" };
                    if (pageToCrawl.Uri.AbsoluteUri.Contains("http://www.hm.com/us/product/") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.hm.com/us/products/men") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.hm.com/us/products/ladies"))
                return new CrawlDecision { Allow = true, Reason = "OK!" };
                }

                // UNIQLO rules
                if (pageToCrawl.Uri.AbsoluteUri.Contains("uniqlo"))
                {
                    // need some for uniqlo too
                    decision = new CrawlDecision { Allow = false, Reason = "Not good!" };
                    if (pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/women.html") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/women/") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/product/") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/men.html") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.uniqlo.com/us/men/"))
                        return new CrawlDecision { Allow = true, Reason = "OK!" };
                }

                // ZARA rules
                if (pageToCrawl.Uri.AbsoluteUri.Contains("zara"))
                {
                    // need some for zara too
                    decision = new CrawlDecision { Allow = false, Reason = "Not good!" };
                    if (pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/collection-ss16/") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/sale/woman") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/sale/man/") ||
                    pageToCrawl.Uri.AbsoluteUri.Contains("http://www.zara.com/us/en/collection-ss16/man/"))
                        return new CrawlDecision { Allow = true, Reason = "OK!" };
                }

                return decision;
            });

            myConnection = new SqlConnection("user id=marakas;" +
                                       "password=M@rakas69!;server=yzf0vdv9dr.database.windows.net;" +
                                       "Trusted_Connection=False;Encrypt=True;" +
                                       "database=superfashiondb_db; " +
                                       "connection timeout=30");
            try
            {
                myConnection.Open();
                Console.WriteLine("DB OK!");
            }
            catch (Exception e)
            {
                Console.WriteLine(e.ToString());
            }

            //Start the crawl
            //This is a synchronous call
            CrawlResult result = crawler.Crawl(uriToCrawl);

            // Keep the console window open in debug mode.
            Console.WriteLine("Press any key to exit.");
            Console.ReadKey();
        }
Пример #3
0
 protected virtual void SignalCrawlStopIfNeeded(CrawlDecision decision)
 {
     if (decision.ShouldHardStopCrawl)
     {
         _logger.InfoFormat("Decision marked crawl [Hard Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason);
         _crawlContext.IsCrawlHardStopRequested = decision.ShouldHardStopCrawl;
     }
     else if (decision.ShouldStopCrawl)
     {
         _logger.InfoFormat("Decision marked crawl [Stop] for site [{0}], [{1}]", _crawlContext.RootUri, decision.Reason);
         _crawlContext.IsCrawlStopRequested = decision.ShouldStopCrawl;
     }
 }
Пример #4
0
        private void Crawl()
        {
            ThreadPool.QueueUserWorkItem((state) =>
            {
                PoliteWebCrawler crawler = new PoliteWebCrawler();
                crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
                crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
                crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
                crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

                crawler.ShouldCrawlPage((crawledPage, crawledContext) =>
                {
                    CrawlDecision decision = new CrawlDecision();
                    var uri = crawledPage.Uri.ToString();
                    if (crawledPage.IsRoot || uri.StartsWith("http://www.tingchina.com/"))
                    {
                        decision.Allow = true;
                    }
                    else
                    {
                        decision.Allow = false;
                        decision.Reason = "Just erge pages!";
                    }
                    return decision;
                });

                CrawlResult result = crawler.Crawl(new Uri("http://www.tingchina.com/"), cancellationTokenSource);

                if (result.ErrorOccurred)
                {
                    NextUrl = result.ErrorException.Message;
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                }
                else
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
                Console.ReadLine();
            }, null);
        }
Пример #5
0
        /// <summary>
        /// Delegate passed to WebCrawler which calls functionaltiy to look at 
        /// blacklisted urls when deciding whether a page should be crawled or not. 
        /// If the <paramref name="pageToCrawl"/> Domain is blacklisted, then the 
        /// CrawlDecision.Allow is set to false.  This delegate is called after the default
        /// CrawlDecisionMaker.ShouldCrawlPage() method is called.
        /// </summary>
        /// <returns>CrawlDecision</returns>
        /// <remarks>The default CrawlDecisionMaker.ShouldCrawlPage() method is called first, but then
        /// this method will be called.  No reason to override the default CrawlDecisionMaker, see
        /// about line ~721 in WebCrawler.cs</remarks>
        public CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            CrawlDecision decision = null;

            var domain = pageToCrawl.Uri.GetBaseDomain();
            if (_repo.IsBlackListed(domain))
            {
                decision = new CrawlDecision
                {
                    Allow = false,
                    Reason = string.Format("The domain {0} is blacklisted", domain)
                };
            }
            else
            {
                decision = new CrawlDecision() { Allow = true };
            }

            return decision;
        }