Exemplo n.º 1
0
        public MangaTests()
        {
            var memoryCache = new MemoryCache(new MemoryCacheOptions());

            _source  = new UnionMangasSource();
            _crawler = new UnionMangasCrawler(_source, memoryCache);
        }
Exemplo n.º 2
0
 public Brawa(IClock clock, IWebCrawler webCrawler)
     : base(
         clock,
         webCrawler,
         new Uri("https://www.brawa.de"))
 {
 }
Exemplo n.º 3
0
 public Hornby(IClock clock, IWebCrawler webCrawler)
     : base(
         clock,
         webCrawler,
         new Uri("https://www.hornby.com"))
 {
 }
Exemplo n.º 4
0
        public void Dispose()
        {
            var disposable = _provider as IDisposable;
            if (disposable != null)
                disposable.Dispose();
            _provider = null;

            disposable = _repo as IDisposable;
            if (disposable != null)
                disposable.Dispose();
            _repo = null;

            if (_crawler != null)
            {
                if (IsAsync)
                {
                    _crawler.PageCrawlStartingAsync -= crawler_ProcessPageCrawlStarting;
                    _crawler.PageCrawlCompletedAsync -= crawler_ProcessPageCrawlCompleted;
                    _crawler.PageCrawlDisallowedAsync -= crawler_PageCrawlDisallowed;
                    _crawler.PageLinksCrawlDisallowedAsync -= crawler_PageLinksCrawlDisallowed;
                }
                else
                {
                    _crawler.PageCrawlStarting -= crawler_ProcessPageCrawlStarting;
                    _crawler.PageCrawlCompleted -= crawler_ProcessPageCrawlCompleted;
                    _crawler.PageCrawlDisallowed -= crawler_PageCrawlDisallowed;
                    _crawler.PageLinksCrawlDisallowed -= crawler_PageLinksCrawlDisallowed;
                }
                _crawler = null;
            }
        }        
Exemplo n.º 5
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetDefaultWebCrawler();

            //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it.
            //For example http://a.com/ghost, would not get crawled if the link were found during the crawl.
            //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled.
            //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run.
            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost"))
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "Scared of ghosts"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not download the page content for any page after 5th.
            //Abot will still make the http request but will not read the raw content from the stream
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run
            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                if (crawlContext.CrawledCount >= 5)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "We already downloaded the raw page content for 5 pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                if (!crawledPage.IsInternal)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "We dont crawl links of external pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
Exemplo n.º 6
0
 public CategoriesParser(ILog logger, IWebCrawler crawler, ICategoryRepository categories, IUnitOfWork uow)
 {
     this.logger = logger;
     this.crawler = crawler;
     this.categories = categories;
     this.uow = uow;
 }
        public async Task GetResult(IWebCrawler crawler, IList <string> results)
        {
            var result = await crawler.Get(fixture.Create <string>());

            output.WriteLine(result.Body);
            results.Add(result.Body);
        }
Exemplo n.º 8
0
        public ExtractorService(IServiceProvider serviceProvider, IHttpContextAccessor contextAccessor) : base(contextAccessor)
        {
            _webCrawler   = serviceProvider.GetRequiredService <IWebCrawler>();
            _imageService = serviceProvider.GetRequiredService <IImageService>();

            Configure();
        }
Exemplo n.º 9
0
        public CrawlResult Crawl()
        {
            IWebCrawler crawler = InitCrawler();

            Uri uriToCrawl = new Uri("http://rabota.ua/jobsearch/vacancy_list"); //http://rabota.ua/jobsearch/vacancy_list?pg=1000

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains(@"rabota.ua/jobsearch/vacancy_list") &&
                    !pageToCrawl.Uri.AbsoluteUri.Contains(@"period"))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false, Reason = "Parse only job pages"
                });
            });

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(uriToCrawl);

            return(result);
        }
Exemplo n.º 10
0
 public ModellbahnshopLippe(IWebCrawler webCrawler, Uri startPage)
     : base(SystemClock.Instance,
            webCrawler,
            new Uri(@"https://www.modellbahnshop-lippe.com"),
            startPage)
 {
 }
Exemplo n.º 11
0
 public DataProcessor(IConsoleManager consoleManager, IStringHelper stringHelper, IWebCrawler webCrawler, IExcelLogger excelLogger, ILogger logger)
 {
     _consoleManager = consoleManager;
     _stringHelper   = stringHelper;
     _webCrawler     = webCrawler;
     _excelLogger    = excelLogger;
     _logger         = logger;
 }
Exemplo n.º 12
0
        private IWebCrawler CreateCrawlerInstance()
        {
            IWebCrawler crawler = _crawlerFactory.CreateInstance();

            crawler.PageCrawlCompleted += (s, e) => ProcessCrawledPage(e.CrawlContext, e.CrawledPage);

            return(crawler);
        }
Exemplo n.º 13
0
 public DataProcessor(IConsoleManager consoleManager, IStringHelper stringHelper, IWebCrawler webCrawler, IExcelLogger excelLogger, ILogger logger)
 {
     _consoleManager = consoleManager;
     _stringHelper = stringHelper;
     _webCrawler = webCrawler;
     _excelLogger = excelLogger;
     _logger = logger;
 }
Exemplo n.º 14
0
 public MfcCrawler(IocContainer ioc, IUserRepository userRepository, IPrinter printer,
                   ICommunityDecider communityDecider, int waitTime = 200)
 {
     _userRepository   = userRepository;
     _printer          = printer;
     _communityDecider = communityDecider;
     crawler           = ioc.SteamCrawler(1000, waitTime);
 }
Exemplo n.º 15
0
        protected BaseWrapper(IClock clock, IWebCrawler webCrawler, Uri baseUri)
        {
            Parser  = new HtmlParser();
            BaseUri = baseUri;

            _clock      = clock;
            _webCrawler = webCrawler ??
                          throw new ArgumentNullException(nameof(webCrawler));
        }
Exemplo n.º 16
0
 public void SetUp()
 {
     var resourceLoader = new ResourceLoader();
     _rootFolder = ConfigurationManager.AppSettings["PathToTestFolder"];
     var fileSaver = new FileSaver();
     var pageLocator = new ResourceLocationManager(_rootFolder);
     
     _webCrawler = new WebCrawlerService.WebCrawler();
 }
Exemplo n.º 17
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetManuallyConfiguredWebCrawler(siteToCrawl);

            //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it.
            //For example http://a.com/ghost, would not get crawled if the link were found during the crawl.
            //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled.
            //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run.
            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                //if (!pageToCrawl.Uri.AbsoluteUri.Contains("chicken") && !pageToCrawl.Uri.AbsoluteUri.Contains("Chicken"))
                if (!pageToCrawl.Uri.AbsoluteUri.Contains(category.Replace(" ", "+")) || /*pageToCrawl.Uri.AbsoluteUri.Contains("navid")||*/ pageToCrawl.Uri.AbsoluteUri.Contains("_KG") || pageToCrawl.Uri.AbsoluteUri.Contains("_EA"))
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "I only crawl the right pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not download the page content for any page after 5th.
            //Abot will still make the http request but will not read the raw content from the stream
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run

            /*crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
             * {
             *  if (crawlContext.CrawledCount >= 5)
             *      return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" };
             *
             *  return new CrawlDecision { Allow = true };
             * });*/

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                CrawlDecision decision = new CrawlDecision {
                    Allow = true
                };
                if (crawledPage.Content.Bytes.Length < 100)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "Just crawl links in pages that have at least 100 bytes"
                    }
                }
                ;

                return(decision);
            });

            return(crawler);
        }
Exemplo n.º 18
0
 public WebTester(CancellationTokenSource tokenSource, TestOptions options)
 {
     Context = new WebTestContext
     {
         TestOptions             = options ?? new TestOptions(),
         CancellationTokenSource = tokenSource ?? new CancellationTokenSource()
     };
     BlockInit();
     _crawler = new CrawlerLight();
     _crawler.PageCrawlCompletedAsync += _crawler_PageCrawlCompleted;
 }
        public Crawler(string argUrl)
        {
            TheWebCrawler = GetManuallyConfiguredWebCrawler();

            TheWebCrawler.PageCrawlCompleted += PageCrawlCompletedEvent;
            TheWebCrawler.PageCrawlDisallowed += PageCrawlDisallowedEvent;
            TheWebCrawler.PageCrawlStarting += PageCrawlStartingEvent;
            TheWebCrawler.PageLinksCrawlDisallowed += PageLinksCrawlDisallowedEvent;

            var crawlResult = TheWebCrawler.Crawl(new Uri(argUrl));
        }
        public Crawler(string argUrl)
        {
            TheWebCrawler = GetManuallyConfiguredWebCrawler();

            TheWebCrawler.PageCrawlCompleted       += PageCrawlCompletedEvent;
            TheWebCrawler.PageCrawlDisallowed      += PageCrawlDisallowedEvent;
            TheWebCrawler.PageCrawlStarting        += PageCrawlStartingEvent;
            TheWebCrawler.PageLinksCrawlDisallowed += PageLinksCrawlDisallowedEvent;

            var crawlResult = TheWebCrawler.Crawl(new Uri(argUrl));
        }
Exemplo n.º 21
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetDefaultWebCrawler();

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains("p.html") || pageToCrawl.Uri.AbsoluteUri.Contains("p.html"))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false, Reason = "Incorrect subdomain"
                });
            });


            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                if (crawlContext.CrawledCount >= 5)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "We already downloaded the raw page content for 5 pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                if (!crawledPage.IsInternal)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "We dont crawl links of external pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
Exemplo n.º 22
0
 /// <summary>
 /// Pobieranie kontentów strony komentarzy dla danego produkru z serwisów ceneo.pl i skapice.pl
 /// </summary>
 public HttpCommentGeter(string productId, IStatisctics statistic)
 {
     product = new Product();
     string pageName = "http://www.ceneo.pl/" + productId + "#tab=reviews";
     fillProductPropertis(product, pageName);
     m_webCrawlerCeneo = new CeneoWebCrawler(pageName);
     m_webCrawlerCeneo.getPagesContent( statistic, product);
     ILinkToProductFinder productFinder = new SkapiecLinkToProductFinder();
     string foundProduct = productFinder.getLinkToProduct(product);
     if (foundProduct != null)
     {
         m_webCrawlerSkapiec = new SkapiecWebCrawler("http://www.skapiec.pl" + productFinder.getLinkToProduct(product) + "#opinie");
         m_webCrawlerSkapiec.getPagesContent(statistic, product);
     }
 }
Exemplo n.º 23
0
        public void CrawlAndAssert(IWebCrawler crawler)
        {
            crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompleted;

            CrawlResult result = crawler.Crawl(_rootUri);

            Assert.IsNull(result.ErrorException);
            Assert.IsFalse(result.ErrorOccurred);
            Assert.AreSame(_rootUri, result.RootUri);

            List<Discrepancy> descrepancies = GetDescrepancies();
            PrintDescrepancies(descrepancies);

            Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details.");
            Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl));
        }
Exemplo n.º 24
0
        public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            if (cancellationToken == null)
            {
                throw new ArgumentNullException("cancellationToken");
            }

            IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page
            IWebCrawler crawler = CreateCrawlerInstance();

            DomainCrawlResult domainCrawlResult = new DomainCrawlResult();

            domainCrawlResult.Domain = domain;
            try
            {
                crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
                {
                    Domain = domain,
                    PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider,
                    BackupPersistenceProvider  = _processorContext.BackupPersistenceProvider,
                    CrawlProcessors            = processors
                };

                domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken);

                ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext);
            }
            catch (Exception ex)
            {
                string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message);
                domainCrawlResult.CrawlResult = new CrawlResult {
                    ErrorException = ex
                };

                _logger.ErrorFormat(errorMessage, ex);
                //TODO Statsg fatal error occurred during crawl
                StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config);
            }

            LogCrawlResult(domainCrawlResult.CrawlResult);
            return(domainCrawlResult);
        }
Exemplo n.º 25
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);

            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return(false);
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId  = sessionId,
                SeedUrl    = Seed.AbsoluteUri,
                CrawlerId  = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting        += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted       += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed      += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return(true);
        }
Exemplo n.º 26
0
        public async Task CrawlAndAssert(IWebCrawler crawler)
        {
            crawler.PageCrawlCompleted += crawler_PageCrawlCompleted;

            CrawlResult result = await crawler.CrawlAsync(_rootUri);

            Assert.IsNull(result.ErrorException);
            Assert.IsFalse(result.ErrorOccurred);
            Assert.AreSame(_rootUri, result.RootUri);

            List <Discrepancy> descrepancies = GetDescrepancies();

            PrintDescrepancies(descrepancies);

            Assert.AreEqual(0, descrepancies.Count, "There were discrepancies between expected and actual crawl results. See ouput window for details.");
            Assert.IsTrue(result.Elapsed.TotalSeconds < _maxSecondsToCrawl, string.Format("Elapsed Time to crawl {0}, over {1} second threshold", result.Elapsed.TotalSeconds, _maxSecondsToCrawl));
        }
Exemplo n.º 27
0
        public CrawlerViewModel Crawl(CrawlerViewModel viewModel)
        {
            if (!Helper.IsValidUrl(viewModel.UrlToCrawl))
            {
                viewModel.ErrorMsg = String.Format(" Please enter mail adress");
                return(viewModel);
            }

            allLinksOnPage = new List <Uri>();
            CrawlConfiguration config = new CrawlerNetConfig().Initalize();

            this.crawler = new PoliteWebCrawler(config);

            crawler.PageCrawlCompleted += crawler_PageCrawlCompleted;

            //

            CrawlResult result = crawler.Crawl(new Uri(viewModel.UrlToCrawl));

            if (result.ErrorOccurred)
            {
                viewModel.ErrorMsg = String.Format("Crawler completed with error: {0}", result.ErrorException.Message);
            }

            var isProd = Convert.ToBoolean(ConfigurationManager.AppSettings["IsProd"].ToString());

            if (isProd)
            {
                viewModel.CrawledLinks.AddRange(allLinksOnPage);
            }
            else
            {
                viewModel.CrawledLinks.AddRange(allLinksOnPage.Take(10));
            }

            viewModel.SuccessMsg = " Successfully Listed !";

            return(viewModel);
        }
Exemplo n.º 28
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetDefaultWebCrawler();

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains("ru/news/"))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false
                });
            });

            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                if (!crawledPage.IsInternal)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "We dont crawl links of external pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
Exemplo n.º 29
0
        public CrawlResult Crawl()
        {
            IWebCrawler crawler = InitCrawler();

            //http://hh.ua/search/vacancy?no_magic=true&items_on_page=100&currency_code=UAH&clusters=true&page=0    //-- ALL
            // --- UA
            Uri uriToCrawl = new Uri("http://hh.ua/search/vacancy?no_magic=true&items_on_page=100&clusters=true&currency_code=UAH&area=5&page=0");


            //var urlPattern=@"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100&currency_code=UAH&clusters=true&page=[0-9]+$"; // -- ALL
            var urlPattern = @"^http://hh\.ua/search/vacancy\?no_magic=true&items_on_page=100&clusters=true&currency_code=UAH&area=5&page=[0-9]+$"; // -- UA

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (Regex.IsMatch(pageToCrawl.Uri.ToString(), urlPattern, RegexOptions.IgnoreCase))
                {
                    return new CrawlDecision {
                        Allow = true
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = false, Reason = "Parse only job pages"
                });
            });

            crawler.PageCrawlStartingAsync        += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CrawlResult result = crawler.Crawl(uriToCrawl);

            return(result);
        }
Exemplo n.º 30
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetManuallyConfiguredWebCrawler();

            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => new CrawlDecision {
                Allow = true
            });

            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                if (!crawledPage.IsInternal)
                {
                    return new CrawlDecision {
                        Allow = false, Reason = "We dont crawl links of external pages"
                    }
                }
                ;
                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
Exemplo n.º 31
0
        public void Dispose()
        {
            var disposable = _provider as IDisposable;

            if (disposable != null)
            {
                disposable.Dispose();
            }
            _provider = null;

            disposable = _repo as IDisposable;
            if (disposable != null)
            {
                disposable.Dispose();
            }
            _repo = null;

            if (_crawler != null)
            {
                if (IsAsync)
                {
                    _crawler.PageCrawlStartingAsync        -= crawler_ProcessPageCrawlStarting;
                    _crawler.PageCrawlCompletedAsync       -= crawler_ProcessPageCrawlCompleted;
                    _crawler.PageCrawlDisallowedAsync      -= crawler_PageCrawlDisallowed;
                    _crawler.PageLinksCrawlDisallowedAsync -= crawler_PageLinksCrawlDisallowed;
                }
                else
                {
                    _crawler.PageCrawlStarting        -= crawler_ProcessPageCrawlStarting;
                    _crawler.PageCrawlCompleted       -= crawler_ProcessPageCrawlCompleted;
                    _crawler.PageCrawlDisallowed      -= crawler_PageCrawlDisallowed;
                    _crawler.PageLinksCrawlDisallowed -= crawler_PageLinksCrawlDisallowed;
                }
                _crawler = null;
            }
        }
 public GroundForcesScraper(IWebCrawler webCrawler, IConsoleManager consoleManager)
 {
     _webCrawler = webCrawler;
     _consoleManager = consoleManager;
 }
Exemplo n.º 33
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetDefaultWebCrawler();

            //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it.
            //For example http://a.com/ghost, would not get crawled if the link were found during the crawl.
            //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled.
            //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run.
            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                CrawlDecision decision;

                //if (pageToCrawl.Uri == crawlContext.RootUri
                //    ||
                //    (pageToCrawl.Uri == new Uri((string)crawlContext.CrawlBag.PriceUri))
                //    )
                //    //pageToCrawl.Uri.PathAndQuery.Contains("tid=10301"))
                //{
                //    decision = new CrawlDecision { Allow = true };
                //}
                //else
                //{
                //    decision = new CrawlDecision { Allow = false,  Reason = "不是主页或价格信息" };
                //}

                decision = new CrawlDecision {
                    Allow = true
                };
                Log($"ShouldCrawlPage {decision.Allow} ", pageToCrawl.Uri.AbsoluteUri);

                return(decision);
                //if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost"))
                //    return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" };

                //return new CrawlDecision { Allow = true };
            });

            //Register a lambda expression that will tell Abot to not download the page content for any page after 5th.
            //Abot will still make the http request but will not read the raw content from the stream
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run
            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                CrawlDecision decision;


                if (crawledPage.Uri == crawlContext.RootUri ||
                    crawledPage.Uri.PathAndQuery.Contains("tid=10301"))
                {
                    decision = new CrawlDecision {
                        Allow = true
                    };
                }
                else
                {
                    decision = new CrawlDecision {
                        Allow = false, ShouldStopCrawl = true, Reason = "不是主页或价格信息"
                    };
                }

                Log($"ShouldDownloadPageContent {decision.Allow} ", crawledPage.Uri.AbsoluteUri);

                return(decision);

                //if (crawlContext.CrawledCount >= 5)
                //    return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" };


                //return new CrawlDecision { Allow = true };
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                Log("ShouldCrawlPageLinks", crawledPage.Uri.AbsoluteUri);

                //if (crawledPage.Uri == crawlContext.RootUri)
                //{
                //    return new CrawlDecision { Allow = true };
                //}

                return(new CrawlDecision {
                    Allow = false, Reason = "需要手工爬链接"
                });
            });

            return(crawler);
        }
Exemplo n.º 34
0
 public RbotProvider(IWebCrawler Crawler)
 {
     this.Crawler = Crawler;
 }
Exemplo n.º 35
0
        public bool InitializeCrawler(string seedUrl, int sessionId, int crawlerId, CrawlConfiguration config)
        {
            _config = config;

            //check if a crawl is already defined
            var existingRun = _repo.GetCrawl(sessionId, crawlerId);
            if (existingRun != null)
            {
                var mssg = string.Format("CrawlerRun exists with sessionId: {0} and crawlerId: {1}; cancelling run ...", sessionId, crawlerId);
                _logger.Error(mssg);
                return false;
            }
            Seed = new Uri(seedUrl);
            CrawlerDefinition = new CrawlerRun()
            {
                SessionId = sessionId,
                SeedUrl = Seed.AbsoluteUri,
                CrawlerId = crawlerId,
                BaseDomain = Seed.GetBaseDomain()
            };
            _repo.AddCrawl(CrawlerDefinition);
            _scheduler = new MyScheduler(new LogicProvider(), CrawlerDefinition, _repo);

            _crawler = new PoliteWebCrawler(_config, null, null, _scheduler, null, null, null, null, null);
            _crawler.CrawlBag.SessionId = CrawlerDefinition.SessionId;
            _crawler.CrawlBag.CrawlerId = CrawlerDefinition.CrawlerId;
            _crawler.ShouldScheduleLink(ShouldScheduleLink);
            _crawler.ShouldCrawlPage(ShouldCrawlPage);

            if (IsAsync)
            {
                _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            }
            else
            {
                _crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting;
                _crawler.PageCrawlCompleted += crawler_ProcessPageCrawlCompleted;
                _crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed;
                _crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed;
            }

            return true;
        }
 protected WrapperWithStartPage(IClock clock, IWebCrawler webCrawler, Uri baseUri, Uri startPage)
     : base(clock, webCrawler, baseUri)
 {
     StartPage = startPage;
 }
Exemplo n.º 37
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = GetDefaultWebCrawler();

            //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it.
            //For example http://a.com/ghost, would not get crawled if the link were found during the crawl.
            //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled.
            //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run.
            crawler.ShouldCrawlPageDecisionMaker = (pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost"))
                {
                    return new CrawlDecision {
                               Allow = false, Reason = "Scared of ghosts"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            };

            //Register a lambda expression that will tell Abot to not download the page content for any page after 5th.
            //Abot will still make the http request but will not read the raw content from the stream
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run
            crawler.ShouldDownloadPageContentDecisionMaker = (crawledPage, crawlContext) =>
            {
                if (crawlContext.CrawledCount >= 5)
                {
                    return new CrawlDecision {
                               Allow = false, Reason = "We already downloaded the raw page content for 5 pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            };

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinksDecisionMaker = (crawledPage, crawlContext) =>
            {
                if (!crawledPage.IsInternal)
                {
                    return new CrawlDecision {
                               Allow = false, Reason = "We don't crawl links of external pages"
                    }
                }
                ;

                return(new CrawlDecision {
                    Allow = true
                });
            };

            return(crawler);
        }

        private static Uri GetSiteToCrawl(string[] args)
        {
            string userInputUrl = string.Empty;

            if (args.Length < 1)
            {
                System.Console.WriteLine("Please, enter ABSOLUTE url to crawl (for ex.: https://github.com ):");
                userInputUrl = System.Console.ReadLine();
            }
            else
            {
                userInputUrl = args[0];
            }

            var isAbsoluteUri = Uri.TryCreate(userInputUrl, UriKind.Absolute, out Uri result);

            if (string.IsNullOrWhiteSpace(userInputUrl) || !isAbsoluteUri)
            {
                throw new ApplicationException("Requare absolute url, without white spaces and not empty");
            }

            return(result);
        }
Exemplo n.º 38
0
 public AdminController(IWebCrawler webCrawler, ApplicationDbContext db, ITime time)
 {
     _webCrawler = webCrawler;
     _db         = db;
     _time       = time;
 }
 public RepositoryUnitTest(RawDataFixture rawDataFixture)
 {
     _rawDataFixture   = rawDataFixture;
     _globalWebCrawler = rawDataFixture.CreateGlobalWebCrawler(10);
     _detailWebCrawler = rawDataFixture.CreateDetailWebCrawler(10);
 }